001/*        BioJava development code
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 */
020package org.biojava.nbio.ronn;
021
022import org.biojava.nbio.core.sequence.ProteinSequence;
023import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
024import org.biojava.nbio.data.sequence.FastaSequence;
025import org.biojava.nbio.data.sequence.SequenceUtil;
026
027import java.io.FileInputStream;
028import java.io.FileNotFoundException;
029import java.io.IOException;
030import java.io.Serializable;
031import java.util.ArrayList;
032import java.util.List;
033import java.util.Map;
034import java.util.TreeMap;
035import java.util.stream.Collectors;
036
037
038/**
039 * This class gives public API to RONN functions.
040 * It is build on top of the command line client. Due to this fact a few things
041 * could be improved and extended pending the refactoring of the command line client.
042 *
043 * The input sequence limitations - the input sequence must not contain any ambiguous characters,
044 * and have a minimum length of 19 amino acids.
045 *
046 * @author Peter Troshin
047 * @version 1.0
048 * @since 3.0.2
049 *
050 */
051public class Jronn implements Serializable {
052
053        /**
054         *
055         */
056        private static final long serialVersionUID = 8104272449130849946L;
057        // Load models
058        private static final ModelLoader loader = new ModelLoader();
059        static {
060                try {
061                        loader.loadModels();
062                } catch (NumberFormatException e) {
063                        throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
064                } catch (IOException e) {
065                        throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
066                }
067        }
068
069
070        /**
071         * Holder for the ranges, contain pointers to starting and ending position
072         * on the sequence which comprises a disordered region. Immutable.
073         * @author pvtroshin
074         */
075        public static class Range {
076                /**
077                 * Range starting position counts from 1 (the first position on the sequence is 1)
078                 */
079                public final int from;
080                /**
081                 * The range ending position includes the last residue.
082                 */
083                public final int to;
084
085                public final float score;
086                public Range(int from, int to, float score) {
087                        assert from>=0;
088                        assert from<to;
089                        this.from = from;
090                        this.to = to;
091                        this.score = score;
092                }
093
094                @Override
095                public String toString() {
096                        return "Range" + " From:" + from + "\t" + "to: " + to + "\n";
097                }
098
099                @Override
100                public int hashCode() {
101                        final int prime = 31;
102                        int result = 1;
103                        result = prime * result + from;
104                        result = prime * result + to;
105                        return result;
106                }
107
108                @Override
109                public boolean equals(Object obj) {
110                        if (this == obj)
111                                return true;
112                        if (obj == null)
113                                return false;
114                        if (getClass() != obj.getClass())
115                                return false;
116                        Range other = (Range) obj;
117                        if (from != other.from)
118                                return false;
119                        if (to != other.to)
120                                return false;
121                        return true;
122                }
123
124
125        }
126
127        /**
128         * Calculates the probability value for each residue in the protein sequence,
129         * telling the probability that the residue belongs to disordered region.
130         * In general, values greater than 0.5 considered to be in the disordered regions.
131         *
132         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
133         * @return the probability scores for each residue in the sequence
134         */
135        public static float[] getDisorderScores(FastaSequence sequence) {
136                    return predictSerial(sequence);
137        }
138
139        /**
140         * Calculates the probability value for each residue in the protein sequence,
141         * telling the probability that the residue belongs to disordered region.
142         * In general, values greater than 0.5 considered to be in the disordered regions.
143         *
144         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
145         * @return the probability scores for each residue in the sequence
146         */
147        public static float[] getDisorderScores(ProteinSequence sequence) {
148
149                FastaSequence seq = convertProteinSequencetoFasta(sequence);
150
151                return predictSerial(seq);
152        }
153
154        /** Utility method to convert a BioJava ProteinSequence object to the FastaSequence
155         *  object used internally in JRonn.
156         *
157         * @param sequence
158         * @return
159         */
160        public static FastaSequence convertProteinSequencetoFasta(ProteinSequence sequence){
161                StringBuffer buf = new StringBuffer();
162                for (AminoAcidCompound compound : sequence) {
163
164                        String c = compound.getShortName();
165
166                        if (! SequenceUtil.NON_AA.matcher(c).find()) {
167                                buf.append(c);
168                        } else {
169                                buf.append("X");
170                        }
171                }
172
173                return new FastaSequence(sequence.getAccession().getID(),buf.toString());
174        }
175
176        private static float[] predictSerial(FastaSequence fsequence) {
177                ORonn.validateSequenceForRonn(fsequence);
178                ORonn ronn;
179                float[] disorder = null;
180                try {
181                        ronn = new ORonn(fsequence, loader);
182                        disorder = ronn.call().getMeanScores();
183                } catch (NumberFormatException | IOException e) {
184                        throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e);
185                } 
186                return disorder;
187        }
188
189        /**
190         * Calculates the disordered regions of the sequence. More formally, the regions for which the
191         * probability of disorder is greater then 0.50.
192         *
193         *
194         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
195         * @return the array of ranges if there are any residues predicted to have the
196         * probability of disorder greater then 0.5, null otherwise.
197         *
198         */
199        public static Range[] getDisorder(FastaSequence sequence) {
200                float[] scores = getDisorderScores(sequence);
201                return scoresToRanges(scores, RonnConstraint.DEFAULT_RANGE_PROBABILITY_THRESHOLD);
202        }
203
204        /**
205         * Convert raw scores to ranges. Gives ranges for given probability of disorder value
206         * @param scores the raw probability of disorder scores for each residue in the sequence.
207         * @param probability the cut off threshold. Include all residues with the probability of disorder greater then this value
208         * @return the array of ranges if there are any residues predicted to have the
209         * probability of disorder greater then {@code probability}, null otherwise.
210         */
211        public static Range[] scoresToRanges(float[] scores, float probability)  {
212                assert scores!=null && scores.length>0;
213                assert probability>0 && probability<1;
214
215                int count=0;
216                int regionLen=0;
217                List<Range> ranges = new ArrayList<>();
218                for(float score: scores) {
219                        count++;
220                        // Round to 2 decimal points before comparison
221                        score = (float) (Math.round(score*100.0)/100.0);
222                        if(score>probability) {
223                                regionLen++;
224                        } else {
225                                if(regionLen>0) {
226                                        ranges.add(new Range(count-regionLen, count-1,score));
227                                }
228                                regionLen=0;
229                        }
230                }
231                // In case of the range to boundary runs to the very end of the sequence
232                if(regionLen>1) {
233                        ranges.add(new Range(count-regionLen+1, count,scores[scores.length-1]));
234                }
235                return ranges.toArray(new Range[ranges.size()]);
236
237        }
238
239        /**
240         * Calculates the probability of disorder scores for each residue in the sequence for
241         * many sequences in the input.
242         *
243         * @param sequences the list of the FastaSequence objects
244         * @return the Map with key->FastaSequence, value->probability of disorder for each residue
245         * @see #getDisorder(FastaSequence)
246         */
247        public static Map<FastaSequence,float[]> getDisorderScores(List<FastaSequence> sequences) {
248                Map<FastaSequence,float[]> results = new TreeMap<>();
249                results = sequences.stream().collect(Collectors.toMap(fastaSequence ->  fastaSequence, fastaSequence -> predictSerial(fastaSequence)));
250                return results;
251        }
252
253        /**
254         * Calculates the disordered regions of the sequence for many sequences in the input.
255         *
256         * @param sequences sequences the list of the FastaSequence objects
257         * @return
258         * @see #getDisorder(FastaSequence)
259         */
260        public static Map<FastaSequence,Range[]> getDisorder(List<FastaSequence> sequences) {
261                Map<FastaSequence,Range[]> disorderRanges = new TreeMap<>();
262                disorderRanges = sequences.stream().collect(Collectors.toMap(fastaSequence -> fastaSequence, fastaSequence -> getDisorder(fastaSequence) ));
263                return disorderRanges;
264        }
265
266        /**
267         * Calculates the disordered regions of the protein sequence.
268         * @param fastaFile input file name containing the sequence in FASTA
269         * @return the Map with key->FastaSequence, value->the list of disordered regions for each sequence
270         * @throws FileNotFoundException if the input file cannot be found
271         * @throws IOException of the system cannot access or read from the input file
272         * @see #getDisorder(FastaSequence)
273         */
274        public static Map<FastaSequence,Range[]> getDisorder(String fastaFile) throws IOException {
275                final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
276                return getDisorder(sequences);
277        }
278
279        /**
280         * TODO
281         *
282         * High performance method for calculating disorder. Use multiple threads to achieve the speedup.
283         *
284         * @param fastaFile  fully qualified path to the input FASTA file
285         * @param outputFile file name of the file for the results
286         * @param threadNumber the number of threads to use, default
287         * @param controls the format of the result file
288         * @throws FileNotFoundException if input file in not found
289         * @throws IOException if the input or the output files cannot be accessed
290         * @see ORonn.ResultLayout
291
292        public static void calculateDisorder(String fastaFile, String outputFile, int threadNumber, ResultLayout layout) throws FileNotFoundException, IOException {
293                final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
294                InputParameters in = new InputParameters();
295                in.setFilePrm(fastaFile, InputParameters.inputKey);
296                in.setFilePrm(outputFile, InputParameters.outputKey);
297                //in.setThreadNum(Integer.toString(threadNumber));
298                ORonn.predictParallel(sequences, in, loader);
299        }
300        */
301}