001/*        BioJava development code
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 */
020package org.biojava.nbio.ronn;
021
022import org.biojava.nbio.core.sequence.ProteinSequence;
023import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
024import org.biojava.nbio.data.sequence.FastaSequence;
025import org.biojava.nbio.data.sequence.SequenceUtil;
026
027import java.io.FileInputStream;
028import java.io.FileNotFoundException;
029import java.io.IOException;
030import java.io.Serializable;
031import java.util.ArrayList;
032import java.util.List;
033import java.util.Map;
034import java.util.TreeMap;
035
036
037/**
038 * This class gives public API to RONN functions.
039 * It is build on top of the command line client. Due to this fact a few things
040 * could be improved and extended pending the refactoring of the command line client.
041 *
042 * The input sequence limitations - the input sequence must not contain any ambiguous characters,
043 * and have a minimum length of 19 amino acids.
044 *
045 * @author Peter Troshin
046 * @version 1.0
047 * @since 3.0.2
048 *
049 */
050public class Jronn implements Serializable {
051
052        /**
053         *
054         */
055        private static final long serialVersionUID = 8104272449130849946L;
056        // Load models
057        private static final ModelLoader loader = new ModelLoader();
058        static {
059                try {
060                        loader.loadModels();
061                } catch (NumberFormatException e) {
062                        throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
063                } catch (IOException e) {
064                        throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
065                }
066        }
067
068
069        /**
070         * Holder for the ranges, contain pointers to starting and ending position
071         * on the sequence which comprises a disordered region. Immutable.
072         * @author pvtroshin
073         */
074        public static class Range {
075                /**
076                 * Range starting position counts from 1 (the first position on the sequence is 1)
077                 */
078                public final int from;
079                /**
080                 * The range ending position includes the last residue.
081                 */
082                public final int to;
083
084                public final float score;
085                public Range(int from, int to, float score) {
086                        assert from>=0;
087                        assert from<to;
088                        this.from = from;
089                        this.to = to;
090                        this.score = score;
091                }
092
093                @Override
094                public String toString() {
095                        return "Range" + " From:" + from + "\t" + "to: " + to + "\n";
096                }
097
098                @Override
099                public int hashCode() {
100                        final int prime = 31;
101                        int result = 1;
102                        result = prime * result + from;
103                        result = prime * result + to;
104                        return result;
105                }
106
107                @Override
108                public boolean equals(Object obj) {
109                        if (this == obj)
110                                return true;
111                        if (obj == null)
112                                return false;
113                        if (getClass() != obj.getClass())
114                                return false;
115                        Range other = (Range) obj;
116                        if (from != other.from)
117                                return false;
118                        if (to != other.to)
119                                return false;
120                        return true;
121                }
122
123
124        }
125
126        /**
127         * Calculates the probability value for each residue in the protein sequence,
128         * telling the probability that the residue belongs to disordered region.
129         * In general, values greater than 0.5 considered to be in the disordered regions.
130         *
131         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
132         * @return the probability scores for each residue in the sequence
133         */
134        public static float[] getDisorderScores(FastaSequence sequence) {
135                    return predictSerial(sequence);
136        }
137
138        /**
139         * Calculates the probability value for each residue in the protein sequence,
140         * telling the probability that the residue belongs to disordered region.
141         * In general, values greater than 0.5 considered to be in the disordered regions.
142         *
143         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
144         * @return the probability scores for each residue in the sequence
145         */
146        public static float[] getDisorderScores(ProteinSequence sequence) {
147
148                FastaSequence seq = convertProteinSequencetoFasta(sequence);
149
150                return predictSerial(seq);
151        }
152
153        /** Utility method to convert a BioJava ProteinSequence object to the FastaSequence
154         *  object used internally in JRonn.
155         *
156         * @param sequence
157         * @return
158         */
159        public static FastaSequence convertProteinSequencetoFasta(ProteinSequence sequence){
160                StringBuffer buf = new StringBuffer();
161                for (AminoAcidCompound compound : sequence) {
162
163                        String c = compound.getShortName();
164
165                        if (! SequenceUtil.NON_AA.matcher(c).find()) {
166                                buf.append(c);
167                        } else {
168                                buf.append("X");
169                        }
170                }
171
172                return new FastaSequence(sequence.getAccession().getID(),buf.toString());
173        }
174
175        private static float[] predictSerial(FastaSequence fsequence) {
176                ORonn.validateSequenceForRonn(fsequence);
177                ORonn ronn;
178                float[] disorder = null;
179                try {
180                        ronn = new ORonn(fsequence, loader);
181                        disorder = ronn.call().getMeanScores();
182                } catch (NumberFormatException e) {
183                        throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e);
184                } catch (IOException e) {
185                        throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e);
186                }
187                return disorder;
188        }
189
190        /**
191         * Calculates the disordered regions of the sequence. More formally, the regions for which the
192         * probability of disorder is greater then 0.50.
193         *
194         *
195         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
196         * @return the array of ranges if there are any residues predicted to have the
197         * probability of disorder greater then 0.5, null otherwise.
198         *
199         */
200        public static Range[] getDisorder(FastaSequence sequence) {
201                float[] scores = getDisorderScores(sequence);
202                return scoresToRanges(scores, RonnConstraint.DEFAULT_RANGE_PROBABILITY_THRESHOLD);
203        }
204
205        /**
206         * Convert raw scores to ranges. Gives ranges for given probability of disorder value
207         * @param scores the raw probability of disorder scores for each residue in the sequence.
208         * @param probability the cut off threshold. Include all residues with the probability of disorder greater then this value
209         * @return the array of ranges if there are any residues predicted to have the
210         * probability of disorder greater then {@code probability}, null otherwise.
211         */
212        public static Range[] scoresToRanges(float[] scores, float probability)  {
213                assert scores!=null && scores.length>0;
214                assert probability>0 && probability<1;
215
216                int count=0;
217                int regionLen=0;
218                List<Range> ranges = new ArrayList<Range>();
219                for(float score: scores) {
220                        count++;
221                        // Round to 2 decimal points before comparison
222                        score = (float) (Math.round(score*100.0)/100.0);
223                        if(score>probability) {
224                                regionLen++;
225                        } else {
226                                if(regionLen>0) {
227                                        ranges.add(new Range(count-regionLen, count-1,score));
228                                }
229                                regionLen=0;
230                        }
231                }
232                // In case of the range to boundary runs to the very end of the sequence
233                if(regionLen>1) {
234                        ranges.add(new Range(count-regionLen+1, count,scores[scores.length-1]));
235                }
236                return ranges.toArray(new Range[ranges.size()]);
237
238        }
239
240        /**
241         * Calculates the probability of disorder scores for each residue in the sequence for
242         * many sequences in the input.
243         *
244         * @param sequences the list of the FastaSequence objects
245         * @return the Map with key->FastaSequence, value->probability of disorder for each residue
246         * @see #getDisorder(FastaSequence)
247         */
248        public static Map<FastaSequence,float[]> getDisorderScores(List<FastaSequence> sequences) {
249                Map<FastaSequence,float[]> results = new TreeMap<FastaSequence, float[]>();
250                for(FastaSequence fsequence : sequences) {
251                        results.put(fsequence, predictSerial(fsequence));
252                }
253                return results;
254        }
255
256        /**
257         * Calculates the disordered regions of the sequence for many sequences in the input.
258         *
259         * @param sequences sequences the list of the FastaSequence objects
260         * @return
261         * @see #getDisorder(FastaSequence)
262         */
263        public static Map<FastaSequence,Range[]> getDisorder(List<FastaSequence> sequences) {
264                Map<FastaSequence,Range[]> disorderRanges = new TreeMap<FastaSequence,Range[]>();
265                for(FastaSequence fs: sequences) {
266                        disorderRanges.put(fs, getDisorder(fs));
267                }
268                return disorderRanges;
269        }
270
271        /**
272         * Calculates the disordered regions of the protein sequence.
273         * @param fastaFile input file name containing the sequence in FASTA
274         * @return the Map with key->FastaSequence, value->the list of disordered regions for each sequence
275         * @throws FileNotFoundException if the input file cannot be found
276         * @throws IOException of the system cannot access or read from the input file
277         * @see #getDisorder(FastaSequence)
278         * @see #Jronn.Range
279         */
280        public static Map<FastaSequence,Range[]> getDisorder(String fastaFile) throws FileNotFoundException, IOException {
281                final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
282                return getDisorder(sequences);
283        }
284
285        /**
286         * TODO
287         *
288         * High performance method for calculating disorder. Use multiple threads to achieve the speedup.
289         *
290         * @param fastaFile  fully qualified path to the input FASTA file
291         * @param outputFile file name of the file for the results
292         * @param threadNumber the number of threads to use, default
293         * @param controls the format of the result file
294         * @throws FileNotFoundException if input file in not found
295         * @throws IOException if the input or the output files cannot be accessed
296         * @see ORonn.ResultLayout
297
298        public static void calculateDisorder(String fastaFile, String outputFile, int threadNumber, ResultLayout layout) throws FileNotFoundException, IOException {
299                final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
300                InputParameters in = new InputParameters();
301                in.setFilePrm(fastaFile, InputParameters.inputKey);
302                in.setFilePrm(outputFile, InputParameters.outputKey);
303                //in.setThreadNum(Integer.toString(threadNumber));
304                ORonn.predictParallel(sequences, in, loader);
305        }
306        */
307}