Source code

001/*        BioJava development code
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 */
020package org.biojava.nbio.ronn;
021
022import org.biojava.nbio.core.sequence.ProteinSequence;
023import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
024import org.biojava.nbio.data.sequence.FastaSequence;
025import org.biojava.nbio.data.sequence.SequenceUtil;
026
027import java.io.FileInputStream;
028import java.io.FileNotFoundException;
029import java.io.IOException;
030import java.util.ArrayList;
031import java.util.List;
032import java.util.Map;
033import java.util.TreeMap;
034
035
036/**
037 * This class gives public API to RONN functions.
038 * It is build on top of the command line client. Due to this fact a few things
039 * could be improved and extended pending the refactoring of the command line client.
040 *
041 * The input sequence limitations - the input sequence must not contain any ambiguous characters,
042 * and have a minimum length of 19 amino acids.
043 *
044 * @author Peter Troshin
045 * @version 1.0
046 * @since 3.0.2
047 *
048 */
049public class Jronn {
050
051        // Load models
052        private static final ModelLoader loader = new ModelLoader();
053        static {
054                try {
055                        loader.loadModels();
056                } catch (NumberFormatException e) {
057                        throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
058                } catch (IOException e) {
059                        throw new RuntimeException("Fails to load models!" + e.getMessage(), e);
060                }
061        }
062
063
064        /**
065         * Holder for the ranges, contain pointers to starting and ending position
066         * on the sequence which comprises a disordered region. Immutable.
067         * @author pvtroshin
068         */
069        public static class Range {
070                /**
071                 * Range starting position counts from 1 (the first position on the sequence is 1)
072                 */
073                public final int from;
074                /**
075                 * The range ending position includes the last residue.
076                 */
077                public final int to;
078
079                public final float score;
080                public Range(int from, int to, float score) {
081                        assert from>=0;
082                        assert from<to;
083                        this.from = from;
084                        this.to = to;
085                        this.score = score;
086                }
087
088                @Override
089                public String toString() {
090                        return "Range" + " From:" + from + "\t" + "to: " + to + "\n";
091                }
092
093                @Override
094                public int hashCode() {
095                        final int prime = 31;
096                        int result = 1;
097                        result = prime * result + from;
098                        result = prime * result + to;
099                        return result;
100                }
101
102                @Override
103                public boolean equals(Object obj) {
104                        if (this == obj)
105                                return true;
106                        if (obj == null)
107                                return false;
108                        if (getClass() != obj.getClass())
109                                return false;
110                        Range other = (Range) obj;
111                        if (from != other.from)
112                                return false;
113                        if (to != other.to)
114                                return false;
115                        return true;
116                }
117
118
119        }
120
121        /**
122         * Calculates the probability value for each residue in the protein sequence,
123         * telling the probability that the residue belongs to disordered region.
124         * In general, values greater than 0.5 considered to be in the disordered regions.
125         *
126         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
127         * @return the probability scores for each residue in the sequence
128         */
129        public static float[] getDisorderScores(FastaSequence sequence) {
130                    return predictSerial(sequence);
131        }
132
133        /**
134         * Calculates the probability value for each residue in the protein sequence,
135         * telling the probability that the residue belongs to disordered region.
136         * In general, values greater than 0.5 considered to be in the disordered regions.
137         *
138         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
139         * @return the probability scores for each residue in the sequence
140         */
141        public static float[] getDisorderScores(ProteinSequence sequence) {
142
143                FastaSequence seq = convertProteinSequencetoFasta(sequence);
144
145                return predictSerial(seq);
146        }
147
148        /** Utility method to convert a BioJava ProteinSequence object to the FastaSequence
149         *  object used internally in JRonn.
150         *
151         * @param sequence
152         * @return
153         */
154        public static FastaSequence convertProteinSequencetoFasta(ProteinSequence sequence){
155                StringBuffer buf = new StringBuffer();
156                for (AminoAcidCompound compound : sequence) {
157
158                        String c = compound.getShortName();
159
160                        if (! SequenceUtil.NON_AA.matcher(c).find()) {
161                                buf.append(c);
162                        } else {
163                                buf.append("X");
164                        }
165                }
166
167                return new FastaSequence(sequence.getAccession().getID(),buf.toString());
168        }
169
170        private static float[] predictSerial(FastaSequence fsequence) {
171                ORonn.validateSequenceForRonn(fsequence);
172                ORonn ronn;
173                float[] disorder = null;
174                try {
175                        ronn = new ORonn(fsequence, loader);
176                        disorder = ronn.call().getMeanScores();
177                } catch (NumberFormatException e) {
178                        throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e);
179                } catch (IOException e) {
180                        throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e);
181                }
182                return disorder;
183        }
184
185        /**
186         * Calculates the disordered regions of the sequence. More formally, the regions for which the
187         * probability of disorder is greater then 0.50.
188         *
189         *
190         * @param sequence an instance of FastaSequence object, holding the name and the sequence.
191         * @return the array of ranges if there are any residues predicted to have the
192         * probability of disorder greater then 0.5, null otherwise.
193         *
194         */
195        public static Range[] getDisorder(FastaSequence sequence) {
196                float[] scores = getDisorderScores(sequence);
197                return scoresToRanges(scores, RonnConstraint.DEFAULT_RANGE_PROBABILITY_THRESHOLD);
198        }
199
200        /**
201         * Convert raw scores to ranges. Gives ranges for given probability of disorder value
202         * @param scores the raw probability of disorder scores for each residue in the sequence.
203         * @param probability the cut off threshold. Include all residues with the probability of disorder greater then this value
204         * @return the array of ranges if there are any residues predicted to have the
205         * probability of disorder greater then {@code probability}, null otherwise.
206         */
207        public static Range[] scoresToRanges(float[] scores, float probability)  {
208                assert scores!=null && scores.length>0;
209                assert probability>0 && probability<1;
210
211                int count=0;
212                int regionLen=0;
213                List<Range> ranges = new ArrayList<Range>();
214                for(float score: scores) {
215                        count++;
216                        // Round to 2 decimal points before comparison
217                        score = (float) (Math.round(score*100.0)/100.0);
218                        if(score>probability) {
219                                regionLen++;
220                        } else {
221                                if(regionLen>0) {
222                                        ranges.add(new Range(count-regionLen, count-1,score));
223                                }
224                                regionLen=0;
225                        }
226                }
227                // In case of the range to boundary runs to the very end of the sequence
228                if(regionLen>1) {
229                        ranges.add(new Range(count-regionLen+1, count,scores[scores.length-1]));
230                }
231                return ranges.toArray(new Range[ranges.size()]);
232
233        }
234
235        /**
236         * Calculates the probability of disorder scores for each residue in the sequence for
237         * many sequences in the input.
238         *
239         * @param sequences the list of the FastaSequence objects
240         * @return the Map with key->FastaSequence, value->probability of disorder for each residue
241         * @see #getDisorder(FastaSequence)
242         */
243        public static Map<FastaSequence,float[]> getDisorderScores(List<FastaSequence> sequences) {
244                Map<FastaSequence,float[]> results = new TreeMap<FastaSequence, float[]>();
245                for(FastaSequence fsequence : sequences) {
246                        results.put(fsequence, predictSerial(fsequence));
247                }
248                return results;
249        }
250
251        /**
252         * Calculates the disordered regions of the sequence for many sequences in the input.
253         *
254         * @param sequences sequences the list of the FastaSequence objects
255         * @return
256         * @see #getDisorder(FastaSequence)
257         */
258        public static Map<FastaSequence,Range[]> getDisorder(List<FastaSequence> sequences) {
259                Map<FastaSequence,Range[]> disorderRanges = new TreeMap<FastaSequence,Range[]>();
260                for(FastaSequence fs: sequences) {
261                        disorderRanges.put(fs, getDisorder(fs));
262                }
263                return disorderRanges;
264        }
265
266        /**
267         * Calculates the disordered regions of the protein sequence.
268         * @param fastaFile input file name containing the sequence in FASTA
269         * @return the Map with key->FastaSequence, value->the list of disordered regions for each sequence
270         * @throws FileNotFoundException if the input file cannot be found
271         * @throws IOException of the system cannot access or read from the input file
272         * @see #getDisorder(FastaSequence)
273         * @see #Jronn.Range
274         */
275        public static Map<FastaSequence,Range[]> getDisorder(String fastaFile) throws FileNotFoundException, IOException {
276                final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
277                return getDisorder(sequences);
278        }
279
280        /**
281         * TODO
282         *
283         * High performance method for calculating disorder. Use multiple threads to achieve the speedup.
284         *
285         * @param fastaFile  fully qualified path to the input FASTA file
286         * @param outputFile file name of the file for the results
287         * @param threadNumber the number of threads to use, default
288         * @param controls the format of the result file
289         * @throws FileNotFoundException if input file in not found
290         * @throws IOException if the input or the output files cannot be accessed
291         * @see ORonn.ResultLayout
292
293        public static void calculateDisorder(String fastaFile, String outputFile, int threadNumber, ResultLayout layout) throws FileNotFoundException, IOException {
294                final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile));
295                InputParameters in = new InputParameters();
296                in.setFilePrm(fastaFile, InputParameters.inputKey);
297                in.setFilePrm(outputFile, InputParameters.outputKey);
298                //in.setThreadNum(Integer.toString(threadNumber));
299                ORonn.predictParallel(sequences, in, loader);
300        }
301        */
302}