001/* BioJava development code 002 * 003 * This code may be freely distributed and modified under the 004 * terms of the GNU Lesser General Public Licence. This should 005 * be distributed with the code. If you do not have a copy, 006 * see: 007 * 008 * http://www.gnu.org/copyleft/lesser.html 009 * 010 * Copyright for this code is held jointly by the individual 011 * authors. These should be listed in @author doc comments. 012 * 013 * For more information on the BioJava project and its aims, 014 * or to join the biojava-l mailing list, visit the home page 015 * at: 016 * 017 * http://www.biojava.org/ 018 * 019 */ 020package org.biojava.nbio.ronn; 021 022import org.biojava.nbio.core.sequence.ProteinSequence; 023import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 024import org.biojava.nbio.data.sequence.FastaSequence; 025import org.biojava.nbio.data.sequence.SequenceUtil; 026 027import java.io.FileInputStream; 028import java.io.FileNotFoundException; 029import java.io.IOException; 030import java.io.Serializable; 031import java.util.ArrayList; 032import java.util.List; 033import java.util.Map; 034import java.util.TreeMap; 035 036 037/** 038 * This class gives public API to RONN functions. 039 * It is build on top of the command line client. Due to this fact a few things 040 * could be improved and extended pending the refactoring of the command line client. 041 * 042 * The input sequence limitations - the input sequence must not contain any ambiguous characters, 043 * and have a minimum length of 19 amino acids. 044 * 045 * @author Peter Troshin 046 * @version 1.0 047 * @since 3.0.2 048 * 049 */ 050public class Jronn implements Serializable { 051 052 /** 053 * 054 */ 055 private static final long serialVersionUID = 8104272449130849946L; 056 // Load models 057 private static final ModelLoader loader = new ModelLoader(); 058 static { 059 try { 060 loader.loadModels(); 061 } catch (NumberFormatException e) { 062 throw new RuntimeException("Fails to load models!" + e.getMessage(), e); 063 } catch (IOException e) { 064 throw new RuntimeException("Fails to load models!" + e.getMessage(), e); 065 } 066 } 067 068 069 /** 070 * Holder for the ranges, contain pointers to starting and ending position 071 * on the sequence which comprises a disordered region. Immutable. 072 * @author pvtroshin 073 */ 074 public static class Range { 075 /** 076 * Range starting position counts from 1 (the first position on the sequence is 1) 077 */ 078 public final int from; 079 /** 080 * The range ending position includes the last residue. 081 */ 082 public final int to; 083 084 public final float score; 085 public Range(int from, int to, float score) { 086 assert from>=0; 087 assert from<to; 088 this.from = from; 089 this.to = to; 090 this.score = score; 091 } 092 093 @Override 094 public String toString() { 095 return "Range" + " From:" + from + "\t" + "to: " + to + "\n"; 096 } 097 098 @Override 099 public int hashCode() { 100 final int prime = 31; 101 int result = 1; 102 result = prime * result + from; 103 result = prime * result + to; 104 return result; 105 } 106 107 @Override 108 public boolean equals(Object obj) { 109 if (this == obj) 110 return true; 111 if (obj == null) 112 return false; 113 if (getClass() != obj.getClass()) 114 return false; 115 Range other = (Range) obj; 116 if (from != other.from) 117 return false; 118 if (to != other.to) 119 return false; 120 return true; 121 } 122 123 124 } 125 126 /** 127 * Calculates the probability value for each residue in the protein sequence, 128 * telling the probability that the residue belongs to disordered region. 129 * In general, values greater than 0.5 considered to be in the disordered regions. 130 * 131 * @param sequence an instance of FastaSequence object, holding the name and the sequence. 132 * @return the probability scores for each residue in the sequence 133 */ 134 public static float[] getDisorderScores(FastaSequence sequence) { 135 return predictSerial(sequence); 136 } 137 138 /** 139 * Calculates the probability value for each residue in the protein sequence, 140 * telling the probability that the residue belongs to disordered region. 141 * In general, values greater than 0.5 considered to be in the disordered regions. 142 * 143 * @param sequence an instance of FastaSequence object, holding the name and the sequence. 144 * @return the probability scores for each residue in the sequence 145 */ 146 public static float[] getDisorderScores(ProteinSequence sequence) { 147 148 FastaSequence seq = convertProteinSequencetoFasta(sequence); 149 150 return predictSerial(seq); 151 } 152 153 /** Utility method to convert a BioJava ProteinSequence object to the FastaSequence 154 * object used internally in JRonn. 155 * 156 * @param sequence 157 * @return 158 */ 159 public static FastaSequence convertProteinSequencetoFasta(ProteinSequence sequence){ 160 StringBuffer buf = new StringBuffer(); 161 for (AminoAcidCompound compound : sequence) { 162 163 String c = compound.getShortName(); 164 165 if (! SequenceUtil.NON_AA.matcher(c).find()) { 166 buf.append(c); 167 } else { 168 buf.append("X"); 169 } 170 } 171 172 return new FastaSequence(sequence.getAccession().getID(),buf.toString()); 173 } 174 175 private static float[] predictSerial(FastaSequence fsequence) { 176 ORonn.validateSequenceForRonn(fsequence); 177 ORonn ronn; 178 float[] disorder = null; 179 try { 180 ronn = new ORonn(fsequence, loader); 181 disorder = ronn.call().getMeanScores(); 182 } catch (NumberFormatException e) { 183 throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e); 184 } catch (IOException e) { 185 throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e); 186 } 187 return disorder; 188 } 189 190 /** 191 * Calculates the disordered regions of the sequence. More formally, the regions for which the 192 * probability of disorder is greater then 0.50. 193 * 194 * 195 * @param sequence an instance of FastaSequence object, holding the name and the sequence. 196 * @return the array of ranges if there are any residues predicted to have the 197 * probability of disorder greater then 0.5, null otherwise. 198 * 199 */ 200 public static Range[] getDisorder(FastaSequence sequence) { 201 float[] scores = getDisorderScores(sequence); 202 return scoresToRanges(scores, RonnConstraint.DEFAULT_RANGE_PROBABILITY_THRESHOLD); 203 } 204 205 /** 206 * Convert raw scores to ranges. Gives ranges for given probability of disorder value 207 * @param scores the raw probability of disorder scores for each residue in the sequence. 208 * @param probability the cut off threshold. Include all residues with the probability of disorder greater then this value 209 * @return the array of ranges if there are any residues predicted to have the 210 * probability of disorder greater then {@code probability}, null otherwise. 211 */ 212 public static Range[] scoresToRanges(float[] scores, float probability) { 213 assert scores!=null && scores.length>0; 214 assert probability>0 && probability<1; 215 216 int count=0; 217 int regionLen=0; 218 List<Range> ranges = new ArrayList<Range>(); 219 for(float score: scores) { 220 count++; 221 // Round to 2 decimal points before comparison 222 score = (float) (Math.round(score*100.0)/100.0); 223 if(score>probability) { 224 regionLen++; 225 } else { 226 if(regionLen>0) { 227 ranges.add(new Range(count-regionLen, count-1,score)); 228 } 229 regionLen=0; 230 } 231 } 232 // In case of the range to boundary runs to the very end of the sequence 233 if(regionLen>1) { 234 ranges.add(new Range(count-regionLen+1, count,scores[scores.length-1])); 235 } 236 return ranges.toArray(new Range[ranges.size()]); 237 238 } 239 240 /** 241 * Calculates the probability of disorder scores for each residue in the sequence for 242 * many sequences in the input. 243 * 244 * @param sequences the list of the FastaSequence objects 245 * @return the Map with key->FastaSequence, value->probability of disorder for each residue 246 * @see #getDisorder(FastaSequence) 247 */ 248 public static Map<FastaSequence,float[]> getDisorderScores(List<FastaSequence> sequences) { 249 Map<FastaSequence,float[]> results = new TreeMap<FastaSequence, float[]>(); 250 for(FastaSequence fsequence : sequences) { 251 results.put(fsequence, predictSerial(fsequence)); 252 } 253 return results; 254 } 255 256 /** 257 * Calculates the disordered regions of the sequence for many sequences in the input. 258 * 259 * @param sequences sequences the list of the FastaSequence objects 260 * @return 261 * @see #getDisorder(FastaSequence) 262 */ 263 public static Map<FastaSequence,Range[]> getDisorder(List<FastaSequence> sequences) { 264 Map<FastaSequence,Range[]> disorderRanges = new TreeMap<FastaSequence,Range[]>(); 265 for(FastaSequence fs: sequences) { 266 disorderRanges.put(fs, getDisorder(fs)); 267 } 268 return disorderRanges; 269 } 270 271 /** 272 * Calculates the disordered regions of the protein sequence. 273 * @param fastaFile input file name containing the sequence in FASTA 274 * @return the Map with key->FastaSequence, value->the list of disordered regions for each sequence 275 * @throws FileNotFoundException if the input file cannot be found 276 * @throws IOException of the system cannot access or read from the input file 277 * @see #getDisorder(FastaSequence) 278 * @see #Jronn.Range 279 */ 280 public static Map<FastaSequence,Range[]> getDisorder(String fastaFile) throws FileNotFoundException, IOException { 281 final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile)); 282 return getDisorder(sequences); 283 } 284 285 /** 286 * TODO 287 * 288 * High performance method for calculating disorder. Use multiple threads to achieve the speedup. 289 * 290 * @param fastaFile fully qualified path to the input FASTA file 291 * @param outputFile file name of the file for the results 292 * @param threadNumber the number of threads to use, default 293 * @param controls the format of the result file 294 * @throws FileNotFoundException if input file in not found 295 * @throws IOException if the input or the output files cannot be accessed 296 * @see ORonn.ResultLayout 297 298 public static void calculateDisorder(String fastaFile, String outputFile, int threadNumber, ResultLayout layout) throws FileNotFoundException, IOException { 299 final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile)); 300 InputParameters in = new InputParameters(); 301 in.setFilePrm(fastaFile, InputParameters.inputKey); 302 in.setFilePrm(outputFile, InputParameters.outputKey); 303 //in.setThreadNum(Integer.toString(threadNumber)); 304 ORonn.predictParallel(sequences, in, loader); 305 } 306 */ 307}