001/* BioJava development code 002 * 003 * This code may be freely distributed and modified under the 004 * terms of the GNU Lesser General Public Licence. This should 005 * be distributed with the code. If you do not have a copy, 006 * see: 007 * 008 * http://www.gnu.org/copyleft/lesser.html 009 * 010 * Copyright for this code is held jointly by the individual 011 * authors. These should be listed in @author doc comments. 012 * 013 * For more information on the BioJava project and its aims, 014 * or to join the biojava-l mailing list, visit the home page 015 * at: 016 * 017 * http://www.biojava.org/ 018 * 019 */ 020package org.biojava.nbio.ronn; 021 022import org.biojava.nbio.core.sequence.ProteinSequence; 023import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 024import org.biojava.nbio.data.sequence.FastaSequence; 025import org.biojava.nbio.data.sequence.SequenceUtil; 026 027import java.io.FileInputStream; 028import java.io.FileNotFoundException; 029import java.io.IOException; 030import java.io.Serializable; 031import java.util.ArrayList; 032import java.util.List; 033import java.util.Map; 034import java.util.TreeMap; 035import java.util.stream.Collectors; 036 037 038/** 039 * This class gives public API to RONN functions. 040 * It is build on top of the command line client. Due to this fact a few things 041 * could be improved and extended pending the refactoring of the command line client. 042 * 043 * The input sequence limitations - the input sequence must not contain any ambiguous characters, 044 * and have a minimum length of 19 amino acids. 045 * 046 * @author Peter Troshin 047 * @version 1.0 048 * @since 3.0.2 049 * 050 */ 051public class Jronn implements Serializable { 052 053 /** 054 * 055 */ 056 private static final long serialVersionUID = 8104272449130849946L; 057 // Load models 058 private static final ModelLoader loader = new ModelLoader(); 059 static { 060 try { 061 loader.loadModels(); 062 } catch (NumberFormatException e) { 063 throw new RuntimeException("Fails to load models!" + e.getMessage(), e); 064 } catch (IOException e) { 065 throw new RuntimeException("Fails to load models!" + e.getMessage(), e); 066 } 067 } 068 069 070 /** 071 * Holder for the ranges, contain pointers to starting and ending position 072 * on the sequence which comprises a disordered region. Immutable. 073 * @author pvtroshin 074 */ 075 public static class Range { 076 /** 077 * Range starting position counts from 1 (the first position on the sequence is 1) 078 */ 079 public final int from; 080 /** 081 * The range ending position includes the last residue. 082 */ 083 public final int to; 084 085 public final float score; 086 public Range(int from, int to, float score) { 087 assert from>=0; 088 assert from<to; 089 this.from = from; 090 this.to = to; 091 this.score = score; 092 } 093 094 @Override 095 public String toString() { 096 return "Range" + " From:" + from + "\t" + "to: " + to + "\n"; 097 } 098 099 @Override 100 public int hashCode() { 101 final int prime = 31; 102 int result = 1; 103 result = prime * result + from; 104 result = prime * result + to; 105 return result; 106 } 107 108 @Override 109 public boolean equals(Object obj) { 110 if (this == obj) 111 return true; 112 if (obj == null) 113 return false; 114 if (getClass() != obj.getClass()) 115 return false; 116 Range other = (Range) obj; 117 if (from != other.from) 118 return false; 119 if (to != other.to) 120 return false; 121 return true; 122 } 123 124 125 } 126 127 /** 128 * Calculates the probability value for each residue in the protein sequence, 129 * telling the probability that the residue belongs to disordered region. 130 * In general, values greater than 0.5 considered to be in the disordered regions. 131 * 132 * @param sequence an instance of FastaSequence object, holding the name and the sequence. 133 * @return the probability scores for each residue in the sequence 134 */ 135 public static float[] getDisorderScores(FastaSequence sequence) { 136 return predictSerial(sequence); 137 } 138 139 /** 140 * Calculates the probability value for each residue in the protein sequence, 141 * telling the probability that the residue belongs to disordered region. 142 * In general, values greater than 0.5 considered to be in the disordered regions. 143 * 144 * @param sequence an instance of FastaSequence object, holding the name and the sequence. 145 * @return the probability scores for each residue in the sequence 146 */ 147 public static float[] getDisorderScores(ProteinSequence sequence) { 148 149 FastaSequence seq = convertProteinSequencetoFasta(sequence); 150 151 return predictSerial(seq); 152 } 153 154 /** Utility method to convert a BioJava ProteinSequence object to the FastaSequence 155 * object used internally in JRonn. 156 * 157 * @param sequence 158 * @return 159 */ 160 public static FastaSequence convertProteinSequencetoFasta(ProteinSequence sequence){ 161 StringBuffer buf = new StringBuffer(); 162 for (AminoAcidCompound compound : sequence) { 163 164 String c = compound.getShortName(); 165 166 if (! SequenceUtil.NON_AA.matcher(c).find()) { 167 buf.append(c); 168 } else { 169 buf.append("X"); 170 } 171 } 172 173 return new FastaSequence(sequence.getAccession().getID(),buf.toString()); 174 } 175 176 private static float[] predictSerial(FastaSequence fsequence) { 177 ORonn.validateSequenceForRonn(fsequence); 178 ORonn ronn; 179 float[] disorder = null; 180 try { 181 ronn = new ORonn(fsequence, loader); 182 disorder = ronn.call().getMeanScores(); 183 } catch (NumberFormatException | IOException e) { 184 throw new RuntimeException("Jronn fails to load models " + e.getLocalizedMessage(), e); 185 } 186 return disorder; 187 } 188 189 /** 190 * Calculates the disordered regions of the sequence. More formally, the regions for which the 191 * probability of disorder is greater then 0.50. 192 * 193 * 194 * @param sequence an instance of FastaSequence object, holding the name and the sequence. 195 * @return the array of ranges if there are any residues predicted to have the 196 * probability of disorder greater then 0.5, null otherwise. 197 * 198 */ 199 public static Range[] getDisorder(FastaSequence sequence) { 200 float[] scores = getDisorderScores(sequence); 201 return scoresToRanges(scores, RonnConstraint.DEFAULT_RANGE_PROBABILITY_THRESHOLD); 202 } 203 204 /** 205 * Convert raw scores to ranges. Gives ranges for given probability of disorder value 206 * @param scores the raw probability of disorder scores for each residue in the sequence. 207 * @param probability the cut off threshold. Include all residues with the probability of disorder greater then this value 208 * @return the array of ranges if there are any residues predicted to have the 209 * probability of disorder greater then {@code probability}, null otherwise. 210 */ 211 public static Range[] scoresToRanges(float[] scores, float probability) { 212 assert scores!=null && scores.length>0; 213 assert probability>0 && probability<1; 214 215 int count=0; 216 int regionLen=0; 217 List<Range> ranges = new ArrayList<Range>(); 218 for(float score: scores) { 219 count++; 220 // Round to 2 decimal points before comparison 221 score = (float) (Math.round(score*100.0)/100.0); 222 if(score>probability) { 223 regionLen++; 224 } else { 225 if(regionLen>0) { 226 ranges.add(new Range(count-regionLen, count-1,score)); 227 } 228 regionLen=0; 229 } 230 } 231 // In case of the range to boundary runs to the very end of the sequence 232 if(regionLen>1) { 233 ranges.add(new Range(count-regionLen+1, count,scores[scores.length-1])); 234 } 235 return ranges.toArray(new Range[ranges.size()]); 236 237 } 238 239 /** 240 * Calculates the probability of disorder scores for each residue in the sequence for 241 * many sequences in the input. 242 * 243 * @param sequences the list of the FastaSequence objects 244 * @return the Map with key->FastaSequence, value->probability of disorder for each residue 245 * @see #getDisorder(FastaSequence) 246 */ 247 public static Map<FastaSequence,float[]> getDisorderScores(List<FastaSequence> sequences) { 248 Map<FastaSequence,float[]> results = new TreeMap<FastaSequence, float[]>(); 249 results = sequences.stream().collect(Collectors.toMap(fastaSequence -> fastaSequence, fastaSequence -> predictSerial(fastaSequence))); 250 return results; 251 } 252 253 /** 254 * Calculates the disordered regions of the sequence for many sequences in the input. 255 * 256 * @param sequences sequences the list of the FastaSequence objects 257 * @return 258 * @see #getDisorder(FastaSequence) 259 */ 260 public static Map<FastaSequence,Range[]> getDisorder(List<FastaSequence> sequences) { 261 Map<FastaSequence,Range[]> disorderRanges = new TreeMap<FastaSequence,Range[]>(); 262 disorderRanges = sequences.stream().collect(Collectors.toMap(fastaSequence -> fastaSequence, fastaSequence -> getDisorder(fastaSequence) )); 263 return disorderRanges; 264 } 265 266 /** 267 * Calculates the disordered regions of the protein sequence. 268 * @param fastaFile input file name containing the sequence in FASTA 269 * @return the Map with key->FastaSequence, value->the list of disordered regions for each sequence 270 * @throws FileNotFoundException if the input file cannot be found 271 * @throws IOException of the system cannot access or read from the input file 272 * @see #getDisorder(FastaSequence) 273 * @see #Jronn.Range 274 */ 275 public static Map<FastaSequence,Range[]> getDisorder(String fastaFile) throws FileNotFoundException, IOException { 276 final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile)); 277 return getDisorder(sequences); 278 } 279 280 /** 281 * TODO 282 * 283 * High performance method for calculating disorder. Use multiple threads to achieve the speedup. 284 * 285 * @param fastaFile fully qualified path to the input FASTA file 286 * @param outputFile file name of the file for the results 287 * @param threadNumber the number of threads to use, default 288 * @param controls the format of the result file 289 * @throws FileNotFoundException if input file in not found 290 * @throws IOException if the input or the output files cannot be accessed 291 * @see ORonn.ResultLayout 292 293 public static void calculateDisorder(String fastaFile, String outputFile, int threadNumber, ResultLayout layout) throws FileNotFoundException, IOException { 294 final List<FastaSequence> sequences = SequenceUtil.readFasta(new FileInputStream(fastaFile)); 295 InputParameters in = new InputParameters(); 296 in.setFilePrm(fastaFile, InputParameters.inputKey); 297 in.setFilePrm(outputFile, InputParameters.outputKey); 298 //in.setThreadNum(Integer.toString(threadNumber)); 299 ORonn.predictParallel(sequences, in, loader); 300 } 301 */ 302}