001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.proteomics; 023 024import java.util.HashMap; 025import java.util.HashSet; 026import java.util.Iterator; 027import java.util.LinkedList; 028import java.util.List; 029import java.util.Set; 030 031import org.biojava.bio.BioError; 032import org.biojava.bio.BioException; 033import org.biojava.bio.seq.ProteinTools; 034import org.biojava.bio.seq.io.SymbolTokenization; 035import org.biojava.bio.symbol.AlphabetManager; 036import org.biojava.bio.symbol.AtomicSymbol; 037import org.biojava.bio.symbol.FiniteAlphabet; 038import org.biojava.bio.symbol.IllegalSymbolException; 039import org.biojava.bio.symbol.Symbol; 040import org.biojava.bio.symbol.SymbolList; 041import org.biojava.bio.symbol.SymbolPropertyTable; 042 043/** 044 * <code>MassCalc</code> calculates the mass of peptides which for our 045 * purposes are <code>SymbolList</code>s which contain 046 * <code>Symbol</code>sfrom the protein <code>Alphabet</code>. It uses 047 * the mono-isotopic and average-isotopic masses identical to those 048 * specified at www.micromass.co.uk 049 * Note: This class does not handle selenocysteine and pyrrolysine. 050 * 051 * @author M. Jones sdfsd 052 * @author Keith James (minor changes) 053 * @author Mark Schreiber 054 * @author George Waldon - getMolecularWeight 055 */ 056public class MassCalc { 057 //Possibly these should be put in a configurable table somewhere 058 /** 059 * Constant value of Carbon monoisotopic mass 060 */ 061 public static final double Cmono = 12.00000; 062 063 /** 064 * Constant value of Hydrogen monoisotopic mass 065 */ 066 public static final double Hmono = 1.0078250; 067 068 /** 069 * Constant value of Nitrogen monoisotopic mass 070 */ 071 public static final double Nmono = 14.0030740; 072 073 /** 074 * Constant value of Oxygen monoisotopic mass 075 */ 076 public static final double Omono = 15.9949146; 077 078 /** 079 * Constant value of Carbon average mass 080 */ 081 public static final double Cavg = 12.011; 082 083 /** 084 * Constant value of Hydrogen average mass 085 */ 086 public static final double Havg = 1.00794; 087 088 /** 089 * Constant value of Nitrogen average mass 090 */ 091 public static final double Navg = 14.00674; 092 093 /** 094 * Constant value of Oxygen average mass 095 */ 096 public static final double Oavg = 15.9994; 097 098 //Save values here so that modifications are not global 099 private HashMap mSymbolPropertyHash; 100 101 102 private HashMap mVariableModPropertyHash; 103 104 /* 105 * If instance methods are being used this will store the 106 * isotopically and MH_PLUS correct terminal mass to be created. 107 * Need to be able to handle N and C term mods in the future. 108 */ 109 private double termMass; 110 111 /* 112 * Creates new MassCalc. 113 * @param isotopicType The type of isotopes to calculate. Either 114 * mono isotopic or average isotopic. It realy is the name of 115 * SymbolProperty table. Ex. SymbolPropertyTable.AVG_MASS or 116 * SymbolPropertyTable.MONO_MASS */ 117 118 119 /** 120 * Creates a new <code>MassCalc</code>. 121 * 122 * @param isotopicType a <code>String</code>. The type of isotopes 123 * to calculate. Either mono isotopic or average 124 * isotopic. Acceptable values are 125 * <code>SymbolPropertyTable.AVG_MASS</code> or 126 * <code>SymbolPropertyTable.MONO_MASS</code>. 127 * @param MH_PLUS a <code>boolean</code>. 128 */ 129 public MassCalc(String isotopicType, boolean MH_PLUS) { 130 //Calculate hydroxyl mass 131 termMass = calcTermMass(isotopicType, MH_PLUS); 132 mSymbolPropertyHash = new HashMap(); 133 134 SymbolPropertyTable symbolPropertyTable = 135 ProteinTools.getSymbolPropertyTable(isotopicType); 136 137 mVariableModPropertyHash = new HashMap(); 138 139 Iterator symbolList = ProteinTools.getAlphabet().iterator(); 140 141 for(; symbolList.hasNext(); ) 142 { 143 Symbol sym = (Symbol) symbolList.next(); 144 //SELENOCYSTEINE and PYRROLYSINE 145 if(sym==ProteinTools.o() || sym==ProteinTools.u()) 146 continue; 147 try { 148 try { 149 Double value = 150 new Double(symbolPropertyTable.getDoubleValue(sym)); 151 mSymbolPropertyHash.put(sym, value); 152 } catch (NullPointerException npe) { 153 //This seems to be happening when a amino acid is 154 //in the property table that doesn't have a residue value 155 } 156 } 157 catch (IllegalSymbolException ise) 158 { 159 throw new BioError("Error setting properties for Symbol " + sym, ise); 160 } 161 } 162 } 163 164 /** 165 * Use this to set a post translational modification for the 166 * <code>Symbol</code> represented by this character. It will only 167 * affect the current <code>MassCalc</code> instance and will not 168 * affect the static method. 169 * 170 * @param symbolToken a <code>char</code> representing a 171 * <code>Symbol</code>. 172 * @param mass a <code>double</code> to be the new mass of the 173 * residue. 174 * 175 * @exception IllegalSymbolException if the <code>Symbol</code> is 176 * not recognised. 177 */ 178 public void setSymbolModification(char symbolToken, double mass) 179 throws IllegalSymbolException 180 { 181 SymbolTokenization toke; 182 183 try { 184 toke = ProteinTools.getAlphabet().getTokenization("token"); 185 } catch (BioException ex) { 186 throw new BioError("Expected a tokenization", ex); 187 } 188 189 Symbol sym = toke.parseToken("" + symbolToken); 190 mSymbolPropertyHash.put(sym, new Double(mass)); 191 } 192 193 /** Add Variable modifications. If multiple masses are 194 * set by this method more then one mass will be returned for a mass 195 * calculation. For example if a peptide contains two Mets and the user sets 196 * the native and oxidized mass for the Met then the masses returned will be 197 * of the peptide with 0, 1 and 2 modified Mets. 198 * @param residue The one char id for this residue 199 * @param masses 200 * @throws IllegalSymbolException 201 * @see #getVariableMasses(SymbolList peptide) 202 * @see #addVariableModification(Symbol residue,double[] masses) 203 */ 204 public void addVariableModification(char residue, 205 double[] masses) 206 throws IllegalSymbolException{ 207 Symbol sym = getSymbolForChar(residue); 208 addVariableModification(sym, masses); 209 } 210 211 212 /** Add Variable modifications. If multiple masses are 213 * set by this method more then one mass will be returned for a mass 214 * calculation. For example if a peptide contains two Mets and the user sets 215 * the native and oxidized mass for the Met then the masses returned will be 216 * of the peptide with 0, 1 and 2 modified Mets. 217 */ 218 public void addVariableModification(Symbol residue, 219 double[] masses) 220 throws IllegalSymbolException{ 221 222 List massList = new LinkedList(); 223 for(int i=0; i<masses.length; i++){ 224 massList.add(new Double(masses[i])); 225 } 226 getVariableModMap().put(residue, massList); 227 } 228 229 /** 230 * Remove all variable modifications assocaited with this residue. 231 * 232 */ 233 public boolean removeVariableModifications(char residue) 234 throws IllegalSymbolException{ 235 Symbol sym = getSymbolForChar(residue); 236 return removeVariableModifications(sym); 237 } 238 239 /** 240 * Remove all variable modifications assocaited with this residue. 241 * 242 */ 243 public boolean removeVariableModifications(Symbol residue){ 244 if(getVariableModMap().remove(residue) != null){ 245 return true; 246 }else{ 247 return false; 248 } 249 } 250 251 /** Calculate the molecular weight of a protein, making estimates whenever it is 252 * possible like averaging mass values for ambiguity symbols or counting 253 * zero when gaps are encountered. 254 * The method is tolerant for ambiguity symbols as long as they can be 255 * resolved to a series of atomic symbols whose mass is available in the 256 * ResidueProperties.xml configuration file or they are gaps. 257 * The method returns the same value as getMass(SymbolList proteinSeq, 258 * SymbolPropertyTable.AVG_MASS, false) when only atomic symbols are found 259 * in the polypeptide. 260 * 261 * @since 1.5 262 */ 263 public static final double getMolecularWeight(SymbolList proteinSeq) 264 throws IllegalSymbolException { 265 double pepMass = 0.0; 266 Symbol gap = AlphabetManager.alphabetForName("PROTEIN").getGapSymbol(); 267 SymbolPropertyTable sPT = 268 ProteinTools.getSymbolPropertyTable(SymbolPropertyTable.AVG_MASS); 269 270 for (Iterator it = proteinSeq.iterator(); it.hasNext(); ) { 271 Symbol s = (Symbol) it.next(); 272 if( s instanceof AtomicSymbol) { 273 pepMass += sPT.getDoubleValue(s); 274 } else { 275 FiniteAlphabet matches = (FiniteAlphabet) s.getMatches(); 276 if(matches.size() == 0) { 277 if(s.equals(gap)) { 278 continue; 279 } 280 throw new IllegalSymbolException( 281 "Symbol " + s.getName() + " has no mass associated"); 282 } else { 283 int count = 0; 284 double mass = 0.0; 285 for(Iterator i = matches.iterator(); i.hasNext(); ) { 286 AtomicSymbol as = (AtomicSymbol) i.next(); 287 //SELENOCYSTEINE and PYRROLYSINE 288 if(as==ProteinTools.o() || 289 as==ProteinTools.u()) 290 continue; 291 mass += sPT.getDoubleValue(as); 292 count++; 293 } 294 pepMass += mass/((double)count); 295 } 296 } 297 } 298 299 //Calculate hydroxyl mass 300 double termMass = calcTermMass(SymbolPropertyTable.AVG_MASS, false); 301 302 if (pepMass != 0.0) { 303 pepMass += termMass; 304 } 305 306 return pepMass; 307 } 308 309 /** 310 * <code>getMass</code> calculates the mass of this peptide. This 311 * only works for the values in the ResidueProperties.xml 312 * configuration file. It is probably slightly faster than the 313 * instance method, but it does not handle post-translational 314 * modifications. 315 * 316 * @param proteinSeq a <code>SymbolList</code> whose mass is to be 317 * calculated. This should use the protein alphabet. 318 * @param isotopicType a <code>String</code> The type of isotopes 319 * to calculate. Either mono isotopic or average 320 * isotopic. Acceptable values are 321 * <code>SymbolPropertyTable.AVG_MASS</code> or 322 * <code>SymbolPropertyTable.MONO_MASS</code>. 323 * @param MH_PLUS a <code>boolean</code> true if the value needed 324 * is the MH+ mass. 325 * 326 * @return a <code>double</code> mass of the peptide. 327 * 328 * @exception IllegalSymbolException if the 329 * <code>SymbolList</code> contains illegal 330 * <code>Symbol</code>s. 331 */ 332 public static final double getMass(SymbolList proteinSeq, 333 String isotopicType, 334 boolean MH_PLUS) 335 throws IllegalSymbolException 336 { 337 338 double pepMass = 0.0; 339 340 SymbolPropertyTable sPT = 341 ProteinTools.getSymbolPropertyTable(isotopicType); 342 343 for (Iterator it = proteinSeq.iterator(); it.hasNext(); ) { 344 Symbol s = (Symbol) it.next(); 345 if(!(s instanceof AtomicSymbol)) { 346 throw new IllegalSymbolException( 347 "Symbol " + s.getName() + " is not atomic"); 348 } 349 pepMass += sPT.getDoubleValue(s); 350 } 351 352 //Calculate hydroxyl mass 353 double termMass = calcTermMass(isotopicType, MH_PLUS); 354 355 if (pepMass != 0.0) { 356 pepMass += termMass; 357 } 358 359 return pepMass; 360 } 361 362 /** 363 * Get the Mass of this peptide. Use this if you want to set fixed 364 * modifications and have created an instance of MassCalc. The 365 * value is calculated using the value of MH_PLUS defined in the 366 * constructor. The static method may be faster. 367 * 368 * @param proteinSeq The sequence for mass calculation 369 * 370 * @return The mass of the sequence */ 371 public double getMass(SymbolList proteinSeq) 372 throws IllegalSymbolException 373 { 374 375 double pepMass = 0.0; 376 377 HashMap symbolPropertyMap = getSymbolPropertyMap(); 378 379 for (Iterator it = proteinSeq.iterator(); it.hasNext(); ) { 380 Symbol s = (Symbol) it.next(); 381 if(! symbolPropertyMap.containsKey(s)){ 382 throw new IllegalSymbolException(s, "The mass of the symbol "+s.getName()+" is unknown"); 383 } 384 Double mass = (Double) symbolPropertyMap.get(s); 385 pepMass += mass.doubleValue(); 386 } 387 pepMass += getTermMass(); 388 389 return pepMass; 390 } 391 392 /** 393 * Get all masses including the variable mass. 394 * Allgorythm 395 * 396 * 1 Get the first residue of the sequence 397 * create a list of all the standard and non-standard massses for this reidue 398 * for each residue mass goto 1 with the sequence of all residues after the current residue 399 * add the residue mass to each mass from 1 to the list 400 * 401 * 402 * @see #addVariableModification 403 */ 404 public double[] getVariableMasses(SymbolList peptide) throws IllegalSymbolException { 405 double[] vMasses = getVMasses(peptide); 406 for(int i=0; i<vMasses.length; i++){ 407 vMasses[i] += getTermMass(); 408 } 409 410 return vMasses; 411 } 412 413 414 private HashMap getVariableModMap() { 415 return mVariableModPropertyHash; 416 } 417 418 419 private HashMap getSymbolPropertyMap(){ 420 return mSymbolPropertyHash;// = ProteinTools.getSymbolPropertyTable(name); 421 } 422 423 /** 424 * <code>getTermMass</code> returns the terminal mass being used 425 * by the instance methods. 426 * 427 * @return a <code>double</code> mass. 428 */ 429 public double getTermMass(){ 430 return termMass; 431 } 432 433 /** 434 * 435 */ 436 private double[] getVMasses(SymbolList peptide) throws IllegalSymbolException { 437 Set allMassList = new HashSet(); 438 439 Symbol sym = peptide.symbolAt(1); 440 if(!getSymbolPropertyMap().containsKey(sym)){ 441 String msg = "No mass Set for Symbol " + sym.getName(); 442 throw new IllegalSymbolException(msg); 443 } 444 445 //Create a list for all masses of the current residue 446 List curResMasses = null; 447 if(getVariableModMap().containsKey(sym)){ 448 curResMasses = new LinkedList((List) getVariableModMap().get(sym)); 449 }else{ 450 curResMasses = new LinkedList(); 451 } 452 curResMasses.add(getSymbolPropertyMap().get(sym)); 453 454 //Move through all masses and calculate the masses of all of the sub peptides 455 Iterator it = curResMasses.iterator(); 456 while(it.hasNext()){ 457 double resMass = ((Double)it.next()).doubleValue(); 458 if(peptide.length() == 1){ 459 allMassList.add(new Double(resMass)); 460 }else{ 461 //Get all masses of remaining peptide 462 double[] subMasses = getVMasses(peptide.subList(2, peptide.length())); 463 //Get next modified mass for symbol 464 for(int i=0; i<subMasses.length; i++){ 465 double pepMass = resMass + subMasses[i]; 466 allMassList.add(new Double(pepMass)); 467 } 468 } 469 } 470 //Convert list to an array 471 double masses[] = new double[allMassList.size()]; 472 int i=0; 473 for(Iterator mit = allMassList.iterator(); mit.hasNext(); i++){ 474 masses[i] = ((Double)mit.next()).doubleValue(); 475 } 476 return masses; 477 } 478 479 480 private static double calcTermMass(String isotopicType, boolean MH_PLUS) { 481 double termMass = 0.0; 482 if (isotopicType.equals(SymbolPropertyTable.AVG_MASS)) { 483 //Add the C-terminal OH and N-Term H 484 termMass += Havg + Oavg + Havg; 485 //Add the extra H 486 if (MH_PLUS) { 487 termMass += Havg; 488 } 489 } 490 else if (isotopicType.equals(SymbolPropertyTable.MONO_MASS)) { 491 //Add the C-terminal OH and N-Term H 492 termMass += Hmono + Omono + Hmono; 493 //Add the extra H 494 if (MH_PLUS) { 495 termMass += Hmono; 496 } 497 } 498 return termMass; 499 } 500 501 private Symbol getSymbolForChar(char symbolToken) 502 throws IllegalSymbolException{ 503 SymbolTokenization toke; 504 try { 505 toke = ProteinTools.getAlphabet().getTokenization("token"); 506 } catch (BioException ex) { 507 throw new BioError("Expected a tokenization", ex); 508 } 509 510 Symbol sym = toke.parseToken("" + symbolToken); 511 return sym; 512 } 513}