001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.proteomics;
023
024import java.util.HashMap;
025import java.util.HashSet;
026import java.util.Iterator;
027import java.util.LinkedList;
028import java.util.List;
029import java.util.Set;
030
031import org.biojava.bio.BioError;
032import org.biojava.bio.BioException;
033import org.biojava.bio.seq.ProteinTools;
034import org.biojava.bio.seq.io.SymbolTokenization;
035import org.biojava.bio.symbol.AlphabetManager;
036import org.biojava.bio.symbol.AtomicSymbol;
037import org.biojava.bio.symbol.FiniteAlphabet;
038import org.biojava.bio.symbol.IllegalSymbolException;
039import org.biojava.bio.symbol.Symbol;
040import org.biojava.bio.symbol.SymbolList;
041import org.biojava.bio.symbol.SymbolPropertyTable;
042
043/**
044 * <code>MassCalc</code> calculates the mass of peptides which for our
045 * purposes are <code>SymbolList</code>s which contain
046 * <code>Symbol</code>sfrom the protein <code>Alphabet</code>. It uses
047 * the mono-isotopic and average-isotopic masses identical to those
048 * specified at www.micromass.co.uk
049 * Note: This class does not handle selenocysteine and pyrrolysine.
050 *
051 * @author M. Jones sdfsd
052 * @author Keith James (minor changes)
053 * @author Mark Schreiber
054 * @author George Waldon - getMolecularWeight
055 */
056public class MassCalc {
057    //Possibly these should be put in a configurable table somewhere
058    /**
059     * Constant value of Carbon monoisotopic mass
060     */
061    public static final double Cmono = 12.00000;
062
063    /**
064     * Constant value of  Hydrogen monoisotopic mass
065     */
066    public static final double Hmono = 1.0078250;
067
068    /**
069     * Constant value of Nitrogen monoisotopic mass
070     */
071    public static final double Nmono = 14.0030740;
072
073    /**
074     * Constant value of Oxygen monoisotopic mass
075     */
076    public static final double Omono = 15.9949146;
077
078    /**
079     * Constant value of Carbon average mass
080     */
081    public static final double Cavg = 12.011;
082
083    /**
084     * Constant value of Hydrogen average mass
085     */
086    public static final double Havg = 1.00794;
087
088    /**
089     * Constant value of Nitrogen average mass
090     */
091    public static final double Navg = 14.00674;
092
093    /**
094     * Constant value of Oxygen average mass
095     */
096    public static final double Oavg = 15.9994;
097
098    //Save values here so that modifications are not global
099    private HashMap mSymbolPropertyHash;
100
101
102    private HashMap mVariableModPropertyHash;
103
104    /*
105     * If instance methods are being used this will store the
106     * isotopically and MH_PLUS correct terminal mass to be created.
107     * Need to be able to handle N and C term mods in the future.
108     */
109     private double termMass;
110
111    /*
112     * Creates new MassCalc.
113     * @param isotopicType The type of isotopes to calculate. Either
114     * mono isotopic or average isotopic. It realy is the name of
115     * SymbolProperty table. Ex. SymbolPropertyTable.AVG_MASS or
116     * SymbolPropertyTable.MONO_MASS */
117
118
119    /**
120     * Creates a new <code>MassCalc</code>.
121     *
122     * @param isotopicType a <code>String</code>. The type of isotopes
123     * to calculate. Either mono isotopic or average
124     * isotopic. Acceptable values are
125     * <code>SymbolPropertyTable.AVG_MASS</code> or
126     * <code>SymbolPropertyTable.MONO_MASS</code>.
127     * @param MH_PLUS a <code>boolean</code>.
128     */
129    public MassCalc(String isotopicType, boolean MH_PLUS) {
130        //Calculate hydroxyl mass
131        termMass = calcTermMass(isotopicType, MH_PLUS);
132        mSymbolPropertyHash = new HashMap();
133
134        SymbolPropertyTable symbolPropertyTable =
135            ProteinTools.getSymbolPropertyTable(isotopicType);
136
137        mVariableModPropertyHash = new HashMap();
138
139        Iterator symbolList = ProteinTools.getAlphabet().iterator();
140
141        for(; symbolList.hasNext(); )
142        {
143            Symbol sym = (Symbol) symbolList.next();
144            //SELENOCYSTEINE and PYRROLYSINE
145            if(sym==ProteinTools.o() || sym==ProteinTools.u())
146                continue;
147            try {
148                try {
149                    Double value =
150                        new Double(symbolPropertyTable.getDoubleValue(sym));
151                    mSymbolPropertyHash.put(sym, value);
152                } catch (NullPointerException npe) {
153                    //This seems to be happening when a amino acid is
154                    //in the property table that doesn't have a residue value
155                }
156            }
157            catch (IllegalSymbolException ise)
158            {
159                throw new BioError("Error setting properties for Symbol " + sym, ise);
160            }
161        }
162    }
163
164    /**
165     * Use this to set a post translational modification for the
166     * <code>Symbol</code> represented by this character. It will only
167     * affect the current <code>MassCalc</code> instance and will not
168     * affect the static method.
169     *
170     * @param symbolToken a <code>char</code> representing a
171     * <code>Symbol</code>.
172     * @param mass a <code>double</code> to be the new mass of the
173     * residue.
174     *
175     * @exception IllegalSymbolException if the <code>Symbol</code> is
176     * not recognised.
177     */
178    public void setSymbolModification(char symbolToken, double mass)
179        throws IllegalSymbolException
180    {
181        SymbolTokenization toke;
182
183        try {
184            toke = ProteinTools.getAlphabet().getTokenization("token");
185        } catch (BioException ex) {
186            throw new BioError("Expected a tokenization", ex);
187        }
188
189        Symbol sym = toke.parseToken("" + symbolToken);
190        mSymbolPropertyHash.put(sym, new Double(mass));
191    }
192
193    /** Add Variable modifications.  If multiple masses are
194     * set by this method more then one mass will be returned for a mass
195     * calculation. For example if a peptide contains two Mets and the user sets
196     * the native and oxidized mass for the Met then the masses returned will be
197     * of the peptide with 0, 1 and 2 modified Mets.
198     * @param residue The one char id for this residue
199     * @param masses
200     * @throws IllegalSymbolException
201     * @see #getVariableMasses(SymbolList peptide)
202     * @see #addVariableModification(Symbol residue,double[] masses)
203     */
204     public void addVariableModification(char residue,
205                                         double[] masses)
206        throws IllegalSymbolException{
207        Symbol sym = getSymbolForChar(residue);
208        addVariableModification(sym, masses);
209     }
210
211
212     /** Add Variable modifications. If multiple masses are
213      * set by this method more then one mass will be returned for a mass
214      * calculation. For example if a peptide contains two Mets and the user sets
215      * the native and oxidized mass for the Met then the masses returned will be
216      * of the peptide with 0, 1 and 2 modified Mets.
217      */
218     public void addVariableModification(Symbol residue,
219                                         double[] masses)
220        throws IllegalSymbolException{
221
222        List massList = new LinkedList();
223        for(int i=0; i<masses.length; i++){
224            massList.add(new Double(masses[i]));
225        }
226        getVariableModMap().put(residue, massList);
227     }
228
229     /**
230      * Remove all variable modifications assocaited with this residue.
231      *
232      */
233     public boolean removeVariableModifications(char residue)
234        throws IllegalSymbolException{
235        Symbol sym = getSymbolForChar(residue);
236        return removeVariableModifications(sym);
237     }
238
239     /**
240      * Remove all variable modifications assocaited with this residue.
241      *
242      */
243     public boolean removeVariableModifications(Symbol residue){
244        if(getVariableModMap().remove(residue) != null){
245            return true;
246        }else{
247            return false;
248        }
249     }
250
251     /** Calculate the molecular weight of a protein, making estimates whenever it is 
252      * possible like averaging mass values for ambiguity symbols or counting
253      * zero when gaps are encountered. 
254      * The method is tolerant for ambiguity symbols as long as they can be
255      * resolved to a series of atomic symbols whose mass is available in the 
256      * ResidueProperties.xml configuration file or they are gaps.
257      * The method returns the same value as getMass(SymbolList proteinSeq,
258      * SymbolPropertyTable.AVG_MASS, false) when only atomic symbols are found
259      * in the polypeptide.
260      *
261      * @since 1.5
262      */
263     public static final double getMolecularWeight(SymbolList proteinSeq)
264     throws IllegalSymbolException {
265        double pepMass = 0.0;
266        Symbol gap = AlphabetManager.alphabetForName("PROTEIN").getGapSymbol();
267        SymbolPropertyTable sPT =
268            ProteinTools.getSymbolPropertyTable(SymbolPropertyTable.AVG_MASS);
269
270        for (Iterator it = proteinSeq.iterator(); it.hasNext(); ) {
271            Symbol s = (Symbol) it.next();
272            if( s instanceof AtomicSymbol) {
273                pepMass += sPT.getDoubleValue(s);
274            } else {
275                FiniteAlphabet matches = (FiniteAlphabet) s.getMatches();
276                if(matches.size() == 0) {
277                    if(s.equals(gap)) {
278                        continue;
279                    }
280                    throw new IllegalSymbolException(
281                        "Symbol " + s.getName() + " has no mass associated");
282                } else {
283                    int count = 0;
284                    double mass = 0.0;
285                    for(Iterator i = matches.iterator(); i.hasNext(); ) {
286                        AtomicSymbol as = (AtomicSymbol) i.next();
287                        //SELENOCYSTEINE and PYRROLYSINE
288                        if(as==ProteinTools.o() ||
289                                as==ProteinTools.u())
290                            continue;
291                        mass += sPT.getDoubleValue(as);
292                        count++;
293                    }
294                    pepMass += mass/((double)count);
295                }
296            }
297        }
298
299        //Calculate hydroxyl mass
300        double termMass = calcTermMass(SymbolPropertyTable.AVG_MASS, false);
301
302        if (pepMass != 0.0) {
303            pepMass += termMass;
304        }
305
306        return pepMass;
307     }
308     
309    /**
310     * <code>getMass</code> calculates the mass of this peptide. This
311     * only works for the values in the ResidueProperties.xml
312     * configuration file. It is probably slightly faster than the
313     * instance method, but it does not handle post-translational
314     * modifications.
315     *
316     * @param proteinSeq a <code>SymbolList</code> whose mass is to be
317     * calculated. This should use the protein alphabet.
318     * @param isotopicType a <code>String</code> The type of isotopes
319     * to calculate. Either mono isotopic or average
320     * isotopic. Acceptable values are
321     * <code>SymbolPropertyTable.AVG_MASS</code> or
322     * <code>SymbolPropertyTable.MONO_MASS</code>.
323     * @param MH_PLUS a <code>boolean</code> true if the value needed
324     * is the MH+ mass.
325     *
326     * @return a <code>double</code> mass of the peptide.
327     *
328     * @exception IllegalSymbolException if the
329     * <code>SymbolList</code> contains illegal
330     * <code>Symbol</code>s.
331     */
332     public static final double getMass(SymbolList proteinSeq,
333             String isotopicType,
334             boolean MH_PLUS)
335        throws IllegalSymbolException
336    {
337
338        double pepMass = 0.0;
339
340        SymbolPropertyTable sPT =
341            ProteinTools.getSymbolPropertyTable(isotopicType);
342
343        for (Iterator it = proteinSeq.iterator(); it.hasNext(); ) {
344            Symbol s = (Symbol) it.next();
345            if(!(s instanceof AtomicSymbol)) {
346                throw new IllegalSymbolException(
347                    "Symbol " + s.getName() + " is not atomic");
348            }
349            pepMass += sPT.getDoubleValue(s);
350        }
351
352        //Calculate hydroxyl mass
353        double termMass = calcTermMass(isotopicType, MH_PLUS);
354
355        if (pepMass != 0.0) {
356            pepMass += termMass;
357        }
358
359        return pepMass;
360    }
361
362    /**
363     * Get the Mass of this peptide. Use this if you want to set fixed
364     * modifications and have created an instance of MassCalc. The
365     * value is calculated using the value of MH_PLUS defined in the
366     * constructor. The static method may be faster.
367     *
368     * @param proteinSeq The sequence for mass calculation
369     *
370     * @return The mass of the sequence */
371    public double getMass(SymbolList proteinSeq)
372        throws IllegalSymbolException
373    {
374
375        double pepMass = 0.0;
376
377        HashMap symbolPropertyMap = getSymbolPropertyMap();
378
379        for (Iterator it = proteinSeq.iterator(); it.hasNext(); ) {
380            Symbol s = (Symbol) it.next();
381            if(! symbolPropertyMap.containsKey(s)){
382                throw new IllegalSymbolException(s, "The mass of the symbol "+s.getName()+" is unknown");
383            }
384            Double mass = (Double) symbolPropertyMap.get(s);
385            pepMass += mass.doubleValue();
386        }
387        pepMass += getTermMass();
388
389        return pepMass;
390    }
391
392     /**
393      * Get all masses including the variable mass.
394      * Allgorythm
395      *
396      * 1 Get the first residue of the sequence
397      * create a list of all the standard and non-standard massses for this reidue
398      * for each residue mass goto 1 with the sequence of all residues after the current residue
399      *     add the residue mass to each mass from 1 to the list
400      *
401      *
402      *     @see #addVariableModification
403      */
404     public double[] getVariableMasses(SymbolList peptide) throws IllegalSymbolException {
405        double[] vMasses = getVMasses(peptide);
406        for(int i=0; i<vMasses.length; i++){
407            vMasses[i] +=  getTermMass();
408        }
409
410        return vMasses;
411     }
412
413
414     private HashMap getVariableModMap() {
415         return mVariableModPropertyHash;
416     }
417
418
419    private HashMap getSymbolPropertyMap(){
420        return mSymbolPropertyHash;// = ProteinTools.getSymbolPropertyTable(name);
421    }
422
423    /**
424     * <code>getTermMass</code> returns the terminal mass being used
425     * by the instance methods.
426     *
427     * @return a <code>double</code> mass.
428     */
429    public double getTermMass(){
430        return termMass;
431    }
432
433  /**
434      *
435      */
436     private double[] getVMasses(SymbolList peptide) throws IllegalSymbolException {
437        Set allMassList = new HashSet();
438
439        Symbol sym = peptide.symbolAt(1);
440        if(!getSymbolPropertyMap().containsKey(sym)){
441            String msg = "No mass Set for Symbol " + sym.getName();
442            throw new IllegalSymbolException(msg);
443        }
444
445        //Create a list for all masses of the current residue
446        List curResMasses = null;
447        if(getVariableModMap().containsKey(sym)){
448            curResMasses = new LinkedList((List) getVariableModMap().get(sym));
449        }else{
450            curResMasses = new LinkedList();
451        }
452        curResMasses.add(getSymbolPropertyMap().get(sym));
453
454        //Move through all masses and calculate the masses of all of the sub peptides
455        Iterator it = curResMasses.iterator();
456        while(it.hasNext()){
457            double resMass = ((Double)it.next()).doubleValue();
458            if(peptide.length() == 1){
459                allMassList.add(new Double(resMass));
460            }else{
461                //Get all masses of remaining peptide
462                double[] subMasses = getVMasses(peptide.subList(2, peptide.length()));
463                //Get next modified mass for symbol
464                for(int i=0; i<subMasses.length; i++){
465                    double pepMass = resMass + subMasses[i];
466                    allMassList.add(new Double(pepMass));
467                }
468            }
469        }
470        //Convert list to an array
471        double masses[] = new double[allMassList.size()];
472        int i=0;
473        for(Iterator mit = allMassList.iterator(); mit.hasNext(); i++){
474            masses[i] = ((Double)mit.next()).doubleValue();
475        }
476        return masses;
477     }
478
479
480    private static double calcTermMass(String isotopicType, boolean MH_PLUS) {
481        double termMass = 0.0;
482        if (isotopicType.equals(SymbolPropertyTable.AVG_MASS)) {
483            //Add the C-terminal OH and N-Term H
484            termMass += Havg + Oavg + Havg;
485            //Add the extra H
486            if (MH_PLUS) {
487                termMass += Havg;
488            }
489        }
490        else if (isotopicType.equals(SymbolPropertyTable.MONO_MASS)) {
491            //Add the C-terminal OH and N-Term H
492            termMass += Hmono + Omono + Hmono;
493            //Add the extra H
494            if (MH_PLUS) {
495                termMass += Hmono;
496            }
497        }
498        return termMass;
499    }
500
501     private Symbol getSymbolForChar(char symbolToken)
502         throws IllegalSymbolException{
503         SymbolTokenization toke;
504         try {
505             toke = ProteinTools.getAlphabet().getTokenization("token");
506         } catch (BioException ex) {
507             throw new BioError("Expected a tokenization", ex);
508         }
509
510         Symbol sym = toke.parseToken("" + symbolToken);
511        return sym;
512     }
513}