001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.aaproperties;
022
023import org.biojava.nbio.aaproperties.xml.AminoAcidCompositionTable;
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import javax.xml.bind.JAXBException;
032import java.io.File;
033import java.io.FileNotFoundException;
034import java.util.HashMap;
035import java.util.HashSet;
036import java.util.Map;
037import java.util.Set;
038
039/**
040 * This is an adaptor class which enable the ease of generating protein properties.
041 * At least one adaptor method is written for each available properties provided in IPeptideProperties.
042 *
043 * @author kohchuanhock
044 * @version 2011.08.22
045 * @since 3.0.2
046 * @see IPeptideProperties
047 * @see PeptidePropertiesImpl
048 */
049public class PeptideProperties {
050
051        private final static Logger logger = LoggerFactory.getLogger(PeptideProperties.class);
052
053        /**
054         * Enumeration of 20 standard amino acid code
055         */
056        public enum SingleLetterAACode { W, C, M, H, Y, F, Q, N, I, R, D, P, T, K, E, V, S, G, A, L}
057
058        /**
059         * Contains the 20 standard AA code in a set
060         */
061        public static Set<Character> standardAASet;
062
063        /**
064         * To initialize the standardAASet
065         */
066        static{
067                standardAASet = new HashSet<Character>();
068                for(SingleLetterAACode c:SingleLetterAACode.values()) standardAASet.add(c.toString().charAt(0));
069        }
070
071        /**
072         * An adaptor method to return the molecular weight of sequence.
073         * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
074         * This method will sum the molecular weight of each amino acid in the
075         * sequence. Molecular weights are based on <a href="http://web.expasy.org/findmod/findmod_masses.html">here</a>.
076         *
077         * @param sequence
078         *      a protein sequence consisting of non-ambiguous characters only
079         * @return the total molecular weight of sequence + weight of water molecule
080         */
081        public static final double getMolecularWeight(String sequence){
082                sequence = Utils.checkSequence(sequence);
083                ProteinSequence pSequence = null;
084                try {
085                        pSequence = new ProteinSequence(sequence);
086                } catch (CompoundNotFoundException e) {
087                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
088                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
089                }
090                IPeptideProperties pp = new PeptidePropertiesImpl();
091                return pp.getMolecularWeight(pSequence);
092        }
093
094        /**
095         * An adaptor method to return the molecular weight of sequence.
096         * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
097         * This method will sum the molecular weight of each amino acid in the
098         * sequence. Molecular weights are based on the input xml file.
099         *
100         * @param sequence
101         *      a protein sequence consisting of non-ambiguous characters only
102         * @param elementMassFile
103         *      xml file that details the mass of each elements and isotopes
104         * @param aminoAcidCompositionFile
105         *      xml file that details the composition of amino acids
106         * @return the total molecular weight of sequence + weight of water molecule
107         * @throws FileNotFoundException
108         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
109         * @throws JAXBException
110         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
111         */
112        public static final double getMolecularWeight(String sequence, File elementMassFile, File aminoAcidCompositionFile)
113        throws FileNotFoundException, JAXBException{
114                sequence = Utils.checkSequence(sequence);
115                ProteinSequence pSequence = null;
116                try {
117                        pSequence = new ProteinSequence(sequence);
118                } catch (CompoundNotFoundException e) {
119                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
120                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
121                }
122                IPeptideProperties pp = new PeptidePropertiesImpl();
123                return pp.getMolecularWeight(pSequence, elementMassFile, aminoAcidCompositionFile);
124        }
125
126        /**
127         * An adaptor method to return the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
128         * This method will sum the molecular weight of each amino acid in the
129         * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema.
130         * Note that it assumes that ElementMass.xml file can be found in default location.
131         *
132         * @param sequence
133         *      a protein sequence consisting of non-ambiguous characters only
134         *      xml file that details the mass of each elements and isotopes
135         * @param aminoAcidCompositionFile
136         *      xml file that details the composition of amino acids
137         * @return the total molecular weight of sequence + weight of water molecule
138         * @throws JAXBException
139         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
140         * @throws FileNotFoundException
141         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
142         */
143        public static final double getMolecularWeight(String sequence, File aminoAcidCompositionFile) throws FileNotFoundException, JAXBException{
144                sequence = Utils.checkSequence(sequence);
145                ProteinSequence pSequence = null;
146                try {
147                        pSequence = new ProteinSequence(sequence);
148                } catch (CompoundNotFoundException e) {
149                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
150                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
151                }
152                IPeptideProperties pp = new PeptidePropertiesImpl();
153                return pp.getMolecularWeight(pSequence, aminoAcidCompositionFile);
154        }
155
156        /**
157         * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
158         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
159         * Note that ElementMass.xml is assumed to be able to be seen in default location.
160         *
161         * @param aminoAcidCompositionFile
162         *      xml file that details the composition of amino acids
163         * @return the initialized amino acid composition table
164         * @throws JAXBException
165         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
166         * @throws FileNotFoundException
167         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
168         */
169        public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File aminoAcidCompositionFile)
170        throws JAXBException, FileNotFoundException{
171                IPeptideProperties pp = new PeptidePropertiesImpl();
172                return pp.obtainAminoAcidCompositionTable(aminoAcidCompositionFile);
173        }
174
175        /**
176         * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
177         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
178         *
179         * @param elementMassFile
180         *      xml file that details the mass of each elements and isotopes
181         * @param aminoAcidCompositionFile
182         *      xml file that details the composition of amino acids
183         * @return the initialized amino acid composition table
184         * @throws JAXBException
185         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
186         * @throws FileNotFoundException
187         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
188         */
189        public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File elementMassFile, File aminoAcidCompositionFile)
190        throws JAXBException, FileNotFoundException{
191                IPeptideProperties pp = new PeptidePropertiesImpl();
192                return pp.obtainAminoAcidCompositionTable(elementMassFile, aminoAcidCompositionFile);
193        }
194
195        /**
196         * An adaptor method that returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
197         * This method will sum the molecular weight of each amino acid in the
198         * sequence. Molecular weights are based on the AminoAcidCompositionTable.
199         * Those input files must be XML using the defined schema.
200         *
201         * @param sequence
202         *      a protein sequence consisting of non-ambiguous characters only
203         * @param aminoAcidCompositionTable
204         *      a amino acid composition table obtained by calling IPeptideProperties.obtainAminoAcidCompositionTable
205         * @return the total molecular weight of sequence + weight of water molecule
206         *      thrown if the method IPeptideProperties.setMolecularWeightXML(File, File) is not successfully called before calling this method.
207         */
208        public static double getMolecularWeightBasedOnXML(String sequence, AminoAcidCompositionTable aminoAcidCompositionTable){
209                sequence = Utils.checkSequence(sequence, aminoAcidCompositionTable.getSymbolSet());
210                ProteinSequence pSequence = null;
211                try {
212                        pSequence = new ProteinSequence(sequence, aminoAcidCompositionTable.getAminoAcidCompoundSet());
213                } catch (CompoundNotFoundException e) {
214                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
215                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
216                }
217                IPeptideProperties pp = new PeptidePropertiesImpl();
218                return pp.getMolecularWeightBasedOnXML(pSequence, aminoAcidCompositionTable);
219        }
220
221        /**
222         * An adaptor method to returns the absorbance (optical density) of sequence. The sequence argument
223         * must be a protein sequence consisting of only non-ambiguous characters.
224         * The computation of absorbance (optical density) follows the
225         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
226         *
227         * @param sequence
228         *      a protein sequence consisting of non-ambiguous characters only
229         * @param assumeCysReduced
230         *      true if Cys are assumed to be reduced and false if Cys are assumed to form cystines
231         * @return the absorbance (optical density) of sequence
232         */
233        public static final double getAbsorbance(String sequence, boolean assumeCysReduced){
234                sequence = Utils.checkSequence(sequence);
235                ProteinSequence pSequence = null;
236                try {
237                        pSequence = new ProteinSequence(sequence);
238                } catch (CompoundNotFoundException e) {
239                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
240                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
241                }
242                IPeptideProperties pp = new PeptidePropertiesImpl();
243                return pp.getAbsorbance(pSequence, assumeCysReduced);
244        }
245
246        /**
247         * An adaptor method to return the extinction coefficient of sequence. The sequence argument
248         * must be a protein sequence consisting of only non-ambiguous characters.
249         * The extinction coefficient indicates how much light a protein absorbs at
250         * a certain wavelength. It is useful to have an estimation of this
251         * coefficient for following a protein which a spectrophotometer when
252         * purifying it. The computation of extinction coefficient follows the
253         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
254         *
255         * @param sequence
256         *            a protein sequence consisting of non-ambiguous characters only
257         * @param assumeCysReduced
258         *            true if Cys are assumed to be reduced and false if Cys are
259         *            assumed to form cystines
260         * @return the extinction coefficient of sequence
261         */
262        public static final double getExtinctionCoefficient(String sequence, boolean assumeCysReduced) {
263                sequence = Utils.checkSequence(sequence);
264                ProteinSequence pSequence = null;
265                try {
266                        pSequence = new ProteinSequence(sequence);
267                } catch (CompoundNotFoundException e) {
268                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
269                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
270                }
271                IPeptideProperties pp = new PeptidePropertiesImpl();
272                return pp.getExtinctionCoefficient(pSequence, assumeCysReduced);
273        }
274
275        /**
276         * An adaptor method to return the instability index of sequence. The sequence argument must be
277         * a protein sequence consisting of only non-ambiguous characters.
278         * The instability index provides an estimate of the stability of your
279         * protein in a test tube. The computation of instability index follows the
280         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
281         *
282         * @param sequence
283         *              a protein sequence consisting of non-ambiguous characters only
284         * @return the instability index of sequence
285         */
286        public static final double getInstabilityIndex(String sequence) {
287                sequence = Utils.checkSequence(sequence);
288                ProteinSequence pSequence = null;
289                try {
290                        pSequence = new ProteinSequence(sequence);
291                } catch (CompoundNotFoundException e) {
292                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
293                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
294                }
295                IPeptideProperties pp = new PeptidePropertiesImpl();
296                return pp.getInstabilityIndex(pSequence);
297        }
298
299        /**
300         * An adaptor method to return the apliphatic index of sequence. The sequence argument must be a
301         * protein sequence consisting of only non-ambiguous characters.
302         * The aliphatic index of a protein is defined as the relative volume
303         * occupied by aliphatic side chains (alanine, valine, isoleucine, and
304         * leucine). It may be regarded as a positive factor for the increase of
305         * thermostability of globular proteins. The computation of aliphatic index
306         * follows the documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
307         * A protein whose instability index is smaller than 40 is predicted as stable, a value above 40 predicts that the protein may be unstable.
308         *
309         * @param sequence
310         *              a protein sequence consisting of non-ambiguous characters only
311         * @return the aliphatic index of sequence
312         */
313        public static final double getApliphaticIndex(String sequence) {
314                sequence = Utils.checkSequence(sequence);
315                ProteinSequence pSequence = null;
316                try {
317                        pSequence = new ProteinSequence(sequence);
318                } catch (CompoundNotFoundException e) {
319                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
320                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
321                }
322
323                IPeptideProperties pp = new PeptidePropertiesImpl();
324                return pp.getApliphaticIndex(pSequence);
325        }
326
327        /**
328         * An adaptor method to return the average hydropathy value of sequence. The sequence argument
329         * must be a protein sequence consisting of only non-ambiguous characters.
330         * The average value for a sequence is calculated as the sum of hydropathy
331         * values of all the amino acids, divided by the number of residues in the
332         * sequence. Hydropathy values are based on (Kyte, J. and Doolittle, R.F.
333         * (1982) A simple method for displaying the hydropathic character of a
334         * protein. J. Mol. Biol. 157, 105-132).
335         *
336         * @param sequence
337         *              a protein sequence consisting of non-ambiguous characters only
338         * @return the average hydropathy value of sequence
339         */
340        public static final double getAvgHydropathy(String sequence) {
341                sequence = Utils.checkSequence(sequence);
342                ProteinSequence pSequence = null;
343                try {
344                        pSequence = new ProteinSequence(sequence);
345                } catch (CompoundNotFoundException e) {
346                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
347                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
348                }
349                IPeptideProperties pp = new PeptidePropertiesImpl();
350                return pp.getAvgHydropathy(pSequence);
351        }
352
353        /**
354         * An adaptor method to return the isoelectric point of sequence. The sequence argument must be
355         * a protein sequence consisting of only non-ambiguous characters.
356         * The isoelectric point is the pH at which the protein carries no net
357         * electrical charge. The isoelectric point will be computed based on
358         * approach stated in
359         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a>
360         *
361         * pKa values used will be either
362         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
363         * OR
364         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
365         *
366         * @param sequence
367         *              a protein sequence consisting of non-ambiguous characters only
368         * @param useExpasyValues
369         *              whether to use Expasy values (Default) or Innovagen values
370         * @return the isoelectric point of sequence
371         */
372        public static final double getIsoelectricPoint(String sequence, boolean useExpasyValues) {
373                sequence = Utils.checkSequence(sequence);
374                ProteinSequence pSequence = null;
375                try {
376                        pSequence = new ProteinSequence(sequence);
377                } catch (CompoundNotFoundException e) {
378                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
379                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
380                }
381                IPeptideProperties pp = new PeptidePropertiesImpl();
382                return pp.getIsoelectricPoint(pSequence, useExpasyValues);
383        }
384
385        public static final double getIsoelectricPoint(String sequence){
386                return getIsoelectricPoint(sequence, true);
387        }
388
389        /**
390         * An adaptor method to return the net charge of sequence at pH 7. The sequence argument must be
391         * a protein sequence consisting of only non-ambiguous characters.
392         * The net charge will be computed using the approach stated in
393         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a>
394         *
395         * pKa values used will be either
396         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
397         * OR
398         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
399         *
400         * @param sequence
401         *              a protein sequence consisting of non-ambiguous characters only
402         * @param useExpasyValues
403         *              whether to use Expasy values (Default) or Innovagen values
404         * @param pHPoint
405         *              the pH value to use for computation of the net charge. Default at 7.
406         * @return the net charge of sequence at given pHPoint
407         */
408        public static final double getNetCharge(String sequence, boolean useExpasyValues, double pHPoint){
409                sequence = Utils.checkSequence(sequence);
410                ProteinSequence pSequence = null;
411                try {
412                        pSequence = new ProteinSequence(sequence);
413                } catch (CompoundNotFoundException e) {
414                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
415                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
416                }
417                IPeptideProperties pp = new PeptidePropertiesImpl();
418                return pp.getNetCharge(pSequence, useExpasyValues, pHPoint);
419        }
420
421        public static final double getNetCharge(String sequence, boolean useExpasyValues) {
422                return getNetCharge(sequence, useExpasyValues, 7.0);
423        }
424
425        public static final double getNetCharge(String sequence){
426                return getNetCharge(sequence, true);
427        }
428
429        /**
430         * An adaptor method to return the composition of specified amino acid in the sequence. The
431         * sequence argument must be a protein sequence consisting of only
432         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
433         * character.
434         * The composition of an amino acid is the total number of its occurrence,
435         * divided by the total length of the sequence.
436         *
437         * @param sequence
438         *            a protein sequence consisting of non-ambiguous characters only
439         * @param aminoAcidCode
440         *            the code of the amino acid to compute
441         * @return the composition of specified amino acid in the sequence
442         * @see SingleLetterAACode
443         */
444        public static final double getEnrichment(String sequence, SingleLetterAACode aminoAcidCode) {
445                return getEnrichment(sequence, aminoAcidCode.toString());
446        }
447
448        /**
449         * An adaptor method to return the composition of specified amino acid in the sequence. The
450         * sequence argument must be a protein sequence consisting of only
451         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
452         * character.
453         * The composition of an amino acid is the total number of its occurrence,
454         * divided by the total length of the sequence.
455         *
456         * @param sequence
457         *              a protein sequence consisting of non-ambiguous characters only
458         * @param aminoAcidCode
459         *              the code of the amino acid to compute
460         * @return the composition of specified amino acid in the sequence
461         */
462        public static final double getEnrichment(String sequence, char aminoAcidCode){
463                return getEnrichment(sequence, aminoAcidCode);
464        }
465
466        /**
467         * An adaptor method to return the composition of specified amino acid in the sequence. The
468         * sequence argument must be a protein sequence consisting of only
469         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
470         * character.
471         * The composition of an amino acid is the total number of its occurrence,
472         * divided by the total length of the sequence.
473         *
474         * @param sequence
475         *            a protein sequence consisting of non-ambiguous characters only
476         * @param aminoAcidCode
477         *            the code of the amino acid to compute
478         * @return the composition of specified amino acid in the sequence
479         */
480        public static final double getEnrichment(String sequence, String aminoAcidCode){
481                sequence = Utils.checkSequence(sequence);
482                ProteinSequence pSequence = null;
483                try {
484                        pSequence = new ProteinSequence(sequence);
485                } catch (CompoundNotFoundException e) {
486                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
487                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
488                }
489                IPeptideProperties pp = new PeptidePropertiesImpl();
490                AminoAcidCompoundSet aaSet = new AminoAcidCompoundSet();
491                return pp.getEnrichment(pSequence, aaSet.getCompoundForString(aminoAcidCode));
492        }
493
494        /**
495         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
496         * The sequence argument must be a protein sequence consisting of only
497         * non-ambiguous characters.
498         * The composition of an amino acid is the total number of its occurrence,
499         * divided by the total length of the sequence.
500         *
501         * @param sequence
502         *            a protein sequence consisting of non-ambiguous characters only
503         * @return the composition of the 20 standard amino acid in the sequence
504         * @see AminoAcidCompound
505         */
506        public static final Map<AminoAcidCompound, Double> getAAComposition(String sequence) {
507                sequence = Utils.checkSequence(sequence);
508                ProteinSequence pSequence = null;
509                try {
510                        pSequence = new ProteinSequence(sequence);
511                } catch (CompoundNotFoundException e) {
512                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
513                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
514                }
515                IPeptideProperties pp = new PeptidePropertiesImpl();
516                return pp.getAAComposition(pSequence);
517        }
518
519        /**
520         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
521         * The sequence argument must be a protein sequence consisting of only
522         * non-ambiguous characters.
523         * The composition of an amino acid is the total number of its occurrence,
524         * divided by the total length of the sequence.
525         *
526         * @param sequence
527         *              a protein sequence consisting of non-ambiguous characters only
528         * @return the composition of the 20 standard amino acid in the sequence
529         */
530        public static final Map<String, Double> getAACompositionString(String sequence){
531                Map<AminoAcidCompound, Double> aa2Composition = getAAComposition(sequence);
532                Map<String, Double> aaString2Composition = new HashMap<String, Double>();
533                for(AminoAcidCompound aaCompound:aa2Composition.keySet()){
534                        aaString2Composition.put(aaCompound.getShortName(), aa2Composition.get(aaCompound));
535                }
536                return aaString2Composition;
537        }
538
539        /**
540         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
541         * The sequence argument must be a protein sequence consisting of only
542         * non-ambiguous characters.
543         * The composition of an amino acid is the total number of its occurrence,
544         * divided by the total length of the sequence.
545         *
546         * @param sequence
547         *              a protein sequence consisting of non-ambiguous characters only
548         * @return the composition of the 20 standard amino acid in the sequence
549         */
550        public static final Map<Character, Double> getAACompositionChar(String sequence){
551                Map<AminoAcidCompound, Double> aa2Composition = getAAComposition(sequence);
552                Map<Character, Double> aaChar2Composition = new HashMap<Character, Double>();
553                for(AminoAcidCompound aaCompound:aa2Composition.keySet()){
554                        aaChar2Composition.put(aaCompound.getShortName().charAt(0), aa2Composition.get(aaCompound));
555                }
556                return aaChar2Composition;
557        }
558
559        /**
560         * Returns the array of charges of each amino acid in a protein. At pH=7, two are negative charged: aspartic acid (Asp, D) and glutamic acid (Glu, E) (acidic side chains),
561         * and three are positive charged: lysine (Lys, K), arginine (Arg, R) and histidine (His, H) (basic side chains).
562         *
563         * @param sequence
564         *              a protein sequence consisting of non-ambiguous characters only
565         * @return the array of charges of amino acids in the protein (1 if amino acid is positively charged, -1 if negatively charged, 0 if not charged)
566         */
567        public static final int[] getChargesOfAminoAcids(String sequence) {
568                int[] charges = new int[sequence.length()];
569                for ( int i=0; i < sequence.length(); i++ ) {
570                        char aa = sequence.toCharArray()[i];
571                        charges[i] = AminoAcidProperties.getChargeOfAminoAcid(aa);
572                }
573                return charges;
574        }
575
576        /**
577         * Returns the array of polarity values of each amino acid in a protein sequence.
578         *
579         * @param sequence
580         *              a protein sequence consisting of non-ambiguous characters only
581         * @return the array of polarity of amino acids in the protein (1 if amino acid is polar, 0 if not)
582         */
583        public static final int[] getPolarityOfAminoAcids(String sequence) {
584                int[] polarity = new int[sequence.length()];
585                for ( int i=0; i < sequence.length(); i++ ) {
586                        char aa = sequence.toCharArray()[i];
587                        polarity[i] = AminoAcidProperties.getPolarityOfAminoAcid(aa);
588                }
589                return polarity;
590        }
591}