001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.aaproperties;
022
023import org.biojava.nbio.aaproperties.xml.AminoAcidCompositionTable;
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import jakarta.xml.bind.JAXBException;
032import java.io.File;
033import java.io.FileNotFoundException;
034import java.util.Arrays;
035import java.util.HashMap;
036import java.util.HashSet;
037import java.util.Map;
038import java.util.Set;
039import java.util.stream.Collectors;
040
041/**
042 * This is an adaptor class which enable the ease of generating protein properties.
043 * At least one adaptor method is written for each available properties provided in IPeptideProperties.
044 *
045 * @author kohchuanhock
046 * @version 2011.08.22
047 * @since 3.0.2
048 * @see IPeptideProperties
049 * @see PeptidePropertiesImpl
050 */
051public class PeptideProperties {
052
053        private final static Logger logger = LoggerFactory.getLogger(PeptideProperties.class);
054
055        /**
056         * Enumeration of 20 standard amino acid code
057         */
058        public enum SingleLetterAACode { W, C, M, H, Y, F, Q, N, I, R, D, P, T, K, E, V, S, G, A, L}
059
060        /**
061         * Contains the 20 standard AA code in a set
062         */
063        public static Set<Character> standardAASet;
064
065        /**
066         * To initialize the standardAASet
067         */
068        static{
069                standardAASet = Arrays.stream(SingleLetterAACode.values())
070                                      .map(singleLetterAACode -> singleLetterAACode.toString().charAt(0))
071                                      .collect(Collectors.toCollection(HashSet::new));
072        }
073
074        /**
075         * An adaptor method to return the molecular weight of sequence.
076         * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
077         * This method will sum the molecular weight of each amino acid in the
078         * sequence. Molecular weights are based on <a href="http://web.expasy.org/findmod/findmod_masses.html">here</a>.
079         *
080         * @param sequence
081         *      a protein sequence consisting of non-ambiguous characters only
082         * @return the total molecular weight of sequence + weight of water molecule
083         */
084        public static final double getMolecularWeight(String sequence){
085                sequence = Utils.checkSequence(sequence);
086                ProteinSequence pSequence = null;
087                try {
088                        pSequence = new ProteinSequence(sequence);
089                } catch (CompoundNotFoundException e) {
090                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
091                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
092                }
093                IPeptideProperties pp = new PeptidePropertiesImpl();
094                return pp.getMolecularWeight(pSequence);
095        }
096
097        /**
098         * An adaptor method to return the molecular weight of sequence.
099         * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
100         * This method will sum the molecular weight of each amino acid in the
101         * sequence. Molecular weights are based on the input xml file.
102         *
103         * @param sequence
104         *      a protein sequence consisting of non-ambiguous characters only
105         * @param elementMassFile
106         *      xml file that details the mass of each elements and isotopes
107         * @param aminoAcidCompositionFile
108         *      xml file that details the composition of amino acids
109         * @return the total molecular weight of sequence + weight of water molecule
110         * @throws FileNotFoundException
111         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
112         * @throws JAXBException
113         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
114         */
115        public static final double getMolecularWeight(String sequence, File elementMassFile, File aminoAcidCompositionFile)
116        throws FileNotFoundException, JAXBException{
117                sequence = Utils.checkSequence(sequence);
118                ProteinSequence pSequence = null;
119                try {
120                        pSequence = new ProteinSequence(sequence);
121                } catch (CompoundNotFoundException e) {
122                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
123                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
124                }
125                IPeptideProperties pp = new PeptidePropertiesImpl();
126                return pp.getMolecularWeight(pSequence, elementMassFile, aminoAcidCompositionFile);
127        }
128
129        /**
130         * An adaptor method to return the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
131         * This method will sum the molecular weight of each amino acid in the
132         * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema.
133         * Note that it assumes that ElementMass.xml file can be found in default location.
134         *
135         * @param sequence
136         *      a protein sequence consisting of non-ambiguous characters only
137         *      xml file that details the mass of each elements and isotopes
138         * @param aminoAcidCompositionFile
139         *      xml file that details the composition of amino acids
140         * @return the total molecular weight of sequence + weight of water molecule
141         * @throws JAXBException
142         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
143         * @throws FileNotFoundException
144         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
145         */
146        public static final double getMolecularWeight(String sequence, File aminoAcidCompositionFile) throws FileNotFoundException, JAXBException{
147                sequence = Utils.checkSequence(sequence);
148                ProteinSequence pSequence = null;
149                try {
150                        pSequence = new ProteinSequence(sequence);
151                } catch (CompoundNotFoundException e) {
152                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
153                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
154                }
155                IPeptideProperties pp = new PeptidePropertiesImpl();
156                return pp.getMolecularWeight(pSequence, aminoAcidCompositionFile);
157        }
158
159        /**
160         * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
161         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
162         * Note that ElementMass.xml is assumed to be able to be seen in default location.
163         *
164         * @param aminoAcidCompositionFile
165         *      xml file that details the composition of amino acids
166         * @return the initialized amino acid composition table
167         * @throws JAXBException
168         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
169         * @throws FileNotFoundException
170         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
171         */
172        public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File aminoAcidCompositionFile)
173        throws JAXBException, FileNotFoundException{
174                IPeptideProperties pp = new PeptidePropertiesImpl();
175                return pp.obtainAminoAcidCompositionTable(aminoAcidCompositionFile);
176        }
177
178        /**
179         * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
180         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
181         *
182         * @param elementMassFile
183         *      xml file that details the mass of each elements and isotopes
184         * @param aminoAcidCompositionFile
185         *      xml file that details the composition of amino acids
186         * @return the initialized amino acid composition table
187         * @throws JAXBException
188         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
189         * @throws FileNotFoundException
190         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
191         */
192        public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File elementMassFile, File aminoAcidCompositionFile)
193        throws JAXBException, FileNotFoundException{
194                IPeptideProperties pp = new PeptidePropertiesImpl();
195                return pp.obtainAminoAcidCompositionTable(elementMassFile, aminoAcidCompositionFile);
196        }
197
198        /**
199         * An adaptor method that returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
200         * This method will sum the molecular weight of each amino acid in the
201         * sequence. Molecular weights are based on the AminoAcidCompositionTable.
202         * Those input files must be XML using the defined schema.
203         *
204         * @param sequence
205         *      a protein sequence consisting of non-ambiguous characters only
206         * @param aminoAcidCompositionTable
207         *      a amino acid composition table obtained by calling IPeptideProperties.obtainAminoAcidCompositionTable
208         * @return the total molecular weight of sequence + weight of water molecule
209         *      thrown if the method IPeptideProperties.setMolecularWeightXML(File, File) is not successfully called before calling this method.
210         */
211        public static double getMolecularWeightBasedOnXML(String sequence, AminoAcidCompositionTable aminoAcidCompositionTable){
212                sequence = Utils.checkSequence(sequence, aminoAcidCompositionTable.getSymbolSet());
213                ProteinSequence pSequence = null;
214                try {
215                        pSequence = new ProteinSequence(sequence, aminoAcidCompositionTable.getAminoAcidCompoundSet());
216                } catch (CompoundNotFoundException e) {
217                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
218                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
219                }
220                IPeptideProperties pp = new PeptidePropertiesImpl();
221                return pp.getMolecularWeightBasedOnXML(pSequence, aminoAcidCompositionTable);
222        }
223
224        /**
225         * An adaptor method to returns the absorbance (optical density) of sequence. The sequence argument
226         * must be a protein sequence consisting of only non-ambiguous characters.
227         * The computation of absorbance (optical density) follows the
228         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
229         *
230         * @param sequence
231         *      a protein sequence consisting of non-ambiguous characters only
232         * @param assumeCysReduced
233         *      true if Cys are assumed to be reduced and false if Cys are assumed to form cystines
234         * @return the absorbance (optical density) of sequence
235         */
236        public static final double getAbsorbance(String sequence, boolean assumeCysReduced){
237                sequence = Utils.checkSequence(sequence);
238                ProteinSequence pSequence = null;
239                try {
240                        pSequence = new ProteinSequence(sequence);
241                } catch (CompoundNotFoundException e) {
242                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
243                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
244                }
245                IPeptideProperties pp = new PeptidePropertiesImpl();
246                return pp.getAbsorbance(pSequence, assumeCysReduced);
247        }
248
249        /**
250         * An adaptor method to return the extinction coefficient of sequence. The sequence argument
251         * must be a protein sequence consisting of only non-ambiguous characters.
252         * The extinction coefficient indicates how much light a protein absorbs at
253         * a certain wavelength. It is useful to have an estimation of this
254         * coefficient for following a protein which a spectrophotometer when
255         * purifying it. The computation of extinction coefficient follows the
256         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
257         *
258         * @param sequence
259         *            a protein sequence consisting of non-ambiguous characters only
260         * @param assumeCysReduced
261         *            true if Cys are assumed to be reduced and false if Cys are
262         *            assumed to form cystines
263         * @return the extinction coefficient of sequence
264         */
265        public static final double getExtinctionCoefficient(String sequence, boolean assumeCysReduced) {
266                sequence = Utils.checkSequence(sequence);
267                ProteinSequence pSequence = null;
268                try {
269                        pSequence = new ProteinSequence(sequence);
270                } catch (CompoundNotFoundException e) {
271                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
272                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
273                }
274                IPeptideProperties pp = new PeptidePropertiesImpl();
275                return pp.getExtinctionCoefficient(pSequence, assumeCysReduced);
276        }
277
278        /**
279         * An adaptor method to return the instability index of sequence. The sequence argument must be
280         * a protein sequence consisting of only non-ambiguous characters.
281         * The instability index provides an estimate of the stability of your
282         * protein in a test tube. The computation of instability index follows the
283         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
284         *
285         * @param sequence
286         *              a protein sequence consisting of non-ambiguous characters only
287         * @return the instability index of sequence
288         */
289        public static final double getInstabilityIndex(String sequence) {
290                sequence = Utils.checkSequence(sequence);
291                ProteinSequence pSequence = null;
292                try {
293                        pSequence = new ProteinSequence(sequence);
294                } catch (CompoundNotFoundException e) {
295                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
296                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
297                }
298                IPeptideProperties pp = new PeptidePropertiesImpl();
299                return pp.getInstabilityIndex(pSequence);
300        }
301
302        /**
303         * An adaptor method to return the apliphatic index of sequence. The sequence argument must be a
304         * protein sequence consisting of only non-ambiguous characters.
305         * The aliphatic index of a protein is defined as the relative volume
306         * occupied by aliphatic side chains (alanine, valine, isoleucine, and
307         * leucine). It may be regarded as a positive factor for the increase of
308         * thermostability of globular proteins. The computation of aliphatic index
309         * follows the documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
310         * A protein whose instability index is smaller than 40 is predicted as stable, a value above 40 predicts that the protein may be unstable.
311         *
312         * @param sequence
313         *              a protein sequence consisting of non-ambiguous characters only
314         * @return the aliphatic index of sequence
315         */
316        public static final double getApliphaticIndex(String sequence) {
317                sequence = Utils.checkSequence(sequence);
318                ProteinSequence pSequence = null;
319                try {
320                        pSequence = new ProteinSequence(sequence);
321                } catch (CompoundNotFoundException e) {
322                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
323                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
324                }
325
326                IPeptideProperties pp = new PeptidePropertiesImpl();
327                return pp.getApliphaticIndex(pSequence);
328        }
329
330        /**
331         * An adaptor method to return the average hydropathy value of sequence. The sequence argument
332         * must be a protein sequence consisting of only non-ambiguous characters.
333         * The average value for a sequence is calculated as the sum of hydropathy
334         * values of all the amino acids, divided by the number of residues in the
335         * sequence. Hydropathy values are based on (Kyte, J. and Doolittle, R.F.
336         * (1982) A simple method for displaying the hydropathic character of a
337         * protein. J. Mol. Biol. 157, 105-132).
338         *
339         * @param sequence
340         *              a protein sequence consisting of non-ambiguous characters only
341         * @return the average hydropathy value of sequence
342         */
343        public static final double getAvgHydropathy(String sequence) {
344                sequence = Utils.checkSequence(sequence);
345                ProteinSequence pSequence = null;
346                try {
347                        pSequence = new ProteinSequence(sequence);
348                } catch (CompoundNotFoundException e) {
349                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
350                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
351                }
352                IPeptideProperties pp = new PeptidePropertiesImpl();
353                return pp.getAvgHydropathy(pSequence);
354        }
355
356        /**
357         * An adaptor method to return the isoelectric point of sequence. The sequence argument must be
358         * a protein sequence consisting of only non-ambiguous characters.
359         * The isoelectric point is the pH at which the protein carries no net
360         * electrical charge. The isoelectric point will be computed based on
361         * approach stated in
362         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a>
363         *
364         * pKa values used will be either
365         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
366         * OR
367         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
368         *
369         * @param sequence
370         *              a protein sequence consisting of non-ambiguous characters only
371         * @param useExpasyValues
372         *              whether to use Expasy values (Default) or Innovagen values
373         * @return the isoelectric point of sequence
374         */
375        public static final double getIsoelectricPoint(String sequence, boolean useExpasyValues) {
376                sequence = Utils.checkSequence(sequence);
377                ProteinSequence pSequence = null;
378                try {
379                        pSequence = new ProteinSequence(sequence);
380                } catch (CompoundNotFoundException e) {
381                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
382                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
383                }
384                IPeptideProperties pp = new PeptidePropertiesImpl();
385                return pp.getIsoelectricPoint(pSequence, useExpasyValues);
386        }
387
388        public static final double getIsoelectricPoint(String sequence){
389                return getIsoelectricPoint(sequence, true);
390        }
391
392        /**
393         * An adaptor method to return the net charge of sequence at pH 7. The sequence argument must be
394         * a protein sequence consisting of only non-ambiguous characters.
395         * The net charge will be computed using the approach stated in
396         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a>
397         *
398         * pKa values used will be either
399         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
400         * OR
401         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
402         *
403         * @param sequence
404         *              a protein sequence consisting of non-ambiguous characters only
405         * @param useExpasyValues
406         *              whether to use Expasy values (Default) or Innovagen values
407         * @param pHPoint
408         *              the pH value to use for computation of the net charge. Default at 7.
409         * @return the net charge of sequence at given pHPoint
410         */
411        public static final double getNetCharge(String sequence, boolean useExpasyValues, double pHPoint){
412                sequence = Utils.checkSequence(sequence);
413                ProteinSequence pSequence = null;
414                try {
415                        pSequence = new ProteinSequence(sequence);
416                } catch (CompoundNotFoundException e) {
417                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
418                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
419                }
420                IPeptideProperties pp = new PeptidePropertiesImpl();
421                return pp.getNetCharge(pSequence, useExpasyValues, pHPoint);
422        }
423
424        public static final double getNetCharge(String sequence, boolean useExpasyValues) {
425                return getNetCharge(sequence, useExpasyValues, 7.0);
426        }
427
428        public static final double getNetCharge(String sequence){
429                return getNetCharge(sequence, true);
430        }
431
432        /**
433         * An adaptor method to return the composition of specified amino acid in the sequence. The
434         * sequence argument must be a protein sequence consisting of only
435         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
436         * character.
437         * The composition of an amino acid is the total number of its occurrence,
438         * divided by the total length of the sequence.
439         *
440         * @param sequence
441         *            a protein sequence consisting of non-ambiguous characters only
442         * @param aminoAcidCode
443         *            the code of the amino acid to compute
444         * @return the composition of specified amino acid in the sequence
445         * @see SingleLetterAACode
446         */
447        public static final double getEnrichment(String sequence, SingleLetterAACode aminoAcidCode) {
448                return getEnrichment(sequence, aminoAcidCode.toString());
449        }
450
451        /**
452         * An adaptor method to return the composition of specified amino acid in the sequence. The
453         * sequence argument must be a protein sequence consisting of only
454         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
455         * character.
456         * The composition of an amino acid is the total number of its occurrence,
457         * divided by the total length of the sequence.
458         *
459         * @param sequence
460         *              a protein sequence consisting of non-ambiguous characters only
461         * @param aminoAcidCode
462         *              the code of the amino acid to compute
463         * @return the composition of specified amino acid in the sequence
464         */
465        public static final double getEnrichment(String sequence, char aminoAcidCode){
466                return getEnrichment(sequence, aminoAcidCode);
467        }
468
469        /**
470         * An adaptor method to return the composition of specified amino acid in the sequence. The
471         * sequence argument must be a protein sequence consisting of only
472         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
473         * character.
474         * The composition of an amino acid is the total number of its occurrence,
475         * divided by the total length of the sequence.
476         *
477         * @param sequence
478         *            a protein sequence consisting of non-ambiguous characters only
479         * @param aminoAcidCode
480         *            the code of the amino acid to compute
481         * @return the composition of specified amino acid in the sequence
482         */
483        public static final double getEnrichment(String sequence, String aminoAcidCode){
484                sequence = Utils.checkSequence(sequence);
485                ProteinSequence pSequence = null;
486                try {
487                        pSequence = new ProteinSequence(sequence);
488                } catch (CompoundNotFoundException e) {
489                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
490                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
491                }
492                IPeptideProperties pp = new PeptidePropertiesImpl();
493                AminoAcidCompoundSet aaSet = new AminoAcidCompoundSet();
494                return pp.getEnrichment(pSequence, aaSet.getCompoundForString(aminoAcidCode));
495        }
496
497        /**
498         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
499         * The sequence argument must be a protein sequence consisting of only
500         * non-ambiguous characters.
501         * The composition of an amino acid is the total number of its occurrence,
502         * divided by the total length of the sequence.
503         *
504         * @param sequence
505         *            a protein sequence consisting of non-ambiguous characters only
506         * @return the composition of the 20 standard amino acid in the sequence
507         * @see AminoAcidCompound
508         */
509        public static final Map<AminoAcidCompound, Double> getAAComposition(String sequence) {
510                sequence = Utils.checkSequence(sequence);
511                ProteinSequence pSequence = null;
512                try {
513                        pSequence = new ProteinSequence(sequence);
514                } catch (CompoundNotFoundException e) {
515                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
516                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
517                }
518                IPeptideProperties pp = new PeptidePropertiesImpl();
519                return pp.getAAComposition(pSequence);
520        }
521
522        /**
523         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
524         * The sequence argument must be a protein sequence consisting of only
525         * non-ambiguous characters.
526         * The composition of an amino acid is the total number of its occurrence,
527         * divided by the total length of the sequence.
528         *
529         * @param sequence
530         *              a protein sequence consisting of non-ambiguous characters only
531         * @return the composition of the 20 standard amino acid in the sequence
532         */
533        public static final Map<String, Double> getAACompositionString(String sequence){
534                Map<AminoAcidCompound, Double> aa2Composition = getAAComposition(sequence);
535                Map<String, Double> aaString2Composition = new HashMap<>();
536                aaString2Composition = aa2Composition.keySet().stream() .collect(Collectors.toMap(aaCompound -> aaCompound.getShortName(),aaCompound ->aa2Composition.get(aaCompound)));
537                return aaString2Composition;
538        }
539
540        /**
541         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
542         * The sequence argument must be a protein sequence consisting of only
543         * non-ambiguous characters.
544         * The composition of an amino acid is the total number of its occurrence,
545         * divided by the total length of the sequence.
546         *
547         * @param sequence
548         *              a protein sequence consisting of non-ambiguous characters only
549         * @return the composition of the 20 standard amino acid in the sequence
550         */
551        public static final Map<Character, Double> getAACompositionChar(String sequence){
552                Map<AminoAcidCompound, Double> aa2Composition = getAAComposition(sequence);
553                Map<Character, Double> aaChar2Composition = new HashMap<>();
554                for(AminoAcidCompound aaCompound:aa2Composition.keySet()){
555                        aaChar2Composition.put(aaCompound.getShortName().charAt(0), aa2Composition.get(aaCompound));
556                }
557                return aaChar2Composition;
558        }
559
560        /**
561         * Returns the array of charges of each amino acid in a protein. At pH=7, two are negative charged: aspartic acid (Asp, D) and glutamic acid (Glu, E) (acidic side chains),
562         * and three are positive charged: lysine (Lys, K), arginine (Arg, R) and histidine (His, H) (basic side chains).
563         *
564         * @param sequence
565         *              a protein sequence consisting of non-ambiguous characters only
566         * @return the array of charges of amino acids in the protein (1 if amino acid is positively charged, -1 if negatively charged, 0 if not charged)
567         */
568        public static final int[] getChargesOfAminoAcids(String sequence) {
569                int[] charges = new int[sequence.length()];
570                for ( int i=0; i < sequence.length(); i++ ) {
571                        char aa = sequence.toCharArray()[i];
572                        charges[i] = AminoAcidProperties.getChargeOfAminoAcid(aa);
573                }
574                return charges;
575        }
576
577        /**
578         * Returns the array of polarity values of each amino acid in a protein sequence.
579         *
580         * @param sequence
581         *              a protein sequence consisting of non-ambiguous characters only
582         * @return the array of polarity of amino acids in the protein (1 if amino acid is polar, 0 if not)
583         */
584        public static final int[] getPolarityOfAminoAcids(String sequence) {
585                int[] polarity = new int[sequence.length()];
586                for ( int i=0; i < sequence.length(); i++ ) {
587                        char aa = sequence.toCharArray()[i];
588                        polarity[i] = AminoAcidProperties.getPolarityOfAminoAcid(aa);
589                }
590                return polarity;
591        }
592
593        /**
594         * An adaptor method to return the aromaticity value of sequence. The sequence argument
595         * must be a protein sequence consisting of only non-ambiguous characters.
596         * <p>
597         * Calculates the aromaticity value of a protein according to Lobry, 1994.
598         * It is simply the relative frequency of Phe+Trp+Tyr.
599         * *
600         *
601         * @param sequence a protein sequence consisting of non-ambiguous characters only
602         * @return the aromaticity value of sequence
603         */
604        public static final double getAromaticity(String sequence) {
605                sequence = Utils.checkSequence(sequence);
606                ProteinSequence pSequence = null;
607                try {
608                        pSequence = new ProteinSequence(sequence);
609                } catch (CompoundNotFoundException e) {
610                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
611                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
612                }
613                IPeptideProperties pp = new PeptidePropertiesImpl();
614                return pp.getAromaticity(pSequence);
615        }
616}