001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.aaproperties;
022
023import org.biojava.nbio.aaproperties.xml.AminoAcidCompositionTable;
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import javax.xml.bind.JAXBException;
032import java.io.File;
033import java.io.FileNotFoundException;
034import java.util.Arrays;
035import java.util.HashMap;
036import java.util.HashSet;
037import java.util.Map;
038import java.util.Set;
039import java.util.stream.Collectors;
040import java.util.stream.Stream;
041
042/**
043 * This is an adaptor class which enable the ease of generating protein properties.
044 * At least one adaptor method is written for each available properties provided in IPeptideProperties.
045 *
046 * @author kohchuanhock
047 * @version 2011.08.22
048 * @since 3.0.2
049 * @see IPeptideProperties
050 * @see PeptidePropertiesImpl
051 */
052public class PeptideProperties {
053
054        private final static Logger logger = LoggerFactory.getLogger(PeptideProperties.class);
055
056        /**
057         * Enumeration of 20 standard amino acid code
058         */
059        public enum SingleLetterAACode { W, C, M, H, Y, F, Q, N, I, R, D, P, T, K, E, V, S, G, A, L}
060
061        /**
062         * Contains the 20 standard AA code in a set
063         */
064        public static Set<Character> standardAASet;
065
066        /**
067         * To initialize the standardAASet
068         */
069        static{
070                standardAASet = Arrays.stream(SingleLetterAACode.values())
071                                      .map(singleLetterAACode -> singleLetterAACode.toString().charAt(0))
072                                      .collect(Collectors.toCollection(HashSet::new));
073        }
074
075        /**
076         * An adaptor method to return the molecular weight of sequence.
077         * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
078         * This method will sum the molecular weight of each amino acid in the
079         * sequence. Molecular weights are based on <a href="http://web.expasy.org/findmod/findmod_masses.html">here</a>.
080         *
081         * @param sequence
082         *      a protein sequence consisting of non-ambiguous characters only
083         * @return the total molecular weight of sequence + weight of water molecule
084         */
085        public static final double getMolecularWeight(String sequence){
086                sequence = Utils.checkSequence(sequence);
087                ProteinSequence pSequence = null;
088                try {
089                        pSequence = new ProteinSequence(sequence);
090                } catch (CompoundNotFoundException e) {
091                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
092                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
093                }
094                IPeptideProperties pp = new PeptidePropertiesImpl();
095                return pp.getMolecularWeight(pSequence);
096        }
097
098        /**
099         * An adaptor method to return the molecular weight of sequence.
100         * The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
101         * This method will sum the molecular weight of each amino acid in the
102         * sequence. Molecular weights are based on the input xml file.
103         *
104         * @param sequence
105         *      a protein sequence consisting of non-ambiguous characters only
106         * @param elementMassFile
107         *      xml file that details the mass of each elements and isotopes
108         * @param aminoAcidCompositionFile
109         *      xml file that details the composition of amino acids
110         * @return the total molecular weight of sequence + weight of water molecule
111         * @throws FileNotFoundException
112         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
113         * @throws JAXBException
114         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
115         */
116        public static final double getMolecularWeight(String sequence, File elementMassFile, File aminoAcidCompositionFile)
117        throws FileNotFoundException, JAXBException{
118                sequence = Utils.checkSequence(sequence);
119                ProteinSequence pSequence = null;
120                try {
121                        pSequence = new ProteinSequence(sequence);
122                } catch (CompoundNotFoundException e) {
123                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
124                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
125                }
126                IPeptideProperties pp = new PeptidePropertiesImpl();
127                return pp.getMolecularWeight(pSequence, elementMassFile, aminoAcidCompositionFile);
128        }
129
130        /**
131         * An adaptor method to return the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
132         * This method will sum the molecular weight of each amino acid in the
133         * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema.
134         * Note that it assumes that ElementMass.xml file can be found in default location.
135         *
136         * @param sequence
137         *      a protein sequence consisting of non-ambiguous characters only
138         *      xml file that details the mass of each elements and isotopes
139         * @param aminoAcidCompositionFile
140         *      xml file that details the composition of amino acids
141         * @return the total molecular weight of sequence + weight of water molecule
142         * @throws JAXBException
143         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
144         * @throws FileNotFoundException
145         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
146         */
147        public static final double getMolecularWeight(String sequence, File aminoAcidCompositionFile) throws FileNotFoundException, JAXBException{
148                sequence = Utils.checkSequence(sequence);
149                ProteinSequence pSequence = null;
150                try {
151                        pSequence = new ProteinSequence(sequence);
152                } catch (CompoundNotFoundException e) {
153                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
154                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
155                }
156                IPeptideProperties pp = new PeptidePropertiesImpl();
157                return pp.getMolecularWeight(pSequence, aminoAcidCompositionFile);
158        }
159
160        /**
161         * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
162         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
163         * Note that ElementMass.xml is assumed to be able to be seen in default location.
164         *
165         * @param aminoAcidCompositionFile
166         *      xml file that details the composition of amino acids
167         * @return the initialized amino acid composition table
168         * @throws JAXBException
169         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
170         * @throws FileNotFoundException
171         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
172         */
173        public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File aminoAcidCompositionFile)
174        throws JAXBException, FileNotFoundException{
175                IPeptideProperties pp = new PeptidePropertiesImpl();
176                return pp.obtainAminoAcidCompositionTable(aminoAcidCompositionFile);
177        }
178
179        /**
180         * An adaptor method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
181         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
182         *
183         * @param elementMassFile
184         *      xml file that details the mass of each elements and isotopes
185         * @param aminoAcidCompositionFile
186         *      xml file that details the composition of amino acids
187         * @return the initialized amino acid composition table
188         * @throws JAXBException
189         *      thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
190         * @throws FileNotFoundException
191         *      thrown if either elementMassFile or aminoAcidCompositionFile are not found
192         */
193        public static final AminoAcidCompositionTable obtainAminoAcidCompositionTable(File elementMassFile, File aminoAcidCompositionFile)
194        throws JAXBException, FileNotFoundException{
195                IPeptideProperties pp = new PeptidePropertiesImpl();
196                return pp.obtainAminoAcidCompositionTable(elementMassFile, aminoAcidCompositionFile);
197        }
198
199        /**
200         * An adaptor method that returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
201         * This method will sum the molecular weight of each amino acid in the
202         * sequence. Molecular weights are based on the AminoAcidCompositionTable.
203         * Those input files must be XML using the defined schema.
204         *
205         * @param sequence
206         *      a protein sequence consisting of non-ambiguous characters only
207         * @param aminoAcidCompositionTable
208         *      a amino acid composition table obtained by calling IPeptideProperties.obtainAminoAcidCompositionTable
209         * @return the total molecular weight of sequence + weight of water molecule
210         *      thrown if the method IPeptideProperties.setMolecularWeightXML(File, File) is not successfully called before calling this method.
211         */
212        public static double getMolecularWeightBasedOnXML(String sequence, AminoAcidCompositionTable aminoAcidCompositionTable){
213                sequence = Utils.checkSequence(sequence, aminoAcidCompositionTable.getSymbolSet());
214                ProteinSequence pSequence = null;
215                try {
216                        pSequence = new ProteinSequence(sequence, aminoAcidCompositionTable.getAminoAcidCompoundSet());
217                } catch (CompoundNotFoundException e) {
218                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
219                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
220                }
221                IPeptideProperties pp = new PeptidePropertiesImpl();
222                return pp.getMolecularWeightBasedOnXML(pSequence, aminoAcidCompositionTable);
223        }
224
225        /**
226         * An adaptor method to returns the absorbance (optical density) of sequence. The sequence argument
227         * must be a protein sequence consisting of only non-ambiguous characters.
228         * The computation of absorbance (optical density) follows the
229         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
230         *
231         * @param sequence
232         *      a protein sequence consisting of non-ambiguous characters only
233         * @param assumeCysReduced
234         *      true if Cys are assumed to be reduced and false if Cys are assumed to form cystines
235         * @return the absorbance (optical density) of sequence
236         */
237        public static final double getAbsorbance(String sequence, boolean assumeCysReduced){
238                sequence = Utils.checkSequence(sequence);
239                ProteinSequence pSequence = null;
240                try {
241                        pSequence = new ProteinSequence(sequence);
242                } catch (CompoundNotFoundException e) {
243                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
244                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
245                }
246                IPeptideProperties pp = new PeptidePropertiesImpl();
247                return pp.getAbsorbance(pSequence, assumeCysReduced);
248        }
249
250        /**
251         * An adaptor method to return the extinction coefficient of sequence. The sequence argument
252         * must be a protein sequence consisting of only non-ambiguous characters.
253         * The extinction coefficient indicates how much light a protein absorbs at
254         * a certain wavelength. It is useful to have an estimation of this
255         * coefficient for following a protein which a spectrophotometer when
256         * purifying it. The computation of extinction coefficient follows the
257         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
258         *
259         * @param sequence
260         *            a protein sequence consisting of non-ambiguous characters only
261         * @param assumeCysReduced
262         *            true if Cys are assumed to be reduced and false if Cys are
263         *            assumed to form cystines
264         * @return the extinction coefficient of sequence
265         */
266        public static final double getExtinctionCoefficient(String sequence, boolean assumeCysReduced) {
267                sequence = Utils.checkSequence(sequence);
268                ProteinSequence pSequence = null;
269                try {
270                        pSequence = new ProteinSequence(sequence);
271                } catch (CompoundNotFoundException e) {
272                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
273                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
274                }
275                IPeptideProperties pp = new PeptidePropertiesImpl();
276                return pp.getExtinctionCoefficient(pSequence, assumeCysReduced);
277        }
278
279        /**
280         * An adaptor method to return the instability index of sequence. The sequence argument must be
281         * a protein sequence consisting of only non-ambiguous characters.
282         * The instability index provides an estimate of the stability of your
283         * protein in a test tube. The computation of instability index follows the
284         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
285         *
286         * @param sequence
287         *              a protein sequence consisting of non-ambiguous characters only
288         * @return the instability index of sequence
289         */
290        public static final double getInstabilityIndex(String sequence) {
291                sequence = Utils.checkSequence(sequence);
292                ProteinSequence pSequence = null;
293                try {
294                        pSequence = new ProteinSequence(sequence);
295                } catch (CompoundNotFoundException e) {
296                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
297                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
298                }
299                IPeptideProperties pp = new PeptidePropertiesImpl();
300                return pp.getInstabilityIndex(pSequence);
301        }
302
303        /**
304         * An adaptor method to return the apliphatic index of sequence. The sequence argument must be a
305         * protein sequence consisting of only non-ambiguous characters.
306         * The aliphatic index of a protein is defined as the relative volume
307         * occupied by aliphatic side chains (alanine, valine, isoleucine, and
308         * leucine). It may be regarded as a positive factor for the increase of
309         * thermostability of globular proteins. The computation of aliphatic index
310         * follows the documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
311         * A protein whose instability index is smaller than 40 is predicted as stable, a value above 40 predicts that the protein may be unstable.
312         *
313         * @param sequence
314         *              a protein sequence consisting of non-ambiguous characters only
315         * @return the aliphatic index of sequence
316         */
317        public static final double getApliphaticIndex(String sequence) {
318                sequence = Utils.checkSequence(sequence);
319                ProteinSequence pSequence = null;
320                try {
321                        pSequence = new ProteinSequence(sequence);
322                } catch (CompoundNotFoundException e) {
323                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
324                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
325                }
326
327                IPeptideProperties pp = new PeptidePropertiesImpl();
328                return pp.getApliphaticIndex(pSequence);
329        }
330
331        /**
332         * An adaptor method to return the average hydropathy value of sequence. The sequence argument
333         * must be a protein sequence consisting of only non-ambiguous characters.
334         * The average value for a sequence is calculated as the sum of hydropathy
335         * values of all the amino acids, divided by the number of residues in the
336         * sequence. Hydropathy values are based on (Kyte, J. and Doolittle, R.F.
337         * (1982) A simple method for displaying the hydropathic character of a
338         * protein. J. Mol. Biol. 157, 105-132).
339         *
340         * @param sequence
341         *              a protein sequence consisting of non-ambiguous characters only
342         * @return the average hydropathy value of sequence
343         */
344        public static final double getAvgHydropathy(String sequence) {
345                sequence = Utils.checkSequence(sequence);
346                ProteinSequence pSequence = null;
347                try {
348                        pSequence = new ProteinSequence(sequence);
349                } catch (CompoundNotFoundException e) {
350                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
351                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
352                }
353                IPeptideProperties pp = new PeptidePropertiesImpl();
354                return pp.getAvgHydropathy(pSequence);
355        }
356
357        /**
358         * An adaptor method to return the isoelectric point of sequence. The sequence argument must be
359         * a protein sequence consisting of only non-ambiguous characters.
360         * The isoelectric point is the pH at which the protein carries no net
361         * electrical charge. The isoelectric point will be computed based on
362         * approach stated in
363         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a>
364         *
365         * pKa values used will be either
366         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
367         * OR
368         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
369         *
370         * @param sequence
371         *              a protein sequence consisting of non-ambiguous characters only
372         * @param useExpasyValues
373         *              whether to use Expasy values (Default) or Innovagen values
374         * @return the isoelectric point of sequence
375         */
376        public static final double getIsoelectricPoint(String sequence, boolean useExpasyValues) {
377                sequence = Utils.checkSequence(sequence);
378                ProteinSequence pSequence = null;
379                try {
380                        pSequence = new ProteinSequence(sequence);
381                } catch (CompoundNotFoundException e) {
382                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
383                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
384                }
385                IPeptideProperties pp = new PeptidePropertiesImpl();
386                return pp.getIsoelectricPoint(pSequence, useExpasyValues);
387        }
388
389        public static final double getIsoelectricPoint(String sequence){
390                return getIsoelectricPoint(sequence, true);
391        }
392
393        /**
394         * An adaptor method to return the net charge of sequence at pH 7. The sequence argument must be
395         * a protein sequence consisting of only non-ambiguous characters.
396         * The net charge will be computed using the approach stated in
397         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a>
398         *
399         * pKa values used will be either
400         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
401         * OR
402         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
403         *
404         * @param sequence
405         *              a protein sequence consisting of non-ambiguous characters only
406         * @param useExpasyValues
407         *              whether to use Expasy values (Default) or Innovagen values
408         * @param pHPoint
409         *              the pH value to use for computation of the net charge. Default at 7.
410         * @return the net charge of sequence at given pHPoint
411         */
412        public static final double getNetCharge(String sequence, boolean useExpasyValues, double pHPoint){
413                sequence = Utils.checkSequence(sequence);
414                ProteinSequence pSequence = null;
415                try {
416                        pSequence = new ProteinSequence(sequence);
417                } catch (CompoundNotFoundException e) {
418                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
419                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
420                }
421                IPeptideProperties pp = new PeptidePropertiesImpl();
422                return pp.getNetCharge(pSequence, useExpasyValues, pHPoint);
423        }
424
425        public static final double getNetCharge(String sequence, boolean useExpasyValues) {
426                return getNetCharge(sequence, useExpasyValues, 7.0);
427        }
428
429        public static final double getNetCharge(String sequence){
430                return getNetCharge(sequence, true);
431        }
432
433        /**
434         * An adaptor method to return the composition of specified amino acid in the sequence. The
435         * sequence argument must be a protein sequence consisting of only
436         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
437         * character.
438         * The composition of an amino acid is the total number of its occurrence,
439         * divided by the total length of the sequence.
440         *
441         * @param sequence
442         *            a protein sequence consisting of non-ambiguous characters only
443         * @param aminoAcidCode
444         *            the code of the amino acid to compute
445         * @return the composition of specified amino acid in the sequence
446         * @see SingleLetterAACode
447         */
448        public static final double getEnrichment(String sequence, SingleLetterAACode aminoAcidCode) {
449                return getEnrichment(sequence, aminoAcidCode.toString());
450        }
451
452        /**
453         * An adaptor method to return the composition of specified amino acid in the sequence. The
454         * sequence argument must be a protein sequence consisting of only
455         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
456         * character.
457         * The composition of an amino acid is the total number of its occurrence,
458         * divided by the total length of the sequence.
459         *
460         * @param sequence
461         *              a protein sequence consisting of non-ambiguous characters only
462         * @param aminoAcidCode
463         *              the code of the amino acid to compute
464         * @return the composition of specified amino acid in the sequence
465         */
466        public static final double getEnrichment(String sequence, char aminoAcidCode){
467                return getEnrichment(sequence, aminoAcidCode);
468        }
469
470        /**
471         * An adaptor method to return the composition of specified amino acid in the sequence. The
472         * sequence argument must be a protein sequence consisting of only
473         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
474         * character.
475         * The composition of an amino acid is the total number of its occurrence,
476         * divided by the total length of the sequence.
477         *
478         * @param sequence
479         *            a protein sequence consisting of non-ambiguous characters only
480         * @param aminoAcidCode
481         *            the code of the amino acid to compute
482         * @return the composition of specified amino acid in the sequence
483         */
484        public static final double getEnrichment(String sequence, String aminoAcidCode){
485                sequence = Utils.checkSequence(sequence);
486                ProteinSequence pSequence = null;
487                try {
488                        pSequence = new ProteinSequence(sequence);
489                } catch (CompoundNotFoundException e) {
490                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
491                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
492                }
493                IPeptideProperties pp = new PeptidePropertiesImpl();
494                AminoAcidCompoundSet aaSet = new AminoAcidCompoundSet();
495                return pp.getEnrichment(pSequence, aaSet.getCompoundForString(aminoAcidCode));
496        }
497
498        /**
499         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
500         * The sequence argument must be a protein sequence consisting of only
501         * non-ambiguous characters.
502         * The composition of an amino acid is the total number of its occurrence,
503         * divided by the total length of the sequence.
504         *
505         * @param sequence
506         *            a protein sequence consisting of non-ambiguous characters only
507         * @return the composition of the 20 standard amino acid in the sequence
508         * @see AminoAcidCompound
509         */
510        public static final Map<AminoAcidCompound, Double> getAAComposition(String sequence) {
511                sequence = Utils.checkSequence(sequence);
512                ProteinSequence pSequence = null;
513                try {
514                        pSequence = new ProteinSequence(sequence);
515                } catch (CompoundNotFoundException e) {
516                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
517                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
518                }
519                IPeptideProperties pp = new PeptidePropertiesImpl();
520                return pp.getAAComposition(pSequence);
521        }
522
523        /**
524         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
525         * The sequence argument must be a protein sequence consisting of only
526         * non-ambiguous characters.
527         * The composition of an amino acid is the total number of its occurrence,
528         * divided by the total length of the sequence.
529         *
530         * @param sequence
531         *              a protein sequence consisting of non-ambiguous characters only
532         * @return the composition of the 20 standard amino acid in the sequence
533         */
534        public static final Map<String, Double> getAACompositionString(String sequence){
535                Map<AminoAcidCompound, Double> aa2Composition = getAAComposition(sequence);
536                Map<String, Double> aaString2Composition = new HashMap<String, Double>();
537                aaString2Composition = aa2Composition.keySet().stream() .collect(Collectors.toMap(aaCompound -> aaCompound.getShortName(),aaCompound ->aa2Composition.get(aaCompound)));
538                return aaString2Composition;
539        }
540
541        /**
542         * An adaptor method to return the composition of the 20 standard amino acid in the sequence.
543         * The sequence argument must be a protein sequence consisting of only
544         * non-ambiguous characters.
545         * The composition of an amino acid is the total number of its occurrence,
546         * divided by the total length of the sequence.
547         *
548         * @param sequence
549         *              a protein sequence consisting of non-ambiguous characters only
550         * @return the composition of the 20 standard amino acid in the sequence
551         */
552        public static final Map<Character, Double> getAACompositionChar(String sequence){
553                Map<AminoAcidCompound, Double> aa2Composition = getAAComposition(sequence);
554                Map<Character, Double> aaChar2Composition = new HashMap<Character, Double>();
555                for(AminoAcidCompound aaCompound:aa2Composition.keySet()){
556                        aaChar2Composition.put(aaCompound.getShortName().charAt(0), aa2Composition.get(aaCompound));
557                }
558                return aaChar2Composition;
559        }
560
561        /**
562         * Returns the array of charges of each amino acid in a protein. At pH=7, two are negative charged: aspartic acid (Asp, D) and glutamic acid (Glu, E) (acidic side chains),
563         * and three are positive charged: lysine (Lys, K), arginine (Arg, R) and histidine (His, H) (basic side chains).
564         *
565         * @param sequence
566         *              a protein sequence consisting of non-ambiguous characters only
567         * @return the array of charges of amino acids in the protein (1 if amino acid is positively charged, -1 if negatively charged, 0 if not charged)
568         */
569        public static final int[] getChargesOfAminoAcids(String sequence) {
570                int[] charges = new int[sequence.length()];
571                for ( int i=0; i < sequence.length(); i++ ) {
572                        char aa = sequence.toCharArray()[i];
573                        charges[i] = AminoAcidProperties.getChargeOfAminoAcid(aa);
574                }
575                return charges;
576        }
577
578        /**
579         * Returns the array of polarity values of each amino acid in a protein sequence.
580         *
581         * @param sequence
582         *              a protein sequence consisting of non-ambiguous characters only
583         * @return the array of polarity of amino acids in the protein (1 if amino acid is polar, 0 if not)
584         */
585        public static final int[] getPolarityOfAminoAcids(String sequence) {
586                int[] polarity = new int[sequence.length()];
587                for ( int i=0; i < sequence.length(); i++ ) {
588                        char aa = sequence.toCharArray()[i];
589                        polarity[i] = AminoAcidProperties.getPolarityOfAminoAcid(aa);
590                }
591                return polarity;
592        }
593
594        /**
595         * An adaptor method to return the aromaticity value of sequence. The sequence argument
596         * must be a protein sequence consisting of only non-ambiguous characters.
597         * <p>
598         * Calculates the aromaticity value of a protein according to Lobry, 1994.
599         * It is simply the relative frequency of Phe+Trp+Tyr.
600         * *
601         *
602         * @param sequence a protein sequence consisting of non-ambiguous characters only
603         * @return the aromaticity value of sequence
604         */
605        public static final double getAromaticity(String sequence) {
606                sequence = Utils.checkSequence(sequence);
607                ProteinSequence pSequence = null;
608                try {
609                        pSequence = new ProteinSequence(sequence);
610                } catch (CompoundNotFoundException e) {
611                        // the sequence was checked with Utils.checkSequence, this shouldn't happen
612                        logger.error("The protein sequence contains invalid characters ({}), this should not happen. This is most likely a bug in Utils.checkSequence()", e.getMessage());
613                }
614                IPeptideProperties pp = new PeptidePropertiesImpl();
615                return pp.getAromaticity(pSequence);
616        }
617}