001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 2011.05.09 by kohchuanhock
021 *
022 */
023package org.biojava.nbio.aaproperties;
024
025import org.biojava.nbio.aaproperties.xml.AminoAcidCompositionTable;
026import org.biojava.nbio.core.sequence.ProteinSequence;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
028
029import jakarta.xml.bind.JAXBException;
030import java.io.File;
031import java.io.FileNotFoundException;
032import java.util.Map;
033
034
035/**
036 * An interface to generate some basic physico-chemical properties of protein sequences.<br/>
037 * The following properties could be generated:
038 * <p>
039 * Molecular weight<br/>
040 * Absorbance<br/>
041 * Extinction coefficient<br/>
042 * Instability index<br/>
043 * Apliphatic index<br/>
044 * Average hydropathy value<br/>
045 * Isoelectric point<br/>
046 * Net charge at pH 7<br/>
047 * Composition of specified amino acid<br/>
048 * Composition of the 20 standard amino acid<br/>
049 * @author kohchuanhock
050 * @version 2011.05.09
051 * @see PeptideProperties
052 */
053public interface IPeptideProperties{
054        /**
055         * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
056         * This method will sum the molecular weight of each amino acid in the
057         * sequence. Molecular weights are based on <a href="http://web.expasy.org/findmod/findmod_masses.html">here</a>.
058         *
059         * @param sequence
060         *              a protein sequence consisting of non-ambiguous characters only
061         * @return the total molecular weight of sequence + weight of water molecule
062         * @see ProteinSequence
063         */
064        public double getMolecularWeight(ProteinSequence sequence);
065
066        /**
067         * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
068         * This method will sum the molecular weight of each amino acid in the
069         * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema.
070         * Note that it assumes that ElementMass.xml file can be found in default location.
071         *
072         * @param sequence
073         *              a protein sequence consisting of non-ambiguous characters only
074         *              xml file that details the mass of each elements and isotopes
075         * @param aminoAcidCompositionFile
076         *              xml file that details the composition of amino acids
077         * @return the total molecular weight of sequence + weight of water molecule
078         * @throws JAXBException
079         *              thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
080         * @throws FileNotFoundException
081         *              thrown if either elementMassFile or aminoAcidCompositionFile are not found
082         */
083        public double getMolecularWeight(ProteinSequence sequence, File aminoAcidCompositionFile) throws JAXBException, FileNotFoundException;
084
085        /**
086         * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
087         * This method will sum the molecular weight of each amino acid in the
088         * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema.
089         *
090         * @param sequence
091         *              a protein sequence consisting of non-ambiguous characters only
092         * @param elementMassFile
093         *              xml file that details the mass of each elements and isotopes
094         * @param aminoAcidCompositionFile
095         *              xml file that details the composition of amino acids
096         * @return the total molecular weight of sequence + weight of water molecule
097         * @throws JAXBException
098         *              thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
099         * @throws FileNotFoundException
100         *              thrown if either elementMassFile or aminoAcidCompositionFile are not found
101         */
102        public double getMolecularWeight(ProteinSequence sequence, File elementMassFile, File aminoAcidCompositionFile)
103                throws JAXBException, FileNotFoundException;
104
105        /**
106         * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters.
107         * This method will sum the molecular weight of each amino acid in the
108         * sequence. Molecular weights are based on the AminoAcidCompositionTable.
109         * Those input files must be XML using the defined schema.
110         *
111         * @param sequence
112         *              a protein sequence consisting of non-ambiguous characters only
113         * @param aminoAcidCompositionTable
114         *              a amino acid composition table obtained by calling IPeptideProperties.obtainAminoAcidCompositionTable
115         * @return the total molecular weight of sequence + weight of water molecule
116         */
117        public double getMolecularWeightBasedOnXML(ProteinSequence sequence, AminoAcidCompositionTable aminoAcidCompositionTable);
118
119        /**
120         * This method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
121         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
122         * Note that ElementMass.xml is assumed to be able to be seen in default location.
123         *
124         * @param aminoAcidCompositionFile
125         *              xml file that details the composition of amino acids
126         * @return the initialized amino acid composition table
127         * @throws JAXBException
128         *              thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
129         * @throws FileNotFoundException
130         *              thrown if either elementMassFile or aminoAcidCompositionFile are not found
131         */
132        public AminoAcidCompositionTable obtainAminoAcidCompositionTable(File aminoAcidCompositionFile)
133                throws JAXBException, FileNotFoundException;
134
135        /**
136         * This method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to
137         * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable).
138         *
139         * @param elementMassFile
140         *              xml file that details the mass of each elements and isotopes
141         * @param aminoAcidCompositionFile
142         *              xml file that details the composition of amino acids
143         * @return the initialized amino acid composition table
144         * @throws JAXBException
145         *              thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile
146         * @throws FileNotFoundException
147         *              thrown if either elementMassFile or aminoAcidCompositionFile are not found
148         */
149        public AminoAcidCompositionTable obtainAminoAcidCompositionTable(File elementMassFile, File aminoAcidCompositionFile)
150                throws JAXBException, FileNotFoundException;
151
152        /**
153         * Returns the extinction coefficient of sequence. The sequence argument
154         * must be a protein sequence consisting of only non-ambiguous characters.
155         * The extinction coefficient indicates how much light a protein absorbs at
156         * a certain wavelength. It is useful to have an estimation of this
157         * coefficient for following a protein which a spectrophotometer when
158         * purifying it. The computation of extinction coefficient follows the
159         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
160         *
161         * @param sequence
162         *              a protein sequence consisting of non-ambiguous characters only
163         * @param assumeCysReduced
164         *              true if Cys are assumed to be reduced and false if Cys are
165         *              assumed to form cystines
166         * @return the extinction coefficient of sequence
167         * @see ProteinSequence
168         */
169        public double getExtinctionCoefficient(ProteinSequence sequence, boolean assumeCysReduced);
170
171        /**
172         * Returns the absorbance (optical density) of sequence. The sequence argument
173         * must be a protein sequence consisting of only non-ambiguous characters.
174         * The computation of absorbance (optical density) follows the
175         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
176         *
177         * @param sequence
178         *              a protein sequence consisting of non-ambiguous characters only
179         * @param assumeCysReduced
180         *              true if Cys are assumed to be reduced and false if Cys are
181         *              assumed to form cystines
182         * @return the absorbance (optical density) of sequence
183         * @see ProteinSequence
184         */
185        public double getAbsorbance(ProteinSequence sequence, boolean assumeCysReduced);
186
187        /**
188         * Returns the instability index of sequence. The sequence argument must be
189         * a protein sequence consisting of only non-ambiguous characters.
190         * The instability index provides an estimate of the stability of your
191         * protein in a test tube. The computation of instability index follows the
192         * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
193         *
194         * @param sequence
195         *              a protein sequence consisting of non-ambiguous characters only
196         * @return the instability index of sequence
197         * @see ProteinSequence
198         */
199        public double getInstabilityIndex(ProteinSequence sequence);
200
201        /**
202         * Returns the apliphatic index of sequence. The sequence argument must be a
203         * protein sequence consisting of only non-ambiguous characters.
204         * The aliphatic index of a protein is defined as the relative volume
205         * occupied by aliphatic side chains (alanine, valine, isoleucine, and
206         * leucine). It may be regarded as a positive factor for the increase of
207         * thermostability of globular proteins. The computation of aliphatic index
208         * follows the documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>.
209         * A protein whose instability index is smaller than 40 is predicted as stable, a value above 40 predicts that the protein may be unstable.
210         *
211         * @param sequence
212         *              a protein sequence consisting of non-ambiguous characters only
213         * @return the aliphatic index of sequence
214         * @see ProteinSequence
215         */
216        public double getApliphaticIndex(ProteinSequence sequence);
217
218        /**
219         * Returns the average hydropathy value of sequence. The sequence argument
220         * must be a protein sequence consisting of only non-ambiguous characters.
221         * The average value for a sequence is calculated as the sum of hydropathy
222         * values of all the amino acids, divided by the number of residues in the
223         * sequence. Hydropathy values are based on (Kyte, J. and Doolittle, R.F.
224         * (1982) A simple method for displaying the hydropathic character of a
225         * protein. J. Mol. Biol. 157, 105-132).
226         *
227         * @param sequence
228         *              a protein sequence consisting of non-ambiguous characters only
229         * @return the average hydropathy value of sequence
230         * @see ProteinSequence
231         */
232        public double getAvgHydropathy(ProteinSequence sequence);
233
234        /**
235         * Returns the isoelectric point of sequence. The sequence argument must be
236         * a protein sequence consisting of only non-ambiguous characters.
237         * The isoelectric point is the pH at which the protein carries no net
238         * electrical charge. The isoelectric point will be computed based on
239         * approach stated in
240         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a>
241         *
242         * pKa values used will be either
243         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
244         * OR
245         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
246         * @param sequence
247         *              a protein sequence consisting of non-ambiguous characters only
248         * @param useExpasyValues
249         *              whether to use Expasy values (Default) or Innovagen values
250         * @return the isoelectric point of sequence
251         * @see ProteinSequence
252         */
253        public double getIsoelectricPoint(ProteinSequence sequence, boolean useExpasyValues);
254
255        public double getIsoelectricPoint(ProteinSequence seuqence);
256
257        /**
258         * Returns the net charge of sequence at pH 7. The sequence argument must be
259         * a protein sequence consisting of only non-ambiguous characters.
260         * The net charge will be computed using the approach stated in
261         * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#NetCharge">here</a>
262         *
263         * pKa values used will be either
264         * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539"
265         * OR
266         * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1.
267         *
268         * @param sequence
269         *              a protein sequence consisting of non-ambiguous characters only
270         * @param useExpasyValues
271         *              whether to use Expasy values (Default) or Innovagen values
272         * @param pHPoint
273         *              the pH value to use for computation of the net charge. Default at 7.
274         * @return the net charge of sequence at given pHPoint
275         * @see ProteinSequence
276         */
277        public double getNetCharge(ProteinSequence sequence, boolean useExpasyValues, double pHPoint);
278
279        public double getNetCharge(ProteinSequence sequence, boolean useExpasyValues);
280
281        public double getNetCharge(ProteinSequence sequence);
282
283        /**
284         * Returns the composition of specified amino acid in the sequence. The
285         * sequence argument must be a protein sequence consisting of only
286         * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous
287         * character.
288         * The composition of an amino acid is the total number of its occurrence,
289         * divided by the total length of the sequence.
290         *
291         * @param sequence
292         *              a protein sequence consisting of non-ambiguous characters only
293         * @param aminoAcidCode
294         *              the code of the amino acid to compute
295         * @return the composition of specified amino acid in the sequence
296         * @see ProteinSequence
297         * @see AminoAcidCompound
298         */
299        public double getEnrichment(ProteinSequence sequence, AminoAcidCompound aminoAcidCode);
300
301        /**
302         * Returns the composition of the 20 standard amino acid in the sequence.
303         * The sequence argument must be a protein sequence consisting of only
304         * non-ambiguous characters.
305         * The composition of an amino acid is the total number of its occurrence,
306         * divided by the total length of the sequence.
307         *
308         * @param sequence
309         *              a protein sequence consisting of non-ambiguous characters only
310         * @return the composition of the 20 standard amino acid in the sequence
311         * @see ProteinSequence
312         * @see AminoAcidCompound
313         */
314        public Map<AminoAcidCompound, Double> getAAComposition(ProteinSequence sequence);
315
316        /**
317         * Calculates the aromaticity value of a protein according to Lobry, 1994.
318         * It is simply the relative frequency of Phe+Trp+Tyr.
319         *
320         * @param sequence a protein sequence consisting of non-ambiguous characters only
321         * @return the aromaticity of a protein sequence
322         * @see ProteinSequence
323         */
324        public double getAromaticity(ProteinSequence sequence);
325}