001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 2011.05.09 by kohchuanhock 021 * 022 */ 023package org.biojava.nbio.aaproperties; 024 025import org.biojava.nbio.aaproperties.xml.AminoAcidCompositionTable; 026import org.biojava.nbio.core.sequence.ProteinSequence; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 028 029import javax.xml.bind.JAXBException; 030import java.io.File; 031import java.io.FileNotFoundException; 032import java.util.Map; 033 034 035/** 036 * An interface to generate some basic physico-chemical properties of protein sequences.<br/> 037 * The following properties could be generated: 038 * <p/> 039 * Molecular weight<br/> 040 * Absorbance<br/> 041 * Extinction coefficient<br/> 042 * Instability index<br/> 043 * Apliphatic index<br/> 044 * Average hydropathy value<br/> 045 * Isoelectric point<br/> 046 * Net charge at pH 7<br/> 047 * Composition of specified amino acid<br/> 048 * Composition of the 20 standard amino acid<br/> 049 * @author kohchuanhock 050 * @version 2011.05.09 051 * @see PeptideProperties 052 */ 053public interface IPeptideProperties{ 054 /** 055 * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters. 056 * This method will sum the molecular weight of each amino acid in the 057 * sequence. Molecular weights are based on <a href="http://web.expasy.org/findmod/findmod_masses.html">here</a>. 058 * 059 * @param sequence 060 * a protein sequence consisting of non-ambiguous characters only 061 * @return the total molecular weight of sequence + weight of water molecule 062 * @see ProteinSequence 063 */ 064 public double getMolecularWeight(ProteinSequence sequence); 065 066 /** 067 * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters. 068 * This method will sum the molecular weight of each amino acid in the 069 * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema. 070 * Note that it assumes that ElementMass.xml file can be found in default location. 071 * 072 * @param sequence 073 * a protein sequence consisting of non-ambiguous characters only 074 * xml file that details the mass of each elements and isotopes 075 * @param aminoAcidCompositionFile 076 * xml file that details the composition of amino acids 077 * @return the total molecular weight of sequence + weight of water molecule 078 * @throws JAXBException 079 * thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile 080 * @throws FileNotFoundException 081 * thrown if either elementMassFile or aminoAcidCompositionFile are not found 082 */ 083 public double getMolecularWeight(ProteinSequence sequence, File aminoAcidCompositionFile) throws JAXBException, FileNotFoundException; 084 085 /** 086 * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters. 087 * This method will sum the molecular weight of each amino acid in the 088 * sequence. Molecular weights are based on the input files. These input files must be XML using the defined schema. 089 * 090 * @param sequence 091 * a protein sequence consisting of non-ambiguous characters only 092 * @param elementMassFile 093 * xml file that details the mass of each elements and isotopes 094 * @param aminoAcidCompositionFile 095 * xml file that details the composition of amino acids 096 * @return the total molecular weight of sequence + weight of water molecule 097 * @throws JAXBException 098 * thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile 099 * @throws FileNotFoundException 100 * thrown if either elementMassFile or aminoAcidCompositionFile are not found 101 */ 102 public double getMolecularWeight(ProteinSequence sequence, File elementMassFile, File aminoAcidCompositionFile) 103 throws JAXBException, FileNotFoundException; 104 105 /** 106 * Returns the molecular weight of sequence. The sequence argument must be a protein sequence consisting of only non-ambiguous characters. 107 * This method will sum the molecular weight of each amino acid in the 108 * sequence. Molecular weights are based on the AminoAcidCompositionTable. 109 * Those input files must be XML using the defined schema. 110 * 111 * @param sequence 112 * a protein sequence consisting of non-ambiguous characters only 113 * @param aminoAcidCompositionTable 114 * a amino acid composition table obtained by calling IPeptideProperties.obtainAminoAcidCompositionTable 115 * @return the total molecular weight of sequence + weight of water molecule 116 */ 117 public double getMolecularWeightBasedOnXML(ProteinSequence sequence, AminoAcidCompositionTable aminoAcidCompositionTable); 118 119 /** 120 * This method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to 121 * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable). 122 * Note that ElementMass.xml is assumed to be able to be seen in default location. 123 * 124 * @param aminoAcidCompositionFile 125 * xml file that details the composition of amino acids 126 * @return the initialized amino acid composition table 127 * @throws JAXBException 128 * thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile 129 * @throws FileNotFoundException 130 * thrown if either elementMassFile or aminoAcidCompositionFile are not found 131 */ 132 public AminoAcidCompositionTable obtainAminoAcidCompositionTable(File aminoAcidCompositionFile) 133 throws JAXBException, FileNotFoundException; 134 135 /** 136 * This method would initialize amino acid composition table based on the input xml files and stores the table for usage in future calls to 137 * IPeptideProperties.getMolecularWeightBasedOnXML(ProteinSequence, AminoAcidCompositionTable). 138 * 139 * @param elementMassFile 140 * xml file that details the mass of each elements and isotopes 141 * @param aminoAcidCompositionFile 142 * xml file that details the composition of amino acids 143 * @return the initialized amino acid composition table 144 * @throws JAXBException 145 * thrown if unable to properly parse either elementMassFile or aminoAcidCompositionFile 146 * @throws FileNotFoundException 147 * thrown if either elementMassFile or aminoAcidCompositionFile are not found 148 */ 149 public AminoAcidCompositionTable obtainAminoAcidCompositionTable(File elementMassFile, File aminoAcidCompositionFile) 150 throws JAXBException, FileNotFoundException; 151 152 /** 153 * Returns the extinction coefficient of sequence. The sequence argument 154 * must be a protein sequence consisting of only non-ambiguous characters. 155 * The extinction coefficient indicates how much light a protein absorbs at 156 * a certain wavelength. It is useful to have an estimation of this 157 * coefficient for following a protein which a spectrophotometer when 158 * purifying it. The computation of extinction coefficient follows the 159 * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>. 160 * 161 * @param sequence 162 * a protein sequence consisting of non-ambiguous characters only 163 * @param assumeCysReduced 164 * true if Cys are assumed to be reduced and false if Cys are 165 * assumed to form cystines 166 * @return the extinction coefficient of sequence 167 * @see ProteinSequence 168 */ 169 public double getExtinctionCoefficient(ProteinSequence sequence, boolean assumeCysReduced); 170 171 /** 172 * Returns the absorbance (optical density) of sequence. The sequence argument 173 * must be a protein sequence consisting of only non-ambiguous characters. 174 * The computation of absorbance (optical density) follows the 175 * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>. 176 * 177 * @param sequence 178 * a protein sequence consisting of non-ambiguous characters only 179 * @param assumeCysReduced 180 * true if Cys are assumed to be reduced and false if Cys are 181 * assumed to form cystines 182 * @return the absorbance (optical density) of sequence 183 * @see ProteinSequence 184 */ 185 public double getAbsorbance(ProteinSequence sequence, boolean assumeCysReduced); 186 187 /** 188 * Returns the instability index of sequence. The sequence argument must be 189 * a protein sequence consisting of only non-ambiguous characters. 190 * The instability index provides an estimate of the stability of your 191 * protein in a test tube. The computation of instability index follows the 192 * documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>. 193 * 194 * @param sequence 195 * a protein sequence consisting of non-ambiguous characters only 196 * @return the instability index of sequence 197 * @see ProteinSequence 198 */ 199 public double getInstabilityIndex(ProteinSequence sequence); 200 201 /** 202 * Returns the apliphatic index of sequence. The sequence argument must be a 203 * protein sequence consisting of only non-ambiguous characters. 204 * The aliphatic index of a protein is defined as the relative volume 205 * occupied by aliphatic side chains (alanine, valine, isoleucine, and 206 * leucine). It may be regarded as a positive factor for the increase of 207 * thermostability of globular proteins. The computation of aliphatic index 208 * follows the documentation in <a href="http://web.expasy.org/protparam/protparam-doc.html">here</a>. 209 * A protein whose instability index is smaller than 40 is predicted as stable, a value above 40 predicts that the protein may be unstable. 210 * 211 * @param sequence 212 * a protein sequence consisting of non-ambiguous characters only 213 * @return the aliphatic index of sequence 214 * @see ProteinSequence 215 */ 216 public double getApliphaticIndex(ProteinSequence sequence); 217 218 /** 219 * Returns the average hydropathy value of sequence. The sequence argument 220 * must be a protein sequence consisting of only non-ambiguous characters. 221 * The average value for a sequence is calculated as the sum of hydropathy 222 * values of all the amino acids, divided by the number of residues in the 223 * sequence. Hydropathy values are based on (Kyte, J. and Doolittle, R.F. 224 * (1982) A simple method for displaying the hydropathic character of a 225 * protein. J. Mol. Biol. 157, 105-132). 226 * 227 * @param sequence 228 * a protein sequence consisting of non-ambiguous characters only 229 * @return the average hydropathy value of sequence 230 * @see ProteinSequence 231 */ 232 public double getAvgHydropathy(ProteinSequence sequence); 233 234 /** 235 * Returns the isoelectric point of sequence. The sequence argument must be 236 * a protein sequence consisting of only non-ambiguous characters. 237 * The isoelectric point is the pH at which the protein carries no net 238 * electrical charge. The isoelectric point will be computed based on 239 * approach stated in 240 * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#PI">here</a> 241 * 242 * pKa values used will be either 243 * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539" 244 * OR 245 * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1. 246 * @param sequence 247 * a protein sequence consisting of non-ambiguous characters only 248 * @param useExpasyValues 249 * whether to use Expasy values (Default) or Innovagen values 250 * @return the isoelectric point of sequence 251 * @see ProteinSequence 252 */ 253 public double getIsoelectricPoint(ProteinSequence sequence, boolean useExpasyValues); 254 255 public double getIsoelectricPoint(ProteinSequence seuqence); 256 257 /** 258 * Returns the net charge of sequence at pH 7. The sequence argument must be 259 * a protein sequence consisting of only non-ambiguous characters. 260 * The net charge will be computed using the approach stated in 261 * <a href="http://www.innovagen.se/custom-peptide-synthesis/peptide-property-calculator/peptide-property-calculator-notes.asp#NetCharge>here</a> 262 * 263 * pKa values used will be either 264 * those used by Expasy which referenced "Electrophoresis 1994, 15, 529-539" 265 * OR 266 * A.Lehninger, Principles of Biochemistry, 4th Edition (2005), Chapter 3, page78, Table 3-1. 267 * 268 * @param sequence 269 * a protein sequence consisting of non-ambiguous characters only 270 * @param useExpasyValues 271 * whether to use Expasy values (Default) or Innovagen values 272 * @param pHPoint 273 * the pH value to use for computation of the net charge. Default at 7. 274 * @return the net charge of sequence at given pHPoint 275 * @see ProteinSequence 276 */ 277 public double getNetCharge(ProteinSequence sequence, boolean useExpasyValues, double pHPoint); 278 279 public double getNetCharge(ProteinSequence sequence, boolean useExpasyValues); 280 281 public double getNetCharge(ProteinSequence sequence); 282 283 /** 284 * Returns the composition of specified amino acid in the sequence. The 285 * sequence argument must be a protein sequence consisting of only 286 * non-ambiguous characters. The aminoAcidCode must be a non-ambiguous 287 * character. 288 * The composition of an amino acid is the total number of its occurrence, 289 * divided by the total length of the sequence. 290 * 291 * @param sequence 292 * a protein sequence consisting of non-ambiguous characters only 293 * @param aminoAcidCode 294 * the code of the amino acid to compute 295 * @return the composition of specified amino acid in the sequence 296 * @see ProteinSequence 297 * @see AminoAcidCompound 298 */ 299 public double getEnrichment(ProteinSequence sequence, AminoAcidCompound aminoAcidCode); 300 301 /** 302 * Returns the composition of the 20 standard amino acid in the sequence. 303 * The sequence argument must be a protein sequence consisting of only 304 * non-ambiguous characters. 305 * The composition of an amino acid is the total number of its occurrence, 306 * divided by the total length of the sequence. 307 * 308 * @param sequence 309 * a protein sequence consisting of non-ambiguous characters only 310 * @return the composition of the 20 standard amino acid in the sequence 311 * @see ProteinSequence 312 * @see AminoAcidCompound 313 */ 314 public Map<AminoAcidCompound, Double> getAAComposition(ProteinSequence sequence); 315 316 /** 317 * Calculates the aromaticity value of a protein according to Lobry, 1994. 318 * It is simply the relative frequency of Phe+Trp+Tyr. 319 * 320 * @param sequence a protein sequence consisting of non-ambiguous characters only 321 * @return the aromaticity of a protein sequence 322 * @see ProteinSequence 323 */ 324 public double getAromaticity(ProteinSequence sequence); 325}