001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq; 023 024import java.io.InputStream; 025import java.util.HashMap; 026import java.util.Iterator; 027import java.util.Map; 028import java.util.MissingResourceException; 029 030import javax.xml.parsers.DocumentBuilder; 031import javax.xml.parsers.DocumentBuilderFactory; 032 033import org.biojava.bio.BioError; 034import org.biojava.bio.BioException; 035import org.biojava.bio.SimpleAnnotation; 036import org.biojava.bio.seq.impl.SimpleGappedSequence; 037import org.biojava.bio.seq.impl.SimpleSequenceFactory; 038import org.biojava.bio.seq.io.SymbolTokenization; 039import org.biojava.bio.symbol.AlphabetManager; 040import org.biojava.bio.symbol.AtomicSymbol; 041import org.biojava.bio.symbol.FiniteAlphabet; 042import org.biojava.bio.symbol.IllegalSymbolException; 043import org.biojava.bio.symbol.SimpleSymbolList; 044import org.biojava.bio.symbol.SimpleSymbolPropertyTable; 045import org.biojava.bio.symbol.Symbol; 046import org.biojava.bio.symbol.SymbolList; 047import org.biojava.bio.symbol.SymbolPropertyTable; 048import org.biojava.utils.ClassTools; 049import org.w3c.dom.Document; 050import org.w3c.dom.Element; 051import org.w3c.dom.Node; 052import org.w3c.dom.NodeList; 053import org.xml.sax.InputSource; 054 055/** 056 * The central port-of-call for all information and functionality specific to 057 * SymbolLists over the protein alphabet. 058 * 059 * @author Matthew Pocock 060 * @author Greg Cox 061 * @author Thomas Down 062 * @author MarkSchreiber 063 * @author Jonathan Warren 064 * @author gwaldon (pyrrolysine, pKs) 065 */ 066public class ProteinTools { 067 private static final FiniteAlphabet proteinAlpha; 068 private static final FiniteAlphabet proteinTAlpha; 069 070 private static final Map tokenToSymbol = new HashMap(); 071 072 private static final Map propertyTableMap = new HashMap(); 073 074 static { 075 try { 076 proteinAlpha = (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN"); 077 proteinTAlpha = (FiniteAlphabet) AlphabetManager.alphabetForName("PROTEIN-TERM"); 078 SymbolTokenization st = proteinTAlpha.getTokenization("token"); 079 for (Iterator i = proteinTAlpha.iterator(); i.hasNext(); ) { 080 AtomicSymbol s = (AtomicSymbol)i.next(); 081 tokenToSymbol.put(st.tokenizeSymbol(s), s); 082 } 083 084 } catch (Exception e) { 085 throw new BioError(" Could not initialize ProteinTools", e); 086 } 087 } 088 089 090 static { 091 092 Document doc = null; 093 /* try { 094 URL proteaseManagerURL = ProteinTools.class.getClassLoader().getResource( 095 "org/biojava/bio/symbol/ResidueProperties.xml" 096 ); 097 //If I try and do this here on compile it says "An exception can't be thrown by an initializer" 098 InputSource is = Resolver.createInputSource(proteaseManagerURL, true); 099 doc = XmlDocument.createXmlDocument(is, true);*/ 100 101 try { 102 InputStream tablesStream = ClassTools.getClassLoader(ProteinTools.class).getResourceAsStream( 103 "org/biojava/bio/symbol/ResidueProperties.xml" 104 ); 105 if(tablesStream == null ) { 106 throw new BioError("Couldn't locate ResidueProperties.xml."); 107 } 108 109 InputSource is = new InputSource(tablesStream); 110 DocumentBuilder parser = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 111 doc = parser.parse(is); 112 }catch (MissingResourceException mre) { 113 System.err.println(mre.getMessage()); 114 }catch(Exception e){//err 115 e.printStackTrace(); 116 } 117 118 try { 119 SimpleSymbolPropertyTable monoMassPropertyTable = new SimpleSymbolPropertyTable( 120 getAlphabet(), 121 SymbolPropertyTable.MONO_MASS 122 ); 123 124 SimpleSymbolPropertyTable avgMassPropertyTable = new SimpleSymbolPropertyTable( 125 getAlphabet(), 126 SymbolPropertyTable.AVG_MASS 127 ); 128 129 SimpleSymbolPropertyTable pK_NtermPropertyTable = new SimpleSymbolPropertyTable( 130 getAlphabet(), 131 SymbolPropertyTable.PK_Nterm 132 ); 133 134 SimpleSymbolPropertyTable pKPropertyTable = new SimpleSymbolPropertyTable( 135 getAlphabet(), 136 SymbolPropertyTable.PK 137 ); 138 139 SimpleSymbolPropertyTable pK_CtermPropertyTable = new SimpleSymbolPropertyTable( 140 getAlphabet(), 141 SymbolPropertyTable.PK_Cterm 142 ); 143 144 SimpleSymbolPropertyTable HydropathicityTable = new SimpleSymbolPropertyTable( 145 getAlphabet(), 146 SymbolPropertyTable.HYDROPATHICITY 147 ); 148 149 SymbolTokenization tokens = getAlphabet().getTokenization("token"); 150 151 NodeList children = doc.getDocumentElement().getChildNodes(); 152 for(int i = 0; i < children.getLength(); i++) { 153 Node cnode = (Node) children.item(i); 154 if(! (cnode instanceof Element)) { 155 continue; 156 } 157 Element child = (Element) cnode; 158 if(child.getNodeName().equals("residue")) { 159 String token = child.getAttribute("token"); 160 Symbol s = tokens.parseToken(token); 161 162 NodeList properyNodes = child.getChildNodes(); 163 for(int j = 0; j < properyNodes.getLength(); j++) { 164 cnode = (Node) properyNodes.item(j); 165 if(! (cnode instanceof Element)) { 166 continue; 167 } 168 Element el = (Element) cnode; 169 String name = el.getAttribute("name"); 170 if(name.equals(SymbolPropertyTable.MONO_MASS)) { 171 String value = el.getAttribute("value"); 172 monoMassPropertyTable.setDoubleProperty(s, value); 173 } else if (name.equals(SymbolPropertyTable.AVG_MASS)) { 174 String value = el.getAttribute("value"); 175 avgMassPropertyTable.setDoubleProperty(s, value); 176 } else if (name.equals(SymbolPropertyTable.PK_Nterm)) { 177 String value = el.getAttribute("value"); 178 pK_NtermPropertyTable.setDoubleProperty(s, value); 179 } else if (name.equals(SymbolPropertyTable.PK)) { 180 String value = el.getAttribute("value"); 181 pKPropertyTable.setDoubleProperty(s, value); 182 } else if (name.equals(SymbolPropertyTable.PK_Cterm)) { 183 String value = el.getAttribute("value"); 184 pK_CtermPropertyTable.setDoubleProperty(s, value); 185 }else if (name.equals(SymbolPropertyTable.HYDROPATHICITY)) { 186 String value = el.getAttribute("value"); 187 HydropathicityTable.setDoubleProperty(s, value); 188 } 189 } 190 } 191 } 192 193 propertyTableMap.put(SymbolPropertyTable.MONO_MASS, (SymbolPropertyTable) monoMassPropertyTable); 194 propertyTableMap.put(SymbolPropertyTable.AVG_MASS, (SymbolPropertyTable) avgMassPropertyTable); 195 propertyTableMap.put(SymbolPropertyTable.PK_Nterm, (SymbolPropertyTable) pK_NtermPropertyTable); 196 propertyTableMap.put(SymbolPropertyTable.PK, (SymbolPropertyTable) pKPropertyTable); 197 propertyTableMap.put(SymbolPropertyTable.PK_Cterm, (SymbolPropertyTable) pK_CtermPropertyTable); 198 propertyTableMap.put(SymbolPropertyTable.HYDROPATHICITY, (SymbolPropertyTable) HydropathicityTable); 199 } catch (Exception e) { 200 throw new BioError(" Could not initialize ProteinTools", e); 201 } 202 } 203 204 private ProteinTools() { 205 } 206 207 /** 208 *Gets the protein alphabet 209 */ 210 public static final FiniteAlphabet getAlphabet() { 211 return proteinAlpha; 212 } 213 214 /** 215 *Gets the protein alphabet including the translation termination symbols 216 */ 217 public static final FiniteAlphabet getTAlphabet() { 218 return proteinTAlpha; 219 } 220 221 public static final SymbolPropertyTable getSymbolPropertyTable(String name) 222 { 223 return (SymbolPropertyTable)propertyTableMap.get(name); 224 } 225 226 /** 227 * Return a new Protein <span class="type">SymbolList</span> for <span 228 * class="arg">protein</span>. 229 * 230 * @param theProtein a <span class="type">String</span> to parse into Protein 231 * @return a <span class="type">SymbolList</span> created form <span 232 * class="arg">Protein</span> 233 * @throws IllegalSymbolException if <span class="arg">dna</span> contains 234 * any non-Amino Acid characters. 235 */ 236 public static SymbolList createProtein(String theProtein) 237 throws IllegalSymbolException 238 { 239 SymbolTokenization p = null; 240 try { 241 p = getTAlphabet().getTokenization("token"); 242 } catch (BioException e) { 243 throw new BioError("Something has gone badly wrong with Protein", e); 244 } 245 return new SimpleSymbolList(p, theProtein); 246 } 247 248 /** Get a new protein as a GappedSequence */ 249 public static GappedSequence createGappedProteinSequence(String theProtein, String name) throws IllegalSymbolException{ 250 String theProtein1 = theProtein.replaceAll("-", ""); 251 Sequence protein = createProteinSequence(theProtein1, name); 252 GappedSequence protein1 = new SimpleGappedSequence(protein); 253 int pos = theProtein.indexOf('-', 0); 254 while(pos!=-1){ 255 protein1.addGapInView(pos+1); 256 pos = theProtein.indexOf('-', pos+1); 257 } 258 return protein1; 259 } 260 261 /** 262 * Return a new PROTEIN <span class="type">Sequence</span> for 263 * <span class="arg">protein</span>. 264 * 265 * @param protein a <span class="type">String</span> to parse into PROTEIN 266 * @param name a <span class="type">String</span> to use as the name 267 * @return a <span class="type">Sequence</span> created form 268 * <span class="arg">protein</span> 269 * @throws IllegalSymbolException if <span class="arg">protein</span> contains 270 * any non-PROTEIN characters 271 */ 272 public static Sequence createProteinSequence(String protein, String name) 273 throws IllegalSymbolException { 274 try { 275 return new SimpleSequenceFactory().createSequence( 276 createProtein(protein), 277 "", name, new SimpleAnnotation() 278 ); 279 } catch (BioException se) { 280 throw new BioError("Something has gone badly wrong with ProteinTAlpha", se); 281 } 282 } 283 284 /** 285 * Returns the <code>AtomicSymbol</code> for the amino acid Alanine 286 * (A) 287 */ 288 public static AtomicSymbol ala() { 289 return (AtomicSymbol) tokenToSymbol.get("A"); 290 } 291 292 /** 293 * Returns the <code>AtomicSymbol</code> for the amino acid 294 * Alanine 295 */ 296 public static AtomicSymbol a() { 297 return ala(); 298 } 299 300 /** 301 * Returns the <code>AtomicSymbol</code> for the amino acid 302 * Arginine (R) 303 */ 304 public static AtomicSymbol arg() { 305 return (AtomicSymbol) tokenToSymbol.get("R"); 306 } 307 308 /** 309 * Returns the <code>AtomicSymbol</code> for the amino acid 310 * Arginine 311 */ 312 public static AtomicSymbol r() { 313 return arg(); 314 } 315 316 /** 317 * Returns the <code>AtomicSymbol</code> for the amino acid 318 * Asparagine (N) 319 */ 320 public static AtomicSymbol asn() { 321 return (AtomicSymbol) tokenToSymbol.get("N"); 322 } 323 324 /** 325 * Returns the <code>AtomicSymbol</code> for the amino acid 326 * Asparagine 327 */ 328 public static AtomicSymbol n() { 329 return asn(); 330 } 331 332 /** 333 * Returns the <code>AtomicSymbol</code> for the amino acid 334 * Aspartic Acid (D) 335 */ 336 public static AtomicSymbol asp() { 337 return (AtomicSymbol) tokenToSymbol.get("D"); 338 } 339 340 /** 341 * Returns the <code>AtomicSymbol</code> for the amino acid 342 * Aspartic Acid 343 */ 344 public static AtomicSymbol d() { 345 return asp(); 346 } 347 348 /** 349 * Returns the <code>AtomicSymbol</code> for the amino acid 350 * Cysteine (C) 351 */ 352 public static AtomicSymbol cys() { 353 return (AtomicSymbol) tokenToSymbol.get("C"); 354 } 355 356 /** 357 * Returns the <code>AtomicSymbol</code> for the amino acid 358 * Cysteine 359 */ 360 public static AtomicSymbol c() { 361 return cys(); 362 } 363 364 /** 365 * Returns the <code>AtomicSymbol</code> for the amino acid 366 * Glutamine (Q) 367 */ 368 public static AtomicSymbol gln() { 369 return (AtomicSymbol) tokenToSymbol.get("Q"); 370 } 371 372 /** 373 * Returns the <code>AtomicSymbol</code> for the amino acid 374 * Glutamine 375 */ 376 public static AtomicSymbol q() { 377 return gln(); 378 } 379 380 /** 381 * Returns the <code>AtomicSymbol</code> for the amino acid 382 * Glutamic Acid (E) 383 */ 384 public static AtomicSymbol glu() { 385 return (AtomicSymbol) tokenToSymbol.get("E"); 386 } 387 388 /** 389 * Returns the <code>AtomicSymbol</code> for the amino acid 390 * Glutamic Acid 391 */ 392 public static AtomicSymbol e() { 393 return glu(); 394 } 395 396 /** 397 * Returns the <code>AtomicSymbol</code> for the amino acid 398 * Glycine (G) 399 */ 400 public static AtomicSymbol gly() { 401 return (AtomicSymbol) tokenToSymbol.get("G"); 402 } 403 404 /** 405 * Returns the <code>AtomicSymbol</code> for the amino acid 406 * Glycine 407 */ 408 public static AtomicSymbol g() { 409 return gly(); 410 } 411 412 /** 413 * Returns the <code>AtomicSymbol</code> for the amino acid 414 * Histidine (H) 415 */ 416 public static AtomicSymbol his() { 417 return (AtomicSymbol) tokenToSymbol.get("H"); 418 } 419 420 /** 421 * Returns the <code>AtomicSymbol</code> for the amino acid 422 * Histidine 423 */ 424 public static AtomicSymbol h() { 425 return his(); 426 } 427 428 /** 429 * Returns the <code>AtomicSymbol</code> for the amino acid 430 * Isoleucine (I) 431 */ 432 public static AtomicSymbol ile() { 433 return (AtomicSymbol) tokenToSymbol.get("I"); 434 } 435 436 /** 437 * Returns the <code>AtomicSymbol</code> for the amino acid 438 * Isoleucine 439 */ 440 public static AtomicSymbol i() { 441 return ile(); 442 } 443 444 /** 445 * Returns the <code>AtomicSymbol</code> for the amino acid 446 * Leucine (L) 447 */ 448 public static AtomicSymbol leu() { 449 return (AtomicSymbol) tokenToSymbol.get("L"); 450 } 451 452 /** 453 * Returns the <code>AtomicSymbol</code> for the amino acid 454 * Leucine 455 */ 456 public static AtomicSymbol l() { 457 return leu(); 458 } 459 460 /** 461 * Returns the <code>AtomicSymbol</code> for the amino acid 462 * Lysine (K) 463 */ 464 public static AtomicSymbol lys() { 465 return (AtomicSymbol) tokenToSymbol.get("K"); 466 } 467 468 /** 469 * Returns the <code>AtomicSymbol</code> for the amino acid 470 * Lysine 471 */ 472 public static AtomicSymbol k() { 473 return lys(); 474 } 475 476 /** 477 * Returns the <code>AtomicSymbol</code> for the amino acid 478 * Methionine (M) 479 */ 480 public static AtomicSymbol met() { 481 return (AtomicSymbol) tokenToSymbol.get("M"); 482 } 483 484 /** 485 * Returns the <code>AtomicSymbol</code> for the amino acid 486 * Methionine 487 */ 488 public static AtomicSymbol m() { 489 return met(); 490 } 491 492 /** 493 * Returns the <code>AtomicSymbol</code> for the amino acid 494 * Phenylalanine (F) 495 */ 496 public static AtomicSymbol phe() { 497 return (AtomicSymbol) tokenToSymbol.get("F"); 498 } 499 500 /** 501 * Returns the <code>AtomicSymbol</code> for the amino acid 502 * Phenylalanine 503 */ 504 public static AtomicSymbol f() { 505 return phe(); 506 } 507 508 /** 509 * Returns the <code>AtomicSymbol</code> for the amino acid 510 * Proline (P) 511 */ 512 public static AtomicSymbol pro() { 513 return (AtomicSymbol) tokenToSymbol.get("P"); 514 } 515 516 /** 517 * Returns the <code>AtomicSymbol</code> for the amino acid 518 * Proline 519 */ 520 public static AtomicSymbol p() { 521 return pro(); 522 } 523 524 /** 525 * Returns the <code>AtomicSymbol</code> for the amino acid 526 * Pyrrolysine (O) 527 */ 528 public static AtomicSymbol pyl() { 529 return (AtomicSymbol) tokenToSymbol.get("O"); 530 } 531 532 /** 533 * Returns the <code>AtomicSymbol</code> for the amino acid 534 * Pyrrolysine 535 */ 536 public static AtomicSymbol o() { 537 return pyl(); 538 } 539 540 /** 541 * Returns the <code>AtomicSymbol</code> for the amino acid 542 * Selenocysteine (U) 543 */ 544 public static AtomicSymbol sec() { 545 return (AtomicSymbol) tokenToSymbol.get("U"); 546 } 547 548 /** 549 * Returns the <code>AtomicSymbol</code> for the amino acid 550 * Selenocysteine 551 */ 552 public static AtomicSymbol u(){ 553 return sec(); 554 } 555 556 /** 557 * Returns the <code>AtomicSymbol</code> for the amino acid 558 * Serine (S) 559 */ 560 public static AtomicSymbol ser() { 561 return (AtomicSymbol) tokenToSymbol.get("S"); 562 } 563 564 /** 565 * Returns the <code>AtomicSymbol</code> for the amino acid 566 * Serine 567 */ 568 public static AtomicSymbol s() { 569 return ser(); 570 } 571 572 /** 573 * Returns the <code>AtomicSymbol</code> for the amino acid 574 * Threonine (T) 575 */ 576 public static AtomicSymbol thr() { 577 return (AtomicSymbol) tokenToSymbol.get("T"); 578 } 579 580 /** 581 * Returns the <code>AtomicSymbol</code> for the amino acid 582 * Threonine 583 */ 584 public static AtomicSymbol t() { 585 return thr(); 586 } 587 588 /** 589 * Returns the <code>AtomicSymbol</code> for the amino acid 590 * Tryptophan (W) 591 */ 592 public static AtomicSymbol trp() { 593 return (AtomicSymbol) tokenToSymbol.get("W"); 594 } 595 596 /** 597 * Returns the <code>AtomicSymbol</code> for the amino acid 598 * Tryptophan 599 */ 600 public static AtomicSymbol w() { 601 return trp(); 602 } 603 604 /** 605 * Returns the <code>AtomicSymbol</code> for the amino acid 606 * Tyrosine (Y) 607 */ 608 public static AtomicSymbol tyr() { 609 return (AtomicSymbol) tokenToSymbol.get("Y"); 610 } 611 612 /** 613 * Returns the <code>AtomicSymbol</code> for the amino acid 614 * Tyrosine 615 */ 616 public static AtomicSymbol y() { 617 return tyr(); 618 } 619 620 /** 621 * Returns the <code>AtomicSymbol</code> for the amino acid Valine (V) 622 */ 623 public static AtomicSymbol val() { 624 return (AtomicSymbol) tokenToSymbol.get("V"); 625 } 626 627 /** 628 * Returns the <code>AtomicSymbol</code> for the amino acid 629 * Valine 630 */ 631 public static AtomicSymbol v() { 632 return val(); 633 } 634 635 636 /** 637 * Returns the <code>AtomicSymbol</code> for the termination (*) 638 * placeholder 639 */ 640 public static AtomicSymbol ter() { 641 return (AtomicSymbol) tokenToSymbol.get("*"); 642 } 643 644}