001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on Jan 4, 2006 021 * 022 */ 023package org.biojava.nbio.structure; 024 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.ArrayList; 030import java.util.HashMap; 031import java.util.HashSet; 032import java.util.Iterator; 033import java.util.LinkedHashSet; 034import java.util.List; 035import java.util.Map; 036import java.util.Set; 037import org.biojava.nbio.structure.align.util.AtomCache; 038import org.biojava.nbio.structure.contact.AtomContactSet; 039import org.biojava.nbio.structure.contact.Grid; 040import org.biojava.nbio.structure.io.FileParsingParameters; 041import org.biojava.nbio.structure.io.PDBFileParser; 042import org.biojava.nbio.structure.io.mmcif.chem.PolymerType; 043import org.biojava.nbio.structure.io.mmcif.chem.ResidueType; 044import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 045import org.biojava.nbio.structure.io.util.FileDownloadUtils; 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049/** 050 * A class that provides some tool methods. 051 * 052 * @author Andreas Prlic, Jules Jacobsen 053 * @since 1.0 054 */ 055public class StructureTools { 056 057 private static final Logger logger = LoggerFactory 058 .getLogger(StructureTools.class); 059 060 // Amino Acid backbone 061 /** 062 * The atom name of the backbone C-alpha atom. Note that this can be 063 * ambiguous depending on the context since Calcium atoms use the same name 064 * in PDB. 065 */ 066 public static final String CA_ATOM_NAME = "CA"; 067 068 /** 069 * The atom name for the backbone amide nitrogen 070 */ 071 public static final String N_ATOM_NAME = "N"; 072 073 /** 074 * The atom name for the backbone carbonyl 075 */ 076 public static final String C_ATOM_NAME = "C"; 077 078 /** 079 * The atom name for the backbone carbonyl oxygen 080 */ 081 public static final String O_ATOM_NAME = "O"; 082 083 /** 084 * The atom name of the side-chain C-beta atom 085 */ 086 public static final String CB_ATOM_NAME = "CB"; 087 088 // Nucleotide backbone 089 /** 090 * The atom name of the backbone C1' in RNA 091 */ 092 public static final String C1_ATOM_NAME = "C1'"; 093 /** 094 * The atom name of the backbone C2' in RNA 095 */ 096 public static final String C2_ATOM_NAME = "C2'"; 097 /** 098 * The atom name of the backbone C3' in RNA 099 */ 100 public static final String C3_ATOM_NAME = "C3'"; 101 /** 102 * The atom name of the backbone C4' in RNA 103 */ 104 public static final String C4_ATOM_NAME = "C4'"; 105 /** 106 * The atom name of the backbone O2' in RNA 107 */ 108 public static final String O2_ATOM_NAME = "O2'"; 109 /** 110 * The atom name of the backbone O3' in RNA 111 */ 112 public static final String O3_ATOM_NAME = "O3'"; 113 /** 114 * The atom name of the backbone O4' in RNA 115 */ 116 public static final String O4_ATOM_NAME = "O4'"; 117 /** 118 * The atom name of the backbone O4' in RNA 119 */ 120 public static final String O5_ATOM_NAME = "O5'"; 121 /** 122 * The atom name of the backbone O4' in RNA 123 */ 124 public static final String OP1_ATOM_NAME = "OP1"; 125 /** 126 * The atom name of the backbone O4' in RNA 127 */ 128 public static final String OP2_ATOM_NAME = "OP2"; 129 /** 130 * The atom name of the backbone phosphate in RNA 131 */ 132 public static final String P_ATOM_NAME = "P"; 133 134 /** 135 * The atom used as representative for nucleotides, equivalent to 136 * {@link #CA_ATOM_NAME} for proteins 137 */ 138 public static final String NUCLEOTIDE_REPRESENTATIVE = P_ATOM_NAME; 139 140 /** 141 * The character to use for unknown compounds in sequence strings 142 */ 143 public static final char UNKNOWN_GROUP_LABEL = 'X'; 144 145 /** 146 * Below this ratio of aminoacid/nucleotide residues to the sequence total, 147 * we use simple majority of aminoacid/nucleotide residues to decide the 148 * character of the chain (protein/nucleotide) 149 */ 150 public static final double RATIO_RESIDUES_TO_TOTAL = 0.95; 151 152 // there is a file format change in PDB 3.0 and nucleotides are being 153 // renamed 154 private static final Map<String, Character> nucleotides30; 155 private static final Map<String, Character> nucleotides23; 156 157 // amino acid 3 and 1 letter code definitions 158 private static final Map<String, Character> aminoAcids; 159 160 private static final Set<Element> hBondDonorAcceptors; 161 162 static { 163 nucleotides30 = new HashMap<String, Character>(); 164 nucleotides30.put("DA", 'A'); 165 nucleotides30.put("DC", 'C'); 166 nucleotides30.put("DG", 'G'); 167 nucleotides30.put("DT", 'T'); 168 nucleotides30.put("DI", 'I'); 169 nucleotides30.put("A", 'A'); 170 nucleotides30.put("G", 'G'); 171 nucleotides30.put("C", 'C'); 172 nucleotides30.put("U", 'U'); 173 nucleotides30.put("I", 'I'); 174 175 // the DNA linkers - the +C , +G, +A +T +U and +I have been replaced 176 // with these: 177 nucleotides30.put("TAF", UNKNOWN_GROUP_LABEL); // Fluorinated Thymine 178 nucleotides30.put("TC1", UNKNOWN_GROUP_LABEL); // Furanosyl 179 nucleotides30.put("TFE", UNKNOWN_GROUP_LABEL); // Fluorinated Thymine 180 nucleotides30.put("TFO", UNKNOWN_GROUP_LABEL); // Tenofovir (3' 181 // terminator) 182 nucleotides30.put("TGP", UNKNOWN_GROUP_LABEL); // Guanine variant 183 nucleotides30.put("THX", UNKNOWN_GROUP_LABEL); // 5' terminator 184 nucleotides30.put("TLC", UNKNOWN_GROUP_LABEL); // Thymine with dicyclic 185 // sugar 186 nucleotides30.put("TLN", UNKNOWN_GROUP_LABEL); // locked Thymine 187 nucleotides30.put("LCG", UNKNOWN_GROUP_LABEL); // locked Guanine 188 nucleotides30.put("TP1", UNKNOWN_GROUP_LABEL); // Thymine peptide 189 // nucleic acid, with 190 // added methyl 191 nucleotides30.put("CP1", UNKNOWN_GROUP_LABEL); // Cytidine peptide 192 // nucleic acid, with 193 // added methyl 194 nucleotides30.put("TPN", UNKNOWN_GROUP_LABEL); // Thymine peptide 195 // nucleic acid 196 nucleotides30.put("CPN", UNKNOWN_GROUP_LABEL); // Cytidine peptide 197 // nucleic acid 198 nucleotides30.put("GPN", UNKNOWN_GROUP_LABEL); // Guanine peptide 199 // nucleic acid 200 nucleotides30.put("APN", UNKNOWN_GROUP_LABEL); // Adenosine peptide 201 // nucleic acid 202 nucleotides30.put("TPC", UNKNOWN_GROUP_LABEL); // Thymine variant 203 204 // store nucleic acids (C, G, A, T, U, and I), and 205 // the modified versions of nucleic acids (+C, +G, +A, +T, +U, and +I), 206 // and 207 nucleotides23 = new HashMap<String, Character>(); 208 String[] names = { "C", "G", "A", "T", "U", "I", "+C", "+G", "+A", 209 "+T", "+U", "+I" }; 210 for (String n : names) { 211 nucleotides23.put(n, n.charAt(n.length() - 1)); 212 } 213 214 aminoAcids = new HashMap<String, Character>(); 215 aminoAcids.put("GLY", 'G'); 216 aminoAcids.put("ALA", 'A'); 217 aminoAcids.put("VAL", 'V'); 218 aminoAcids.put("LEU", 'L'); 219 aminoAcids.put("ILE", 'I'); 220 aminoAcids.put("PHE", 'F'); 221 aminoAcids.put("TYR", 'Y'); 222 aminoAcids.put("TRP", 'W'); 223 aminoAcids.put("PRO", 'P'); 224 aminoAcids.put("HIS", 'H'); 225 aminoAcids.put("LYS", 'K'); 226 aminoAcids.put("ARG", 'R'); 227 aminoAcids.put("SER", 'S'); 228 aminoAcids.put("THR", 'T'); 229 aminoAcids.put("GLU", 'E'); 230 aminoAcids.put("GLN", 'Q'); 231 aminoAcids.put("ASP", 'D'); 232 aminoAcids.put("ASN", 'N'); 233 aminoAcids.put("CYS", 'C'); 234 aminoAcids.put("MET", 'M'); 235 // MSE is only found as a molecular replacement for MET 236 aminoAcids.put("MSE", 'M'); 237 // 'non-standard', genetically encoded 238 // http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html 239 // IUBMB recommended name is 'SEC' but the wwPDB currently use 'CSE' 240 // likewise 'PYL' (IUBMB) and 'PYH' (PDB) 241 aminoAcids.put("CSE", 'U'); 242 aminoAcids.put("SEC", 'U'); 243 aminoAcids.put("PYH", 'O'); 244 aminoAcids.put("PYL", 'O'); 245 246 hBondDonorAcceptors = new HashSet<Element>(); 247 hBondDonorAcceptors.add(Element.N); 248 hBondDonorAcceptors.add(Element.O); 249 hBondDonorAcceptors.add(Element.S); 250 251 } 252 253 /** 254 * Count how many Atoms are contained within a Structure object. 255 * 256 * @param s 257 * the structure object 258 * @return the number of Atoms in this Structure 259 */ 260 public static final int getNrAtoms(Structure s) { 261 262 int nrAtoms = 0; 263 264 Iterator<Group> iter = new GroupIterator(s); 265 266 while (iter.hasNext()) { 267 Group g = iter.next(); 268 nrAtoms += g.size(); 269 } 270 271 return nrAtoms; 272 } 273 274 /** 275 * Count how many groups are contained within a structure object. 276 * 277 * @param s 278 * the structure object 279 * @return the number of groups in the structure 280 */ 281 public static final int getNrGroups(Structure s) { 282 int nrGroups = 0; 283 284 List<Chain> chains = s.getChains(0); 285 for (Chain c : chains) { 286 nrGroups += c.getAtomLength(); 287 } 288 return nrGroups; 289 } 290 291 /** 292 * Returns an array of the requested Atoms from the Structure object. 293 * Iterates over all groups and checks if the requested atoms are in this 294 * group, no matter if this is a {@link AminoAcid} or {@link HetatomImpl} 295 * group. If the group does not contain all requested atoms then no atoms 296 * are added for that group. For structures with more than one model, only 297 * model 0 will be used. 298 * 299 * @param s 300 * the structure to get the atoms from 301 * 302 * @param atomNames 303 * contains the atom names to be used. 304 * @return an Atom[] array 305 */ 306 public static final Atom[] getAtomArray(Structure s, String[] atomNames) { 307 List<Chain> chains = s.getModel(0); 308 309 List<Atom> atoms = new ArrayList<Atom>(); 310 311 extractAtoms(atomNames, chains, atoms); 312 return atoms.toArray(new Atom[atoms.size()]); 313 314 } 315 316 /** 317 * Returns an array of the requested Atoms from the Structure object. In 318 * contrast to {@link #getAtomArray(Structure, String[])} this method 319 * iterates over all chains. Iterates over all chains and groups and checks 320 * if the requested atoms are in this group, no matter if this is a 321 * {@link AminoAcid} or {@link HetatomImpl} group. If the group does not 322 * contain all requested atoms then no atoms are added for that group. For 323 * structures with more than one model, only model 0 will be used. 324 * 325 * @param s 326 * the structure to get the atoms from 327 * 328 * @param atomNames 329 * contains the atom names to be used. 330 * @return an Atom[] array 331 */ 332 public static final Atom[] getAtomArrayAllModels(Structure s, 333 String[] atomNames) { 334 335 List<Atom> atoms = new ArrayList<Atom>(); 336 337 for (int i = 0; i < s.nrModels(); i++) { 338 List<Chain> chains = s.getModel(i); 339 extractAtoms(atomNames, chains, atoms); 340 } 341 return atoms.toArray(new Atom[atoms.size()]); 342 343 } 344 345 /** 346 * Convert all atoms of the structure (first model) into an Atom array 347 * 348 * @param s 349 * input structure 350 * @return all atom array 351 */ 352 public static final Atom[] getAllAtomArray(Structure s) { 353 List<Atom> atoms = new ArrayList<Atom>(); 354 355 AtomIterator iter = new AtomIterator(s); 356 while (iter.hasNext()) { 357 Atom a = iter.next(); 358 atoms.add(a); 359 } 360 return atoms.toArray(new Atom[atoms.size()]); 361 } 362 363 /** 364 * Returns and array of all atoms of the chain (first model), including 365 * Hydrogens (if present) and all HETATOMs. Waters are not included. 366 * 367 * @param c 368 * input chain 369 * @return all atom array 370 */ 371 public static final Atom[] getAllAtomArray(Chain c) { 372 List<Atom> atoms = new ArrayList<Atom>(); 373 374 for (Group g : c.getAtomGroups()) { 375 if (g.isWater()) 376 continue; 377 for (Atom a : g.getAtoms()) { 378 atoms.add(a); 379 } 380 } 381 return atoms.toArray(new Atom[atoms.size()]); 382 } 383 384 /** 385 * List of groups from the structure not included in ca (e.g. ligands). 386 * 387 * Unaligned groups are searched from all chains referenced in ca, as well 388 * as any chains in the first model of the structure from ca[0], if any. 389 * 390 * @param ca an array of atoms 391 * @return 392 */ 393 public static List<Group> getUnalignedGroups(Atom[] ca) { 394 Set<Chain> chains = new HashSet<Chain>(); 395 Set<Group> caGroups = new HashSet<Group>(); 396 397 // Create list of all chains in this structure 398 Structure s = null; 399 if (ca.length > 0) { 400 Group g = ca[0].getGroup(); 401 if (g != null) { 402 Chain c = g.getChain(); 403 if (c != null) { 404 s = c.getStructure(); 405 } 406 } 407 } 408 if (s != null) { 409 // Add all chains from the structure 410 for (Chain c : s.getChains(0)) { 411 chains.add(c); 412 } 413 } 414 415 // Add groups and chains from ca 416 for (Atom a : ca) { 417 Group g = a.getGroup(); 418 if (g != null) { 419 caGroups.add(g); 420 421 Chain c = g.getChain(); 422 if (c != null) { 423 chains.add(c); 424 } 425 } 426 } 427 428 // Iterate through all chains, finding groups not in ca 429 List<Group> unadded = new ArrayList<Group>(); 430 for (Chain c : chains) { 431 for (Group g : c.getAtomGroups()) { 432 if (!caGroups.contains(g)) { 433 unadded.add(g); 434 } 435 } 436 } 437 return unadded; 438 } 439 440 /** 441 * Returns and array of all non-Hydrogen atoms in the given Structure, 442 * optionally including HET atoms or not. Waters are not included. 443 * 444 * @param s 445 * @param hetAtoms 446 * if true HET atoms are included in array, if false they are not 447 * @return 448 */ 449 public static final Atom[] getAllNonHAtomArray(Structure s, boolean hetAtoms) { 450 List<Atom> atoms = new ArrayList<Atom>(); 451 452 AtomIterator iter = new AtomIterator(s); 453 while (iter.hasNext()) { 454 Atom a = iter.next(); 455 if (a.getElement() == Element.H) 456 continue; 457 458 Group g = a.getGroup(); 459 460 if (g.isWater()) 461 continue; 462 463 if (!hetAtoms && g.getType().equals(GroupType.HETATM)) 464 continue; 465 466 atoms.add(a); 467 } 468 return atoms.toArray(new Atom[atoms.size()]); 469 } 470 471 /** 472 * Returns and array of all non-Hydrogen atoms in the given Chain, 473 * optionally including HET atoms or not Waters are not included. 474 * 475 * @param c 476 * @param hetAtoms 477 * if true HET atoms are included in array, if false they are not 478 * @return 479 */ 480 public static final Atom[] getAllNonHAtomArray(Chain c, boolean hetAtoms) { 481 List<Atom> atoms = new ArrayList<Atom>(); 482 483 for (Group g : c.getAtomGroups()) { 484 if (g.isWater()) 485 continue; 486 for (Atom a : g.getAtoms()) { 487 488 if (a.getElement() == Element.H) 489 continue; 490 491 if (!hetAtoms && g.getType().equals(GroupType.HETATM)) 492 continue; 493 494 atoms.add(a); 495 } 496 } 497 return atoms.toArray(new Atom[atoms.size()]); 498 } 499 500 /** 501 * Adds to the given atoms list, all atoms of groups that contained all 502 * requested atomNames, i.e. if a group does not contain all of the 503 * requested atom names, its atoms won't be added. 504 * 505 * @param atomNames 506 * @param chains 507 * @param atoms 508 */ 509 private static void extractAtoms(String[] atomNames, List<Chain> chains, 510 List<Atom> atoms) { 511 512 for (Chain c : chains) { 513 514 for (Group g : c.getAtomGroups()) { 515 516 // a temp container for the atoms of this group 517 List<Atom> thisGroupAtoms = new ArrayList<Atom>(); 518 // flag to check if this group contains all the requested atoms. 519 boolean thisGroupAllAtoms = true; 520 for (String atomName : atomNames) { 521 Atom a = g.getAtom(atomName); 522 523 if (a == null) { 524 // this group does not have a required atom, skip it... 525 thisGroupAllAtoms = false; 526 break; 527 } 528 thisGroupAtoms.add(a); 529 } 530 if (thisGroupAllAtoms) { 531 // add the atoms of this group to the array. 532 for (Atom a : thisGroupAtoms) { 533 atoms.add(a); 534 } 535 } 536 537 } 538 } 539 } 540 541 /** 542 * Returns an array of the requested Atoms from the Chain object. Iterates 543 * over all groups and checks if the requested atoms are in this group, no 544 * matter if this is a AminoAcid or Hetatom group. If the group does not 545 * contain all requested atoms then no atoms are added for that group. 546 * 547 * @param c 548 * the Chain to get the atoms from 549 * 550 * @param atomNames 551 * contains the atom names to be used. 552 * @return an Atom[] array 553 */ 554 public static final Atom[] getAtomArray(Chain c, String[] atomNames) { 555 556 List<Atom> atoms = new ArrayList<Atom>(); 557 558 for (Group g : c.getAtomGroups()) { 559 560 // a temp container for the atoms of this group 561 List<Atom> thisGroupAtoms = new ArrayList<Atom>(); 562 // flag to check if this group contains all the requested atoms. 563 boolean thisGroupAllAtoms = true; 564 for (String atomName : atomNames) { 565 Atom a = g.getAtom(atomName); 566 if (a == null) { 567 logger.debug("Group " + g.getResidueNumber() + " (" 568 + g.getPDBName() 569 + ") does not have the required atom '" + atomName 570 + "'"); 571 // this group does not have a required atom, skip it... 572 thisGroupAllAtoms = false; 573 break; 574 } 575 thisGroupAtoms.add(a); 576 } 577 578 if (thisGroupAllAtoms) { 579 // add the atoms of this group to the array. 580 for (Atom a : thisGroupAtoms) { 581 atoms.add(a); 582 } 583 } 584 585 } 586 return atoms.toArray(new Atom[atoms.size()]); 587 588 } 589 590 /** 591 * Returns an Atom array of the C-alpha atoms. Any atom that is a carbon and 592 * has CA name will be returned. 593 * 594 * @param c 595 * the structure object 596 * @return an Atom[] array 597 * @see #getRepresentativeAtomArray(Chain) 598 */ 599 public static final Atom[] getAtomCAArray(Chain c) { 600 List<Atom> atoms = new ArrayList<Atom>(); 601 602 for (Group g : c.getAtomGroups()) { 603 if (g.hasAtom(CA_ATOM_NAME) 604 && g.getAtom(CA_ATOM_NAME).getElement() == Element.C) { 605 atoms.add(g.getAtom(CA_ATOM_NAME)); 606 } 607 } 608 609 return atoms.toArray(new Atom[atoms.size()]); 610 } 611 612 /** 613 * Gets a representative atom for each group that is part of the chain 614 * backbone. Note that modified aminoacids won't be returned as part of the 615 * backbone if the {@link org.biojava.nbio.structure.io.mmcif.ReducedChemCompProvider} was used to load the 616 * structure. 617 * 618 * For amino acids, the representative is a CA carbon. For nucleotides, the 619 * representative is the {@value #NUCLEOTIDE_REPRESENTATIVE}. Other group 620 * types will be ignored. 621 * 622 * @param c 623 * @return representative Atoms of the chain backbone 624 * @since Biojava 4.1.0 625 */ 626 public static final Atom[] getRepresentativeAtomArray(Chain c) { 627 List<Atom> atoms = new ArrayList<Atom>(); 628 629 for (Group g : c.getAtomGroups()) { 630 631 switch (g.getType()) { 632 case AMINOACID: 633 if (g.hasAtom(CA_ATOM_NAME) 634 && g.getAtom(CA_ATOM_NAME).getElement() == Element.C) { 635 atoms.add(g.getAtom(CA_ATOM_NAME)); 636 } 637 break; 638 case NUCLEOTIDE: 639 if (g.hasAtom(NUCLEOTIDE_REPRESENTATIVE)) { 640 atoms.add(g.getAtom(NUCLEOTIDE_REPRESENTATIVE)); 641 } 642 break; 643 default: 644 // don't add 645 } 646 } 647 648 return atoms.toArray(new Atom[atoms.size()]); 649 650 } 651 652 /** 653 * Provides an equivalent copy of Atoms in a new array. Clones everything, 654 * starting with parent groups and chains. The chain will only contain 655 * groups that are part of the input array. 656 * 657 * @param ca 658 * array of representative atoms, e.g. CA atoms 659 * @return Atom array 660 * @deprecated Use the better-named {@link #cloneAtomArray(Atom[])} instead 661 */ 662 @Deprecated 663 public static final Atom[] cloneCAArray(Atom[] ca) { 664 return cloneAtomArray(ca); 665 } 666 667 /** 668 * Provides an equivalent copy of Atoms in a new array. Clones everything, 669 * starting with parent groups and chains. The chain will only contain 670 * groups that are part of the input array. 671 * 672 * @param ca 673 * array of representative atoms, e.g. CA atoms 674 * @return Atom array 675 * @since Biojava 4.1.0 676 */ 677 public static final Atom[] cloneAtomArray(Atom[] ca) { 678 Atom[] newCA = new Atom[ca.length]; 679 680 List<Chain> model = new ArrayList<Chain>(); 681 int apos = -1; 682 for (Atom a : ca) { 683 apos++; 684 Group parentG = a.getGroup(); 685 Chain parentC = parentG.getChain(); 686 687 Chain newChain = null; 688 for (Chain c : model) { 689 if (c.getChainID().equals(parentC.getChainID())) { 690 newChain = c; 691 break; 692 } 693 } 694 if (newChain == null) { 695 newChain = new ChainImpl(); 696 newChain.setChainID(parentC.getChainID()); 697 model.add(newChain); 698 } 699 700 Group parentN = (Group) parentG.clone(); 701 702 newCA[apos] = parentN.getAtom(a.getName()); 703 try { 704 // if the group doesn't exist yet, this produces a StructureException 705 newChain.getGroupByPDB(parentN.getResidueNumber()); 706 } catch (StructureException e) { 707 // the group doesn't exist yet in the newChain, let's add it 708 newChain.addGroup(parentN); 709 } 710 711 } 712 return newCA; 713 } 714 715 /** 716 * Clone a set of representative Atoms, but returns the parent groups 717 * 718 * @param ca 719 * Atom array 720 * @return Group array 721 */ 722 public static Group[] cloneGroups(Atom[] ca) { 723 Group[] newGroup = new Group[ca.length]; 724 725 List<Chain> model = new ArrayList<Chain>(); 726 int apos = -1; 727 for (Atom a : ca) { 728 apos++; 729 Group parentG = a.getGroup(); 730 Chain parentC = parentG.getChain(); 731 732 Chain newChain = null; 733 for (Chain c : model) { 734 if (c.getChainID().equals(parentC.getChainID())) { 735 newChain = c; 736 break; 737 } 738 } 739 if (newChain == null) { 740 newChain = new ChainImpl(); 741 newChain.setChainID(parentC.getChainID()); 742 model.add(newChain); 743 } 744 745 Group ng = (Group) parentG.clone(); 746 newGroup[apos] = ng; 747 newChain.addGroup(ng); 748 } 749 return newGroup; 750 } 751 752 /** 753 * Utility method for working with circular permutations. Creates a 754 * duplicated and cloned set of Calpha atoms from the input array. 755 * 756 * @param ca2 757 * atom array 758 * @return cloned and duplicated set of input array 759 */ 760 public static Atom[] duplicateCA2(Atom[] ca2) { 761 // we don't want to rotate input atoms, do we? 762 Atom[] ca2clone = new Atom[ca2.length * 2]; 763 764 int pos = 0; 765 766 Chain c = null; 767 String prevChainId = ""; 768 for (Atom a : ca2) { 769 Group g = (Group) a.getGroup().clone(); // works because each group 770 // has only a single atom 771 772 if (c == null) { 773 c = new ChainImpl(); 774 Chain orig = a.getGroup().getChain(); 775 c.setChainID(orig.getChainID()); 776 } else { 777 Chain orig = a.getGroup().getChain(); 778 if (!orig.getChainID().equals(prevChainId)) { 779 c = new ChainImpl(); 780 c.setChainID(orig.getChainID()); 781 } 782 } 783 784 c.addGroup(g); 785 ca2clone[pos] = g.getAtom(a.getName()); 786 787 pos++; 788 } 789 790 // Duplicate ca2! 791 c = null; 792 prevChainId = ""; 793 for (Atom a : ca2) { 794 Group g = (Group) a.getGroup().clone(); 795 796 if (c == null) { 797 c = new ChainImpl(); 798 Chain orig = a.getGroup().getChain(); 799 c.setChainID(orig.getChainID()); 800 } else { 801 Chain orig = a.getGroup().getChain(); 802 if (!orig.getChainID().equals(prevChainId)) { 803 c = new ChainImpl(); 804 c.setChainID(orig.getChainID()); 805 } 806 } 807 808 c.addGroup(g); 809 ca2clone[pos] = g.getAtom(a.getName()); 810 811 pos++; 812 } 813 814 return ca2clone; 815 816 } 817 818 /** 819 * Return an Atom array of the C-alpha atoms. Any atom that is a carbon and 820 * has CA name will be returned. 821 * 822 * @param s 823 * the structure object 824 * @return an Atom[] array 825 * @see #getRepresentativeAtomArray(Structure) 826 */ 827 public static Atom[] getAtomCAArray(Structure s) { 828 829 List<Atom> atoms = new ArrayList<Atom>(); 830 831 for (Chain c : s.getChains()) { 832 for (Group g : c.getAtomGroups()) { 833 if (g.hasAtom(CA_ATOM_NAME) 834 && g.getAtom(CA_ATOM_NAME).getElement() == Element.C) { 835 atoms.add(g.getAtom(CA_ATOM_NAME)); 836 } 837 } 838 } 839 840 return atoms.toArray(new Atom[atoms.size()]); 841 } 842 843 /** 844 * Gets a representative atom for each group that is part of the chain 845 * backbone. Note that modified aminoacids won't be returned as part of the 846 * backbone if the {@link org.biojava.nbio.structure.io.mmcif.ReducedChemCompProvider} was used to load the 847 * structure. 848 * 849 * For amino acids, the representative is a CA carbon. For nucleotides, the 850 * representative is the {@value #NUCLEOTIDE_REPRESENTATIVE}. Other group 851 * types will be ignored. 852 * 853 * @param s 854 * Input structure 855 * @return representative Atoms of the structure backbone 856 * @since Biojava 4.1.0 857 */ 858 public static Atom[] getRepresentativeAtomArray(Structure s) { 859 860 List<Atom> atoms = new ArrayList<Atom>(); 861 862 for (Chain c : s.getChains()) { 863 Atom[] chainAtoms = getRepresentativeAtomArray(c); 864 for (Atom a : chainAtoms) { 865 atoms.add(a); 866 } 867 } 868 869 return atoms.toArray(new Atom[atoms.size()]); 870 } 871 872 /** 873 * Return an Atom array of the main chain atoms: CA, C, N, O Any group that 874 * contains those atoms will be included, be it a standard aminoacid or not 875 * 876 * @param s 877 * the structure object 878 * @return an Atom[] array 879 */ 880 public static Atom[] getBackboneAtomArray(Structure s) { 881 882 List<Atom> atoms = new ArrayList<Atom>(); 883 884 for (Chain c : s.getChains()) { 885 for (Group g : c.getAtomGroups()) { 886 if (g.hasAminoAtoms()) { 887 // this means we will only take atoms grom groups that have 888 // complete backbones 889 for (Atom a : g.getAtoms()) { 890 switch (g.getType()) { 891 case NUCLEOTIDE: 892 // Nucleotide backbone 893 if (a.getName().equals(C1_ATOM_NAME)) 894 atoms.add(a); 895 if (a.getName().equals(C2_ATOM_NAME)) 896 atoms.add(a); 897 if (a.getName().equals(C3_ATOM_NAME)) 898 atoms.add(a); 899 if (a.getName().equals(C4_ATOM_NAME)) 900 atoms.add(a); 901 if (a.getName().equals(O2_ATOM_NAME)) 902 atoms.add(a); 903 if (a.getName().equals(O3_ATOM_NAME)) 904 atoms.add(a); 905 if (a.getName().equals(O4_ATOM_NAME)) 906 atoms.add(a); 907 if (a.getName().equals(O5_ATOM_NAME)) 908 atoms.add(a); 909 if (a.getName().equals(OP1_ATOM_NAME)) 910 atoms.add(a); 911 if (a.getName().equals(OP2_ATOM_NAME)) 912 atoms.add(a); 913 if (a.getName().equals(P_ATOM_NAME)) 914 atoms.add(a); 915 // TODO Allow C4* names as well as C4'? -SB 3/2015 916 break; 917 case AMINOACID: 918 default: 919 // we do it this way instead of with g.getAtom() to 920 // be sure we always use the same order as original 921 if (a.getName().equals(CA_ATOM_NAME)) 922 atoms.add(a); 923 if (a.getName().equals(C_ATOM_NAME)) 924 atoms.add(a); 925 if (a.getName().equals(N_ATOM_NAME)) 926 atoms.add(a); 927 if (a.getName().equals(O_ATOM_NAME)) 928 atoms.add(a); 929 break; 930 } 931 } 932 } 933 } 934 935 } 936 937 return atoms.toArray(new Atom[atoms.size()]); 938 } 939 940 /** 941 * Convert three character amino acid codes into single character e.g. 942 * convert CYS to C. Valid 3-letter codes will be those of the standard 20 943 * amino acids plus MSE, CSE, SEC, PYH, PYL (see the {@link #aminoAcids} 944 * map) 945 * 946 * @return the 1 letter code, or null if the given 3 letter code does not 947 * correspond to an amino acid code 948 * @param groupCode3 949 * a three character amino acid representation String 950 * @see {@link #get1LetterCode(String)} 951 */ 952 public static final Character get1LetterCodeAmino(String groupCode3) { 953 return aminoAcids.get(groupCode3); 954 } 955 956 /** 957 * 958 * @param code3 959 * @return 960 * @deprecated Use {@link #get1LetterCodeAmino(String)} instead 961 */ 962 @Deprecated 963 public static final Character convert_3code_1code(String code3) { 964 return get1LetterCodeAmino(code3); 965 } 966 967 /** 968 * Convert a three letter amino acid or nucleotide code into a single 969 * character code. If the code does not correspond to an amino acid or 970 * nucleotide, returns {@link #UNKNOWN_GROUP_LABEL}. 971 * 972 * Returned null for nucleotides prior to version 4.0.1. 973 * 974 * @param groupCode3 975 * three letter representation 976 * @return The 1-letter abbreviation 977 */ 978 public static final Character get1LetterCode(String groupCode3) { 979 980 Character code1; 981 982 // is it a standard amino acid ? 983 code1 = get1LetterCodeAmino(groupCode3); 984 985 if (code1 == null) { 986 // hm groupCode3 is not standard 987 // perhaps it is a nucleotide? 988 groupCode3 = groupCode3.trim(); 989 if (isNucleotide(groupCode3)) { 990 code1 = nucleotides30.get(groupCode3); 991 if (code1 == null) { 992 code1 = nucleotides23.get(groupCode3); 993 } 994 if (code1 == null) { 995 code1 = UNKNOWN_GROUP_LABEL; 996 } 997 } else { 998 // does not seem to be so let's assume it is 999 // nonstandard aminoacid and label it "X" 1000 // logger.warning("unknown group name "+groupCode3 ); 1001 code1 = UNKNOWN_GROUP_LABEL; 1002 } 1003 } 1004 1005 return code1; 1006 1007 } 1008 1009 /** 1010 * Test if the three-letter code of an ATOM entry corresponds to a 1011 * nucleotide or to an aminoacid. 1012 * 1013 * @param groupCode3 1014 * 3-character code for a group. 1015 * 1016 */ 1017 public static final boolean isNucleotide(String groupCode3) { 1018 String code = groupCode3.trim(); 1019 return nucleotides30.containsKey(code) 1020 || nucleotides23.containsKey(code); 1021 } 1022 1023 /** 1024 * Reduce a structure to provide a smaller representation . Only takes the 1025 * first model of the structure. If chainId is provided only return a 1026 * structure containing that Chain ID. Converts lower case chain IDs to 1027 * upper case if structure does not contain a chain with that ID. 1028 * 1029 * @param s 1030 * @param chainId 1031 * @return Structure 1032 * @since 3.0 1033 * @deprecated Use {@link StructureIdentifier#reduce(Structure)} instead (v. 4.2.0) 1034 */ 1035 @Deprecated 1036 public static final Structure getReducedStructure(Structure s, 1037 String chainId) throws StructureException { 1038 // since we deal here with structure alignments, 1039 // only use Model 1... 1040 1041 Structure newS = new StructureImpl(); 1042 newS.setPDBCode(s.getPDBCode()); 1043 newS.setPDBHeader(s.getPDBHeader()); 1044 newS.setName(s.getName()); 1045 newS.setSSBonds(s.getSSBonds()); 1046 newS.setDBRefs(s.getDBRefs()); 1047 newS.setSites(s.getSites()); 1048 newS.setBiologicalAssembly(s.isBiologicalAssembly()); 1049 newS.setCompounds(s.getCompounds()); 1050 newS.setConnections(s.getConnections()); 1051 newS.setSSBonds(s.getSSBonds()); 1052 newS.setSites(s.getSites()); 1053 1054 if (chainId != null) 1055 chainId = chainId.trim(); 1056 1057 if (chainId == null || chainId.equals("")) { 1058 // only get model 0 1059 List<Chain> model0 = s.getModel(0); 1060 for (Chain c : model0) { 1061 newS.addChain(c); 1062 } 1063 return newS; 1064 1065 } 1066 1067 Chain c = null; 1068 try { 1069 c = s.getChainByPDB(chainId); 1070 } catch (StructureException e) { 1071 logger.warn(e.getMessage() + ". Chain id " + chainId 1072 + " did not match, trying upper case Chain id."); 1073 c = s.getChainByPDB(chainId.toUpperCase()); 1074 1075 } 1076 if (c != null) { 1077 newS.addChain(c); 1078 for (Compound comp : s.getCompounds()) { 1079 if (comp.getChainIds() != null 1080 && comp.getChainIds().contains(c.getChainID())) { 1081 // found matching compound. set description... 1082 newS.getPDBHeader().setDescription( 1083 "Chain " + c.getChainID() + " of " + s.getPDBCode() 1084 + " " + comp.getMolName()); 1085 } 1086 } 1087 } 1088 1089 return newS; 1090 } 1091 1092 /** 1093 * Reduce a structure to provide a smaller representation. Only takes the 1094 * first model of the structure. If chainNr >=0 only takes the chain at that 1095 * position into account. 1096 * 1097 * @param s 1098 * @param chainNr 1099 * can be -1 to request all chains of model 0, otherwise will 1100 * only add chain at this position 1101 * @return Structure object 1102 * @since 3.0 1103 * @deprecated Use {@link StructureIdentifier#reduce(Structure)} instead (v. 4.2.0) 1104 */ 1105 @Deprecated 1106 public static final Structure getReducedStructure(Structure s, int chainNr) 1107 throws StructureException { 1108 // since we deal here with structure alignments, 1109 // only use Model 1... 1110 1111 Structure newS = new StructureImpl(); 1112 newS.setPDBCode(s.getPDBCode()); 1113 newS.setPDBHeader(s.getPDBHeader()); 1114 newS.setName(s.getName()); 1115 newS.setSSBonds(s.getSSBonds()); 1116 newS.setDBRefs(s.getDBRefs()); 1117 newS.setSites(s.getSites()); 1118 newS.setBiologicalAssembly(s.isBiologicalAssembly()); 1119 newS.setCompounds(s.getCompounds()); 1120 newS.setConnections(s.getConnections()); 1121 newS.setSSBonds(s.getSSBonds()); 1122 newS.setSites(s.getSites()); 1123 newS.setCrystallographicInfo(s.getCrystallographicInfo()); 1124 newS.getPDBHeader().setDescription( 1125 "subset of " + s.getPDBCode() + " " 1126 + s.getPDBHeader().getDescription()); 1127 1128 if (chainNr < 0) { 1129 1130 // only get model 0 1131 List<Chain> model0 = s.getModel(0); 1132 for (Chain c : model0) { 1133 newS.addChain(c); 1134 } 1135 return newS; 1136 } 1137 Chain c = null; 1138 1139 c = s.getChain(0, chainNr); 1140 1141 newS.addChain(c); 1142 1143 return newS; 1144 } 1145 1146 /** 1147 * In addition to the functionality provided by 1148 * {@link #getReducedStructure(Structure, int)} and 1149 * {@link #getReducedStructure(Structure, String)}, also provides a way to 1150 * specify sub-regions of a structure with the following specification: 1151 * 1152 * <p> 1153 * <li>ranges can be surrounded by ( and ). (but will be removed).</li> 1154 * <li>ranges are specified as PDBresnum1 : PDBresnum2</li> 1155 * 1156 * <li>a list of ranges is separated by ,</li> 1157 * </p> 1158 * Example 1159 * 1160 * <pre> 1161 * 4GCR (A:1-83) 1162 * 1CDG (A:407-495,A:582-686) 1163 * 1CDG (A_407-495,A_582-686) 1164 * </pre> 1165 * 1166 * @param s 1167 * The full structure 1168 * @param ranges 1169 * A comma-separated list of ranges, optionally surrounded by 1170 * parentheses 1171 * @return Substructure of s specified by ranges 1172 * @throws IllegalArgumentException for malformed range strings 1173 * @throws StructureException for errors when reducing the Structure 1174 * @deprecated Use {@link StructureIdentifier} instead (4.2.0) 1175 */ 1176 @Deprecated 1177 public static final Structure getSubRanges(Structure s, String ranges ) 1178 throws StructureException 1179 { 1180 if (ranges == null || ranges.equals("")) 1181 throw new IllegalArgumentException("ranges can't be null or empty"); 1182 1183 ranges = ranges.trim(); 1184 1185 if (ranges.startsWith("(")) 1186 ranges = ranges.substring(1); 1187 if (ranges.endsWith(")")) { 1188 ranges = ranges.substring(0, ranges.length() - 1); 1189 } 1190 1191 // special case: '-' means 'everything' 1192 if (ranges.equals("-")) { 1193 return s; 1194 } 1195 1196 List<ResidueRange> resRanges = ResidueRange.parseMultiple(ranges); 1197 SubstructureIdentifier structId = new SubstructureIdentifier(null,resRanges); 1198 return structId.reduce(s); 1199 } 1200 1201 public static final String convertAtomsToSeq(Atom[] atoms) { 1202 1203 StringBuilder buf = new StringBuilder(); 1204 Group prevGroup = null; 1205 for (Atom a : atoms) { 1206 Group g = a.getGroup(); 1207 if (prevGroup != null) { 1208 if (prevGroup.equals(g)) { 1209 // we add each group only once. 1210 continue; 1211 } 1212 } 1213 String code3 = g.getPDBName(); 1214 Character code1 = get1LetterCodeAmino(code3); 1215 if (code1 == null) 1216 code1 = UNKNOWN_GROUP_LABEL; 1217 1218 buf.append(code1); 1219 1220 prevGroup = g; 1221 1222 } 1223 return buf.toString(); 1224 } 1225 1226 /** 1227 * Get a group represented by a ResidueNumber. 1228 * 1229 * @param struc 1230 * a {@link Structure} 1231 * @param pdbResNum 1232 * a {@link ResidueNumber} 1233 * @return a group in the structure that is represented by the pdbResNum. 1234 * @throws StructureException 1235 * if the group cannot be found. 1236 */ 1237 public static final Group getGroupByPDBResidueNumber(Structure struc, 1238 ResidueNumber pdbResNum) throws StructureException { 1239 if (struc == null || pdbResNum == null) { 1240 throw new IllegalArgumentException("Null argument(s)."); 1241 } 1242 1243 Chain chain = struc.findChain(pdbResNum.getChainId()); 1244 1245 return chain.getGroupByPDB(pdbResNum); 1246 } 1247 1248 /** 1249 * Returns the set of intra-chain contacts for the given chain for given 1250 * atom names, i.e. the contact map. Uses a geometric hashing algorithm that 1251 * speeds up the calculation without need of full distance matrix. The 1252 * parsing mode {@link FileParsingParameters#setAlignSeqRes(boolean)} needs 1253 * to be set to true for this to work. 1254 * 1255 * @param chain 1256 * @param atomNames 1257 * the array with atom names to be used. Beware: CA will do both 1258 * C-alphas an Calciums if null all non-H atoms of non-hetatoms 1259 * will be used 1260 * @param cutoff 1261 * @return 1262 */ 1263 public static AtomContactSet getAtomsInContact(Chain chain, 1264 String[] atomNames, double cutoff) { 1265 Grid grid = new Grid(cutoff); 1266 1267 Atom[] atoms = null; 1268 if (atomNames == null) { 1269 atoms = getAllNonHAtomArray(chain, false); 1270 } else { 1271 atoms = getAtomArray(chain, atomNames); 1272 } 1273 1274 grid.addAtoms(atoms); 1275 1276 return grid.getContacts(); 1277 } 1278 1279 /** 1280 * Returns the set of intra-chain contacts for the given chain for all non-H 1281 * atoms of non-hetatoms, i.e. the contact map. Uses a geometric hashing 1282 * algorithm that speeds up the calculation without need of full distance 1283 * matrix. The parsing mode 1284 * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to 1285 * true for this to work. 1286 * 1287 * @param chain 1288 * @param cutoff 1289 * @return 1290 */ 1291 public static AtomContactSet getAtomsInContact(Chain chain, double cutoff) { 1292 return getAtomsInContact(chain, (String[]) null, cutoff); 1293 } 1294 1295 /** 1296 * Returns the set of intra-chain contacts for the given chain for C-alpha 1297 * atoms (including non-standard aminoacids appearing as HETATM groups), 1298 * i.e. the contact map. Uses a geometric hashing algorithm that speeds up 1299 * the calculation without need of full distance matrix. The parsing mode 1300 * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to 1301 * true for this to work. 1302 * 1303 * @param chain 1304 * @param cutoff 1305 * @return 1306 * @see {@link #getRepresentativeAtomsInContact(Chain, double)} 1307 */ 1308 public static AtomContactSet getAtomsCAInContact(Chain chain, double cutoff) { 1309 Grid grid = new Grid(cutoff); 1310 1311 Atom[] atoms = getAtomCAArray(chain); 1312 1313 grid.addAtoms(atoms); 1314 1315 return grid.getContacts(); 1316 } 1317 1318 /** 1319 * Returns the set of intra-chain contacts for the given chain for C-alpha 1320 * or C3' atoms (including non-standard aminoacids appearing as HETATM 1321 * groups), i.e. the contact map. Uses a geometric hashing algorithm that 1322 * speeds up the calculation without need of full distance matrix. 1323 * 1324 * @param chain 1325 * @param cutoff 1326 * @return 1327 * @since Biojava 4.1.0 1328 */ 1329 public static AtomContactSet getRepresentativeAtomsInContact(Chain chain, 1330 double cutoff) { 1331 Grid grid = new Grid(cutoff); 1332 1333 Atom[] atoms = getRepresentativeAtomArray(chain); 1334 1335 grid.addAtoms(atoms); 1336 1337 return grid.getContacts(); 1338 } 1339 1340 /** 1341 * Returns the set of inter-chain contacts between the two given chains for 1342 * the given atom names. Uses a geometric hashing algorithm that speeds up 1343 * the calculation without need of full distance matrix. The parsing mode 1344 * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to 1345 * true for this to work. 1346 * 1347 * @param chain1 1348 * @param chain2 1349 * @param atomNames 1350 * the array with atom names to be used. For Calphas use {"CA"}, 1351 * if null all non-H atoms will be used. Note HET atoms are 1352 * ignored unless this parameter is null. 1353 * @param cutoff 1354 * @param hetAtoms 1355 * if true HET atoms are included, if false they are not 1356 * @return 1357 */ 1358 public static AtomContactSet getAtomsInContact(Chain chain1, Chain chain2, 1359 String[] atomNames, double cutoff, boolean hetAtoms) { 1360 Grid grid = new Grid(cutoff); 1361 Atom[] atoms1 = null; 1362 Atom[] atoms2 = null; 1363 if (atomNames == null) { 1364 atoms1 = getAllNonHAtomArray(chain1, hetAtoms); 1365 atoms2 = getAllNonHAtomArray(chain2, hetAtoms); 1366 } else { 1367 atoms1 = getAtomArray(chain1, atomNames); 1368 atoms2 = getAtomArray(chain2, atomNames); 1369 } 1370 grid.addAtoms(atoms1, atoms2); 1371 1372 return grid.getContacts(); 1373 } 1374 1375 /** 1376 * Returns the set of inter-chain contacts between the two given chains for 1377 * all non-H atoms. Uses a geometric hashing algorithm that speeds up the 1378 * calculation without need of full distance matrix. The parsing mode 1379 * {@link FileParsingParameters#setAlignSeqRes(boolean)} needs to be set to 1380 * true for this to work. 1381 * 1382 * @param chain1 1383 * @param chain2 1384 * @param cutoff 1385 * @param hetAtoms 1386 * if true HET atoms are included, if false they are not 1387 * @return 1388 */ 1389 public static AtomContactSet getAtomsInContact(Chain chain1, Chain chain2, 1390 double cutoff, boolean hetAtoms) { 1391 return getAtomsInContact(chain1, chain2, null, cutoff, hetAtoms); 1392 } 1393 1394 /** 1395 * Finds Groups in {@code structure} that contain at least one Atom that is 1396 * within {@code radius} Angstroms of {@code centroid}. 1397 * 1398 * @param structure 1399 * The structure from which to find Groups 1400 * @param centroid 1401 * The centroid of the shell 1402 * @param excludeResidues 1403 * A list of ResidueNumbers to exclude 1404 * @param radius 1405 * The radius from {@code centroid}, in Angstroms 1406 * @param includeWater 1407 * Whether to include Groups whose <em>only</em> atoms are water 1408 * @param useAverageDistance 1409 * When set to true, distances are the arithmetic mean (1-norm) 1410 * of the distances of atoms that belong to the group and that 1411 * are within the shell; otherwise, distances are the minimum of 1412 * these values 1413 * @return A map of Groups within (or partially within) the shell, to their 1414 * distances in Angstroms 1415 */ 1416 public static Map<Group, Double> getGroupDistancesWithinShell( 1417 Structure structure, Atom centroid, 1418 Set<ResidueNumber> excludeResidues, double radius, 1419 boolean includeWater, boolean useAverageDistance) { 1420 1421 // for speed, we avoid calculating square roots 1422 radius = radius * radius; 1423 1424 Map<Group, Double> distances = new HashMap<Group, Double>(); 1425 1426 // we only need this if we're averaging distances 1427 // note that we can't use group.getAtoms().size() because some the 1428 // group's atoms be outside the shell 1429 Map<Group, Integer> atomCounts = new HashMap<Group, Integer>(); 1430 1431 for (Chain chain : structure.getChains()) { 1432 groupLoop: for (Group chainGroup : chain.getAtomGroups()) { 1433 1434 // exclude water 1435 if (!includeWater && chainGroup.isWater()) 1436 continue; 1437 1438 // check blacklist of residue numbers 1439 for (ResidueNumber rn : excludeResidues) { 1440 if (rn.equals(chainGroup.getResidueNumber())) 1441 continue groupLoop; 1442 } 1443 1444 for (Atom testAtom : chainGroup.getAtoms()) { 1445 1446 // use getDistanceFast as we are doing a lot of comparisons 1447 double dist = Calc.getDistanceFast(centroid, testAtom); 1448 1449 // if we're the shell 1450 if (dist <= radius) { 1451 if (!distances.containsKey(chainGroup)) 1452 distances.put(chainGroup, Double.POSITIVE_INFINITY); 1453 if (useAverageDistance) { 1454 // sum the distance; we'll divide by the total 1455 // number later 1456 // here, we CANNOT use fastDistance (distance 1457 // squared) because we want the arithmetic mean 1458 distances.put(chainGroup, distances.get(chainGroup) 1459 + Math.sqrt(dist)); 1460 if (!atomCounts.containsKey(chainGroup)) 1461 atomCounts.put(chainGroup, 0); 1462 atomCounts.put(chainGroup, 1463 atomCounts.get(chainGroup) + 1); 1464 } else { 1465 // take the minimum distance among all atoms of 1466 // chainGroup 1467 // note that we can't break here because we might 1468 // find a smaller distance 1469 if (dist < distances.get(chainGroup)) { 1470 distances.put(chainGroup, dist); 1471 } 1472 } 1473 } 1474 1475 } 1476 } 1477 } 1478 1479 if (useAverageDistance) { 1480 for (Map.Entry<Group, Double> entry : distances.entrySet()) { 1481 int count = atomCounts.get(entry.getKey()); 1482 distances.put(entry.getKey(), entry.getValue() / count); 1483 } 1484 } else { 1485 // in this case we used getDistanceFast 1486 for (Map.Entry<Group, Double> entry : distances.entrySet()) { 1487 distances.put(entry.getKey(), Math.sqrt(entry.getValue())); 1488 } 1489 } 1490 1491 return distances; 1492 1493 } 1494 1495 public static Set<Group> getGroupsWithinShell(Structure structure, 1496 Atom atom, Set<ResidueNumber> excludeResidues, double distance, 1497 boolean includeWater) { 1498 1499 // square the distance to use as a comparison against getDistanceFast 1500 // which returns the square of a distance. 1501 distance = distance * distance; 1502 1503 Set<Group> returnSet = new LinkedHashSet<Group>(); 1504 for (Chain chain : structure.getChains()) { 1505 groupLoop: for (Group chainGroup : chain.getAtomGroups()) { 1506 if (!includeWater && chainGroup.isWater()) 1507 continue; 1508 for (ResidueNumber rn : excludeResidues) { 1509 if (rn.equals(chainGroup.getResidueNumber())) 1510 continue groupLoop; 1511 } 1512 for (Atom atomB : chainGroup.getAtoms()) { 1513 1514 // use getDistanceFast as we are doing a lot of comparisons 1515 double dist = Calc.getDistanceFast(atom, atomB); 1516 if (dist <= distance) { 1517 returnSet.add(chainGroup); 1518 break; 1519 } 1520 1521 } 1522 } 1523 } 1524 return returnSet; 1525 } 1526 1527 /** 1528 * <p> 1529 * Returns a Set of Groups in a structure within the distance specified of a 1530 * given group. 1531 * </p> 1532 * <p> 1533 * Updated 18-Sep-2015 sroughley to return a Set so only a unique set of 1534 * Groups returned 1535 * 1536 * @param structure 1537 * The structure to work with 1538 * @param group 1539 * The 'query' group 1540 * @param distance 1541 * The cutoff distance 1542 * @param includeWater 1543 * Should water residues be included in the output? 1544 * @return {@link LinkedHashSet} of {@link Group}s within at least one atom 1545 * with {@code distance} of at least one atom in {@code group} 1546 */ 1547 public static Set<Group> getGroupsWithinShell(Structure structure, 1548 Group group, double distance, boolean includeWater) { 1549 1550 Set<Group> returnList = new LinkedHashSet<Group>(); 1551 1552 Set<ResidueNumber> excludeGroups = new HashSet<ResidueNumber>(); 1553 excludeGroups.add(group.getResidueNumber()); 1554 for (Atom atom : group.getAtoms()) { 1555 Set<Group> set = getGroupsWithinShell(structure, atom, 1556 excludeGroups, distance, includeWater); 1557 returnList.addAll(set); 1558 } 1559 1560 return returnList; 1561 } 1562 1563 /** 1564 * Remove all models from a Structure and keep only the first 1565 * 1566 * @param s 1567 * original Structure 1568 * @return a structure that contains only the first model 1569 * @since 3.0.5 1570 */ 1571 public static Structure removeModels(Structure s) { 1572 if (s.nrModels() == 1) 1573 return s; 1574 1575 Structure n = new StructureImpl(); 1576 // go through whole substructure and clone ... 1577 1578 // copy structure data 1579 1580 n.setPDBCode(s.getPDBCode()); 1581 n.setName(s.getName()); 1582 1583 // TODO: do deep copying of data! 1584 n.setPDBHeader(s.getPDBHeader()); 1585 n.setDBRefs(s.getDBRefs()); 1586 1587 n.setSites(s.getSites()); 1588 1589 n.setChains(s.getModel(0)); 1590 1591 return n; 1592 1593 } 1594 1595 /** 1596 * Removes all polymeric and solvent groups from a list of groups 1597 * 1598 */ 1599 public static List<Group> filterLigands(List<Group> allGroups) { 1600 1601 List<Group> groups = new ArrayList<Group>(); 1602 for (Group g : allGroups) { 1603 1604 ChemComp cc = g.getChemComp(); 1605 1606 if (ResidueType.lPeptideLinking.equals(cc.getResidueType()) 1607 || PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) 1608 || PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc 1609 .getPolymerType())) { 1610 continue; 1611 } 1612 if (!g.isWater()) { 1613 groups.add(g); 1614 } 1615 } 1616 1617 return groups; 1618 } 1619 1620 /** 1621 * Short version of {@link #getStructure(String, PDBFileParser, AtomCache)} 1622 * which creates new parsers when needed 1623 * 1624 * @param name 1625 * @return 1626 * @throws IOException 1627 * @throws StructureException 1628 */ 1629 public static Structure getStructure(String name) throws IOException, 1630 StructureException { 1631 return StructureTools.getStructure(name, null, null); 1632 } 1633 1634 /** 1635 * Flexibly get a structure from an input String. The intent of this method 1636 * is to allow any reasonable string which could refer to a structure to be 1637 * correctly parsed. The following are currently supported: 1638 * <ol> 1639 * <li>Filename (if name refers to an existing file) 1640 * <li>PDB ID 1641 * <li>SCOP domains 1642 * <li>PDP domains 1643 * <li>Residue ranges 1644 * <li>Other formats supported by AtomCache 1645 * </ol> 1646 * 1647 * @param name 1648 * Some reference to the protein structure 1649 * @param parser 1650 * A clean PDBFileParser to use if it is a file. If null, a 1651 * PDBFileParser will be instantiated if needed. 1652 * @param cache 1653 * An AtomCache to use if the structure can be fetched from the 1654 * PDB. If null, a AtomCache will be instantiated if needed. 1655 * @return A Structure object 1656 * @throws IOException 1657 * if name is an existing file, but doesn't parse correctly 1658 * @throws StructureException 1659 * if the format is unknown, or if AtomCache throws an 1660 * exception. 1661 */ 1662 public static Structure getStructure(String name, PDBFileParser parser, 1663 AtomCache cache) throws IOException, StructureException { 1664 File f = new File(FileDownloadUtils.expandUserHome(name)); 1665 if (f.exists()) { 1666 if (parser == null) { 1667 parser = new PDBFileParser(); 1668 } 1669 InputStream inStream = new FileInputStream(f); 1670 return parser.parsePDBFile(inStream); 1671 } else { 1672 if (cache == null) { 1673 cache = new AtomCache(); 1674 } 1675 return cache.getStructure(name); 1676 } 1677 } 1678 1679 /** 1680 * Tell whether given chain is a protein chain 1681 * 1682 * @param c 1683 * @return true if protein, false if nucleotide or ligand 1684 * @see #getPredominantGroupType(Chain) 1685 */ 1686 public static boolean isProtein(Chain c) { 1687 return getPredominantGroupType(c) == GroupType.AMINOACID; 1688 } 1689 1690 /** 1691 * Tell whether given chain is DNA or RNA 1692 * 1693 * @param c 1694 * @return true if nucleic acid, false if protein or ligand 1695 * @see #getPredominantGroupType(Chain) 1696 */ 1697 public static boolean isNucleicAcid(Chain c) { 1698 return getPredominantGroupType(c) == GroupType.NUCLEOTIDE; 1699 } 1700 1701 /** 1702 * Get the predominant {@link GroupType} for a given Chain, following these 1703 * rules: <li>if the ratio of number of residues of a certain 1704 * {@link GroupType} to total non-water residues is above the threshold 1705 * {@value #RATIO_RESIDUES_TO_TOTAL}, then that {@link GroupType} is 1706 * returned</li> <li>if there is no {@link GroupType} that is above the 1707 * threshold then the {@link GroupType} with most members is chosen, logging 1708 * it</li> 1709 * <p> 1710 * See also {@link ChemComp#getPolymerType()} and 1711 * {@link ChemComp#getResidueType()} which follow the PDB chemical component 1712 * dictionary and provide a much more accurate description of groups and 1713 * their linking. 1714 * </p> 1715 * 1716 * @param c 1717 * @return 1718 */ 1719 public static GroupType getPredominantGroupType(Chain c) { 1720 int sizeAminos = c.getAtomGroups(GroupType.AMINOACID).size(); 1721 int sizeNucleotides = c.getAtomGroups(GroupType.NUCLEOTIDE).size(); 1722 List<Group> hetAtoms = c.getAtomGroups(GroupType.HETATM); 1723 int sizeHetatoms = hetAtoms.size(); 1724 int sizeWaters = 0; 1725 for (Group g : hetAtoms) { 1726 if (g.isWater()) 1727 sizeWaters++; 1728 } 1729 int sizeHetatomsWithoutWater = sizeHetatoms - sizeWaters; 1730 1731 int fullSize = sizeAminos + sizeNucleotides + sizeHetatomsWithoutWater; 1732 1733 if ((double) sizeAminos / (double) fullSize > RATIO_RESIDUES_TO_TOTAL) 1734 return GroupType.AMINOACID; 1735 1736 if ((double) sizeNucleotides / (double) fullSize > RATIO_RESIDUES_TO_TOTAL) 1737 return GroupType.NUCLEOTIDE; 1738 1739 if ((double) (sizeHetatomsWithoutWater) / (double) fullSize > RATIO_RESIDUES_TO_TOTAL) 1740 return GroupType.HETATM; 1741 1742 // finally if neither condition works, we try based on majority, but log 1743 // it 1744 GroupType max; 1745 if (sizeNucleotides > sizeAminos) { 1746 if (sizeNucleotides > sizeHetatomsWithoutWater) { 1747 max = GroupType.NUCLEOTIDE; 1748 } else { 1749 max = GroupType.HETATM; 1750 } 1751 } else { 1752 if (sizeAminos > sizeHetatomsWithoutWater) { 1753 max = GroupType.AMINOACID; 1754 } else { 1755 max = GroupType.HETATM; 1756 } 1757 } 1758 logger.debug( 1759 "Ratio of residues to total for chain {} is below {}. Assuming it is a {} chain. " 1760 + "Counts: # aa residues: {}, # nuc residues: {}, # non-water het residues: {}, # waters: {}, " 1761 + "ratio aa/total: {}, ratio nuc/total: {}", 1762 c.getChainID(), RATIO_RESIDUES_TO_TOTAL, max, sizeAminos, 1763 sizeNucleotides, sizeHetatomsWithoutWater, sizeWaters, 1764 (double) sizeAminos / (double) fullSize, 1765 (double) sizeNucleotides / (double) fullSize); 1766 1767 return max; 1768 } 1769 1770 /** 1771 * Returns true if the given chain is composed of water molecules only 1772 * 1773 * @param c 1774 * @return 1775 */ 1776 public static boolean isChainWaterOnly(Chain c) { 1777 boolean waterOnly = true; 1778 for (Group g : c.getAtomGroups()) { 1779 if (!g.isWater()) 1780 waterOnly = false; 1781 break; 1782 } 1783 return waterOnly; 1784 } 1785 1786 /** 1787 * Returns true if the given chain is composed of non-polymeric groups only 1788 * 1789 * @param c 1790 * @return 1791 */ 1792 public static boolean isChainPureNonPolymer(Chain c) { 1793 1794 for (Group g : c.getAtomGroups()) { 1795 if (g.getType() == GroupType.AMINOACID 1796 || g.getType() == GroupType.NUCLEOTIDE) 1797 return false; 1798 1799 } 1800 return true; 1801 } 1802}