001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 12.03.2004 021 * @author Andreas Prlic 022 * 023 */ 024package org.biojava.nbio.structure; 025 026 027import org.biojava.nbio.structure.io.FileConvert; 028import org.biojava.nbio.structure.io.mmcif.ChemCompGroupFactory; 029import org.biojava.nbio.structure.io.mmcif.chem.PolymerType; 030import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 031import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 032import org.biojava.nbio.core.sequence.ProteinSequence; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 034import org.biojava.nbio.core.sequence.template.Sequence; 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038import java.util.*; 039 040 041/** 042 * A Chain in a PDB file. It contains several groups which can be of 043 * one of the types defined in the {@link GroupType} constants. 044 * 045 * @author Andreas Prlic 046 * @author Jules Jacobsen 047 * @since 1.4 048 */ 049public class ChainImpl implements Chain { 050 051 private final static Logger logger = LoggerFactory.getLogger(ChainImpl.class); 052 053 private static final long serialVersionUID = 1990171805277911840L; 054 055 /** 056 * The default chain identifier used to be an empty space 057 */ 058 private static final String DEFAULT_CHAIN_ID = "A"; 059 060 private String swissprot_id ; 061 private String authId; // the 'public' chain identifier as assigned by authors in PDB files 062 063 private List <Group> groups; 064 private List<Group> seqResGroups; 065 066 private EntityInfo entity; 067 private Structure parent; 068 069 private Map<String, Integer> pdbResnumMap; 070 private String asymId; // the 'internal' chain identifier as used in mmCIF files 071 072 073 private List<SeqMisMatch> seqMisMatches = null; 074 /** 075 * Constructs a ChainImpl object. 076 */ 077 public ChainImpl() { 078 super(); 079 080 authId = DEFAULT_CHAIN_ID; 081 groups = new ArrayList<>() ; 082 083 seqResGroups = new ArrayList<>(); 084 pdbResnumMap = new HashMap<>(); 085 asymId = null; 086 087 } 088 089 /** {@inheritDoc} 090 * 091 */ 092 @Override 093 public String getId() { 094 return asymId; 095 } 096 097 /** {@inheritDoc} 098 * 099 */ 100 @Override 101 public void setId(String asymId) { 102 this.asymId = asymId; 103 } 104 105 /** {@inheritDoc} 106 * 107 */ 108 @Override 109 public String getName() { return authId; } 110 111 /** {@inheritDoc} 112 * 113 */ 114 @Override 115 public void setName(String authId) { this.authId = authId; } 116 117 /** {@inheritDoc} 118 * 119 */ 120 @Override 121 @Deprecated 122 public void setParent(Structure parent) { 123 setStructure(parent); 124 } 125 126 /** {@inheritDoc} 127 * 128 */ 129 @Override 130 public void setStructure(Structure parent){ 131 this.parent = parent; 132 } 133 134 /** Returns the parent Structure of this chain. 135 * 136 * @return the parent Structure object 137 */ 138 @Override 139 public Structure getStructure() { 140 141 return parent; 142 } 143 144 145 /** Returns the parent Structure of this chain. 146 * 147 * @return the parent Structure object 148 * @deprecated use getStructure instead. 149 */ 150 @Override 151 @Deprecated 152 public Structure getParent() { 153 154 return getStructure(); 155 } 156 157 /** Returns an identical copy of this Chain . 158 * @return an identical copy of this Chain 159 */ 160 @Override 161 public Object clone() { 162 // go through all groups and add to new Chain. 163 ChainImpl n = new ChainImpl(); 164 // copy chain data: 165 166 n.setId(getId()); 167 n.setName(getName()); 168 n.setSwissprotId ( getSwissprotId()); 169 170 // NOTE the EntityInfo will be reset at the parent level (Structure) if cloning is happening from parent level 171 // here we don't deep-copy it and just keep the same reference, in case the cloning is happening at the Chain level only 172 n.setEntityInfo(this.entity); 173 174 175 for (Group group : groups) { 176 Group g = (Group) group.clone(); 177 n.addGroup(g); 178 g.setChain(n); 179 } 180 181 if (seqResGroups!=null){ 182 183 List<Group> tmpSeqRes = new ArrayList<>(); 184 185 // cloning seqres and atom groups is ugly, due to their 186 // nested relationship (some of the atoms can be in the seqres, but not all) 187 188 for (Group seqResGroup : seqResGroups) { 189 190 if (seqResGroup==null) { 191 tmpSeqRes.add(null); 192 continue; 193 } 194 195 int i = groups.indexOf(seqResGroup); 196 197 Group g ; 198 199 if (i!=-1) { 200 // group found in atom groups, we get the equivalent reference from the newly cloned atom groups 201 g = n.getAtomGroup(i); 202 } else { 203 // group not found in atom groups, we clone the seqres group 204 g = (Group) seqResGroup.clone(); 205 } 206 g.setChain(n); 207 tmpSeqRes.add(g); 208 } 209 210 n.setSeqResGroups(tmpSeqRes); 211 } 212 213 return n ; 214 } 215 216 /** {@inheritDoc} 217 * 218 */ 219 @Override 220 public void setEntityInfo(EntityInfo mol) { 221 this.entity = mol; 222 } 223 224 /** {@inheritDoc} 225 * 226 */ 227 @Override 228 public EntityInfo getEntityInfo() { 229 return this.entity; 230 } 231 232 /** set the Swissprot id of this chains . 233 * @param sp_id a String specifying the swissprot id value 234 * @see #getSwissprotId 235 */ 236 @Override 237 public void setSwissprotId(String sp_id){ 238 swissprot_id = sp_id ; 239 } 240 241 /** get the Swissprot id of this chains . 242 * @return a String representing the swissprot id value 243 * @see #setSwissprotId 244 */ 245 @Override 246 public String getSwissprotId() { 247 return swissprot_id ; 248 } 249 250 /** {@inheritDoc} 251 * 252 */ 253 @Override 254 public void addGroup(Group group) { 255 256 group.setChain(this); 257 258 // Set the altlocs chain as well 259 for(Group g : group.getAltLocs()) { 260 g.setChain(this); 261 } 262 263 groups.add(group); 264 265 // store the position internally for quick access of this group 266 267 String pdbResnum = null ; 268 ResidueNumber resNum = group.getResidueNumber(); 269 if ( resNum != null) 270 pdbResnum = resNum.toString(); 271 if ( pdbResnum != null) { 272 Integer pos = groups.size() - 1; 273 // ARGH sometimes numbering in PDB files is confusing. 274 // e.g. PDB: 1sfe 275 /* 276 * ATOM 620 N GLY 93 -24.320 -6.591 4.210 1.00 46.82 N 277 * ATOM 621 CA GLY 93 -24.960 -6.849 5.497 1.00 47.35 C 278 * ATOM 622 C GLY 93 -26.076 -5.873 5.804 1.00 47.24 C 279 * ATOM 623 O GLY 93 -26.382 -4.986 5.006 1.00 47.56 O 280 * and ... 281 * HETATM 1348 O HOH 92 -21.853 -16.886 19.138 1.00 66.92 O 282 * HETATM 1349 O HOH 93 -26.126 1.226 29.069 1.00 71.69 O 283 * HETATM 1350 O HOH 94 -22.250 -18.060 -6.401 1.00 61.97 O 284 */ 285 286 // this check is to give in this case the entry priority that is an AminoAcid / comes first... 287 // a good example of same residue number for 2 residues is 3th3, chain T, residue 201 (a LYS and a sugar BGC covalently attached to it) - JD 2016-03-09 288 if ( pdbResnumMap.containsKey(pdbResnum)) { 289 290 logger.warn("Adding residue {}({}) to chain {} but a residue with same residue number is already present: {}({}). Will add only the aminoacid residue (if any) to the lookup, lookups for that residue number won't work properly.", 291 pdbResnum, group.getPDBName(), getChainID(), groups.get(pdbResnumMap.get(pdbResnum)).getResidueNumber(), groups.get(pdbResnumMap.get(pdbResnum)).getPDBName()); 292 if ( group instanceof AminoAcid) 293 pdbResnumMap.put(pdbResnum,pos); 294 } else 295 pdbResnumMap.put(pdbResnum,pos); 296 } 297 298 } 299 300 301 /** 302 * {@inheritDoc} 303 */ 304 @Override 305 public Group getAtomGroup(int position) { 306 307 return groups.get(position); 308 } 309 310 /** 311 * {@inheritDoc} 312 */ 313 @Override 314 public List<Group> getAtomGroups(GroupType type){ 315 316 List<Group> tmp = new ArrayList<>() ; 317 for (Group g : groups) { 318 if (g.getType().equals(type)) { 319 tmp.add(g); 320 } 321 } 322 323 return tmp ; 324 } 325 326 327 /** {@inheritDoc} 328 * 329 */ 330 @Override 331 public List<Group> getAtomGroups(){ 332 return groups ; 333 } 334 335 /** {@inheritDoc} 336 * 337 */ 338 @Override 339 public void setAtomGroups(List<Group> groups){ 340 for (Group g:groups){ 341 g.setChain(this); 342 } 343 this.groups = groups; 344 } 345 346 @Override 347 public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end, boolean ignoreMissing) 348 throws StructureException { 349 // Short-circut for include all groups 350 if(start == null && end == null) { 351 return groups.toArray(new Group[groups.size()]); 352 } 353 354 355 List<Group> retlst = new ArrayList<>(); 356 357 boolean adding, foundStart; 358 if( start == null ) { 359 // start with first group 360 adding = true; 361 foundStart = true; 362 } else { 363 adding = false; 364 foundStart = false; 365 } 366 367 368 for (Group g: groups){ 369 370 // Check for start 371 if (!adding && start.equalsPositional(g.getResidueNumber())) { 372 adding = true; 373 foundStart = true; 374 } 375 376 // Check if past start 377 if ( ignoreMissing && ! (foundStart && adding) ) { 378 ResidueNumber pos = g.getResidueNumber(); 379 380 if ( start != null && start.compareToPositional(pos) <= 0) { 381 foundStart = true; 382 adding = true; 383 } 384 } 385 386 if ( adding) 387 retlst.add(g); 388 389 // check for end 390 if ( end != null && end.equalsPositional(g.getResidueNumber())) { 391 if ( ! adding) 392 throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId); 393 adding = false; 394 break; 395 } 396 // check if past end 397 if ( ignoreMissing && adding && end != null){ 398 399 ResidueNumber pos = g.getResidueNumber(); 400 if ( end.compareToPositional(pos) <= 0) { 401 adding = false; 402 break; 403 } 404 405 } 406 } 407 408 if ( ! foundStart){ 409 throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId); 410 } 411 if ( end != null && adding && !ignoreMissing) { 412 throw new StructureException("did not find end PDB residue number " + end + " in chain " + authId); 413 } 414 415 416 //not checking if the end has been found in this case... 417 418 return retlst.toArray(new Group[retlst.size()] ); 419 } 420 421 422 /** 423 * {@inheritDoc} 424 * 425 */ 426 @Override 427 public Group getGroupByPDB(ResidueNumber resNum) throws StructureException { 428 String pdbresnum = resNum.toString(); 429 if ( pdbResnumMap.containsKey(pdbresnum)) { 430 Integer pos = pdbResnumMap.get(pdbresnum); 431 return groups.get(pos); 432 } else { 433 throw new StructureException("unknown PDB residue number " + pdbresnum + " in chain " + authId); 434 } 435 } 436 437 /** 438 * {@inheritDoc} 439 * 440 */ 441 @Override 442 public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end) 443 throws StructureException { 444 return getGroupsByPDB(start, end, false); 445 } 446 447 448 449 /** 450 * {@inheritDoc} 451 */ 452 @Override 453 public int getSeqResLength() { 454 //new method returns the length of the sequence defined in the SEQRES records 455 return seqResGroups.size(); 456 } 457 458 /** 459 * {@inheritDoc} 460 */ 461 @Override 462 public void setChainID(String asymId) { this.asymId = asymId; } 463 464 465 /** 466 * {@inheritDoc} 467 */ 468 @Override 469 public String getChainID() { return this.asymId; } 470 471 472 473 /** String representation. 474 * @return String representation of the Chain 475 */ 476 @Override 477 public String toString(){ 478 String newline = System.getProperty("line.separator"); 479 StringBuilder str = new StringBuilder(); 480 str.append("Chain asymId:").append(getChainID()).append(" authId:").append(getName()).append(newline); 481 if ( entity != null ){ 482 if ( entity.getDescription() != null){ 483 str.append(entity.getDescription()).append(newline); 484 } 485 } 486 str.append("total SEQRES length: ").append(getSeqResGroups().size()).append(" total ATOM length:") 487 .append(getAtomLength()).append(" residues ").append(newline); 488 489 return str.toString() ; 490 491 } 492 493 /** 494 * {@inheritDoc} 495 */ 496 @Override 497 public Sequence<?> getBJSequence() { 498 499 String seq = getSeqResSequence(); 500 501 Sequence<AminoAcidCompound> s = null; 502 503 try { 504 s = new ProteinSequence(seq); 505 } catch (CompoundNotFoundException e) { 506 logger.error("Could not create sequence object from seqres sequence. Some unknown compound: {}",e.getMessage()); 507 } 508 509 //TODO: return a DNA sequence if the content is DNA... 510 return s; 511 512 } 513 514 /** 515 * {@inheritDoc} 516 */ 517 @Override 518 public String getAtomSequence(){ 519 520 521 List<Group> groups = getAtomGroups(); 522 StringBuilder sequence = new StringBuilder() ; 523 524 for ( Group g: groups){ 525 ChemComp cc = g.getChemComp(); 526 527 if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 528 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ 529 // an amino acid residue.. use for alignment 530 String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); 531 if ( oneLetter == null) 532 oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); 533 sequence.append(oneLetter); 534 } 535 536 } 537 return sequence.toString(); 538 539 540 } 541 542 /** 543 * {@inheritDoc} 544 */ 545 @Override 546 public String getSeqResSequence(){ 547 548 StringBuilder str = new StringBuilder(); 549 for (Group g : seqResGroups) { 550 ChemComp cc = g.getChemComp(); 551 if ( cc == null) { 552 logger.warn("Could not load ChemComp for group: ", g); 553 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 554 } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 555 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ 556 // an amino acid residue.. use for alignment 557 String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); 558 // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible. 559 if ( oneLetter == null || oneLetter.isEmpty() || oneLetter.equals("?")) { 560 oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); 561 } 562 str.append(oneLetter); 563 } else { 564 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 565 } 566 } 567 return str.toString(); 568 } 569 570 /** 571 * Get the one letter sequence so that Sequence is guaranteed to 572 * be the same length as seqResGroups. 573 * Method related to https://github.com/biojava/biojava/issues/457 574 * @return a string of the sequence guaranteed to be the same length 575 * as seqResGroups. 576 */ 577 public String getSeqResOneLetterSeq(){ 578 579 StringBuilder str = new StringBuilder(); 580 for (Group g : seqResGroups) { 581 ChemComp cc = g.getChemComp(); 582 if ( cc == null) { 583 logger.warn("Could not load ChemComp for group: ", g); 584 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 585 } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 586 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ 587 // an amino acid residue.. use for alignment 588 String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); 589 // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible. 590 if ( oneLetter == null || oneLetter.isEmpty() || oneLetter.equals("?") || oneLetter.length()!=1) { 591 oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); 592 } 593 str.append(oneLetter); 594 } else { 595 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 596 } 597 } 598 return str.toString(); 599 } 600 601 602 /** 603 * {@inheritDoc} 604 */ 605 @Override 606 public Group getSeqResGroup(int position) { 607 608 return seqResGroups.get(position); 609 } 610 611 /** 612 * {@inheritDoc} 613 */ 614 @Override 615 public List<Group> getSeqResGroups(GroupType type) { 616 List<Group> tmp = new ArrayList<>() ; 617 for (Group g : seqResGroups) { 618 if (g.getType().equals(type)) { 619 tmp.add(g); 620 } 621 } 622 623 return tmp ; 624 } 625 626 /** {@inheritDoc} 627 * 628 */ 629 @Override 630 public List<Group> getSeqResGroups() { 631 return seqResGroups; 632 } 633 634 /** {@inheritDoc} 635 * 636 */ 637 @Override 638 public void setSeqResGroups(List<Group> groups){ 639 for (Group g: groups){ 640 g.setChain(this); 641 } 642 this.seqResGroups = groups; 643 } 644 645 646 /** {@inheritDoc} 647 * 648 */ 649 @Override 650 public int getAtomLength() { 651 652 return groups.size(); 653 } 654 655 /** {@inheritDoc} 656 * 657 */ 658 @Override 659 public List<Group> getAtomLigands(){ 660 List<Group> ligands = new ArrayList<>(); 661 662 for (Group g : groups) 663 if (!seqResGroups.contains(g) && !g.isWater()) 664 ligands.add(g); 665 666 return ligands; 667 } 668 669 @Override 670 public String getInternalChainID() { 671 return asymId; 672 } 673 674 @Override 675 public void setInternalChainID(String internalChainID) { 676 this.asymId = internalChainID; 677 678 } 679 680 @Override 681 public String toPDB() { 682 return FileConvert.toPDB(this); 683 } 684 685 @Override 686 public String toMMCIF() { 687 return FileConvert.toMMCIF(this, true); 688 } 689 690 @Override 691 public void setSeqMisMatches(List<SeqMisMatch> seqMisMatches) { 692 this.seqMisMatches = seqMisMatches; 693 } 694 695 @Override 696 public List<SeqMisMatch> getSeqMisMatches() { 697 return seqMisMatches; 698 } 699 700 @Override 701 public EntityType getEntityType() { 702 if (getEntityInfo()==null) return null; 703 return getEntityInfo().getType(); 704 } 705 706 @Override 707 public boolean isWaterOnly() { 708 for (Group g : getAtomGroups()) { 709 if (!g.isWater()) 710 return false; 711 } 712 return true; 713 } 714 715 @Override 716 public boolean isPureNonPolymer() { 717 for (Group g : getAtomGroups()) { 718 719 //ChemComp cc = g.getChemComp(); 720 721 if ( g.isPolymeric() && 722 !g.isHetAtomInFile() ) { 723 724 // important: the aminoacid or nucleotide residue can be in Atom records 725 726 return false; 727 } 728 729 } 730 return true; 731 } 732 733 @Override 734 public GroupType getPredominantGroupType(){ 735 736 double ratioResiduesToTotal = StructureTools.RATIO_RESIDUES_TO_TOTAL; 737 738 int sizeAminos = getAtomGroups(GroupType.AMINOACID).size(); 739 int sizeNucleotides = getAtomGroups(GroupType.NUCLEOTIDE).size(); 740 List<Group> hetAtoms = getAtomGroups(GroupType.HETATM); 741 int sizeHetatoms = hetAtoms.size(); 742 int sizeWaters = 0; 743 for (Group g : hetAtoms) { 744 if (g.isWater()) 745 sizeWaters++; 746 } 747 int sizeHetatomsWithoutWater = sizeHetatoms - sizeWaters; 748 749 int fullSize = sizeAminos + sizeNucleotides + sizeHetatomsWithoutWater; 750 751 if ((double) sizeAminos / (double) fullSize > ratioResiduesToTotal) 752 return GroupType.AMINOACID; 753 754 if ((double) sizeNucleotides / (double) fullSize > ratioResiduesToTotal) 755 return GroupType.NUCLEOTIDE; 756 757 if ((double) (sizeHetatomsWithoutWater) / (double) fullSize > ratioResiduesToTotal) 758 return GroupType.HETATM; 759 760 // finally if neither condition works, we try based on majority, but log 761 // it 762 GroupType max; 763 if (sizeNucleotides > sizeAminos) { 764 if (sizeNucleotides > sizeHetatomsWithoutWater) { 765 max = GroupType.NUCLEOTIDE; 766 } else { 767 max = GroupType.HETATM; 768 } 769 } else { 770 if (sizeAminos > sizeHetatomsWithoutWater) { 771 max = GroupType.AMINOACID; 772 } else { 773 max = GroupType.HETATM; 774 } 775 } 776 logger.debug( 777 "Ratio of residues to total for chain with asym_id {} is below {}. Assuming it is a {} chain. " 778 + "Counts: # aa residues: {}, # nuc residues: {}, # non-water het residues: {}, # waters: {}, " 779 + "ratio aa/total: {}, ratio nuc/total: {}", 780 getId(), ratioResiduesToTotal, max, sizeAminos, 781 sizeNucleotides, sizeHetatomsWithoutWater, sizeWaters, 782 (double) sizeAminos / (double) fullSize, 783 (double) sizeNucleotides / (double) fullSize); 784 785 return max; 786 } 787 788 @Override 789 public boolean isProtein() { 790 return getPredominantGroupType() == GroupType.AMINOACID; 791 } 792 793 @Override 794 public boolean isNucleicAcid() { 795 return getPredominantGroupType() == GroupType.NUCLEOTIDE; 796 } 797 798 799} 800