001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 12.03.2004 021 * @author Andreas Prlic 022 * 023 */ 024package org.biojava.nbio.structure; 025 026 027import org.biojava.nbio.structure.chem.ChemComp; 028import org.biojava.nbio.structure.chem.ChemCompGroupFactory; 029import org.biojava.nbio.structure.chem.PolymerType; 030import org.biojava.nbio.structure.io.FileConvert; 031import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 032import org.biojava.nbio.core.sequence.ProteinSequence; 033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 034import org.biojava.nbio.core.sequence.template.Sequence; 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038import java.util.*; 039 040 041/** 042 * A Chain in a PDB file. It contains several groups which can be of 043 * one of the types defined in the {@link GroupType} constants. 044 * 045 * @author Andreas Prlic 046 * @author Jules Jacobsen 047 * @since 1.4 048 */ 049public class ChainImpl implements Chain { 050 051 private final static Logger logger = LoggerFactory.getLogger(ChainImpl.class); 052 053 private static final long serialVersionUID = 1990171805277911840L; 054 055 /** 056 * The default chain identifier used to be an empty space 057 */ 058 private static final String DEFAULT_CHAIN_ID = "A"; 059 060 private String authId; // the 'public' chain identifier as assigned by authors in PDB files 061 private String asymId; // the 'internal' chain identifier as used in mmCIF files 062 063 private List<Group> groups; 064 private List<Group> seqResGroups; 065 066 private EntityInfo entity; 067 private Structure parent; 068 069 private final Map<String, Integer> pdbResnumMap; 070 071 private List<SeqMisMatch> seqMisMatches; 072 073 /** 074 * Constructs a ChainImpl object. 075 */ 076 public ChainImpl() { 077 super(); 078 079 authId = DEFAULT_CHAIN_ID; 080 groups = new ArrayList<>() ; 081 082 seqResGroups = new ArrayList<>(); 083 pdbResnumMap = new HashMap<>(); 084 asymId = null; 085 086 } 087 088 @Override 089 public String getId() { 090 return asymId; 091 } 092 093 @Override 094 public void setId(String asymId) { 095 this.asymId = asymId; 096 } 097 098 @Override 099 public String getName() { return authId; } 100 101 @Override 102 public void setName(String authId) { this.authId = authId; } 103 104 @Override 105 public void setStructure(Structure parent){ 106 this.parent = parent; 107 } 108 109 @Override 110 public Structure getStructure() { 111 112 return parent; 113 } 114 115 @Override 116 public Object clone() { 117 118 // go through all groups and add to new Chain. 119 ChainImpl n = new ChainImpl(); 120 // copy chain data: 121 122 n.setId(getId()); 123 n.setName(getName()); 124 125 // NOTE the EntityInfo will be reset at the parent level (Structure) if cloning is happening from parent level 126 // here we don't deep-copy it and just keep the same reference, in case the cloning is happening at the Chain level only 127 n.setEntityInfo(this.entity); 128 129 130 for (Group group : groups) { 131 Group g = (Group) group.clone(); 132 n.addGroup(g); 133 g.setChain(n); 134 } 135 136 if (seqResGroups!=null){ 137 138 List<Group> tmpSeqRes = new ArrayList<>(); 139 140 // cloning seqres and atom groups is ugly, due to their 141 // nested relationship (some of the atoms can be in the seqres, but not all) 142 143 for (Group seqResGroup : seqResGroups) { 144 145 if (seqResGroup==null) { 146 tmpSeqRes.add(null); 147 continue; 148 } 149 150 int i = groups.indexOf(seqResGroup); 151 152 Group g ; 153 154 if (i!=-1) { 155 // group found in atom groups, we get the equivalent reference from the newly cloned atom groups 156 g = n.getAtomGroup(i); 157 } else { 158 // group not found in atom groups, we clone the seqres group 159 g = (Group) seqResGroup.clone(); 160 } 161 g.setChain(n); 162 tmpSeqRes.add(g); 163 } 164 165 n.setSeqResGroups(tmpSeqRes); 166 } 167 168 return n ; 169 } 170 171 @Override 172 public void setEntityInfo(EntityInfo mol) { 173 this.entity = mol; 174 } 175 176 @Override 177 public EntityInfo getEntityInfo() { 178 return this.entity; 179 } 180 181 @Override 182 public void addGroup(Group group) { 183 184 group.setChain(this); 185 186 // Set the altlocs chain as well 187 for(Group g : group.getAltLocs()) { 188 g.setChain(this); 189 } 190 191 groups.add(group); 192 193 // store the position internally for quick access of this group 194 195 String pdbResnum = null ; 196 ResidueNumber resNum = group.getResidueNumber(); 197 if ( resNum != null) 198 pdbResnum = resNum.toString(); 199 if ( pdbResnum != null) { 200 Integer pos = groups.size() - 1; 201 // ARGH sometimes numbering in PDB files is confusing. 202 // e.g. PDB: 1sfe 203 /* 204 * ATOM 620 N GLY 93 -24.320 -6.591 4.210 1.00 46.82 N 205 * ATOM 621 CA GLY 93 -24.960 -6.849 5.497 1.00 47.35 C 206 * ATOM 622 C GLY 93 -26.076 -5.873 5.804 1.00 47.24 C 207 * ATOM 623 O GLY 93 -26.382 -4.986 5.006 1.00 47.56 O 208 * and ... 209 * HETATM 1348 O HOH 92 -21.853 -16.886 19.138 1.00 66.92 O 210 * HETATM 1349 O HOH 93 -26.126 1.226 29.069 1.00 71.69 O 211 * HETATM 1350 O HOH 94 -22.250 -18.060 -6.401 1.00 61.97 O 212 */ 213 214 // this check is to give in this case the entry priority that is an AminoAcid / comes first... 215 // a good example of same residue number for 2 residues is 3th3, chain T, residue 201 (a LYS and a sugar BGC covalently attached to it) - JD 2016-03-09 216 if ( pdbResnumMap.containsKey(pdbResnum)) { 217 218 logger.warn("Adding residue {}({}) to chain {} but a residue with same residue number is already present: {}({}). Will add only the aminoacid residue (if any) to the lookup, lookups for that residue number won't work properly.", 219 pdbResnum, group.getPDBName(), getId(), groups.get(pdbResnumMap.get(pdbResnum)).getResidueNumber(), groups.get(pdbResnumMap.get(pdbResnum)).getPDBName()); 220 if ( group instanceof AminoAcid) 221 pdbResnumMap.put(pdbResnum,pos); 222 } else 223 pdbResnumMap.put(pdbResnum,pos); 224 } 225 226 } 227 228 @Override 229 public Group getAtomGroup(int position) { 230 231 return groups.get(position); 232 } 233 234 @Override 235 public List<Group> getAtomGroups(GroupType type){ 236 237 List<Group> tmp = new ArrayList<>() ; 238 for (Group g : groups) { 239 if (g.getType().equals(type)) { 240 tmp.add(g); 241 } 242 } 243 244 return tmp ; 245 } 246 247 @Override 248 public List<Group> getAtomGroups(){ 249 return groups ; 250 } 251 252 @Override 253 public void setAtomGroups(List<Group> groups){ 254 for (Group g:groups){ 255 g.setChain(this); 256 } 257 this.groups = groups; 258 } 259 260 @Override 261 public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end, boolean ignoreMissing) 262 throws StructureException { 263 // Short-circut for include all groups 264 if(start == null && end == null) { 265 return groups.toArray(new Group[0]); 266 } 267 268 List<Group> retlst = new ArrayList<>(); 269 270 boolean adding, foundStart; 271 if( start == null ) { 272 // start with first group 273 adding = true; 274 foundStart = true; 275 } else { 276 adding = false; 277 foundStart = false; 278 } 279 280 for (Group g: groups){ 281 282 // Check for start 283 if (!adding && start.equalsPositional(g.getResidueNumber())) { 284 adding = true; 285 foundStart = true; 286 } 287 288 // Check if past start 289 if ( ignoreMissing && ! (foundStart && adding) ) { 290 ResidueNumber pos = g.getResidueNumber(); 291 292 if ( start != null && start.compareToPositional(pos) <= 0) { 293 foundStart = true; 294 adding = true; 295 } 296 } 297 298 if ( adding) 299 retlst.add(g); 300 301 // check for end 302 if ( end != null && end.equalsPositional(g.getResidueNumber())) { 303 if ( ! adding) 304 throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId); 305 adding = false; 306 break; 307 } 308 // check if past end 309 if ( ignoreMissing && adding && end != null){ 310 311 ResidueNumber pos = g.getResidueNumber(); 312 if ( end.compareToPositional(pos) <= 0) { 313 adding = false; 314 break; 315 } 316 317 } 318 } 319 320 if ( ! foundStart){ 321 throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId); 322 } 323 if ( end != null && adding && !ignoreMissing) { 324 throw new StructureException("did not find end PDB residue number " + end + " in chain " + authId); 325 } 326 327 //not checking if the end has been found in this case... 328 329 return retlst.toArray(new Group[0]); 330 } 331 332 @Override 333 public Group getGroupByPDB(ResidueNumber resNum) throws StructureException { 334 String pdbresnum = resNum.toString(); 335 if ( pdbResnumMap.containsKey(pdbresnum)) { 336 Integer pos = pdbResnumMap.get(pdbresnum); 337 return groups.get(pos); 338 } else { 339 throw new StructureException("unknown PDB residue number " + pdbresnum + " in chain " + authId); 340 } 341 } 342 343 @Override 344 public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end) 345 throws StructureException { 346 return getGroupsByPDB(start, end, false); 347 } 348 349 @Override 350 public int getSeqResLength() { 351 //new method returns the length of the sequence defined in the SEQRES records 352 return seqResGroups.size(); 353 } 354 355 @Override 356 public String toString(){ 357 String newline = System.getProperty("line.separator"); 358 StringBuilder str = new StringBuilder(); 359 str.append("Chain asymId:").append(getId()).append(" authId:").append(getName()).append(newline); 360 if ( entity != null ){ 361 if ( entity.getDescription() != null){ 362 str.append(entity.getDescription()).append(newline); 363 } 364 } 365 str.append("total SEQRES length: ").append(getSeqResGroups().size()).append(" total ATOM length:") 366 .append(getAtomLength()).append(" residues ").append(newline); 367 368 return str.toString() ; 369 } 370 371 @Override 372 public Sequence<?> getBJSequence() { 373 374 String seq = getSeqResSequence(); 375 376 Sequence<AminoAcidCompound> s = null; 377 378 try { 379 s = new ProteinSequence(seq); 380 } catch (CompoundNotFoundException e) { 381 logger.error("Could not create sequence object from seqres sequence. Some unknown compound: {}",e.getMessage()); 382 } 383 384 //TODO: return a DNA sequence if the content is DNA... 385 return s; 386 } 387 388 @Override 389 public String getAtomSequence(){ 390 391 List<Group> groups = getAtomGroups(); 392 StringBuilder sequence = new StringBuilder() ; 393 394 for ( Group g: groups){ 395 ChemComp cc = g.getChemComp(); 396 397 if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 398 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ 399 // an amino acid residue.. use for alignment 400 String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); 401 if ( oneLetter == null) 402 oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); 403 sequence.append(oneLetter); 404 } 405 406 } 407 return sequence.toString(); 408 } 409 410 @Override 411 public String getSeqResSequence(){ 412 413 StringBuilder str = new StringBuilder(); 414 for (Group g : seqResGroups) { 415 ChemComp cc = g.getChemComp(); 416 if ( cc == null) { 417 logger.warn("Could not load ChemComp for group: {}", g); 418 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 419 } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 420 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ 421 // an amino acid residue.. use for alignment 422 String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); 423 // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible. 424 if ( oneLetter == null || oneLetter.isEmpty() || "?".equals(oneLetter)) { 425 oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); 426 } 427 str.append(oneLetter); 428 } else { 429 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 430 } 431 } 432 return str.toString(); 433 } 434 435 /** 436 * Get the one letter sequence so that Sequence is guaranteed to 437 * be the same length as seqResGroups. 438 * Method related to https://github.com/biojava/biojava/issues/457 439 * @return a string of the sequence guaranteed to be the same length 440 * as seqResGroups. 441 */ 442 public String getSeqResOneLetterSeq(){ 443 444 StringBuilder str = new StringBuilder(); 445 for (Group g : seqResGroups) { 446 ChemComp cc = g.getChemComp(); 447 if ( cc == null) { 448 logger.warn("Could not load ChemComp for group: {}", g); 449 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 450 } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 451 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){ 452 // an amino acid residue.. use for alignment 453 String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc); 454 // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible. 455 if ( oneLetter == null || oneLetter.isEmpty() || "?".equals(oneLetter) || oneLetter.length()!=1) { 456 oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL); 457 } 458 str.append(oneLetter); 459 } else { 460 str.append(StructureTools.UNKNOWN_GROUP_LABEL); 461 } 462 } 463 return str.toString(); 464 } 465 466 @Override 467 public Group getSeqResGroup(int position) { 468 return seqResGroups.get(position); 469 } 470 471 @Override 472 public List<Group> getSeqResGroups(GroupType type) { 473 List<Group> tmp = new ArrayList<>() ; 474 for (Group g : seqResGroups) { 475 if (g.getType().equals(type)) { 476 tmp.add(g); 477 } 478 } 479 480 return tmp ; 481 } 482 483 @Override 484 public List<Group> getSeqResGroups() { 485 return seqResGroups; 486 } 487 488 @Override 489 public void setSeqResGroups(List<Group> groups){ 490 for (Group g: groups){ 491 if (g != null) { 492 g.setChain(this); 493 } 494 } 495 this.seqResGroups = groups; 496 } 497 498 @Override 499 public int getAtomLength() { 500 501 return groups.size(); 502 } 503 504 @Override 505 public String toPDB() { 506 return FileConvert.toPDB(this); 507 } 508 509 @Override 510 public String toMMCIF() { 511 return FileConvert.toMMCIF(this); 512 } 513 514 @Override 515 public void setSeqMisMatches(List<SeqMisMatch> seqMisMatches) { 516 this.seqMisMatches = seqMisMatches; 517 } 518 519 @Override 520 public List<SeqMisMatch> getSeqMisMatches() { 521 return seqMisMatches; 522 } 523 524 @Override 525 public EntityType getEntityType() { 526 if (getEntityInfo()==null) return null; 527 return getEntityInfo().getType(); 528 } 529 530 @Override 531 public boolean isWaterOnly() { 532 for (Group g : getAtomGroups()) { 533 if (!g.isWater()) 534 return false; 535 } 536 return true; 537 } 538 539 @Override 540 public boolean isPureNonPolymer() { 541 for (Group g : getAtomGroups()) { 542 543 //ChemComp cc = g.getChemComp(); 544 545 if ( g.isPolymeric() && 546 !g.isHetAtomInFile() ) { 547 548 // important: the aminoacid or nucleotide residue can be in Atom records 549 550 return false; 551 } 552 553 } 554 return true; 555 } 556 557 @Override 558 public GroupType getPredominantGroupType(){ 559 560 double ratioResiduesToTotal = StructureTools.RATIO_RESIDUES_TO_TOTAL; 561 562 int sizeAminos = getAtomGroups(GroupType.AMINOACID).size(); 563 int sizeNucleotides = getAtomGroups(GroupType.NUCLEOTIDE).size(); 564 List<Group> hetAtoms = getAtomGroups(GroupType.HETATM); 565 int sizeHetatoms = hetAtoms.size(); 566 int sizeWaters = 0; 567 for (Group g : hetAtoms) { 568 if (g.isWater()) 569 sizeWaters++; 570 } 571 int sizeHetatomsWithoutWater = sizeHetatoms - sizeWaters; 572 573 int fullSize = sizeAminos + sizeNucleotides + sizeHetatomsWithoutWater; 574 575 if ((double) sizeAminos / (double) fullSize > ratioResiduesToTotal) 576 return GroupType.AMINOACID; 577 578 if ((double) sizeNucleotides / (double) fullSize > ratioResiduesToTotal) 579 return GroupType.NUCLEOTIDE; 580 581 if ((double) (sizeHetatomsWithoutWater) / (double) fullSize > ratioResiduesToTotal) 582 return GroupType.HETATM; 583 584 // finally if neither condition works, we try based on majority, but log 585 // it 586 GroupType max; 587 if (sizeNucleotides > sizeAminos) { 588 if (sizeNucleotides > sizeHetatomsWithoutWater) { 589 max = GroupType.NUCLEOTIDE; 590 } else { 591 max = GroupType.HETATM; 592 } 593 } else { 594 if (sizeAminos > sizeHetatomsWithoutWater) { 595 max = GroupType.AMINOACID; 596 } else { 597 max = GroupType.HETATM; 598 } 599 } 600 logger.debug("Ratio of residues to total for chain with asym_id {} is below {}. Assuming it is a {} chain. Counts: # aa residues: {}, # nuc residues: {}, # non-water het residues: {}, # waters: {}, ratio aa/total: {}, ratio nuc/total: {}{}{}{}{}", getId(), ratioResiduesToTotal, max, sizeAminos, sizeNucleotides, sizeHetatomsWithoutWater, sizeWaters, (double) sizeAminos, (double) fullSize, (double) sizeNucleotides, (double) fullSize); 601 602 return max; 603 } 604 605 @Override 606 public boolean isProtein() { 607 return getPredominantGroupType() == GroupType.AMINOACID; 608 } 609 610 @Override 611 public boolean isNucleicAcid() { 612 return getPredominantGroupType() == GroupType.NUCLEOTIDE; 613 } 614 615 616} 617