001package org.biojava.nbio.structure.io.cif; 002 003import java.time.LocalDate; 004import java.time.ZoneId; 005import java.time.format.DateTimeFormatter; 006import java.time.format.DateTimeFormatterBuilder; 007import java.util.ArrayList; 008import java.util.Date; 009import java.util.HashMap; 010import java.util.LinkedHashMap; 011import java.util.List; 012import java.util.Locale; 013import java.util.Map; 014import java.util.NoSuchElementException; 015import java.util.Optional; 016import java.util.OptionalInt; 017import java.util.stream.IntStream; 018 019import javax.vecmath.Matrix4d; 020 021import org.biojava.nbio.structure.AminoAcid; 022import org.biojava.nbio.structure.AminoAcidImpl; 023import org.biojava.nbio.structure.Atom; 024import org.biojava.nbio.structure.AtomImpl; 025import org.biojava.nbio.structure.Chain; 026import org.biojava.nbio.structure.ChainImpl; 027import org.biojava.nbio.structure.DBRef; 028import org.biojava.nbio.structure.Element; 029import org.biojava.nbio.structure.EntityInfo; 030import org.biojava.nbio.structure.EntityType; 031import org.biojava.nbio.structure.Group; 032import org.biojava.nbio.structure.GroupType; 033import org.biojava.nbio.structure.HetatomImpl; 034import org.biojava.nbio.structure.NucleotideImpl; 035import org.biojava.nbio.structure.PDBCrystallographicInfo; 036import org.biojava.nbio.structure.PDBHeader; 037import org.biojava.nbio.structure.PdbId; 038import org.biojava.nbio.structure.ResidueNumber; 039import org.biojava.nbio.structure.SeqMisMatch; 040import org.biojava.nbio.structure.SeqMisMatchImpl; 041import org.biojava.nbio.structure.Site; 042import org.biojava.nbio.structure.Structure; 043import org.biojava.nbio.structure.StructureException; 044import org.biojava.nbio.structure.StructureImpl; 045import org.biojava.nbio.structure.StructureTools; 046import org.biojava.nbio.structure.chem.ChemCompGroupFactory; 047import org.biojava.nbio.structure.io.BondMaker; 048import org.biojava.nbio.structure.io.ChargeAdder; 049import org.biojava.nbio.structure.io.EntityFinder; 050import org.biojava.nbio.structure.io.FileParsingParameters; 051import org.biojava.nbio.structure.io.SeqRes2AtomAligner; 052import org.biojava.nbio.structure.quaternary.BioAssemblyInfo; 053import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder; 054import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation; 055import org.biojava.nbio.structure.xtal.CrystalCell; 056import org.biojava.nbio.structure.xtal.SpaceGroup; 057import org.biojava.nbio.structure.xtal.SymoplibParser; 058import org.rcsb.cif.model.FloatColumn; 059import org.rcsb.cif.model.IntColumn; 060import org.rcsb.cif.model.StrColumn; 061import org.rcsb.cif.model.ValueKind; 062import org.rcsb.cif.schema.mm.AtomSite; 063import org.rcsb.cif.schema.mm.AtomSites; 064import org.rcsb.cif.schema.mm.AuditAuthor; 065import org.rcsb.cif.schema.mm.Cell; 066import org.rcsb.cif.schema.mm.ChemComp; 067import org.rcsb.cif.schema.mm.ChemCompBond; 068import org.rcsb.cif.schema.mm.DatabasePDBRemark; 069import org.rcsb.cif.schema.mm.DatabasePDBRev; 070import org.rcsb.cif.schema.mm.DatabasePDBRevRecord; 071import org.rcsb.cif.schema.mm.Em3dReconstruction; 072import org.rcsb.cif.schema.mm.Entity; 073import org.rcsb.cif.schema.mm.EntityPoly; 074import org.rcsb.cif.schema.mm.EntityPolySeq; 075import org.rcsb.cif.schema.mm.EntitySrcGen; 076import org.rcsb.cif.schema.mm.EntitySrcNat; 077import org.rcsb.cif.schema.mm.Exptl; 078import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory; 079import org.rcsb.cif.schema.mm.PdbxChemCompIdentifier; 080import org.rcsb.cif.schema.mm.PdbxDatabaseStatus; 081import org.rcsb.cif.schema.mm.PdbxEntityBranchDescriptor; 082import org.rcsb.cif.schema.mm.PdbxEntitySrcSyn; 083import org.rcsb.cif.schema.mm.PdbxMolecule; 084import org.rcsb.cif.schema.mm.PdbxMoleculeFeatures; 085import org.rcsb.cif.schema.mm.PdbxNonpolyScheme; 086import org.rcsb.cif.schema.mm.PdbxReferenceEntityLink; 087import org.rcsb.cif.schema.mm.PdbxReferenceEntityList; 088import org.rcsb.cif.schema.mm.PdbxReferenceEntityPolyLink; 089import org.rcsb.cif.schema.mm.PdbxStructAssembly; 090import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen; 091import org.rcsb.cif.schema.mm.PdbxStructModResidue; 092import org.rcsb.cif.schema.mm.PdbxStructOperList; 093import org.rcsb.cif.schema.mm.Refine; 094import org.rcsb.cif.schema.mm.Struct; 095import org.rcsb.cif.schema.mm.StructAsym; 096import org.rcsb.cif.schema.mm.StructConf; 097import org.rcsb.cif.schema.mm.StructConn; 098import org.rcsb.cif.schema.mm.StructConnType; 099import org.rcsb.cif.schema.mm.StructKeywords; 100import org.rcsb.cif.schema.mm.StructNcsOper; 101import org.rcsb.cif.schema.mm.StructRef; 102import org.rcsb.cif.schema.mm.StructRefSeq; 103import org.rcsb.cif.schema.mm.StructRefSeqDif; 104import org.rcsb.cif.schema.mm.StructSheetRange; 105import org.rcsb.cif.schema.mm.StructSite; 106import org.rcsb.cif.schema.mm.StructSiteGen; 107import org.rcsb.cif.schema.mm.Symmetry; 108import org.slf4j.Logger; 109import org.slf4j.LoggerFactory; 110 111/** 112 * An implementation of a CifFileConsumer for BioJava. Will process the information provided by a CifFile instance and 113 * use it to build up a {@link Structure} object. 114 * @author Sebastian Bittrich 115 * @since 6.0.0 116 */ 117public class CifStructureConsumerImpl implements CifStructureConsumer { 118 private static final Logger logger = LoggerFactory.getLogger(CifStructureConsumerImpl.class); 119 private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder() 120 .parseCaseInsensitive() 121 .appendPattern("yyyy-MM-dd") 122 .toFormatter(Locale.US); 123 124 private Structure structure; 125 private Chain currentChain; 126 private Group currentGroup; 127 private List<List<Chain>> allModels; 128 private List<Chain> currentModel; 129 private PDBHeader pdbHeader; 130 private String currentNmrModelNumber; 131 private Em3dReconstruction em3dReconstruction; 132 private List<Chain> entityChains; 133 134 private Entity entity; 135 private EntityPoly entityPoly; 136 private EntitySrcGen entitySrcGen; 137 private EntitySrcNat entitySrcNat; 138 private PdbxEntitySrcSyn entitySrcSyn; 139 private List<Chain> seqResChains; 140 private PdbxStructAssembly structAssembly; 141 private PdbxStructAssemblyGen structAssemblyGen; 142 private StructAsym structAsym; 143 private StructConn structConn; 144 private StructNcsOper structNcsOper; 145 private PdbxStructOperList structOpers; 146 private StructRef structRef; 147 private StructRefSeqDif structRefSeqDif; 148 private StructSiteGen structSiteGen; 149 150 private Map<String, String> asymId2entityId; 151 private Map<String, String> asymId2authorId; 152 private Matrix4d parsedScaleMatrix; 153 154 private final FileParsingParameters params; 155 156 public CifStructureConsumerImpl(FileParsingParameters params) { 157 this.params = params; 158 } 159 160 @Override 161 public void prepare() { 162 this.structure = new StructureImpl(); 163 this.pdbHeader = new PDBHeader(); 164 structure.setPDBHeader(pdbHeader); 165 166 this.allModels = new ArrayList<>(); 167 this.currentModel = new ArrayList<>(); 168 169 this.seqResChains = new ArrayList<>(); 170 this.asymId2entityId = new HashMap<>(); 171 this.asymId2authorId = new HashMap<>(); 172 173 this.entityChains = new ArrayList<>(); 174 } 175 176 @Override 177 public void consumeAtomSite(AtomSite atomSite) { 178 if (params.isHeaderOnly()) { 179 return; 180 } 181 182 StrColumn labelAsymId = atomSite.getLabelAsymId(); 183 StrColumn authAsymId = atomSite.getAuthAsymId(); 184 185 StrColumn groupPDB = atomSite.getGroupPDB(); 186 IntColumn authSeqId = atomSite.getAuthSeqId(); 187 188 StrColumn labelCompId = atomSite.getLabelCompId(); 189 190 IntColumn id = atomSite.getId(); 191 StrColumn labelAtomId = atomSite.getLabelAtomId(); 192 193 FloatColumn cartnX = atomSite.getCartnX(); 194 FloatColumn cartnY = atomSite.getCartnY(); 195 FloatColumn cartnZ = atomSite.getCartnZ(); 196 197 FloatColumn occupancy = atomSite.getOccupancy(); 198 FloatColumn bIsoOrEquiv = atomSite.getBIsoOrEquiv(); 199 200 StrColumn labelAltId = atomSite.getLabelAltId(); 201 StrColumn typeSymbol = atomSite.getTypeSymbol(); 202 203 StrColumn pdbxPDBInsCode = atomSite.getPdbxPDBInsCode(); 204 IntColumn labelSeqId = atomSite.getLabelSeqId(); 205 IntColumn pdbx_pdb_model_num = atomSite.getPdbxPDBModelNum(); 206 207 for (int atomIndex = 0; atomIndex < atomSite.getRowCount(); atomIndex++) { 208 boolean startOfNewChain = false; 209 Character oneLetterCode = StructureTools.get1LetterCodeAmino(labelCompId.get(atomIndex)); 210 211 boolean isHetAtmInFile = false; 212 if (!"ATOM".equals(groupPDB.get(atomIndex))) { 213 if (oneLetterCode != null && oneLetterCode.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 214 oneLetterCode = null; 215 } 216 217 isHetAtmInFile = true; 218 } 219 220 String insCodeString = pdbxPDBInsCode.isDefined()? pdbxPDBInsCode.get(atomIndex) : null; 221 222 Character insCode = null; 223 if (insCodeString != null && !insCodeString.isEmpty() && !"?".equals(insCodeString)) { 224 insCode = insCodeString.charAt(0); 225 } 226 227 // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.' 228 long seqId = labelSeqId.get(atomIndex); 229 230 String nmrModelNumber = pdbx_pdb_model_num.getStringData(atomIndex); 231 232 if (currentNmrModelNumber == null) { 233 currentNmrModelNumber = nmrModelNumber; 234 } 235 if (!currentNmrModelNumber.equals(nmrModelNumber)) { 236 currentNmrModelNumber = nmrModelNumber; 237 238 if (currentChain != null) { 239 currentChain.addGroup(currentGroup); 240 currentGroup.trimToSize(); 241 } 242 243 allModels.add(currentModel); 244 currentModel = new ArrayList<>(); 245 currentChain = null; 246 currentGroup = null; 247 } 248 249 String asymId = labelAsymId.get(atomIndex); 250 String authId = authAsymId.isDefined()? authAsymId.get(atomIndex) : asymId; 251 252 if (currentChain == null) { 253 currentChain = new ChainImpl(); 254 currentChain.setName(authId); 255 currentChain.setId(asymId); 256 currentModel.add(currentChain); 257 startOfNewChain = true; 258 } 259 260 if (!asymId.equals(currentChain.getId())) { 261 startOfNewChain = true; 262 263 currentChain.addGroup(currentGroup); 264 265 Optional<Chain> testChain = currentModel.stream() 266 .filter(chain -> chain.getId().equals(asymId)) 267 .findFirst(); 268 269 if (testChain.isPresent()) { 270 currentChain = testChain.get(); 271 } else { 272 currentChain = new ChainImpl(); 273 currentChain.setName(authId); 274 currentChain.setId(asymId); 275 } 276 277 if (!currentModel.contains(currentChain)) { 278 currentModel.add(currentChain); 279 } 280 } 281 282 int authSeqIdInt = authSeqId.isDefined()? authSeqId.get(atomIndex) : (int)seqId; 283 284 ResidueNumber residueNumber = new ResidueNumber(authId, authSeqIdInt, insCode); 285 286 String recordName = groupPDB.get(atomIndex); 287 String compId = labelCompId.get(atomIndex); 288 if (currentGroup == null) { 289 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 290 currentGroup.setResidueNumber(residueNumber); 291 currentGroup.setPDBName(compId); 292 currentGroup.setHetAtomInFile(isHetAtmInFile); 293 } 294 295 Group altGroup = null; 296 String altLocation = labelAltId.isDefined()? labelAltId.get(atomIndex) : null; 297 298 if (startOfNewChain) { 299 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 300 currentGroup.setResidueNumber(residueNumber); 301 currentGroup.setPDBName(compId); 302 currentGroup.setHetAtomInFile(isHetAtmInFile); 303 } else { 304 if (!residueNumber.equals(currentGroup.getResidueNumber())) { 305 currentChain.addGroup(currentGroup); 306 currentGroup.trimToSize(); 307 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 308 currentGroup.setPDBName(compId); 309 currentGroup.setResidueNumber(residueNumber); 310 currentGroup.setHetAtomInFile(isHetAtmInFile); 311 } else { 312 if (altLocation != null && !altLocation.isEmpty() && !".".equals(altLocation)) { 313 altGroup = getAltLocGroup(recordName, altLocation.charAt(0), oneLetterCode, compId, seqId); 314 if (altGroup.getChain() == null) { 315 altGroup.setChain(currentChain); 316 } 317 } 318 } 319 } 320 321 if (params.isParseCAOnly()) { 322 if (!labelAtomId.get(atomIndex).equals(StructureTools.CA_ATOM_NAME) && "C".equals(typeSymbol.get(atomIndex))) { 323 continue; 324 } 325 } 326 327 Atom atom = new AtomImpl(); 328 329 atom.setPDBserial(id.get(atomIndex)); 330 atom.setName(labelAtomId.get(atomIndex)); 331 332 atom.setX(cartnX.get(atomIndex)); 333 atom.setY(cartnY.get(atomIndex)); 334 atom.setZ(cartnZ.get(atomIndex)); 335 336 atom.setOccupancy((float) occupancy.get(atomIndex)); 337 atom.setTempFactor((float) bIsoOrEquiv.get(atomIndex)); 338 339 if (altLocation == null || altLocation.isEmpty() || ".".equals(altLocation)) { 340 atom.setAltLoc(' '); 341 } else { 342 atom.setAltLoc(altLocation.charAt(0)); 343 } 344 345 String ts = typeSymbol.get(atomIndex); 346 try { 347 Element element = Element.valueOfIgnoreCase(ts); 348 atom.setElement(element); 349 } catch (IllegalArgumentException e) { 350 logger.info("Element {} was not recognised as a BioJava-known element, the element will be " + 351 "represented as the generic element {}", ts, Element.R.name()); 352 atom.setElement(Element.R); 353 } 354 355 if (altGroup != null) { 356 altGroup.addAtom(atom); 357 } else { 358 currentGroup.addAtom(atom); 359 } 360 361 String atomName = atom.getName(); 362 if (!currentGroup.hasAtom(atomName)) { 363 if (currentGroup.getPDBName().equals(atom.getGroup().getPDBName())) { 364 if (!StructureTools.hasNonDeuteratedEquiv(atom, currentGroup)) { 365 currentGroup.addAtom(atom); 366 } 367 } 368 } 369 } 370 } 371 372 private Group getAltLocGroup(String recordName, Character altLoc, Character oneLetterCode, String threeLetterCode, 373 long seqId) { 374 List<Atom> atoms = currentGroup.getAtoms(); 375 if (atoms.size() > 0) { 376 if (atoms.get(0).getAltLoc().equals(altLoc)) { 377 return currentGroup; 378 } 379 } 380 381 List<Group> altLocs = currentGroup.getAltLocs(); 382 for (Group altLocGroup : altLocs) { 383 atoms = altLocGroup.getAtoms(); 384 if (atoms.size() > 0) { 385 for (Atom a1 : atoms) { 386 if (a1.getAltLoc().equals(altLoc)) { 387 return altLocGroup; 388 } 389 } 390 } 391 } 392 393 if (threeLetterCode.equals(currentGroup.getPDBName())) { 394 if (currentGroup.getAtoms().isEmpty()) { 395 return currentGroup; 396 } 397 398 Group altLocGroup = (Group) currentGroup.clone(); 399 altLocGroup.setAtoms(new ArrayList<>()); 400 altLocGroup.getAltLocs().clear(); 401 currentGroup.addAltLoc(altLocGroup); 402 return altLocGroup; 403 } 404 405 Group altLocGroup = createGroup(recordName, oneLetterCode, threeLetterCode, seqId); 406 altLocGroup.setPDBName(threeLetterCode); 407 altLocGroup.setResidueNumber(currentGroup.getResidueNumber()); 408 currentGroup.addAltLoc(altLocGroup); 409 return altLocGroup; 410 } 411 412 private Group createGroup(String record, Character oneLetterCode, String threeLetterCode, long seqId) { 413 Group group = ChemCompGroupFactory.getGroupFromChemCompDictionary(threeLetterCode); 414 if (group != null && !group.getChemComp().isEmpty()) { 415 if (group instanceof AminoAcidImpl) { 416 AminoAcidImpl aminoAcid = (AminoAcidImpl) group; 417 aminoAcid.setId(seqId); 418 } else if (group instanceof NucleotideImpl) { 419 NucleotideImpl nucleotide = (NucleotideImpl) group; 420 nucleotide.setId(seqId); 421 } else if (group instanceof HetatomImpl) { 422 HetatomImpl hetatom = (HetatomImpl) group; 423 hetatom.setId(seqId); 424 } 425 return group; 426 } 427 428 if ("ATOM".equals(record)) { 429 if (StructureTools.isNucleotide(threeLetterCode)) { 430 NucleotideImpl nucleotide = new NucleotideImpl(); 431 group = nucleotide; 432 nucleotide.setId(seqId); 433 } else if (oneLetterCode == null || oneLetterCode == StructureTools.UNKNOWN_GROUP_LABEL) { 434 HetatomImpl hetatom = new HetatomImpl(); 435 group = hetatom; 436 hetatom.setId(seqId); 437 } else { 438 AminoAcidImpl aminoAcid = new AminoAcidImpl(); 439 group = aminoAcid; 440 aminoAcid.setAminoType(oneLetterCode); 441 aminoAcid.setId(seqId); 442 } 443 } else { 444 if (StructureTools.isNucleotide(threeLetterCode)) { 445 NucleotideImpl nucleotide = new NucleotideImpl(); 446 group = nucleotide; 447 nucleotide.setId(seqId); 448 } else if (oneLetterCode != null) { 449 AminoAcidImpl aminoAcid = new AminoAcidImpl(); 450 group = aminoAcid; 451 aminoAcid.setAminoType(oneLetterCode); 452 aminoAcid.setId(seqId); 453 } else { 454 HetatomImpl hetatom = new HetatomImpl(); 455 hetatom.setId(seqId); 456 group = hetatom; 457 } 458 } 459 return group; 460 } 461 462 @Override 463 public void consumeAtomSites(AtomSites atomSites) { 464 // no atom sites present 465 if (!atomSites.isDefined() || atomSites.getRowCount() == 0) { 466 return; 467 } 468 469 try { 470 parsedScaleMatrix = new Matrix4d( 471 atomSites.getFractTransfMatrix11().get(0), 472 atomSites.getFractTransfMatrix12().get(0), 473 atomSites.getFractTransfMatrix13().get(0), 474 atomSites.getFractTransfVector1().get(0), 475 476 atomSites.getFractTransfMatrix21().get(0), 477 atomSites.getFractTransfMatrix22().get(0), 478 atomSites.getFractTransfMatrix23().get(0), 479 atomSites.getFractTransfVector2().get(0), 480 481 atomSites.getFractTransfMatrix31().get(0), 482 atomSites.getFractTransfMatrix32().get(0), 483 atomSites.getFractTransfMatrix33().get(0), 484 atomSites.getFractTransfVector3().get(0), 485 486 0, 487 0, 488 0, 489 1 490 ); 491 } catch (NumberFormatException e) { 492 logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not " + 493 "be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", 494 e.getMessage()); 495 structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false); 496 } 497 } 498 499 @Override 500 public void consumeAuditAuthor(AuditAuthor auditAuthor) { 501 for (int rowIndex = 0; rowIndex < auditAuthor.getRowCount(); rowIndex++) { 502 String name = auditAuthor.getName().get(rowIndex); 503 504 StringBuilder last = new StringBuilder(); 505 StringBuilder initials = new StringBuilder(); 506 boolean afterComma = false; 507 for (char c : name.toCharArray()) { 508 if (c == ' ') { 509 continue; 510 } 511 if (c == ',') { 512 afterComma = true; 513 continue; 514 } 515 516 if (afterComma) { 517 initials.append(c); 518 } else { 519 last.append(c); 520 } 521 } 522 523 StringBuilder newaa = new StringBuilder(); 524 newaa.append(initials); 525 newaa.append(last); 526 527 String auth = pdbHeader.getAuthors(); 528 if (auth == null) { 529 pdbHeader.setAuthors(newaa.toString()); 530 } else { 531 auth += "," + newaa.toString(); 532 pdbHeader.setAuthors(auth); 533 } 534 } 535 } 536 537 @Override 538 public void consumeCell(Cell cell) { 539 if (!cell.isDefined() || cell.getRowCount() == 0) { 540 return; 541 } 542 543 try { 544 float a = (float) cell.getLengthA().get(0); 545 float b = (float) cell.getLengthB().get(0); 546 float c = (float) cell.getLengthC().get(0); 547 float alpha = (float) cell.getAngleAlpha().get(0); 548 float beta = (float) cell.getAngleBeta().get(0); 549 float gamma = (float) cell.getAngleGamma().get(0); 550 551 CrystalCell crystalCell = new CrystalCell(); 552 crystalCell.setA(a); 553 crystalCell.setB(b); 554 crystalCell.setC(c); 555 crystalCell.setAlpha(alpha); 556 crystalCell.setBeta(beta); 557 crystalCell.setGamma(gamma); 558 559 if (!crystalCell.isCellReasonable()) { 560 // If the entry describes a structure determined by a technique other than X-ray crystallography, 561 // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees 562 // if so we don't add and CrystalCell will be null 563 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", CrystalCell.MIN_VALID_CELL_SIZE); 564 return; 565 } 566 567 structure.getPDBHeader() 568 .getCrystallographicInfo() 569 .setCrystalCell(crystalCell); 570 571 } catch (NumberFormatException e){ 572 structure.getPDBHeader() 573 .getCrystallographicInfo() 574 .setCrystalCell(null); 575 logger.info("could not parse some cell parameters ({}), ignoring _cell", e.getMessage()); 576 } 577 } 578 579 @Override 580 public void consumeChemComp(ChemComp chemComp) { 581 // TODO not impled in ref 582 } 583 584 @Override 585 public void consumeChemCompBond(ChemCompBond chemCompBond) { 586 // TODO not impled in ref 587 } 588 589 @Override 590 public void consumeDatabasePDBRemark(DatabasePDBRemark databasePDBremark) { 591 for (int rowIndex = 0; rowIndex < databasePDBremark.getRowCount(); rowIndex++) { 592 int id = databasePDBremark.getId().get(rowIndex); 593 if (id == 2) { 594 String line = databasePDBremark.getText().get(rowIndex); 595 int i = line.indexOf("ANGSTROM"); 596 597 if (i > 5) { 598 // line contains ANGSTROM info... 599 String resolution = line.substring(i - 5, i).trim(); 600 // convert string to float 601 try { 602 float res = Float.parseFloat(resolution); 603 pdbHeader.setResolution(res); 604 } catch (NumberFormatException e) { 605 logger.info("could not parse resolution from line and ignoring it {}", line); 606 return; 607 } 608 } 609 } 610 } 611 } 612 613 private Date convert(LocalDate localDate) { 614 return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant()); 615 } 616 617 @Override 618 public void consumeDatabasePDBRev(DatabasePDBRev databasePDBrev) { 619 logger.debug("got a database revision:{}", databasePDBrev); 620 621 Date modDate = null; 622 for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) { 623 if (databasePDBrev.getNum().get(rowIndex) == 1) { 624 String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex); 625 pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT))); 626 627 String date = databasePDBrev.getDate().get(rowIndex); 628 final Date relDate = convert(LocalDate.parse(date, DATE_FORMAT)); 629 pdbHeader.setRelDate(relDate); 630 modDate = relDate; 631 } else { 632 String dbrev = databasePDBrev.getDate().get(rowIndex); 633 modDate = convert(LocalDate.parse(dbrev, DATE_FORMAT)); 634 } 635 pdbHeader.setModDate(modDate); 636 } 637 } 638 639 @Override 640 public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecord) { 641 List<org.biojava.nbio.structure.DatabasePDBRevRecord> revRecords = pdbHeader.getRevisionRecords(); 642 if (revRecords == null) { 643 revRecords = new ArrayList<>(); 644 pdbHeader.setRevisionRecords(revRecords); 645 } 646 647 for (int i = 0; i < databasePDBrevRecord.getRowCount(); i++) { 648 revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i)); 649 } 650 } 651 652 @Override 653 public void consumeEm3dReconstruction(Em3dReconstruction em3dReconstruction) { 654 this.em3dReconstruction = em3dReconstruction; 655 656 for (int rowIndex = 0; rowIndex < em3dReconstruction.getRowCount(); rowIndex++) { //can it have more than 1 value? 657 final FloatColumn resolution = em3dReconstruction.getResolution(); 658 if (ValueKind.PRESENT.equals(resolution.getValueKind(rowIndex))) 659 pdbHeader.setResolution((float) resolution.get(rowIndex)); 660 } 661 //TODO other fields (maybe RFree)? 662 } 663 664 @Override 665 public void consumeEntity(Entity entity) { 666 this.entity = entity; 667 } 668 669 @Override 670 public void consumeEntityPoly(EntityPoly entityPoly) { 671 this.entityPoly = entityPoly; 672 } 673 674 @Override 675 public void consumeEntitySrcGen(EntitySrcGen entitySrcGen) { 676 this.entitySrcGen = entitySrcGen; 677 } 678 679 @Override 680 public void consumeEntitySrcNat(EntitySrcNat entitySrcNat) { 681 this.entitySrcNat = entitySrcNat; 682 } 683 684 @Override 685 public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) { 686 this.entitySrcSyn = entitySrcSyn; 687 } 688 689 @Override 690 public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) { 691 for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) { 692 Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex)); 693 694 // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group 695 // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08 696 697 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(entityPolySeq.getMonId().get(rowIndex)); 698 //int seqId = Integer.parseInt(entityPolySeq.getNum()); 699 if (g != null && !g.getChemComp().isEmpty()) { 700 if (g instanceof AminoAcidImpl) { 701 AminoAcidImpl aa = (AminoAcidImpl) g; 702 aa.setRecordType(AminoAcid.SEQRESRECORD); 703 } 704 } else { 705 if (entityPolySeq.getMonId().get(rowIndex).length() == 3 && 706 StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)) != null) { 707 AminoAcidImpl a = new AminoAcidImpl(); 708 a.setRecordType(AminoAcid.SEQRESRECORD); 709 Character code1 = StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)); 710 a.setAminoType(code1); 711 g = a; 712 713 } else if (StructureTools.isNucleotide(entityPolySeq.getMonId().get(rowIndex))) { 714 // the group is actually a nucleotide group... 715 g = new NucleotideImpl(); 716 } else { 717 logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", entityPolySeq.getNum().get(rowIndex), entityPolySeq.getMonId().get(rowIndex)); 718 g = new HetatomImpl(); 719 } 720 } 721 // at this stage we don't know about author residue numbers (insertion codes) 722 // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly 723 // sequential and follow the seqres sequence 1 to n) 724 // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes() 725 g.setResidueNumber(ResidueNumber.fromString(entityPolySeq.getNum().getStringData(rowIndex))); 726 g.setPDBName(entityPolySeq.getMonId().get(rowIndex)); 727 entityChain.addGroup(g); 728 } 729 } 730 731 private Chain getEntityChain(String entityId) { 732 for (Chain chain : entityChains) { 733 if (chain.getId().equals(entityId)) { 734 return chain; 735 } 736 } 737 738 // does not exist yet, so create... 739 Chain chain = new ChainImpl(); 740 chain.setId(entityId); 741 entityChains.add(chain); 742 743 return chain; 744 } 745 746 @Override 747 public void consumeExptl(Exptl exptl) { 748 for (int rowIndex = 0; rowIndex < exptl.getRowCount(); rowIndex++) { 749 pdbHeader.setExperimentalTechnique(exptl.getMethod().get(rowIndex)); 750 } 751 } 752 753 @Override 754 public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRevisionHistory) { 755 Date date = null; 756 for (int rowIndex = 0; rowIndex < pdbxAuditRevisionHistory.getRowCount(); rowIndex++) { 757 // first entry in revision history is the release date 758 if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) { 759 String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex); 760 date = convert(LocalDate.parse(release, DATE_FORMAT)); 761 pdbHeader.setRelDate(date); 762 } else { 763 // all other dates are revision dates; 764 // since this method may be called multiple times, 765 // the last revision date will "stick" 766 String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex); 767 date = convert(LocalDate.parse(revision, DATE_FORMAT)); 768 } 769 pdbHeader.setModDate(date); 770 } 771 } 772 773 @Override 774 public void consumePdbxChemCompIdentifier(PdbxChemCompIdentifier pdbxChemCompIdentifier) { 775 // TODO not impled in ref 776 } 777 778 @Override 779 public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) { 780 for (int rowIndex = 0; rowIndex < pdbxDatabaseStatus.getRowCount(); rowIndex++) { 781 // the deposition date field is only available in mmCIF 5.0 782 StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate(); 783 if (recvdInitialDepositionDate.isDefined()) { 784 String deposition = recvdInitialDepositionDate.get(rowIndex); 785 pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT))); 786 } 787 } 788 } 789 790 @Override 791 public void consumePdbxEntityBranchDescriptor(PdbxEntityBranchDescriptor pdbxEntityBranchDescriptor) { 792 // TODO not considered in ref 793 } 794 795 @Override 796 public void consumePdbxMolecule(PdbxMolecule pdbxMolecule) { 797 // TODO not considered in ref 798 } 799 800 @Override 801 public void consumePdbxMoleculeFeatures(PdbxMoleculeFeatures pdbxMoleculeFeatures) { 802 // TODO not considered in ref 803 } 804 805 @Override 806 public void consumePdbxNonpolyScheme(PdbxNonpolyScheme pdbxNonpolyScheme) { 807 // TODO not impled in ref 808 } 809 810 @Override 811 public void consumePdbxReferenceEntityLink(PdbxReferenceEntityLink pdbxReferenceEntityLink) { 812 // TODO not considered in ref 813 } 814 815 @Override 816 public void consumePdbxReferenceEntityList(PdbxReferenceEntityList pdbxReferenceEntityList) { 817 // TODO not considered in ref 818 } 819 820 @Override 821 public void consumePdbxReferenceEntityPolyLink(PdbxReferenceEntityPolyLink pdbxReferenceEntityPolyLink) { 822 // TODO not considered in ref 823 } 824 825 @Override 826 public void consumePdbxStructAssembly(PdbxStructAssembly pdbxStructAssembly) { 827 this.structAssembly = pdbxStructAssembly; 828 } 829 830 @Override 831 public void consumePdbxStructAssemblyGen(PdbxStructAssemblyGen pdbxStructAssemblyGen) { 832 this.structAssemblyGen = pdbxStructAssemblyGen; 833 } 834 835 @Override 836 public void consumePdbxStructModResidue(PdbxStructModResidue pdbxStructModResidue) { 837 // TODO not considered in ref 838 } 839 840 @Override 841 public void consumePdbxStructOperList(PdbxStructOperList pdbxStructOperList) { 842 this.structOpers = pdbxStructOperList; 843 } 844 845 @Override 846 public void consumeRefine(Refine refine) { 847 for (int rowIndex = 0; rowIndex < refine.getRowCount(); rowIndex++) { 848 // RESOLUTION 849 ValueKind valueKind = refine.getLsDResHigh().getValueKind(rowIndex); 850 if (! ValueKind.PRESENT.equals(valueKind)) { 851 continue; 852 } 853 // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m) 854 // there are 2 resolution values, one for each method 855 // we take the last one found so that behaviour is like in PDB file parsing 856 double lsDResHigh = refine.getLsDResHigh().get(rowIndex); 857 // TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0 858 if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) { 859 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}", 860 lsDResHigh, String.format("%4.2f",pdbHeader.getResolution())); 861 } 862 pdbHeader.setResolution((float) lsDResHigh); 863 864 FloatColumn lsRFactorRFree = refine.getLsRFactorRFree(); 865 // RFREE 866 if (pdbHeader.getRfree() != PDBHeader.DEFAULT_RFREE) { 867 logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {}", 868 lsRFactorRFree, String.format("%4.2f",pdbHeader.getRfree())); 869 } 870 if (lsRFactorRFree.isDefined() && lsRFactorRFree.getValueKind(rowIndex) == ValueKind.PRESENT) { 871 pdbHeader.setRfree((float) lsRFactorRFree.get(rowIndex)); 872 } else { 873 // some entries like 2ifo haven't got this field at all 874 logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value"); 875 } 876 877 // RWORK 878 FloatColumn lsRFactorRWork = refine.getLsRFactorRWork(); 879 if(pdbHeader.getRwork() != PDBHeader.DEFAULT_RFREE) { 880 logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ", 881 lsRFactorRWork, String.format("%4.2f",pdbHeader.getRwork())); 882 } 883 if (lsRFactorRWork.isDefined() && lsRFactorRWork.getValueKind(rowIndex) == ValueKind.PRESENT) { 884 pdbHeader.setRwork((float) lsRFactorRWork.get(rowIndex)); 885 } else { 886 logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value"); 887 } 888 } 889 } 890 891 @Override 892 public void consumeStruct(Struct struct) { 893 if (struct.isDefined() && struct.getTitle().isDefined()) { 894 pdbHeader.setTitle(struct.getTitle().get(0)); 895 } 896 897 if (struct.isDefined() && struct.getEntryId().isDefined()) { 898 PdbId pdbId; 899 String pdbCode = struct.getEntryId().get(0); 900 if(pdbCode.isBlank()){ 901 pdbId = null; 902 } else { 903 try { 904 pdbId = new PdbId(pdbCode); 905 } catch (IllegalArgumentException e) { 906 logger.warn("Malformed PDB ID {}. setting PdbId to null", pdbCode); 907 pdbId = null; 908 } 909 } 910 pdbHeader.setPdbId(pdbId); 911 structure.setPdbId(pdbId); 912 } 913 } 914 915 @Override 916 public void consumeStructAsym(StructAsym structAsym) { 917 this.structAsym = structAsym; 918 } 919 920 @Override 921 public void consumeStructConf(StructConf structConf) { 922 // TODO not considered in ref 923 } 924 925 @Override 926 public void consumeStructConn(StructConn structConn) { 927 this.structConn = structConn; 928 } 929 930 @Override 931 public void consumeStructConnType(StructConnType structConnType) { 932 // TODO not considered in ref 933 } 934 935 @Override 936 public void consumeStructKeywords(StructKeywords structKeywords) { 937 ArrayList<String> keywordsList = new ArrayList<>(); 938 939 StrColumn text = structKeywords.getText(); 940 if (text.isDefined()) { 941 String keywords = text.get(0); 942 String[] strings = keywords.split(" *, *"); 943 for (String string : strings) { 944 keywordsList.add(string.trim()); 945 } 946 } 947 structure.getPDBHeader().setKeywords(keywordsList); 948 949 StrColumn pdbxKeywords = structKeywords.getPdbxKeywords(); 950 if (pdbxKeywords.isDefined()) { 951 String keywords = pdbxKeywords.get(0); 952 pdbHeader.setClassification(keywords); 953 //This field should be left empty. TODO The next line should be removed later 954 pdbHeader.setDescription(keywords); 955 } 956 } 957 958 @Override 959 public void consumeStructNcsOper(StructNcsOper structNcsOper) { 960 this.structNcsOper = structNcsOper; 961 } 962 963 @Override 964 public void consumeStructRef(StructRef structRef) { 965 this.structRef = structRef; 966 } 967 968 @Override 969 public void consumeStructRefSeq(StructRefSeq structRefSeq) { 970 for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) { 971 String refId = structRefSeq.getRefId().get(rowIndex); 972 973 DBRef dbRef = new DBRef(); 974 975 dbRef.setIdCode(structRefSeq.getPdbxPDBIdCode().isDefined()? structRefSeq.getPdbxPDBIdCode().get(rowIndex):null); 976 dbRef.setDbAccession(structRefSeq.getPdbxDbAccession().isDefined()? structRefSeq.getPdbxDbAccession().get(rowIndex):null); 977 dbRef.setDbIdCode(structRefSeq.getPdbxDbAccession().isDefined()? structRefSeq.getPdbxDbAccession().get(rowIndex):null); 978 dbRef.setChainName(structRefSeq.getPdbxStrandId().get(rowIndex)); 979 980 OptionalInt structRefRowIndex = IntStream.range(0, structRef.getRowCount()) 981 .filter(i -> structRef.getId().get(i).equals(refId)) 982 .findFirst(); 983 984 if (structRefRowIndex.isPresent()) { 985 dbRef.setDatabase(structRef.getDbName().get(structRefRowIndex.getAsInt())); 986 dbRef.setDbIdCode(structRef.getDbCode().get(structRefRowIndex.getAsInt())); 987 } else { 988 logger.info("could not find StructRef `{} for StructRefSeq {}", refId, rowIndex); 989 } 990 991 int seqBegin; 992 int seqEnd; 993 994 try { 995 seqBegin = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignBeg().get(rowIndex)); 996 seqEnd = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignEnd().get(rowIndex)); 997 } catch (NumberFormatException e) { 998 // this happens in a few entries, annotation error? e.g. 6eoj 999 logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref " + 1000 "alignment info for accession {}. Error: {}", dbRef.getDbAccession(), e.getMessage()); 1001 return; 1002 } 1003 1004 char beginInsCode = ' '; 1005 String pdbxSeqAlignBegInsCode = structRefSeq.getPdbxSeqAlignBegInsCode().get(rowIndex); 1006 if (pdbxSeqAlignBegInsCode.length() > 0) { 1007 beginInsCode = pdbxSeqAlignBegInsCode.charAt(0); 1008 } 1009 1010 char endInsCode = ' '; 1011 String pdbxSeqAlignEndInsCode = structRefSeq.getPdbxSeqAlignEndInsCode().get(rowIndex); 1012 if (pdbxSeqAlignEndInsCode.length() > 0) { 1013 endInsCode = pdbxSeqAlignEndInsCode.charAt(0); 1014 } 1015 1016 if (beginInsCode == '?') { 1017 beginInsCode = ' '; 1018 } 1019 if (endInsCode == '?') { 1020 endInsCode = ' '; 1021 } 1022 1023 dbRef.setSeqBegin(seqBegin); 1024 dbRef.setInsertBegin(beginInsCode); 1025 dbRef.setSeqEnd(seqEnd); 1026 dbRef.setInsertEnd(endInsCode); 1027 1028 int dbSeqBegin = structRefSeq.getDbAlignBeg().get(rowIndex); 1029 int dbSeqEnd = structRefSeq.getDbAlignEnd().get(rowIndex); 1030 1031 char dbBeginInsCode = ' '; 1032 StrColumn pdbxDbAlignBegInsCodeCol = structRefSeq.getPdbxDbAlignBegInsCode(); 1033 if (pdbxDbAlignBegInsCodeCol.isDefined()) { 1034 String pdbxDbAlignBegInsCode = pdbxDbAlignBegInsCodeCol.get(rowIndex); 1035 if (pdbxDbAlignBegInsCode.length() > 0) { 1036 dbBeginInsCode = pdbxDbAlignBegInsCode.charAt(0); 1037 } 1038 } 1039 1040 char dbEndInsCode = ' '; 1041 StrColumn pdbxDbAlignEndInsCodeCol = structRefSeq.getPdbxDbAlignEndInsCode(); 1042 if (pdbxDbAlignEndInsCodeCol.isDefined()) { 1043 String pdbxDbAlignEndInsCode = pdbxDbAlignEndInsCodeCol.get(rowIndex); 1044 if (pdbxDbAlignEndInsCode.length() > 0) { 1045 dbEndInsCode = pdbxDbAlignEndInsCode.charAt(0); 1046 } 1047 } 1048 1049 if (dbBeginInsCode == '?') { 1050 dbBeginInsCode = ' '; 1051 } 1052 if (dbEndInsCode == '?') { 1053 dbEndInsCode = ' '; 1054 } 1055 1056 dbRef.setDbSeqBegin(dbSeqBegin); 1057 dbRef.setIdbnsBegin(dbBeginInsCode); 1058 dbRef.setDbSeqEnd(dbSeqEnd); 1059 dbRef.setIdbnsEnd(dbEndInsCode); 1060 1061 List<DBRef> dbrefs = structure.getDBRefs(); 1062 if (dbrefs == null) { 1063 dbrefs = new ArrayList<>(); 1064 } 1065 dbrefs.add(dbRef); 1066 1067 logger.debug(dbRef.toPDB()); 1068 1069 structure.setDBRefs(dbrefs); 1070 } 1071 } 1072 1073 @Override 1074 public void consumeStructRefSeqDif(StructRefSeqDif structRefSeqDif) { 1075 this.structRefSeqDif = structRefSeqDif; 1076 } 1077 1078 @Override 1079 public void consumeStructSheetRange(StructSheetRange structSheetRange) { 1080 // TODO not considered in ref 1081 } 1082 1083 @Override 1084 public void consumeStructSite(StructSite structSite) { 1085 if (params.isHeaderOnly()) { 1086 return; 1087 } 1088 1089 List<Site> sites = structure.getSites(); 1090 if (sites == null) { 1091 sites = new ArrayList<>(); 1092 } 1093 1094 for (int rowIndex = 0; rowIndex < structSite.getRowCount(); rowIndex++) { 1095 Site site = null; 1096 for (Site asite : sites) { 1097 if (asite.getSiteID().equals(structSite.getId().get(rowIndex))) { 1098 site = asite; // prevent duplicate siteIds 1099 } 1100 } 1101 1102 boolean addSite = false; 1103 if (site == null) { 1104 site = new Site(); 1105 addSite = true; 1106 } 1107 1108 site.setSiteID(structSite.getId().get(rowIndex)); 1109 site.setDescription(structSite.getDetails().get(rowIndex)); 1110 site.setEvCode(structSite.getPdbxEvidenceCode().get(rowIndex)); 1111 1112 if (addSite) { 1113 sites.add(site); 1114 } 1115 } 1116 1117 structure.setSites(sites); 1118 } 1119 1120 @Override 1121 public void consumeStructSiteGen(StructSiteGen structSiteGen) { 1122 this.structSiteGen = structSiteGen; 1123 } 1124 1125 @Override 1126 public void consumeSymmetry(Symmetry symmetry) { 1127 for (int rowIndex = 0; rowIndex < symmetry.getRowCount(); rowIndex++) { 1128 String spaceGroupString = symmetry.getSpaceGroupNameH_M().get(rowIndex); 1129 SpaceGroup spaceGroup = SymoplibParser.getSpaceGroup(spaceGroupString); 1130 if (spaceGroup == null) { 1131 logger.warn("Space group '{}' not recognised as a standard space group", spaceGroupString); 1132 structure.getPDBHeader() 1133 .getCrystallographicInfo() 1134 .setNonStandardSg(true); 1135 } else { 1136 structure.getPDBHeader() 1137 .getCrystallographicInfo() 1138 .setSpaceGroup(spaceGroup); 1139 structure.getPDBHeader() 1140 .getCrystallographicInfo() 1141 .setNonStandardSg(false); 1142 } 1143 } 1144 } 1145 1146 @Override 1147 public void finish() { 1148 if (currentChain != null) { 1149 currentChain.addGroup(currentGroup); 1150 1151 Optional<Chain> testChain = currentModel.stream() 1152 .filter(chain -> chain.getId().equals(currentChain.getId())) 1153 .findFirst(); 1154 1155 if (!testChain.isPresent()) { 1156 currentModel.add(currentChain); 1157 } 1158 } else if (!params.isHeaderOnly()) { 1159 logger.warn("current chain is null at end of document."); 1160 } 1161 1162 allModels.add(currentModel); 1163 1164 initMaps(); 1165 1166 for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) { 1167 String id = structAsym.getId().get(rowIndex); 1168 String entityId = structAsym.getEntityId().get(rowIndex); 1169 logger.debug("Entity {} matches asym_id: {}", entityId, id); 1170 1171 Chain chain = getEntityChain(entityId); 1172 Chain seqRes = (Chain) chain.clone(); 1173 // to solve issue #160 (e.g. 3u7t) 1174 seqRes = removeSeqResHeterogeneity(seqRes); 1175 seqRes.setId(id); 1176 seqRes.setName(asymId2authorId.getOrDefault(id, id)); 1177 1178 EntityType type = EntityType.entityTypeFromString(getEntityType(entityId)); 1179 if (type == null || type == EntityType.POLYMER) { 1180 seqResChains.add(seqRes); 1181 } 1182 1183 logger.debug(" seqres: {} {}<", id, seqRes); 1184 addEntity(rowIndex, entityId, getEntityDescription(entityId), getEntityType(entityId)); 1185 } 1186 1187 if (!structAsym.isDefined() || structAsym.getRowCount() == 0) { 1188 logger.warn("No _struct_asym category in file, no SEQRES groups will be added."); 1189 } 1190 1191 // entities 1192 // In addEntities above we created the entities if they were present in the file 1193 // Now we need to make sure that they are linked to chains and also that if they are not present in the file we 1194 // need to add them now 1195 linkEntities(); 1196 1197 // now that we know the entities, we can add all chains to structure so that they are stored 1198 // properly as polymer/nonpolymer/water chains inside structure 1199 allModels.forEach(structure::addModel); 1200 1201 // Only align if requested (default) and not when headerOnly mode with no Atoms. 1202 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 1203 if (params.isAlignSeqRes() && !params.isHeaderOnly()){ 1204 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 1205 alignSeqRes(); 1206 } else { 1207 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 1208 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 1209 } 1210 1211 // Now make sure all altlocgroups have all the atoms in all the groups 1212 StructureTools.cleanUpAltLocs(structure); 1213 1214 // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out 1215 if (!params.isHeaderOnly()) { 1216 if (params.shouldCreateAtomBonds()) { 1217 addBonds(); 1218 } 1219 1220 if (params.shouldCreateAtomCharges()) { 1221 addCharges(); 1222 } 1223 } 1224 1225 if (!params.isHeaderOnly()) { 1226 addSites(); 1227 } 1228 1229 // set the oligomeric state info in the header... 1230 if (params.isParseBioAssembly()) { 1231 // the more detailed mapping of chains to rotation operations happens in StructureIO... 1232 1233 Map<Integer, BioAssemblyInfo> bioAssemblies = new LinkedHashMap<>(); 1234 for (int i = 0; i < structAssembly.getRowCount(); i++) { 1235 String assemblyId = structAssembly.getId().get(i); 1236 List<Integer> structAssemblyGenIndices = new ArrayList<>(); 1237 for (int j = 0; j < structAssemblyGen.getRowCount(); j++) { 1238 if (structAssemblyGen.getAssemblyId().get(j).equals(assemblyId)) { 1239 structAssemblyGenIndices.add(j); 1240 } 1241 } 1242 BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder(); 1243 // these are the transformations that need to be applied to our model 1244 List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(structAssembly, 1245 i, structAssemblyGen, structOpers); 1246 1247 int bioAssemblyId = -1; 1248 try { 1249 bioAssemblyId = Integer.parseInt(assemblyId); 1250 } catch (NumberFormatException e) { 1251 logger.info("Could not parse a numerical bio assembly id from '{}'", assemblyId); 1252 } 1253 1254 // if bioassembly id is not numerical we throw it away 1255 // this happens usually for viral capsid entries, like 1ei7 1256 // see issue #230 in github 1257 if (bioAssemblyId != -1) { 1258 int mmSize = 0; 1259 // note that the transforms contain asym ids of both polymers and non-polymers 1260 // For the mmsize, we are only interested in the polymers 1261 for (BiologicalAssemblyTransformation transf : transformations) { 1262 Chain c = structure.getChain(transf.getChainId()); 1263 if (c == null) { 1264 logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId()); 1265 continue; 1266 } 1267 if (c.getEntityType() == EntityType.POLYMER && 1268 // for entries like 4kro, sugars are annotated as polymers but we 1269 // don't want them in the macromolecularSize count 1270 !c.getEntityInfo().getDescription().contains("SUGAR")) { 1271 mmSize++; 1272 } 1273 } 1274 1275 BioAssemblyInfo bioAssembly = new BioAssemblyInfo(); 1276 bioAssembly.setId(bioAssemblyId); 1277 bioAssembly.setMacromolecularSize(mmSize); 1278 bioAssembly.setTransforms(transformations); 1279 bioAssemblies.put(bioAssemblyId, bioAssembly); 1280 } 1281 1282 } 1283 structure.getPDBHeader() 1284 .setBioAssemblies(bioAssemblies); 1285 } 1286 1287 setStructNcsOps(); 1288 setCrystallographicInfoMetadata(); 1289 1290 Map<String, List<SeqMisMatch>> misMatchMap = new HashMap<>(); 1291 for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) { 1292 SeqMisMatch seqMisMatch = new SeqMisMatchImpl(); 1293 seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex)); 1294 1295 String insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex); 1296 if ("?".equals(insCode)) { 1297 insCode = null; 1298 } 1299 seqMisMatch.setInsCode(insCode); 1300 seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex)); 1301 seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex)); 1302 seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex)); 1303 seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex)); 1304 seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); 1305 1306 String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); 1307 List<SeqMisMatch> seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>()); 1308 seqMisMatches.add(seqMisMatch); 1309 } 1310 1311 for (String chainId : misMatchMap.keySet()){ 1312 Chain chain = structure.getPolyChainByPDB(chainId); 1313 if (chain == null) { 1314 logger.warn("Could not set mismatches for chain with author id {}", chainId); 1315 continue; 1316 } 1317 1318 chain.setSeqMisMatches(misMatchMap.get(chainId)); 1319 } 1320 } 1321 1322 private String getEntityType(String entityId) { 1323 return IntStream.range(0, entity.getRowCount()) 1324 .filter(i -> entity.getId().get(i).equals(entityId)) 1325 .mapToObj(i -> entity.getType().get(i)) 1326 .findFirst() 1327 .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId)); 1328 } 1329 1330 private String getEntityDescription(String entityId) { 1331 return IntStream.range(0, entity.getRowCount()) 1332 .filter(i -> entity.getId().get(i).equals(entityId)) 1333 .mapToObj(i -> entity.getPdbxDescription().isDefined()? entity.getPdbxDescription().get(i):"") 1334 .findFirst() 1335 .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId)); 1336 } 1337 1338 private void addEntity(int asymRowIndex, String entityId, String pdbxDescription, String type) { 1339 int eId = 0; 1340 try { 1341 eId = Integer.parseInt(entityId); 1342 } catch (NumberFormatException e) { 1343 logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity", entityId); 1344 } 1345 1346 int entityRowIndex = IntStream.range(0, entity.getRowCount()) 1347 .filter(i -> entity.getId().get(i).equals(entityId)) 1348 .findFirst() 1349 .orElse(-1); 1350 1351 EntityInfo entityInfo = structure.getEntityById(eId); 1352 1353 if (entityInfo == null) { 1354 entityInfo = new EntityInfo(); 1355 entityInfo.setMolId(eId); 1356 // we only add the compound if a polymeric one (to match what the PDB parser does) 1357 if (entityRowIndex != -1) { 1358 entityInfo.setDescription(pdbxDescription); 1359 1360 EntityType eType = EntityType.entityTypeFromString(type); 1361 if (eType != null) { 1362 entityInfo.setType(eType); 1363 } else { 1364 logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", type, eId); 1365 } 1366 addAncilliaryEntityData(asymRowIndex, entityInfo); 1367 structure.addEntityInfo(entityInfo); 1368 logger.debug("Adding Entity with entity id {} from _entity, with name: {}", eId, 1369 entityInfo.getDescription()); 1370 } 1371 } 1372 } 1373 1374 private void addAncilliaryEntityData(int asymRowIndex, EntityInfo entityInfo) { 1375 // Loop through each of the entity types and add the corresponding data 1376 // We're assuming if data is duplicated between sources it is consistent 1377 // This is a potentially huge assumption... 1378 1379 for (int rowIndex = 0; rowIndex < entitySrcGen.getRowCount(); rowIndex++) { 1380 if (!entitySrcGen.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1381 continue; 1382 } 1383 1384 addInformationFromEntitySrcGen(rowIndex, entityInfo); 1385 } 1386 1387 for (int rowIndex = 0; rowIndex < entitySrcNat.getRowCount(); rowIndex++) { 1388 if (!entitySrcNat.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1389 continue; 1390 } 1391 1392 addInformationFromEntitySrcNat(rowIndex, entityInfo); 1393 } 1394 1395 for (int rowIndex = 0; rowIndex < entitySrcSyn.getRowCount(); rowIndex++) { 1396 if (!entitySrcSyn.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1397 continue; 1398 } 1399 1400 addInformationFromEntitySrcSyn(rowIndex, entityInfo); 1401 } 1402 } 1403 1404 private void addInformationFromEntitySrcSyn(int rowIndex, EntityInfo entityInfo) { 1405 entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcSyn.getOrganismCommonName(), rowIndex, null)); 1406 entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcSyn.getOrganismScientific(), rowIndex, null)); 1407 entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcSyn.getNcbiTaxonomyId(), rowIndex, null)); 1408 } 1409 1410 private void addInformationFromEntitySrcNat(int rowIndex, EntityInfo entityInfo) { 1411 entityInfo.setAtcc(getCifFieldNullAware(entitySrcNat.getPdbxAtcc(), rowIndex, null)); 1412 entityInfo.setCell(getCifFieldNullAware(entitySrcNat.getPdbxCell(), rowIndex, null)); 1413 entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcNat.getCommonName(), rowIndex, null)); 1414 entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcNat.getPdbxOrganismScientific(), rowIndex, null)); 1415 entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcNat.getPdbxNcbiTaxonomyId(), rowIndex, null)); 1416 } 1417 1418 private void addInformationFromEntitySrcGen(int rowIndex, EntityInfo entityInfo) { 1419 entityInfo.setAtcc(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcAtcc(), rowIndex, null)); 1420 entityInfo.setCell(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcCell(), rowIndex, null)); 1421 entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcGen.getGeneSrcCommonName(), rowIndex, null)); 1422 entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcScientificName(), rowIndex, null)); 1423 entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcNcbiTaxonomyId(), rowIndex, null)); 1424 entityInfo.setExpressionSystemTaxId(getCifFieldNullAware(entitySrcGen.getPdbxHostOrgNcbiTaxonomyId(), rowIndex, null)); 1425 entityInfo.setExpressionSystem(getCifFieldNullAware(entitySrcGen.getPdbxHostOrgScientificName(), rowIndex, null)); 1426 } 1427 1428 private String getCifFieldNullAware(StrColumn column, int rowIndex, String defaultValue) { 1429 if (column.isDefined()) 1430 return column.get(rowIndex); 1431 else 1432 return defaultValue; 1433 } 1434 1435 private void setStructNcsOps() { 1436 List<Matrix4d> ncsOperators = new ArrayList<>(); 1437 1438 for (int rowIndex = 0; rowIndex < structNcsOper.getRowCount(); rowIndex++) { 1439 if (!"generate".equals(structNcsOper.getCode().get(rowIndex))) { 1440 continue; 1441 } 1442 1443 try { 1444 Matrix4d operator = new Matrix4d(); 1445 1446 operator.setElement(0, 0, structNcsOper.getMatrix11().get(rowIndex)); 1447 operator.setElement(0, 1, structNcsOper.getMatrix12().get(rowIndex)); 1448 operator.setElement(0, 2, structNcsOper.getMatrix13().get(rowIndex)); 1449 operator.setElement(0, 3, structNcsOper.getVector1().get(rowIndex)); 1450 1451 operator.setElement(1, 0, structNcsOper.getMatrix21().get(rowIndex)); 1452 operator.setElement(1, 1, structNcsOper.getMatrix22().get(rowIndex)); 1453 operator.setElement(1, 2, structNcsOper.getMatrix23().get(rowIndex)); 1454 operator.setElement(1, 3, structNcsOper.getVector2().get(rowIndex)); 1455 1456 operator.setElement(2, 0, structNcsOper.getMatrix31().get(rowIndex)); 1457 operator.setElement(2, 1, structNcsOper.getMatrix32().get(rowIndex)); 1458 operator.setElement(2, 2, structNcsOper.getMatrix33().get(rowIndex)); 1459 operator.setElement(2, 3, structNcsOper.getVector3().get(rowIndex)); 1460 1461 operator.setElement(3, 0, 0); 1462 operator.setElement(3, 1, 0); 1463 operator.setElement(3, 2, 0); 1464 operator.setElement(3, 3, 1); 1465 1466 ncsOperators.add(operator); 1467 } catch (NumberFormatException e) { 1468 logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", rowIndex + 1); 1469 } 1470 } 1471 1472 if (ncsOperators.size() > 0) { 1473 structure.getCrystallographicInfo() 1474 .setNcsOperators(ncsOperators.toArray(new Matrix4d[0])); 1475 } 1476 } 1477 1478 private void setCrystallographicInfoMetadata() { 1479 if (parsedScaleMatrix != null) { 1480 PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo(); 1481 boolean nonStd = false; 1482 if (crystalInfo.getCrystalCell() != null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) { 1483 nonStd = true; 1484 } 1485 1486 crystalInfo.setNonStandardCoordFrameConvention(nonStd); 1487 } 1488 } 1489 1490 private void addSites() { 1491 List<Site> sites = structure.getSites(); 1492 if (sites == null) sites = new ArrayList<>(); 1493 1494 for (int rowIndex = 0; rowIndex < structSiteGen.getRowCount(); rowIndex++) { 1495 // For each StructSiteGen, find the residues involved, if they exist then 1496 String site_id = structSiteGen.getSiteId().get(rowIndex); // multiple could be in same site. 1497 if (site_id == null) { 1498 site_id = ""; 1499 } 1500 String comp_id = structSiteGen.getLabelCompId().get(rowIndex); // PDBName 1501 1502 // Assumption: the author chain ID and residue number for the site is consistent with the original 1503 // author chain id and residue numbers. 1504 1505 String asymId = structSiteGen.getLabelAsymId().get(rowIndex); // chain name 1506 String authId = structSiteGen.getAuthAsymId().get(rowIndex); // chain Id 1507 String auth_seq_id = structSiteGen.getAuthSeqId().get(rowIndex); // Res num 1508 1509 String insCode = structSiteGen.getPdbxAuthInsCode().get(rowIndex); 1510 if ("?".equals(insCode)) { 1511 insCode = null; 1512 } 1513 1514 // Look for asymID = chainID and seqID = seq_ID. Check that comp_id matches the resname. 1515 Group g = null; 1516 try { 1517 Chain chain = structure.getChain(asymId); 1518 1519 if (null != chain) { 1520 try { 1521 Character insChar = null; 1522 if (null != insCode && insCode.length() > 0) { 1523 insChar = insCode.charAt(0); 1524 } 1525 g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar)); 1526 } catch (NumberFormatException e) { 1527 logger.warn("Could not lookup residue : {}{}", authId, auth_seq_id); 1528 } 1529 } 1530 } catch (StructureException e) { 1531 logger.warn("Problem finding residue in site entry {} - {}", 1532 structSiteGen.getSiteId().get(rowIndex), e.getMessage()); 1533 } 1534 1535 if (g != null) { 1536 // 2. find the site_id, if not existing, create anew. 1537 Site site = null; 1538 for (Site asite : sites) { 1539 if (site_id.equals(asite.getSiteID())) { 1540 site = asite; 1541 } 1542 } 1543 1544 boolean addSite = false; 1545 1546 // 3. add this residue to the site. 1547 if (site == null) { 1548 addSite = true; 1549 site = new Site(); 1550 site.setSiteID(site_id); 1551 } 1552 1553 List<Group> groups = site.getGroups(); 1554 if (groups == null) { 1555 groups = new ArrayList<>(); 1556 } 1557 1558 // Check the self-consistency of the residue reference from auth_seq_id and chain_id 1559 if (!comp_id.equals(g.getPDBName())) { 1560 logger.warn("comp_id doesn't match the residue at {} {} - skipping", authId, auth_seq_id); 1561 } else { 1562 groups.add(g); 1563 site.setGroups(groups); 1564 } 1565 if (addSite) { 1566 sites.add(site); 1567 } 1568 } 1569 } 1570 structure.setSites(sites); 1571 } 1572 1573 private void addCharges() { 1574 ChargeAdder.addCharges(structure); 1575 } 1576 1577 /** 1578 * The method will return a new reference to a Chain with any consecutive groups 1579 * having same residue numbers removed. 1580 * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160) 1581 */ 1582 private static Chain removeSeqResHeterogeneity(Chain c) { 1583 Chain trimmedChain = new ChainImpl(); 1584 ResidueNumber lastResNum = null; 1585 1586 for (Group g : c.getAtomGroups()) { 1587 // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g) 1588 ResidueNumber currentResNum = new ResidueNumber( 1589 g.getResidueNumber().getChainName(), 1590 g.getResidueNumber().getSeqNum(), 1591 g.getResidueNumber().getInsCode()); 1592 1593 if (lastResNum == null || !lastResNum.equals(currentResNum)) { 1594 trimmedChain.addGroup(g); 1595 } else { 1596 logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': {}", g); 1597 } 1598 lastResNum = currentResNum; 1599 1600 } 1601 return trimmedChain; 1602 } 1603 1604 private void addBonds() { 1605 BondMaker maker = new BondMaker(structure, params); 1606 maker.makeBonds(); 1607 maker.formBondsFromStructConn(structConn); 1608 } 1609 1610 private void alignSeqRes() { 1611 logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence"); 1612 1613 // fix SEQRES residue numbering for all models 1614 1615 for (int model = 0; model < structure.nrModels(); model++) { 1616 List<Chain> atomList = structure.getPolyChains(model); 1617 1618 if (seqResChains.isEmpty()) { 1619 // in files without _entity, seqResChains object is empty: we replace by atomChains resulting below in a trivial alignment and a copy of atom groups to seqres groups 1620 seqResChains = atomList; 1621 } 1622 1623 for (Chain seqResChain : seqResChains){ 1624 1625 // this extracts the matching atom chain from atomList 1626 Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true); 1627 1628 if (atomChain == null) { 1629 // most likely there's no observed residues at all for the seqres chain: can't map 1630 // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues 1631 logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's " + 1632 "no observed residues in the chain.", seqResChain.getId()); 1633 continue; 1634 } 1635 1636 //map the atoms to the seqres... 1637 1638 // we need to first clone the seqres so that they stay independent for different models 1639 List<Group> seqResGroups = new ArrayList<>(); 1640 for (int i = 0; i < seqResChain.getAtomGroups().size(); i++) { 1641 seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone()); 1642 } 1643 1644 for (int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) { 1645 Group seqresG = seqResGroups.get(seqResPos); 1646 boolean found = false; 1647 for (Group atomG : atomChain.getAtomGroups()) { 1648 1649 int internalNr = getInternalNr(atomG); 1650 1651 if (seqresG.getResidueNumber().getSeqNum() == internalNr) { 1652 seqResGroups.set(seqResPos, atomG); 1653 found = true; 1654 break; 1655 } 1656 } 1657 1658 if (!found) 1659 // so far the residue number has tracked internal numbering. 1660 // however there are no atom records, as such this can't be a PDB residue number... 1661 seqresG.setResidueNumber(null); 1662 } 1663 atomChain.setSeqResGroups(seqResGroups); 1664 } 1665 } 1666 } 1667 1668 private int getInternalNr(Group atomG) { 1669 if (atomG.getType().equals(GroupType.AMINOACID)) { 1670 AminoAcidImpl aa = (AminoAcidImpl) atomG; 1671 return (int) aa.getId(); 1672 } else if (atomG.getType().equals(GroupType.NUCLEOTIDE)) { 1673 NucleotideImpl nu = (NucleotideImpl) atomG; 1674 return (int) nu.getId(); 1675 } else { 1676 HetatomImpl he = (HetatomImpl) atomG; 1677 return (int) he.getId(); 1678 } 1679 } 1680 1681 private void linkEntities() { 1682 for (List<Chain> allModel : allModels) { 1683 for (Chain chain : allModel) { 1684 //logger.info("linking entities for " + chain.getId() + " " + chain.getName()); 1685 String entityId = asymId2entityId.get(chain.getId()); 1686 1687 if (entityId == null) { 1688 // this can happen for instance if the cif file didn't have _struct_asym category at all 1689 // and thus we have no asymId2entityId mapping at all 1690 logger.info("No entity id could be found for chain {}", chain.getId()); 1691 continue; 1692 } 1693 1694 int eId = Integer.parseInt(entityId); 1695 1696 // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found. 1697 // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer 1698 // asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the 1699 // mmCIF internal data structures but is compatible with Structure interface. 1700 // Some examples of PDB entries with this kind of problem: 1701 // - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName 1702 // - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule 1703 // - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone 1704 1705 EntityInfo entityInfo = structure.getEntityById(eId); 1706 if (entityInfo == null) { 1707 // Supports the case where the only chain members were from non-polymeric entity that is missing. 1708 // Solved by creating a new Compound(entity) to which this chain will belong. 1709 logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.", 1710 eId, chain.getId()); 1711 entityInfo = new EntityInfo(); 1712 entityInfo.setMolId(eId); 1713 entityInfo.addChain(chain); 1714 if (chain.isWaterOnly()) { 1715 entityInfo.setType(EntityType.WATER); 1716 } else { 1717 entityInfo.setType(EntityType.NONPOLYMER); 1718 } 1719 chain.setEntityInfo(entityInfo); 1720 structure.addEntityInfo(entityInfo); 1721 } else { 1722 logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}", 1723 chain.getId(), chain.getName(), eId); 1724 entityInfo.addChain(chain); 1725 chain.setEntityInfo(entityInfo); 1726 } 1727 1728 } 1729 1730 } 1731 1732 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 1733 List<EntityInfo> entityInfos = structure.getEntityInfos(); 1734 if (entityInfos == null || entityInfos.isEmpty()) { 1735 List<List<Chain>> polyModels = new ArrayList<>(); 1736 List<List<Chain>> nonPolyModels = new ArrayList<>(); 1737 List<List<Chain>> waterModels = new ArrayList<>(); 1738 1739 for (List<Chain> model : allModels) { 1740 List<Chain> polyChains = new ArrayList<>(); 1741 List<Chain> nonPolyChains = new ArrayList<>(); 1742 List<Chain> waterChains = new ArrayList<>(); 1743 1744 polyModels.add(polyChains); 1745 nonPolyModels.add(nonPolyChains); 1746 waterModels.add(waterChains); 1747 1748 for (Chain chain : model) { 1749 // we only have entities for polymeric chains, all others are ignored for assigning entities 1750 if (chain.isWaterOnly()) { 1751 waterChains.add(chain); 1752 } else if (chain.isPureNonPolymer()) { 1753 nonPolyChains.add(chain); 1754 } else { 1755 polyChains.add(chain); 1756 } 1757 } 1758 } 1759 1760 entityInfos = EntityFinder.findPolyEntities(polyModels); 1761 EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos); 1762 1763 structure.setEntityInfos(entityInfos); 1764 } 1765 1766 // final sanity check: it can happen that from the annotated entities some are not linked to any chains 1767 // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds) 1768 // we simply log it, this can sign some other problems if the entities are used down the line 1769 for (EntityInfo e : entityInfos) { 1770 if (e.getChains().isEmpty()) { 1771 logger.info("Entity {} '{}' has no chains associated to it", 1772 e.getMolId() < 0 ? "with no entity id" : e.getMolId(), e.getDescription()); 1773 } 1774 } 1775 } 1776 1777 private void initMaps() { 1778 if (structAsym == null || !structAsym.isDefined() || structAsym.getRowCount() == 0) { 1779 logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available"); 1780 return; 1781 } 1782 1783 Map<String, List<String>> entityId2asymId = new HashMap<>(); 1784 for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) { 1785 String id = structAsym.getId().get(rowIndex); 1786 String entityId = structAsym.getEntityId().get(rowIndex); 1787 1788 logger.debug("Entity {} matches asym_id: {}", entityId, id); 1789 1790 asymId2entityId.put(id, entityId); 1791 1792 if (entityId2asymId.containsKey(entityId)) { 1793 List<String> asymIds = entityId2asymId.get(entityId); 1794 asymIds.add(id); 1795 } else { 1796 List<String> asymIds = new ArrayList<>(); 1797 asymIds.add(id); 1798 entityId2asymId.put(entityId, asymIds); 1799 } 1800 } 1801 1802 if (entityPoly == null || !entityPoly.isDefined() || entityPoly.getRowCount() == 0) { 1803 logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available " + 1804 "for header only parsing"); 1805 return; 1806 } 1807 1808 for (int rowIndex = 0; rowIndex < entityPoly.getRowCount(); rowIndex++) { 1809 if (!entityPoly.getPdbxStrandId().isDefined()) { 1810 logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to " + 1811 "author ids for this entity.", entityPoly.getEntityId().get(rowIndex)); 1812 break; 1813 } 1814 1815 String[] chainNames = entityPoly.getPdbxStrandId().get(rowIndex).split(","); 1816 List<String> asymIds = entityId2asymId.get(entityPoly.getEntityId().get(rowIndex)); 1817 if (chainNames.length != asymIds.size()) { 1818 logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) " + 1819 "for entity {} have different lengths! Can't provide a mapping from asym ids to author chain " + 1820 "ids", entityPoly.getEntityId().get(rowIndex)); 1821 break; 1822 } 1823 1824 for (int i = 0; i < chainNames.length; i++) { 1825 asymId2authorId.put(asymIds.get(i), chainNames[i]); 1826 } 1827 } 1828 } 1829 1830 @Override 1831 public Structure getContainer() { 1832 return structure; 1833 } 1834}