001package org.biojava.nbio.structure.io.cif; 002 003import java.time.LocalDate; 004import java.time.ZoneId; 005import java.time.format.DateTimeFormatter; 006import java.time.format.DateTimeFormatterBuilder; 007import java.util.ArrayList; 008import java.util.Date; 009import java.util.HashMap; 010import java.util.LinkedHashMap; 011import java.util.List; 012import java.util.Locale; 013import java.util.Map; 014import java.util.NoSuchElementException; 015import java.util.Optional; 016import java.util.OptionalInt; 017import java.util.stream.Collectors; 018import java.util.stream.IntStream; 019 020import javax.vecmath.Matrix4d; 021 022import org.biojava.nbio.structure.AminoAcid; 023import org.biojava.nbio.structure.AminoAcidImpl; 024import org.biojava.nbio.structure.Atom; 025import org.biojava.nbio.structure.AtomImpl; 026import org.biojava.nbio.structure.Chain; 027import org.biojava.nbio.structure.ChainImpl; 028import org.biojava.nbio.structure.DBRef; 029import org.biojava.nbio.structure.Element; 030import org.biojava.nbio.structure.EntityInfo; 031import org.biojava.nbio.structure.EntityType; 032import org.biojava.nbio.structure.Group; 033import org.biojava.nbio.structure.GroupType; 034import org.biojava.nbio.structure.HetatomImpl; 035import org.biojava.nbio.structure.NucleotideImpl; 036import org.biojava.nbio.structure.PDBCrystallographicInfo; 037import org.biojava.nbio.structure.PDBHeader; 038import org.biojava.nbio.structure.PdbId; 039import org.biojava.nbio.structure.ResidueNumber; 040import org.biojava.nbio.structure.SeqMisMatch; 041import org.biojava.nbio.structure.SeqMisMatchImpl; 042import org.biojava.nbio.structure.Site; 043import org.biojava.nbio.structure.Structure; 044import org.biojava.nbio.structure.StructureException; 045import org.biojava.nbio.structure.StructureImpl; 046import org.biojava.nbio.structure.StructureTools; 047import org.biojava.nbio.structure.chem.ChemCompGroupFactory; 048import org.biojava.nbio.structure.io.BondMaker; 049import org.biojava.nbio.structure.io.ChargeAdder; 050import org.biojava.nbio.structure.io.EntityFinder; 051import org.biojava.nbio.structure.io.FileParsingParameters; 052import org.biojava.nbio.structure.io.SeqRes2AtomAligner; 053import org.biojava.nbio.structure.quaternary.BioAssemblyInfo; 054import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder; 055import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation; 056import org.biojava.nbio.structure.xtal.CrystalCell; 057import org.biojava.nbio.structure.xtal.SpaceGroup; 058import org.biojava.nbio.structure.xtal.SymoplibParser; 059import org.rcsb.cif.model.FloatColumn; 060import org.rcsb.cif.model.IntColumn; 061import org.rcsb.cif.model.StrColumn; 062import org.rcsb.cif.model.ValueKind; 063import org.rcsb.cif.schema.mm.AtomSite; 064import org.rcsb.cif.schema.mm.AtomSites; 065import org.rcsb.cif.schema.mm.AuditAuthor; 066import org.rcsb.cif.schema.mm.Cell; 067import org.rcsb.cif.schema.mm.ChemComp; 068import org.rcsb.cif.schema.mm.ChemCompBond; 069import org.rcsb.cif.schema.mm.DatabasePDBRemark; 070import org.rcsb.cif.schema.mm.DatabasePDBRev; 071import org.rcsb.cif.schema.mm.DatabasePDBRevRecord; 072import org.rcsb.cif.schema.mm.Em3dReconstruction; 073import org.rcsb.cif.schema.mm.Entity; 074import org.rcsb.cif.schema.mm.EntityPoly; 075import org.rcsb.cif.schema.mm.EntityPolySeq; 076import org.rcsb.cif.schema.mm.EntitySrcGen; 077import org.rcsb.cif.schema.mm.EntitySrcNat; 078import org.rcsb.cif.schema.mm.Exptl; 079import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory; 080import org.rcsb.cif.schema.mm.PdbxChemCompIdentifier; 081import org.rcsb.cif.schema.mm.PdbxDatabaseStatus; 082import org.rcsb.cif.schema.mm.PdbxEntityBranchDescriptor; 083import org.rcsb.cif.schema.mm.PdbxEntitySrcSyn; 084import org.rcsb.cif.schema.mm.PdbxMolecule; 085import org.rcsb.cif.schema.mm.PdbxMoleculeFeatures; 086import org.rcsb.cif.schema.mm.PdbxNonpolyScheme; 087import org.rcsb.cif.schema.mm.PdbxReferenceEntityLink; 088import org.rcsb.cif.schema.mm.PdbxReferenceEntityList; 089import org.rcsb.cif.schema.mm.PdbxReferenceEntityPolyLink; 090import org.rcsb.cif.schema.mm.PdbxStructAssembly; 091import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen; 092import org.rcsb.cif.schema.mm.PdbxStructModResidue; 093import org.rcsb.cif.schema.mm.PdbxStructOperList; 094import org.rcsb.cif.schema.mm.Refine; 095import org.rcsb.cif.schema.mm.Struct; 096import org.rcsb.cif.schema.mm.StructAsym; 097import org.rcsb.cif.schema.mm.StructConf; 098import org.rcsb.cif.schema.mm.StructConn; 099import org.rcsb.cif.schema.mm.StructConnType; 100import org.rcsb.cif.schema.mm.StructKeywords; 101import org.rcsb.cif.schema.mm.StructNcsOper; 102import org.rcsb.cif.schema.mm.StructRef; 103import org.rcsb.cif.schema.mm.StructRefSeq; 104import org.rcsb.cif.schema.mm.StructRefSeqDif; 105import org.rcsb.cif.schema.mm.StructSheetRange; 106import org.rcsb.cif.schema.mm.StructSite; 107import org.rcsb.cif.schema.mm.StructSiteGen; 108import org.rcsb.cif.schema.mm.Symmetry; 109import org.slf4j.Logger; 110import org.slf4j.LoggerFactory; 111 112/** 113 * An implementation of a CifFileConsumer for BioJava. Will process the information provided by a CifFile instance and 114 * use it to build up a {@link Structure} object. 115 * @author Sebastian Bittrich 116 * @since 6.0.0 117 */ 118public class CifStructureConsumerImpl implements CifStructureConsumer { 119 private static final Logger logger = LoggerFactory.getLogger(CifStructureConsumerImpl.class); 120 private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder() 121 .parseCaseInsensitive() 122 .appendPattern("yyyy-MM-dd") 123 .toFormatter(Locale.US); 124 125 private Structure structure; 126 private Chain currentChain; 127 private Group currentGroup; 128 private List<List<Chain>> allModels; 129 private List<Chain> currentModel; 130 private PDBHeader pdbHeader; 131 private String currentNmrModelNumber; 132 private Em3dReconstruction em3dReconstruction; 133 private List<Chain> entityChains; 134 135 private Entity entity; 136 private EntityPoly entityPoly; 137 private EntitySrcGen entitySrcGen; 138 private EntitySrcNat entitySrcNat; 139 private PdbxEntitySrcSyn entitySrcSyn; 140 private List<Chain> seqResChains; 141 private PdbxStructAssembly structAssembly; 142 private PdbxStructAssemblyGen structAssemblyGen; 143 private StructAsym structAsym; 144 private StructConn structConn; 145 private StructNcsOper structNcsOper; 146 private PdbxStructOperList structOpers; 147 private StructRef structRef; 148 private StructRefSeqDif structRefSeqDif; 149 private StructSiteGen structSiteGen; 150 151 private Map<String, String> asymId2entityId; 152 private Map<String, String> asymId2authorId; 153 private Matrix4d parsedScaleMatrix; 154 155 private final FileParsingParameters params; 156 157 public CifStructureConsumerImpl(FileParsingParameters params) { 158 this.params = params; 159 } 160 161 @Override 162 public void prepare() { 163 this.structure = new StructureImpl(); 164 this.pdbHeader = new PDBHeader(); 165 structure.setPDBHeader(pdbHeader); 166 167 this.allModels = new ArrayList<>(); 168 this.currentModel = new ArrayList<>(); 169 170 this.seqResChains = new ArrayList<>(); 171 this.asymId2entityId = new HashMap<>(); 172 this.asymId2authorId = new HashMap<>(); 173 174 this.entityChains = new ArrayList<>(); 175 } 176 177 @Override 178 public void consumeAtomSite(AtomSite atomSite) { 179 if (params.isHeaderOnly()) { 180 return; 181 } 182 183 StrColumn labelAsymId = atomSite.getLabelAsymId(); 184 StrColumn authAsymId = atomSite.getAuthAsymId(); 185 186 StrColumn groupPDB = atomSite.getGroupPDB(); 187 IntColumn authSeqId = atomSite.getAuthSeqId(); 188 189 StrColumn labelCompId = atomSite.getLabelCompId(); 190 191 IntColumn id = atomSite.getId(); 192 StrColumn labelAtomId = atomSite.getLabelAtomId(); 193 194 FloatColumn cartnX = atomSite.getCartnX(); 195 FloatColumn cartnY = atomSite.getCartnY(); 196 FloatColumn cartnZ = atomSite.getCartnZ(); 197 198 FloatColumn occupancy = atomSite.getOccupancy(); 199 FloatColumn bIsoOrEquiv = atomSite.getBIsoOrEquiv(); 200 201 StrColumn labelAltId = atomSite.getLabelAltId(); 202 StrColumn typeSymbol = atomSite.getTypeSymbol(); 203 204 StrColumn pdbxPDBInsCode = atomSite.getPdbxPDBInsCode(); 205 IntColumn labelSeqId = atomSite.getLabelSeqId(); 206 IntColumn pdbx_pdb_model_num = atomSite.getPdbxPDBModelNum(); 207 208 for (int atomIndex = 0; atomIndex < atomSite.getRowCount(); atomIndex++) { 209 boolean startOfNewChain = false; 210 Character oneLetterCode = StructureTools.get1LetterCodeAmino(labelCompId.get(atomIndex)); 211 212 boolean isHetAtmInFile = false; 213 if (!"ATOM".equals(groupPDB.get(atomIndex))) { 214 if (oneLetterCode != null && oneLetterCode.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 215 oneLetterCode = null; 216 } 217 218 isHetAtmInFile = true; 219 } 220 221 String insCodeString = pdbxPDBInsCode.get(atomIndex); 222 Character insCode = null; 223 if (insCodeString != null && !insCodeString.isEmpty() && !"?".equals(insCodeString)) { 224 insCode = insCodeString.charAt(0); 225 } 226 227 // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.' 228 long seqId = labelSeqId.get(atomIndex); 229 230 String nmrModelNumber = pdbx_pdb_model_num.getStringData(atomIndex); 231 232 if (currentNmrModelNumber == null) { 233 currentNmrModelNumber = nmrModelNumber; 234 } 235 if (!currentNmrModelNumber.equals(nmrModelNumber)) { 236 currentNmrModelNumber = nmrModelNumber; 237 238 if (currentChain != null) { 239 currentChain.addGroup(currentGroup); 240 currentGroup.trimToSize(); 241 } 242 243 allModels.add(currentModel); 244 currentModel = new ArrayList<>(); 245 currentChain = null; 246 currentGroup = null; 247 } 248 249 String asymId = labelAsymId.get(atomIndex); 250 String authId = authAsymId.get(atomIndex); 251 if (currentChain == null) { 252 currentChain = new ChainImpl(); 253 currentChain.setName(authId); 254 currentChain.setId(asymId); 255 currentModel.add(currentChain); 256 startOfNewChain = true; 257 } 258 259 if (!asymId.equals(currentChain.getId())) { 260 startOfNewChain = true; 261 262 currentChain.addGroup(currentGroup); 263 264 Optional<Chain> testChain = currentModel.stream() 265 .filter(chain -> chain.getId().equals(asymId)) 266 .findFirst(); 267 268 if (testChain.isPresent()) { 269 currentChain = testChain.get(); 270 } else { 271 currentChain = new ChainImpl(); 272 currentChain.setName(authId); 273 currentChain.setId(asymId); 274 } 275 276 if (!currentModel.contains(currentChain)) { 277 currentModel.add(currentChain); 278 } 279 } 280 281 ResidueNumber residueNumber = new ResidueNumber(authId, authSeqId.get(atomIndex), insCode); 282 283 String recordName = groupPDB.get(atomIndex); 284 String compId = labelCompId.get(atomIndex); 285 if (currentGroup == null) { 286 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 287 currentGroup.setResidueNumber(residueNumber); 288 currentGroup.setPDBName(compId); 289 currentGroup.setHetAtomInFile(isHetAtmInFile); 290 } 291 292 Group altGroup = null; 293 String altLocation = labelAltId.get(atomIndex); 294 295 if (startOfNewChain) { 296 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 297 currentGroup.setResidueNumber(residueNumber); 298 currentGroup.setPDBName(compId); 299 currentGroup.setHetAtomInFile(isHetAtmInFile); 300 } else { 301 if (!residueNumber.equals(currentGroup.getResidueNumber())) { 302 currentChain.addGroup(currentGroup); 303 currentGroup.trimToSize(); 304 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 305 currentGroup.setPDBName(compId); 306 currentGroup.setResidueNumber(residueNumber); 307 currentGroup.setHetAtomInFile(isHetAtmInFile); 308 } else { 309 if (altLocation != null && !altLocation.isEmpty() && !altLocation.equals(".")) { 310 altGroup = getAltLocGroup(recordName, altLocation.charAt(0), oneLetterCode, compId, seqId); 311 if (altGroup.getChain() == null) { 312 altGroup.setChain(currentChain); 313 } 314 } 315 } 316 } 317 318 if (params.isParseCAOnly()) { 319 if (!labelAtomId.get(atomIndex).equals(StructureTools.CA_ATOM_NAME) && "C".equals(typeSymbol.get(atomIndex))) { 320 continue; 321 } 322 } 323 324 Atom atom = new AtomImpl(); 325 326 atom.setPDBserial(id.get(atomIndex)); 327 atom.setName(labelAtomId.get(atomIndex)); 328 329 atom.setX(cartnX.get(atomIndex)); 330 atom.setY(cartnY.get(atomIndex)); 331 atom.setZ(cartnZ.get(atomIndex)); 332 333 atom.setOccupancy((float) occupancy.get(atomIndex)); 334 atom.setTempFactor((float) bIsoOrEquiv.get(atomIndex)); 335 336 if (altLocation == null || altLocation.isEmpty() || altLocation.equals(".")) { 337 atom.setAltLoc(' '); 338 } else { 339 atom.setAltLoc(altLocation.charAt(0)); 340 } 341 342 String ts = typeSymbol.get(atomIndex); 343 try { 344 Element element = Element.valueOfIgnoreCase(ts); 345 atom.setElement(element); 346 } catch (IllegalArgumentException e) { 347 logger.info("Element {} was not recognised as a BioJava-known element, the element will be " + 348 "represented as the generic element {}", ts, Element.R.name()); 349 atom.setElement(Element.R); 350 } 351 352 if (altGroup != null) { 353 altGroup.addAtom(atom); 354 } else { 355 currentGroup.addAtom(atom); 356 } 357 358 String atomName = atom.getName(); 359 if (!currentGroup.hasAtom(atomName)) { 360 if (currentGroup.getPDBName().equals(atom.getGroup().getPDBName())) { 361 if (!StructureTools.hasNonDeuteratedEquiv(atom, currentGroup)) { 362 currentGroup.addAtom(atom); 363 } 364 } 365 } 366 } 367 } 368 369 private Group getAltLocGroup(String recordName, Character altLoc, Character oneLetterCode, String threeLetterCode, 370 long seqId) { 371 List<Atom> atoms = currentGroup.getAtoms(); 372 if (atoms.size() > 0) { 373 if (atoms.get(0).getAltLoc().equals(altLoc)) { 374 return currentGroup; 375 } 376 } 377 378 List<Group> altLocs = currentGroup.getAltLocs(); 379 for (Group altLocGroup : altLocs) { 380 atoms = altLocGroup.getAtoms(); 381 if (atoms.size() > 0) { 382 for (Atom a1 : atoms) { 383 if (a1.getAltLoc().equals(altLoc)) { 384 return altLocGroup; 385 } 386 } 387 } 388 } 389 390 if (threeLetterCode.equals(currentGroup.getPDBName())) { 391 if (currentGroup.getAtoms().isEmpty()) { 392 return currentGroup; 393 } 394 395 Group altLocGroup = (Group) currentGroup.clone(); 396 altLocGroup.setAtoms(new ArrayList<>()); 397 altLocGroup.getAltLocs().clear(); 398 currentGroup.addAltLoc(altLocGroup); 399 return altLocGroup; 400 } 401 402 Group altLocGroup = createGroup(recordName, oneLetterCode, threeLetterCode, seqId); 403 altLocGroup.setPDBName(threeLetterCode); 404 altLocGroup.setResidueNumber(currentGroup.getResidueNumber()); 405 currentGroup.addAltLoc(altLocGroup); 406 return altLocGroup; 407 } 408 409 private Group createGroup(String record, Character oneLetterCode, String threeLetterCode, long seqId) { 410 Group group = ChemCompGroupFactory.getGroupFromChemCompDictionary(threeLetterCode); 411 if (group != null && !group.getChemComp().isEmpty()) { 412 if (group instanceof AminoAcidImpl) { 413 AminoAcidImpl aminoAcid = (AminoAcidImpl) group; 414 aminoAcid.setId(seqId); 415 } else if (group instanceof NucleotideImpl) { 416 NucleotideImpl nucleotide = (NucleotideImpl) group; 417 nucleotide.setId(seqId); 418 } else if (group instanceof HetatomImpl) { 419 HetatomImpl hetatom = (HetatomImpl) group; 420 hetatom.setId(seqId); 421 } 422 return group; 423 } 424 425 if ("ATOM".equals(record)) { 426 if (StructureTools.isNucleotide(threeLetterCode)) { 427 NucleotideImpl nucleotide = new NucleotideImpl(); 428 group = nucleotide; 429 nucleotide.setId(seqId); 430 } else if (oneLetterCode == null || oneLetterCode == StructureTools.UNKNOWN_GROUP_LABEL) { 431 HetatomImpl hetatom = new HetatomImpl(); 432 group = hetatom; 433 hetatom.setId(seqId); 434 } else { 435 AminoAcidImpl aminoAcid = new AminoAcidImpl(); 436 group = aminoAcid; 437 aminoAcid.setAminoType(oneLetterCode); 438 aminoAcid.setId(seqId); 439 } 440 } else { 441 if (StructureTools.isNucleotide(threeLetterCode)) { 442 NucleotideImpl nucleotide = new NucleotideImpl(); 443 group = nucleotide; 444 nucleotide.setId(seqId); 445 } else if (oneLetterCode != null) { 446 AminoAcidImpl aminoAcid = new AminoAcidImpl(); 447 group = aminoAcid; 448 aminoAcid.setAminoType(oneLetterCode); 449 aminoAcid.setId(seqId); 450 } else { 451 HetatomImpl hetatom = new HetatomImpl(); 452 hetatom.setId(seqId); 453 group = hetatom; 454 } 455 } 456 return group; 457 } 458 459 @Override 460 public void consumeAtomSites(AtomSites atomSites) { 461 // no atom sites present 462 if (!atomSites.isDefined() || atomSites.getRowCount() == 0) { 463 return; 464 } 465 466 try { 467 parsedScaleMatrix = new Matrix4d( 468 atomSites.getFractTransfMatrix11().get(0), 469 atomSites.getFractTransfMatrix12().get(0), 470 atomSites.getFractTransfMatrix13().get(0), 471 atomSites.getFractTransfVector1().get(0), 472 473 atomSites.getFractTransfMatrix21().get(0), 474 atomSites.getFractTransfMatrix22().get(0), 475 atomSites.getFractTransfMatrix23().get(0), 476 atomSites.getFractTransfVector2().get(0), 477 478 atomSites.getFractTransfMatrix31().get(0), 479 atomSites.getFractTransfMatrix32().get(0), 480 atomSites.getFractTransfMatrix33().get(0), 481 atomSites.getFractTransfVector3().get(0), 482 483 0, 484 0, 485 0, 486 1 487 ); 488 } catch (NumberFormatException e) { 489 logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not " + 490 "be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", 491 e.getMessage()); 492 structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false); 493 } 494 } 495 496 @Override 497 public void consumeAuditAuthor(AuditAuthor auditAuthor) { 498 for (int rowIndex = 0; rowIndex < auditAuthor.getRowCount(); rowIndex++) { 499 String name = auditAuthor.getName().get(rowIndex); 500 501 StringBuilder last = new StringBuilder(); 502 StringBuilder initials = new StringBuilder(); 503 boolean afterComma = false; 504 for (char c : name.toCharArray()) { 505 if (c == ' ') { 506 continue; 507 } 508 if (c == ',') { 509 afterComma = true; 510 continue; 511 } 512 513 if (afterComma) { 514 initials.append(c); 515 } else { 516 last.append(c); 517 } 518 } 519 520 StringBuilder newaa = new StringBuilder(); 521 newaa.append(initials); 522 newaa.append(last); 523 524 String auth = pdbHeader.getAuthors(); 525 if (auth == null) { 526 pdbHeader.setAuthors(newaa.toString()); 527 } else { 528 auth += "," + newaa.toString(); 529 pdbHeader.setAuthors(auth); 530 } 531 } 532 } 533 534 @Override 535 public void consumeCell(Cell cell) { 536 if (!cell.isDefined() || cell.getRowCount() == 0) { 537 return; 538 } 539 540 try { 541 float a = (float) cell.getLengthA().get(0); 542 float b = (float) cell.getLengthB().get(0); 543 float c = (float) cell.getLengthC().get(0); 544 float alpha = (float) cell.getAngleAlpha().get(0); 545 float beta = (float) cell.getAngleBeta().get(0); 546 float gamma = (float) cell.getAngleGamma().get(0); 547 548 CrystalCell crystalCell = new CrystalCell(); 549 crystalCell.setA(a); 550 crystalCell.setB(b); 551 crystalCell.setC(c); 552 crystalCell.setAlpha(alpha); 553 crystalCell.setBeta(beta); 554 crystalCell.setGamma(gamma); 555 556 if (!crystalCell.isCellReasonable()) { 557 // If the entry describes a structure determined by a technique other than X-ray crystallography, 558 // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees 559 // if so we don't add and CrystalCell will be null 560 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one " + 561 "dimension is below {}), discarding it.", CrystalCell.MIN_VALID_CELL_SIZE); 562 return; 563 } 564 565 structure.getPDBHeader() 566 .getCrystallographicInfo() 567 .setCrystalCell(crystalCell); 568 569 } catch (NumberFormatException e){ 570 structure.getPDBHeader() 571 .getCrystallographicInfo() 572 .setCrystalCell(null); 573 logger.info("could not parse some cell parameters ({}), ignoring _cell", e.getMessage()); 574 } 575 } 576 577 @Override 578 public void consumeChemComp(ChemComp chemComp) { 579 // TODO not impled in ref 580 } 581 582 @Override 583 public void consumeChemCompBond(ChemCompBond chemCompBond) { 584 // TODO not impled in ref 585 } 586 587 @Override 588 public void consumeDatabasePDBRemark(DatabasePDBRemark databasePDBremark) { 589 for (int rowIndex = 0; rowIndex < databasePDBremark.getRowCount(); rowIndex++) { 590 int id = databasePDBremark.getId().get(rowIndex); 591 if (id == 2) { 592 String line = databasePDBremark.getText().get(rowIndex); 593 int i = line.indexOf("ANGSTROM"); 594 595 if (i > 5) { 596 // line contains ANGSTROM info... 597 String resolution = line.substring(i - 5, i).trim(); 598 // convert string to float 599 try { 600 float res = Float.parseFloat(resolution); 601 pdbHeader.setResolution(res); 602 } catch (NumberFormatException e) { 603 logger.info("could not parse resolution from line and ignoring it {}", line); 604 return; 605 } 606 } 607 } 608 } 609 } 610 611 private Date convert(LocalDate localDate) { 612 return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant()); 613 } 614 615 @Override 616 public void consumeDatabasePDBRev(DatabasePDBRev databasePDBrev) { 617 logger.debug("got a database revision:" + databasePDBrev); 618 619 Date modDate = null; 620 for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) { 621 if (databasePDBrev.getNum().get(rowIndex) == 1) { 622 String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex); 623 pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT))); 624 625 String date = databasePDBrev.getDate().get(rowIndex); 626 final Date relDate = convert(LocalDate.parse(date, DATE_FORMAT)); 627 pdbHeader.setRelDate(relDate); 628 modDate = relDate; 629 } else { 630 String dbrev = databasePDBrev.getDate().get(rowIndex); 631 modDate = convert(LocalDate.parse(dbrev, DATE_FORMAT)); 632 } 633 pdbHeader.setModDate(modDate); 634 } 635 } 636 637 @Override 638 public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecord) { 639 List<org.biojava.nbio.structure.DatabasePDBRevRecord> revRecords = pdbHeader.getRevisionRecords(); 640 if (revRecords == null) { 641 revRecords = new ArrayList<>(); 642 pdbHeader.setRevisionRecords(revRecords); 643 } 644 645 for (int i = 0; i < databasePDBrevRecord.getRowCount(); i++) { 646 revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i)); 647 } 648 } 649 650 @Override 651 public void consumeEm3dReconstruction(Em3dReconstruction em3dReconstruction) { 652 this.em3dReconstruction = em3dReconstruction; 653 654 for (int rowIndex = 0; rowIndex < em3dReconstruction.getRowCount(); rowIndex++) { //can it have more than 1 value? 655 final FloatColumn resolution = em3dReconstruction.getResolution(); 656 if (ValueKind.PRESENT.equals(resolution.getValueKind(rowIndex))) 657 pdbHeader.setResolution((float) resolution.get(rowIndex)); 658 } 659 //TODO other fields (maybe RFree)? 660 } 661 662 @Override 663 public void consumeEntity(Entity entity) { 664 this.entity = entity; 665 } 666 667 @Override 668 public void consumeEntityPoly(EntityPoly entityPoly) { 669 this.entityPoly = entityPoly; 670 } 671 672 @Override 673 public void consumeEntitySrcGen(EntitySrcGen entitySrcGen) { 674 this.entitySrcGen = entitySrcGen; 675 } 676 677 @Override 678 public void consumeEntitySrcNat(EntitySrcNat entitySrcNat) { 679 this.entitySrcNat = entitySrcNat; 680 } 681 682 @Override 683 public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) { 684 this.entitySrcSyn = entitySrcSyn; 685 } 686 687 @Override 688 public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) { 689 for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) { 690 Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex)); 691 692 // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group 693 // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08 694 695 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(entityPolySeq.getMonId().get(rowIndex)); 696 //int seqId = Integer.parseInt(entityPolySeq.getNum()); 697 if (g != null && !g.getChemComp().isEmpty()) { 698 if (g instanceof AminoAcidImpl) { 699 AminoAcidImpl aa = (AminoAcidImpl) g; 700 aa.setRecordType(AminoAcid.SEQRESRECORD); 701 } 702 } else { 703 if (entityPolySeq.getMonId().get(rowIndex).length() == 3 && 704 StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)) != null) { 705 AminoAcidImpl a = new AminoAcidImpl(); 706 a.setRecordType(AminoAcid.SEQRESRECORD); 707 Character code1 = StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)); 708 a.setAminoType(code1); 709 g = a; 710 711 } else if (StructureTools.isNucleotide(entityPolySeq.getMonId().get(rowIndex))) { 712 // the group is actually a nucleotide group... 713 g = new NucleotideImpl(); 714 } else { 715 logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group " + 716 "for it", entityPolySeq.getNum().get(rowIndex), entityPolySeq.getMonId().get(rowIndex)); 717 g = new HetatomImpl(); 718 } 719 } 720 // at this stage we don't know about author residue numbers (insertion codes) 721 // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly 722 // sequential and follow the seqres sequence 1 to n) 723 // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes() 724 g.setResidueNumber(ResidueNumber.fromString(entityPolySeq.getNum().getStringData(rowIndex))); 725 g.setPDBName(entityPolySeq.getMonId().get(rowIndex)); 726 entityChain.addGroup(g); 727 } 728 } 729 730 private Chain getEntityChain(String entityId) { 731 for (Chain chain : entityChains) { 732 if (chain.getId().equals(entityId)) { 733 return chain; 734 } 735 } 736 737 // does not exist yet, so create... 738 Chain chain = new ChainImpl(); 739 chain.setId(entityId); 740 entityChains.add(chain); 741 742 return chain; 743 } 744 745 @Override 746 public void consumeExptl(Exptl exptl) { 747 for (int rowIndex = 0; rowIndex < exptl.getRowCount(); rowIndex++) { 748 pdbHeader.setExperimentalTechnique(exptl.getMethod().get(rowIndex)); 749 } 750 } 751 752 @Override 753 public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRevisionHistory) { 754 Date date = null; 755 for (int rowIndex = 0; rowIndex < pdbxAuditRevisionHistory.getRowCount(); rowIndex++) { 756 // first entry in revision history is the release date 757 if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) { 758 String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex); 759 date = convert(LocalDate.parse(release, DATE_FORMAT)); 760 pdbHeader.setRelDate(date); 761 } else { 762 // all other dates are revision dates; 763 // since this method may be called multiple times, 764 // the last revision date will "stick" 765 String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex); 766 date = convert(LocalDate.parse(revision, DATE_FORMAT)); 767 } 768 pdbHeader.setModDate(date); 769 } 770 } 771 772 @Override 773 public void consumePdbxChemCompIdentifier(PdbxChemCompIdentifier pdbxChemCompIdentifier) { 774 // TODO not impled in ref 775 } 776 777 @Override 778 public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) { 779 for (int rowIndex = 0; rowIndex < pdbxDatabaseStatus.getRowCount(); rowIndex++) { 780 // the deposition date field is only available in mmCIF 5.0 781 StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate(); 782 if (recvdInitialDepositionDate.isDefined()) { 783 String deposition = recvdInitialDepositionDate.get(rowIndex); 784 pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT))); 785 } 786 } 787 } 788 789 @Override 790 public void consumePdbxEntityBranchDescriptor(PdbxEntityBranchDescriptor pdbxEntityBranchDescriptor) { 791 // TODO not considered in ref 792 } 793 794 @Override 795 public void consumePdbxMolecule(PdbxMolecule pdbxMolecule) { 796 // TODO not considered in ref 797 } 798 799 @Override 800 public void consumePdbxMoleculeFeatures(PdbxMoleculeFeatures pdbxMoleculeFeatures) { 801 // TODO not considered in ref 802 } 803 804 @Override 805 public void consumePdbxNonpolyScheme(PdbxNonpolyScheme pdbxNonpolyScheme) { 806 // TODO not impled in ref 807 } 808 809 @Override 810 public void consumePdbxReferenceEntityLink(PdbxReferenceEntityLink pdbxReferenceEntityLink) { 811 // TODO not considered in ref 812 } 813 814 @Override 815 public void consumePdbxReferenceEntityList(PdbxReferenceEntityList pdbxReferenceEntityList) { 816 // TODO not considered in ref 817 } 818 819 @Override 820 public void consumePdbxReferenceEntityPolyLink(PdbxReferenceEntityPolyLink pdbxReferenceEntityPolyLink) { 821 // TODO not considered in ref 822 } 823 824 @Override 825 public void consumePdbxStructAssembly(PdbxStructAssembly pdbxStructAssembly) { 826 this.structAssembly = pdbxStructAssembly; 827 } 828 829 @Override 830 public void consumePdbxStructAssemblyGen(PdbxStructAssemblyGen pdbxStructAssemblyGen) { 831 this.structAssemblyGen = pdbxStructAssemblyGen; 832 } 833 834 @Override 835 public void consumePdbxStructModResidue(PdbxStructModResidue pdbxStructModResidue) { 836 // TODO not considered in ref 837 } 838 839 @Override 840 public void consumePdbxStructOperList(PdbxStructOperList pdbxStructOperList) { 841 this.structOpers = pdbxStructOperList; 842 } 843 844 @Override 845 public void consumeRefine(Refine refine) { 846 for (int rowIndex = 0; rowIndex < refine.getRowCount(); rowIndex++) { 847 // RESOLUTION 848 ValueKind valueKind = refine.getLsDResHigh().getValueKind(rowIndex); 849 if (! ValueKind.PRESENT.equals(valueKind)) { 850 continue; 851 } 852 // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m) 853 // there are 2 resolution values, one for each method 854 // we take the last one found so that behaviour is like in PDB file parsing 855 double lsDResHigh = refine.getLsDResHigh().get(rowIndex); 856 // TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0 857 if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) { 858 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}", 859 lsDResHigh, String.format("%4.2f",pdbHeader.getResolution())); 860 } 861 pdbHeader.setResolution((float) lsDResHigh); 862 863 FloatColumn lsRFactorRFree = refine.getLsRFactorRFree(); 864 // RFREE 865 if (pdbHeader.getRfree() != PDBHeader.DEFAULT_RFREE) { 866 logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {}", 867 lsRFactorRFree, String.format("%4.2f",pdbHeader.getRfree())); 868 } 869 if (lsRFactorRFree.isDefined() && lsRFactorRFree.getValueKind(rowIndex) == ValueKind.PRESENT) { 870 pdbHeader.setRfree((float) lsRFactorRFree.get(rowIndex)); 871 } else { 872 // some entries like 2ifo haven't got this field at all 873 logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value"); 874 } 875 876 // RWORK 877 FloatColumn lsRFactorRWork = refine.getLsRFactorRWork(); 878 if(pdbHeader.getRwork() != PDBHeader.DEFAULT_RFREE) { 879 logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ", 880 lsRFactorRWork, String.format("%4.2f",pdbHeader.getRwork())); 881 } 882 if (lsRFactorRWork.isDefined() && lsRFactorRWork.getValueKind(rowIndex) == ValueKind.PRESENT) { 883 pdbHeader.setRwork((float) lsRFactorRWork.get(rowIndex)); 884 } else { 885 logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value"); 886 } 887 } 888 } 889 890 @Override 891 public void consumeStruct(Struct struct) { 892 if (struct.isDefined() && struct.getTitle().isDefined()) { 893 pdbHeader.setTitle(struct.getTitle().get(0)); 894 } 895 896 if (struct.isDefined() && struct.getEntryId().isDefined()) { 897 PdbId pdbId; 898 String pdbCode = struct.getEntryId().get(0); 899 try { 900 pdbId = new PdbId(pdbCode); 901 } catch (IllegalArgumentException e) { 902 logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode); 903 pdbId = null; 904 } 905 pdbHeader.setPdbId(pdbId); 906 structure.setPdbId(pdbId); 907 } 908 } 909 910 @Override 911 public void consumeStructAsym(StructAsym structAsym) { 912 this.structAsym = structAsym; 913 } 914 915 @Override 916 public void consumeStructConf(StructConf structConf) { 917 // TODO not considered in ref 918 } 919 920 @Override 921 public void consumeStructConn(StructConn structConn) { 922 this.structConn = structConn; 923 } 924 925 @Override 926 public void consumeStructConnType(StructConnType structConnType) { 927 // TODO not considered in ref 928 } 929 930 @Override 931 public void consumeStructKeywords(StructKeywords structKeywords) { 932 ArrayList<String> keywordsList = new ArrayList<String>(); 933 934 StrColumn text = structKeywords.getText(); 935 if (text.isDefined()) { 936 String keywords = text.get(0); 937 String[] strings = keywords.split(" *, *"); 938 for (String string : strings) { 939 keywordsList.add(string.trim()); 940 } 941 } 942 structure.getPDBHeader().setKeywords(keywordsList); 943 944 StrColumn pdbxKeywords = structKeywords.getPdbxKeywords(); 945 if (pdbxKeywords.isDefined()) { 946 String keywords = pdbxKeywords.get(0); 947 pdbHeader.setClassification(keywords); 948 //This field should be left empty. TODO The next line should be removed later 949 pdbHeader.setDescription(keywords); 950 } 951 } 952 953 @Override 954 public void consumeStructNcsOper(StructNcsOper structNcsOper) { 955 this.structNcsOper = structNcsOper; 956 } 957 958 @Override 959 public void consumeStructRef(StructRef structRef) { 960 this.structRef = structRef; 961 } 962 963 @Override 964 public void consumeStructRefSeq(StructRefSeq structRefSeq) { 965 for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) { 966 String refId = structRefSeq.getRefId().get(rowIndex); 967 968 DBRef dbRef = new DBRef(); 969 970 dbRef.setIdCode(structRefSeq.getPdbxPDBIdCode().get(rowIndex)); 971 dbRef.setDbAccession(structRefSeq.getPdbxDbAccession().get(rowIndex)); 972 dbRef.setDbIdCode(structRefSeq.getPdbxDbAccession().get(rowIndex)); 973 dbRef.setChainName(structRefSeq.getPdbxStrandId().get(rowIndex)); 974 975 OptionalInt structRefRowIndex = IntStream.range(0, structRef.getRowCount()) 976 .filter(i -> structRef.getId().get(i).equals(refId)) 977 .findFirst(); 978 979 if (structRefRowIndex.isPresent()) { 980 dbRef.setDatabase(structRef.getDbName().get(structRefRowIndex.getAsInt())); 981 dbRef.setDbIdCode(structRef.getDbCode().get(structRefRowIndex.getAsInt())); 982 } else { 983 logger.info("could not find StructRef `{} for StructRefSeq {}", refId, rowIndex); 984 } 985 986 int seqBegin; 987 int seqEnd; 988 989 try { 990 seqBegin = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignBeg().get(rowIndex)); 991 seqEnd = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignEnd().get(rowIndex)); 992 } catch (NumberFormatException e) { 993 // this happens in a few entries, annotation error? e.g. 6eoj 994 logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref " + 995 "alignment info for accession {}. Error: {}", dbRef.getDbAccession(), e.getMessage()); 996 return; 997 } 998 999 char beginInsCode = ' '; 1000 String pdbxSeqAlignBegInsCode = structRefSeq.getPdbxSeqAlignBegInsCode().get(rowIndex); 1001 if (pdbxSeqAlignBegInsCode.length() > 0) { 1002 beginInsCode = pdbxSeqAlignBegInsCode.charAt(0); 1003 } 1004 1005 char endInsCode = ' '; 1006 String pdbxSeqAlignEndInsCode = structRefSeq.getPdbxSeqAlignEndInsCode().get(rowIndex); 1007 if (pdbxSeqAlignEndInsCode.length() > 0) { 1008 endInsCode = pdbxSeqAlignEndInsCode.charAt(0); 1009 } 1010 1011 if (beginInsCode == '?') { 1012 beginInsCode = ' '; 1013 } 1014 if (endInsCode == '?') { 1015 endInsCode = ' '; 1016 } 1017 1018 dbRef.setSeqBegin(seqBegin); 1019 dbRef.setInsertBegin(beginInsCode); 1020 dbRef.setSeqEnd(seqEnd); 1021 dbRef.setInsertEnd(endInsCode); 1022 1023 int dbSeqBegin = structRefSeq.getDbAlignBeg().get(rowIndex); 1024 int dbSeqEnd = structRefSeq.getDbAlignEnd().get(rowIndex); 1025 1026 char dbBeginInsCode = ' '; 1027 StrColumn pdbxDbAlignBegInsCodeCol = structRefSeq.getPdbxDbAlignBegInsCode(); 1028 if (pdbxDbAlignBegInsCodeCol.isDefined()) { 1029 String pdbxDbAlignBegInsCode = pdbxDbAlignBegInsCodeCol.get(rowIndex); 1030 if (pdbxDbAlignBegInsCode.length() > 0) { 1031 dbBeginInsCode = pdbxDbAlignBegInsCode.charAt(0); 1032 } 1033 } 1034 1035 char dbEndInsCode = ' '; 1036 StrColumn pdbxDbAlignEndInsCodeCol = structRefSeq.getPdbxDbAlignEndInsCode(); 1037 if (pdbxDbAlignEndInsCodeCol.isDefined()) { 1038 String pdbxDbAlignEndInsCode = pdbxDbAlignEndInsCodeCol.get(rowIndex); 1039 if (pdbxDbAlignEndInsCode.length() > 0) { 1040 dbEndInsCode = pdbxDbAlignEndInsCode.charAt(0); 1041 } 1042 } 1043 1044 if (dbBeginInsCode == '?') { 1045 dbBeginInsCode = ' '; 1046 } 1047 if (dbEndInsCode == '?') { 1048 dbEndInsCode = ' '; 1049 } 1050 1051 dbRef.setDbSeqBegin(dbSeqBegin); 1052 dbRef.setIdbnsBegin(dbBeginInsCode); 1053 dbRef.setDbSeqEnd(dbSeqEnd); 1054 dbRef.setIdbnsEnd(dbEndInsCode); 1055 1056 List<DBRef> dbrefs = structure.getDBRefs(); 1057 if (dbrefs == null) { 1058 dbrefs = new ArrayList<>(); 1059 } 1060 dbrefs.add(dbRef); 1061 1062 logger.debug(dbRef.toPDB()); 1063 1064 structure.setDBRefs(dbrefs); 1065 } 1066 } 1067 1068 @Override 1069 public void consumeStructRefSeqDif(StructRefSeqDif structRefSeqDif) { 1070 this.structRefSeqDif = structRefSeqDif; 1071 } 1072 1073 @Override 1074 public void consumeStructSheetRange(StructSheetRange structSheetRange) { 1075 // TODO not considered in ref 1076 } 1077 1078 @Override 1079 public void consumeStructSite(StructSite structSite) { 1080 if (params.isHeaderOnly()) { 1081 return; 1082 } 1083 1084 List<Site> sites = structure.getSites(); 1085 if (sites == null) { 1086 sites = new ArrayList<>(); 1087 } 1088 1089 for (int rowIndex = 0; rowIndex < structSite.getRowCount(); rowIndex++) { 1090 Site site = null; 1091 for (Site asite : sites) { 1092 if (asite.getSiteID().equals(structSite.getId().get(rowIndex))) { 1093 site = asite; // prevent duplicate siteIds 1094 } 1095 } 1096 1097 boolean addSite = false; 1098 if (site == null) { 1099 site = new Site(); 1100 addSite = true; 1101 } 1102 1103 site.setSiteID(structSite.getId().get(rowIndex)); 1104 site.setDescription(structSite.getDetails().get(rowIndex)); 1105 site.setEvCode(structSite.getPdbxEvidenceCode().get(rowIndex)); 1106 1107 if (addSite) { 1108 sites.add(site); 1109 } 1110 } 1111 1112 structure.setSites(sites); 1113 } 1114 1115 @Override 1116 public void consumeStructSiteGen(StructSiteGen structSiteGen) { 1117 this.structSiteGen = structSiteGen; 1118 } 1119 1120 @Override 1121 public void consumeSymmetry(Symmetry symmetry) { 1122 for (int rowIndex = 0; rowIndex < symmetry.getRowCount(); rowIndex++) { 1123 String spaceGroupString = symmetry.getSpaceGroupNameH_M().get(rowIndex); 1124 SpaceGroup spaceGroup = SymoplibParser.getSpaceGroup(spaceGroupString); 1125 if (spaceGroup == null) { 1126 logger.warn("Space group '{}' not recognised as a standard space group", spaceGroupString); 1127 structure.getPDBHeader() 1128 .getCrystallographicInfo() 1129 .setNonStandardSg(true); 1130 } else { 1131 structure.getPDBHeader() 1132 .getCrystallographicInfo() 1133 .setSpaceGroup(spaceGroup); 1134 structure.getPDBHeader() 1135 .getCrystallographicInfo() 1136 .setNonStandardSg(false); 1137 } 1138 } 1139 } 1140 1141 @Override 1142 public void finish() { 1143 if (currentChain != null) { 1144 currentChain.addGroup(currentGroup); 1145 1146 Optional<Chain> testChain = currentModel.stream() 1147 .filter(chain -> chain.getId().equals(currentChain.getId())) 1148 .findFirst(); 1149 1150 if (!testChain.isPresent()) { 1151 currentModel.add(currentChain); 1152 } 1153 } else if (!params.isHeaderOnly()) { 1154 logger.warn("current chain is null at end of document."); 1155 } 1156 1157 allModels.add(currentModel); 1158 1159 initMaps(); 1160 1161 for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) { 1162 String id = structAsym.getId().get(rowIndex); 1163 String entityId = structAsym.getEntityId().get(rowIndex); 1164 logger.debug("Entity {} matches asym_id: {}", entityId, id); 1165 1166 Chain chain = getEntityChain(entityId); 1167 Chain seqRes = (Chain) chain.clone(); 1168 // to solve issue #160 (e.g. 3u7t) 1169 seqRes = removeSeqResHeterogeneity(seqRes); 1170 seqRes.setId(id); 1171 seqRes.setName(asymId2authorId.getOrDefault(id, id)); 1172 1173 EntityType type = EntityType.entityTypeFromString(getEntityType(entityId)); 1174 if (type == null || type == EntityType.POLYMER) { 1175 seqResChains.add(seqRes); 1176 } 1177 1178 logger.debug(" seqres: {} {}<", id, seqRes); 1179 addEntity(rowIndex, entityId, getEntityDescription(entityId), getEntityType(entityId)); 1180 } 1181 1182 if (!structAsym.isDefined() || structAsym.getRowCount() == 0) { 1183 logger.warn("No _struct_asym category in file, no SEQRES groups will be added."); 1184 } 1185 1186 // entities 1187 // In addEntities above we created the entities if they were present in the file 1188 // Now we need to make sure that they are linked to chains and also that if they are not present in the file we 1189 // need to add them now 1190 linkEntities(); 1191 1192 // now that we know the entities, we can add all chains to structure so that they are stored 1193 // properly as polymer/nonpolymer/water chains inside structure 1194 allModels.forEach(structure::addModel); 1195 1196 // Only align if requested (default) and not when headerOnly mode with no Atoms. 1197 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 1198 if (params.isAlignSeqRes() && !params.isHeaderOnly()){ 1199 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 1200 alignSeqRes(); 1201 } else { 1202 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 1203 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 1204 } 1205 1206 // Now make sure all altlocgroups have all the atoms in all the groups 1207 StructureTools.cleanUpAltLocs(structure); 1208 1209 // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out 1210 if (!params.isHeaderOnly()) { 1211 if (params.shouldCreateAtomBonds()) { 1212 addBonds(); 1213 } 1214 1215 if (params.shouldCreateAtomCharges()) { 1216 addCharges(); 1217 } 1218 } 1219 1220 if (!params.isHeaderOnly()) { 1221 addSites(); 1222 } 1223 1224 // set the oligomeric state info in the header... 1225 if (params.isParseBioAssembly()) { 1226 // the more detailed mapping of chains to rotation operations happens in StructureIO... 1227 1228 Map<Integer, BioAssemblyInfo> bioAssemblies = new LinkedHashMap<>(); 1229 for (int i = 0; i < structAssembly.getRowCount(); i++) { 1230 String assemblyId = structAssembly.getId().get(i); 1231 List<Integer> structAssemblyGenIndices = new ArrayList<>(); 1232 for (int j = 0; j < structAssemblyGen.getRowCount(); j++) { 1233 if (structAssemblyGen.getAssemblyId().get(j).equals(assemblyId)) { 1234 structAssemblyGenIndices.add(j); 1235 } 1236 } 1237 BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder(); 1238 // these are the transformations that need to be applied to our model 1239 List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(structAssembly, 1240 i, structAssemblyGen, structOpers); 1241 1242 int bioAssemblyId = -1; 1243 try { 1244 bioAssemblyId = Integer.parseInt(assemblyId); 1245 } catch (NumberFormatException e) { 1246 logger.info("Could not parse a numerical bio assembly id from '{}'", assemblyId); 1247 } 1248 1249 // if bioassembly id is not numerical we throw it away 1250 // this happens usually for viral capsid entries, like 1ei7 1251 // see issue #230 in github 1252 if (bioAssemblyId != -1) { 1253 int mmSize = 0; 1254 // note that the transforms contain asym ids of both polymers and non-polymers 1255 // For the mmsize, we are only interested in the polymers 1256 for (BiologicalAssemblyTransformation transf : transformations) { 1257 Chain c = structure.getChain(transf.getChainId()); 1258 if (c == null) { 1259 logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId()); 1260 continue; 1261 } 1262 if (c.getEntityType() == EntityType.POLYMER && 1263 // for entries like 4kro, sugars are annotated as polymers but we 1264 // don't want them in the macromolecularSize count 1265 !c.getEntityInfo().getDescription().contains("SUGAR")) { 1266 mmSize++; 1267 } 1268 } 1269 1270 BioAssemblyInfo bioAssembly = new BioAssemblyInfo(); 1271 bioAssembly.setId(bioAssemblyId); 1272 bioAssembly.setMacromolecularSize(mmSize); 1273 bioAssembly.setTransforms(transformations); 1274 bioAssemblies.put(bioAssemblyId, bioAssembly); 1275 } 1276 1277 } 1278 structure.getPDBHeader() 1279 .setBioAssemblies(bioAssemblies); 1280 } 1281 1282 setStructNcsOps(); 1283 setCrystallographicInfoMetadata(); 1284 1285 Map<String, List<SeqMisMatch>> misMatchMap = new HashMap<>(); 1286 for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) { 1287 SeqMisMatch seqMisMatch = new SeqMisMatchImpl(); 1288 seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex)); 1289 1290 String insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex); 1291 if (insCode != null && insCode.equals("?")) { 1292 insCode = null; 1293 } 1294 seqMisMatch.setInsCode(insCode); 1295 seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex)); 1296 seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex)); 1297 seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex)); 1298 seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex)); 1299 seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); 1300 1301 String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); 1302 List<SeqMisMatch> seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>()); 1303 seqMisMatches.add(seqMisMatch); 1304 } 1305 1306 for (String chainId : misMatchMap.keySet()){ 1307 Chain chain = structure.getPolyChainByPDB(chainId); 1308 if (chain == null) { 1309 logger.warn("Could not set mismatches for chain with author id {}", chainId); 1310 continue; 1311 } 1312 1313 chain.setSeqMisMatches(misMatchMap.get(chainId)); 1314 } 1315 } 1316 1317 private String getEntityType(String entityId) { 1318 return IntStream.range(0, entity.getRowCount()) 1319 .filter(i -> entity.getId().get(i).equals(entityId)) 1320 .mapToObj(i -> entity.getType().get(i)) 1321 .findFirst() 1322 .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId)); 1323 } 1324 1325 private String getEntityDescription(String entityId) { 1326 return IntStream.range(0, entity.getRowCount()) 1327 .filter(i -> entity.getId().get(i).equals(entityId)) 1328 .mapToObj(i -> entity.getPdbxDescription().get(i)) 1329 .findFirst() 1330 .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId)); 1331 } 1332 1333 private void addEntity(int asymRowIndex, String entityId, String pdbxDescription, String type) { 1334 int eId = 0; 1335 try { 1336 eId = Integer.parseInt(entityId); 1337 } catch (NumberFormatException e) { 1338 logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity", entityId); 1339 } 1340 1341 int entityRowIndex = IntStream.range(0, entity.getRowCount()) 1342 .filter(i -> entity.getId().get(i).equals(entityId)) 1343 .findFirst() 1344 .orElse(-1); 1345 1346 EntityInfo entityInfo = structure.getEntityById(eId); 1347 1348 if (entityInfo == null) { 1349 entityInfo = new EntityInfo(); 1350 entityInfo.setMolId(eId); 1351 // we only add the compound if a polymeric one (to match what the PDB parser does) 1352 if (entityRowIndex != -1) { 1353 entityInfo.setDescription(pdbxDescription); 1354 1355 EntityType eType = EntityType.entityTypeFromString(type); 1356 if (eType != null) { 1357 entityInfo.setType(eType); 1358 } else { 1359 logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", type, eId); 1360 } 1361 addAncilliaryEntityData(asymRowIndex, entityInfo); 1362 structure.addEntityInfo(entityInfo); 1363 logger.debug("Adding Entity with entity id {} from _entity, with name: {}", eId, 1364 entityInfo.getDescription()); 1365 } 1366 } 1367 } 1368 1369 private void addAncilliaryEntityData(int asymRowIndex, EntityInfo entityInfo) { 1370 // Loop through each of the entity types and add the corresponding data 1371 // We're assuming if data is duplicated between sources it is consistent 1372 // This is a potentially huge assumption... 1373 1374 for (int rowIndex = 0; rowIndex < entitySrcGen.getRowCount(); rowIndex++) { 1375 if (!entitySrcGen.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1376 continue; 1377 } 1378 1379 addInformationFromEntitySrcGen(rowIndex, entityInfo); 1380 } 1381 1382 for (int rowIndex = 0; rowIndex < entitySrcNat.getRowCount(); rowIndex++) { 1383 if (!entitySrcNat.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1384 continue; 1385 } 1386 1387 addInformationFromEntitySrcNat(rowIndex, entityInfo); 1388 } 1389 1390 for (int rowIndex = 0; rowIndex < entitySrcSyn.getRowCount(); rowIndex++) { 1391 if (!entitySrcSyn.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1392 continue; 1393 } 1394 1395 addInformationFromEntitySrcSyn(rowIndex, entityInfo); 1396 } 1397 } 1398 1399 private void addInformationFromEntitySrcSyn(int rowIndex, EntityInfo entityInfo) { 1400 entityInfo.setOrganismCommon(entitySrcSyn.getOrganismCommonName().get(rowIndex)); 1401 entityInfo.setOrganismScientific(entitySrcSyn.getOrganismScientific().get(rowIndex)); 1402 entityInfo.setOrganismTaxId(entitySrcSyn.getNcbiTaxonomyId().get(rowIndex)); 1403 } 1404 1405 private void addInformationFromEntitySrcNat(int rowIndex, EntityInfo entityInfo) { 1406 entityInfo.setAtcc(entitySrcNat.getPdbxAtcc().get(rowIndex)); 1407 entityInfo.setCell(entitySrcNat.getPdbxCell().get(rowIndex)); 1408 entityInfo.setOrganismCommon(entitySrcNat.getCommonName().get(rowIndex)); 1409 entityInfo.setOrganismScientific(entitySrcNat.getPdbxOrganismScientific().get(rowIndex)); 1410 entityInfo.setOrganismTaxId(entitySrcNat.getPdbxNcbiTaxonomyId().get(rowIndex)); 1411 } 1412 1413 private void addInformationFromEntitySrcGen(int rowIndex, EntityInfo entityInfo) { 1414 entityInfo.setAtcc(entitySrcGen.getPdbxGeneSrcAtcc().get(rowIndex)); 1415 entityInfo.setCell(entitySrcGen.getPdbxGeneSrcCell().get(rowIndex)); 1416 entityInfo.setOrganismCommon(entitySrcGen.getGeneSrcCommonName().get(rowIndex)); 1417 entityInfo.setOrganismScientific(entitySrcGen.getPdbxGeneSrcScientificName().get(rowIndex)); 1418 entityInfo.setOrganismTaxId(entitySrcGen.getPdbxGeneSrcNcbiTaxonomyId().get(rowIndex)); 1419 entityInfo.setExpressionSystemTaxId(entitySrcGen.getPdbxHostOrgNcbiTaxonomyId().get(rowIndex)); 1420 entityInfo.setExpressionSystem(entitySrcGen.getPdbxHostOrgScientificName().get(rowIndex)); 1421 } 1422 1423 private void setStructNcsOps() { 1424 List<Matrix4d> ncsOperators = new ArrayList<>(); 1425 1426 for (int rowIndex = 0; rowIndex < structNcsOper.getRowCount(); rowIndex++) { 1427 if (!"generate".equals(structNcsOper.getCode().get(rowIndex))) { 1428 continue; 1429 } 1430 1431 try { 1432 Matrix4d operator = new Matrix4d(); 1433 1434 operator.setElement(0, 0, structNcsOper.getMatrix11().get(rowIndex)); 1435 operator.setElement(0, 1, structNcsOper.getMatrix12().get(rowIndex)); 1436 operator.setElement(0, 2, structNcsOper.getMatrix13().get(rowIndex)); 1437 operator.setElement(0, 3, structNcsOper.getVector1().get(rowIndex)); 1438 1439 operator.setElement(1, 0, structNcsOper.getMatrix21().get(rowIndex)); 1440 operator.setElement(1, 1, structNcsOper.getMatrix22().get(rowIndex)); 1441 operator.setElement(1, 2, structNcsOper.getMatrix23().get(rowIndex)); 1442 operator.setElement(1, 3, structNcsOper.getVector2().get(rowIndex)); 1443 1444 operator.setElement(2, 0, structNcsOper.getMatrix31().get(rowIndex)); 1445 operator.setElement(2, 1, structNcsOper.getMatrix32().get(rowIndex)); 1446 operator.setElement(2, 2, structNcsOper.getMatrix33().get(rowIndex)); 1447 operator.setElement(2, 3, structNcsOper.getVector3().get(rowIndex)); 1448 1449 operator.setElement(3, 0, 0); 1450 operator.setElement(3, 1, 0); 1451 operator.setElement(3, 2, 0); 1452 operator.setElement(3, 3, 1); 1453 1454 ncsOperators.add(operator); 1455 } catch (NumberFormatException e) { 1456 logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", rowIndex + 1); 1457 } 1458 } 1459 1460 if (ncsOperators.size() > 0) { 1461 structure.getCrystallographicInfo() 1462 .setNcsOperators(ncsOperators.toArray(new Matrix4d[0])); 1463 } 1464 } 1465 1466 private void setCrystallographicInfoMetadata() { 1467 if (parsedScaleMatrix != null) { 1468 PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo(); 1469 boolean nonStd = false; 1470 if (crystalInfo.getCrystalCell() != null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) { 1471 nonStd = true; 1472 } 1473 1474 crystalInfo.setNonStandardCoordFrameConvention(nonStd); 1475 } 1476 } 1477 1478 private void addSites() { 1479 List<Site> sites = structure.getSites(); 1480 if (sites == null) sites = new ArrayList<>(); 1481 1482 for (int rowIndex = 0; rowIndex < structSiteGen.getRowCount(); rowIndex++) { 1483 // For each StructSiteGen, find the residues involved, if they exist then 1484 String site_id = structSiteGen.getSiteId().get(rowIndex); // multiple could be in same site. 1485 if (site_id == null) { 1486 site_id = ""; 1487 } 1488 String comp_id = structSiteGen.getLabelCompId().get(rowIndex); // PDBName 1489 1490 // Assumption: the author chain ID and residue number for the site is consistent with the original 1491 // author chain id and residue numbers. 1492 1493 String asymId = structSiteGen.getLabelAsymId().get(rowIndex); // chain name 1494 String authId = structSiteGen.getAuthAsymId().get(rowIndex); // chain Id 1495 String auth_seq_id = structSiteGen.getAuthSeqId().get(rowIndex); // Res num 1496 1497 String insCode = structSiteGen.getPdbxAuthInsCode().get(rowIndex); 1498 if (insCode != null && insCode.equals("?")) { 1499 insCode = null; 1500 } 1501 1502 // Look for asymID = chainID and seqID = seq_ID. Check that comp_id matches the resname. 1503 Group g = null; 1504 try { 1505 Chain chain = structure.getChain(asymId); 1506 1507 if (null != chain) { 1508 try { 1509 Character insChar = null; 1510 if (null != insCode && insCode.length() > 0) { 1511 insChar = insCode.charAt(0); 1512 } 1513 g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar)); 1514 } catch (NumberFormatException e) { 1515 logger.warn("Could not lookup residue : {}{}", authId, auth_seq_id); 1516 } 1517 } 1518 } catch (StructureException e) { 1519 logger.warn("Problem finding residue in site entry {} - {}", 1520 structSiteGen.getSiteId().get(rowIndex), e.getMessage()); 1521 } 1522 1523 if (g != null) { 1524 // 2. find the site_id, if not existing, create anew. 1525 Site site = null; 1526 for (Site asite : sites) { 1527 if (site_id.equals(asite.getSiteID())) { 1528 site = asite; 1529 } 1530 } 1531 1532 boolean addSite = false; 1533 1534 // 3. add this residue to the site. 1535 if (site == null) { 1536 addSite = true; 1537 site = new Site(); 1538 site.setSiteID(site_id); 1539 } 1540 1541 List<Group> groups = site.getGroups(); 1542 if (groups == null) { 1543 groups = new ArrayList<>(); 1544 } 1545 1546 // Check the self-consistency of the residue reference from auth_seq_id and chain_id 1547 if (!comp_id.equals(g.getPDBName())) { 1548 logger.warn("comp_id doesn't match the residue at {} {} - skipping", authId, auth_seq_id); 1549 } else { 1550 groups.add(g); 1551 site.setGroups(groups); 1552 } 1553 if (addSite) { 1554 sites.add(site); 1555 } 1556 } 1557 } 1558 structure.setSites(sites); 1559 } 1560 1561 private void addCharges() { 1562 ChargeAdder.addCharges(structure); 1563 } 1564 1565 /** 1566 * The method will return a new reference to a Chain with any consecutive groups 1567 * having same residue numbers removed. 1568 * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160) 1569 */ 1570 private static Chain removeSeqResHeterogeneity(Chain c) { 1571 Chain trimmedChain = new ChainImpl(); 1572 ResidueNumber lastResNum = null; 1573 1574 for (Group g : c.getAtomGroups()) { 1575 // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g) 1576 ResidueNumber currentResNum = new ResidueNumber( 1577 g.getResidueNumber().getChainName(), 1578 g.getResidueNumber().getSeqNum(), 1579 g.getResidueNumber().getInsCode()); 1580 1581 if (lastResNum == null || !lastResNum.equals(currentResNum)) { 1582 trimmedChain.addGroup(g); 1583 } else { 1584 logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely " + 1585 "has hetero='y': {}", g); 1586 } 1587 lastResNum = currentResNum; 1588 1589 } 1590 return trimmedChain; 1591 } 1592 1593 private void addBonds() { 1594 BondMaker maker = new BondMaker(structure, params); 1595 maker.makeBonds(); 1596 maker.formBondsFromStructConn(structConn); 1597 } 1598 1599 private void alignSeqRes() { 1600 logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence"); 1601 1602 // fix SEQRES residue numbering for all models 1603 1604 for (int model = 0; model < structure.nrModels(); model++) { 1605 List<Chain> atomList = structure.getModel(model); 1606 1607 for (Chain seqResChain : seqResChains){ 1608 1609 // this extracts the matching atom chain from atomList 1610 Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true); 1611 1612 if (atomChain == null) { 1613 // most likely there's no observed residues at all for the seqres chain: can't map 1614 // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues 1615 logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's " + 1616 "no observed residues in the chain.", seqResChain.getId()); 1617 continue; 1618 } 1619 1620 //map the atoms to the seqres... 1621 1622 // we need to first clone the seqres so that they stay independent for different models 1623 List<Group> seqResGroups = new ArrayList<>(); 1624 for (int i = 0; i < seqResChain.getAtomGroups().size(); i++) { 1625 seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone()); 1626 } 1627 1628 for (int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) { 1629 Group seqresG = seqResGroups.get(seqResPos); 1630 boolean found = false; 1631 for (Group atomG : atomChain.getAtomGroups()) { 1632 1633 int internalNr = getInternalNr(atomG); 1634 1635 if (seqresG.getResidueNumber().getSeqNum() == internalNr) { 1636 seqResGroups.set(seqResPos, atomG); 1637 found = true; 1638 break; 1639 } 1640 } 1641 1642 if (!found) 1643 // so far the residue number has tracked internal numbering. 1644 // however there are no atom records, as such this can't be a PDB residue number... 1645 seqresG.setResidueNumber(null); 1646 } 1647 atomChain.setSeqResGroups(seqResGroups); 1648 } 1649 } 1650 } 1651 1652 private int getInternalNr(Group atomG) { 1653 if (atomG.getType().equals(GroupType.AMINOACID)) { 1654 AminoAcidImpl aa = (AminoAcidImpl) atomG; 1655 return (int) aa.getId(); 1656 } else if (atomG.getType().equals(GroupType.NUCLEOTIDE)) { 1657 NucleotideImpl nu = (NucleotideImpl) atomG; 1658 return (int) nu.getId(); 1659 } else { 1660 HetatomImpl he = (HetatomImpl) atomG; 1661 return (int) he.getId(); 1662 } 1663 } 1664 1665 private void linkEntities() { 1666 for (List<Chain> allModel : allModels) { 1667 for (Chain chain : allModel) { 1668 //logger.info("linking entities for " + chain.getId() + " " + chain.getName()); 1669 String entityId = asymId2entityId.get(chain.getId()); 1670 1671 if (entityId == null) { 1672 // this can happen for instance if the cif file didn't have _struct_asym category at all 1673 // and thus we have no asymId2entityId mapping at all 1674 logger.info("No entity id could be found for chain {}", chain.getId()); 1675 continue; 1676 } 1677 1678 int eId = Integer.parseInt(entityId); 1679 1680 // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found. 1681 // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer 1682 // asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the 1683 // mmCIF internal data structures but is compatible with Structure interface. 1684 // Some examples of PDB entries with this kind of problem: 1685 // - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName 1686 // - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule 1687 // - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone 1688 1689 EntityInfo entityInfo = structure.getEntityById(eId); 1690 if (entityInfo == null) { 1691 // Supports the case where the only chain members were from non-polymeric entity that is missing. 1692 // Solved by creating a new Compound(entity) to which this chain will belong. 1693 logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.", 1694 eId, chain.getId()); 1695 entityInfo = new EntityInfo(); 1696 entityInfo.setMolId(eId); 1697 entityInfo.addChain(chain); 1698 if (chain.isWaterOnly()) { 1699 entityInfo.setType(EntityType.WATER); 1700 } else { 1701 entityInfo.setType(EntityType.NONPOLYMER); 1702 } 1703 chain.setEntityInfo(entityInfo); 1704 structure.addEntityInfo(entityInfo); 1705 } else { 1706 logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}", 1707 chain.getId(), chain.getName(), eId); 1708 entityInfo.addChain(chain); 1709 chain.setEntityInfo(entityInfo); 1710 } 1711 1712 } 1713 1714 } 1715 1716 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 1717 List<EntityInfo> entityInfos = structure.getEntityInfos(); 1718 if (entityInfos == null || entityInfos.isEmpty()) { 1719 List<List<Chain>> polyModels = new ArrayList<>(); 1720 List<List<Chain>> nonPolyModels = new ArrayList<>(); 1721 List<List<Chain>> waterModels = new ArrayList<>(); 1722 1723 for (List<Chain> model : allModels) { 1724 List<Chain> polyChains = new ArrayList<>(); 1725 List<Chain> nonPolyChains = new ArrayList<>(); 1726 List<Chain> waterChains = new ArrayList<>(); 1727 1728 polyModels.add(polyChains); 1729 nonPolyModels.add(nonPolyChains); 1730 waterModels.add(waterChains); 1731 1732 for (Chain chain : model) { 1733 // we only have entities for polymeric chains, all others are ignored for assigning entities 1734 if (chain.isWaterOnly()) { 1735 waterChains.add(chain); 1736 } else if (chain.isPureNonPolymer()) { 1737 nonPolyChains.add(chain); 1738 } else { 1739 polyChains.add(chain); 1740 } 1741 } 1742 } 1743 1744 entityInfos = EntityFinder.findPolyEntities(polyModels); 1745 EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos); 1746 1747 structure.setEntityInfos(entityInfos); 1748 } 1749 1750 // final sanity check: it can happen that from the annotated entities some are not linked to any chains 1751 // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds) 1752 // we simply log it, this can sign some other problems if the entities are used down the line 1753 for (EntityInfo e : entityInfos) { 1754 if (e.getChains().isEmpty()) { 1755 logger.info("Entity {} '{}' has no chains associated to it", 1756 e.getMolId() < 0 ? "with no entity id" : e.getMolId(), e.getDescription()); 1757 } 1758 } 1759 } 1760 1761 private void initMaps() { 1762 if (structAsym == null || !structAsym.isDefined() || structAsym.getRowCount() == 0) { 1763 logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available"); 1764 return; 1765 } 1766 1767 Map<String, List<String>> entityId2asymId = new HashMap<>(); 1768 for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) { 1769 String id = structAsym.getId().get(rowIndex); 1770 String entityId = structAsym.getEntityId().get(rowIndex); 1771 1772 logger.debug("Entity {} matches asym_id: {}", entityId, id); 1773 1774 asymId2entityId.put(id, entityId); 1775 1776 if (entityId2asymId.containsKey(entityId)) { 1777 List<String> asymIds = entityId2asymId.get(entityId); 1778 asymIds.add(id); 1779 } else { 1780 List<String> asymIds = new ArrayList<>(); 1781 asymIds.add(id); 1782 entityId2asymId.put(entityId, asymIds); 1783 } 1784 } 1785 1786 if (entityPoly == null || !entityPoly.isDefined() || entityPoly.getRowCount() == 0) { 1787 logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available " + 1788 "for header only parsing"); 1789 return; 1790 } 1791 1792 for (int rowIndex = 0; rowIndex < entityPoly.getRowCount(); rowIndex++) { 1793 if (!entityPoly.getPdbxStrandId().isDefined()) { 1794 logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to " + 1795 "author ids for this entity.", entityPoly.getEntityId().get(rowIndex)); 1796 break; 1797 } 1798 1799 String[] chainNames = entityPoly.getPdbxStrandId().get(rowIndex).split(","); 1800 List<String> asymIds = entityId2asymId.get(entityPoly.getEntityId().get(rowIndex)); 1801 if (chainNames.length != asymIds.size()) { 1802 logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) " + 1803 "for entity {} have different lengths! Can't provide a mapping from asym ids to author chain " + 1804 "ids", entityPoly.getEntityId().get(rowIndex)); 1805 break; 1806 } 1807 1808 for (int i = 0; i < chainNames.length; i++) { 1809 asymId2authorId.put(asymIds.get(i), chainNames[i]); 1810 } 1811 } 1812 } 1813 1814 @Override 1815 public Structure getContainer() { 1816 return structure; 1817 } 1818}