001package org.biojava.nbio.structure.io.cif; 002 003import java.time.LocalDate; 004import java.time.ZoneId; 005import java.time.format.DateTimeFormatter; 006import java.time.format.DateTimeFormatterBuilder; 007import java.util.ArrayList; 008import java.util.Date; 009import java.util.HashMap; 010import java.util.LinkedHashMap; 011import java.util.List; 012import java.util.Locale; 013import java.util.Map; 014import java.util.NoSuchElementException; 015import java.util.Optional; 016import java.util.OptionalInt; 017import java.util.stream.Collectors; 018import java.util.stream.IntStream; 019 020import javax.vecmath.Matrix4d; 021 022import org.biojava.nbio.structure.AminoAcid; 023import org.biojava.nbio.structure.AminoAcidImpl; 024import org.biojava.nbio.structure.Atom; 025import org.biojava.nbio.structure.AtomImpl; 026import org.biojava.nbio.structure.Chain; 027import org.biojava.nbio.structure.ChainImpl; 028import org.biojava.nbio.structure.DBRef; 029import org.biojava.nbio.structure.Element; 030import org.biojava.nbio.structure.EntityInfo; 031import org.biojava.nbio.structure.EntityType; 032import org.biojava.nbio.structure.Group; 033import org.biojava.nbio.structure.GroupType; 034import org.biojava.nbio.structure.HetatomImpl; 035import org.biojava.nbio.structure.NucleotideImpl; 036import org.biojava.nbio.structure.PDBCrystallographicInfo; 037import org.biojava.nbio.structure.PDBHeader; 038import org.biojava.nbio.structure.PdbId; 039import org.biojava.nbio.structure.ResidueNumber; 040import org.biojava.nbio.structure.SeqMisMatch; 041import org.biojava.nbio.structure.SeqMisMatchImpl; 042import org.biojava.nbio.structure.Site; 043import org.biojava.nbio.structure.Structure; 044import org.biojava.nbio.structure.StructureException; 045import org.biojava.nbio.structure.StructureImpl; 046import org.biojava.nbio.structure.StructureTools; 047import org.biojava.nbio.structure.chem.ChemCompGroupFactory; 048import org.biojava.nbio.structure.io.BondMaker; 049import org.biojava.nbio.structure.io.ChargeAdder; 050import org.biojava.nbio.structure.io.EntityFinder; 051import org.biojava.nbio.structure.io.FileParsingParameters; 052import org.biojava.nbio.structure.io.SeqRes2AtomAligner; 053import org.biojava.nbio.structure.quaternary.BioAssemblyInfo; 054import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder; 055import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation; 056import org.biojava.nbio.structure.xtal.CrystalCell; 057import org.biojava.nbio.structure.xtal.SpaceGroup; 058import org.biojava.nbio.structure.xtal.SymoplibParser; 059import org.rcsb.cif.model.FloatColumn; 060import org.rcsb.cif.model.IntColumn; 061import org.rcsb.cif.model.StrColumn; 062import org.rcsb.cif.model.ValueKind; 063import org.rcsb.cif.schema.mm.AtomSite; 064import org.rcsb.cif.schema.mm.AtomSites; 065import org.rcsb.cif.schema.mm.AuditAuthor; 066import org.rcsb.cif.schema.mm.Cell; 067import org.rcsb.cif.schema.mm.ChemComp; 068import org.rcsb.cif.schema.mm.ChemCompBond; 069import org.rcsb.cif.schema.mm.DatabasePDBRemark; 070import org.rcsb.cif.schema.mm.DatabasePDBRev; 071import org.rcsb.cif.schema.mm.DatabasePDBRevRecord; 072import org.rcsb.cif.schema.mm.Entity; 073import org.rcsb.cif.schema.mm.EntityPoly; 074import org.rcsb.cif.schema.mm.EntityPolySeq; 075import org.rcsb.cif.schema.mm.EntitySrcGen; 076import org.rcsb.cif.schema.mm.EntitySrcNat; 077import org.rcsb.cif.schema.mm.Exptl; 078import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory; 079import org.rcsb.cif.schema.mm.PdbxChemCompIdentifier; 080import org.rcsb.cif.schema.mm.PdbxDatabaseStatus; 081import org.rcsb.cif.schema.mm.PdbxEntityBranchDescriptor; 082import org.rcsb.cif.schema.mm.PdbxEntitySrcSyn; 083import org.rcsb.cif.schema.mm.PdbxMolecule; 084import org.rcsb.cif.schema.mm.PdbxMoleculeFeatures; 085import org.rcsb.cif.schema.mm.PdbxNonpolyScheme; 086import org.rcsb.cif.schema.mm.PdbxReferenceEntityLink; 087import org.rcsb.cif.schema.mm.PdbxReferenceEntityList; 088import org.rcsb.cif.schema.mm.PdbxReferenceEntityPolyLink; 089import org.rcsb.cif.schema.mm.PdbxStructAssembly; 090import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen; 091import org.rcsb.cif.schema.mm.PdbxStructModResidue; 092import org.rcsb.cif.schema.mm.PdbxStructOperList; 093import org.rcsb.cif.schema.mm.Refine; 094import org.rcsb.cif.schema.mm.Struct; 095import org.rcsb.cif.schema.mm.StructAsym; 096import org.rcsb.cif.schema.mm.StructConf; 097import org.rcsb.cif.schema.mm.StructConn; 098import org.rcsb.cif.schema.mm.StructConnType; 099import org.rcsb.cif.schema.mm.StructKeywords; 100import org.rcsb.cif.schema.mm.StructNcsOper; 101import org.rcsb.cif.schema.mm.StructRef; 102import org.rcsb.cif.schema.mm.StructRefSeq; 103import org.rcsb.cif.schema.mm.StructRefSeqDif; 104import org.rcsb.cif.schema.mm.StructSheetRange; 105import org.rcsb.cif.schema.mm.StructSite; 106import org.rcsb.cif.schema.mm.StructSiteGen; 107import org.rcsb.cif.schema.mm.Symmetry; 108import org.slf4j.Logger; 109import org.slf4j.LoggerFactory; 110 111/** 112 * An implementation of a CifFileConsumer for BioJava. Will process the information provided by a CifFile instance and 113 * use it to build up a {@link Structure} object. 114 * @author Sebastian Bittrich 115 * @since 6.0.0 116 */ 117public class CifStructureConsumerImpl implements CifStructureConsumer { 118 private static final Logger logger = LoggerFactory.getLogger(CifStructureConsumerImpl.class); 119 private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder() 120 .parseCaseInsensitive() 121 .appendPattern("yyyy-MM-dd") 122 .toFormatter(Locale.US); 123 124 private Structure structure; 125 private Chain currentChain; 126 private Group currentGroup; 127 private List<List<Chain>> allModels; 128 private List<Chain> currentModel; 129 private PDBHeader pdbHeader; 130 private String currentNmrModelNumber; 131 private List<Chain> entityChains; 132 133 private Entity entity; 134 private EntityPoly entityPoly; 135 private EntitySrcGen entitySrcGen; 136 private EntitySrcNat entitySrcNat; 137 private PdbxEntitySrcSyn entitySrcSyn; 138 private List<Chain> seqResChains; 139 private PdbxStructAssembly structAssembly; 140 private PdbxStructAssemblyGen structAssemblyGen; 141 private StructAsym structAsym; 142 private StructConn structConn; 143 private StructNcsOper structNcsOper; 144 private PdbxStructOperList structOpers; 145 private StructRef structRef; 146 private StructRefSeqDif structRefSeqDif; 147 private StructSiteGen structSiteGen; 148 149 private Map<String, String> asymId2entityId; 150 private Map<String, String> asymId2authorId; 151 private Matrix4d parsedScaleMatrix; 152 153 private final FileParsingParameters params; 154 155 public CifStructureConsumerImpl(FileParsingParameters params) { 156 this.params = params; 157 } 158 159 @Override 160 public void prepare() { 161 this.structure = new StructureImpl(); 162 this.pdbHeader = new PDBHeader(); 163 structure.setPDBHeader(pdbHeader); 164 165 this.allModels = new ArrayList<>(); 166 this.currentModel = new ArrayList<>(); 167 168 this.seqResChains = new ArrayList<>(); 169 this.asymId2entityId = new HashMap<>(); 170 this.asymId2authorId = new HashMap<>(); 171 172 this.entityChains = new ArrayList<>(); 173 } 174 175 @Override 176 public void consumeAtomSite(AtomSite atomSite) { 177 if (params.isHeaderOnly()) { 178 return; 179 } 180 181 StrColumn labelAsymId = atomSite.getLabelAsymId(); 182 StrColumn authAsymId = atomSite.getAuthAsymId(); 183 184 StrColumn groupPDB = atomSite.getGroupPDB(); 185 IntColumn authSeqId = atomSite.getAuthSeqId(); 186 187 StrColumn labelCompId = atomSite.getLabelCompId(); 188 189 IntColumn id = atomSite.getId(); 190 StrColumn labelAtomId = atomSite.getLabelAtomId(); 191 192 FloatColumn cartnX = atomSite.getCartnX(); 193 FloatColumn cartnY = atomSite.getCartnY(); 194 FloatColumn cartnZ = atomSite.getCartnZ(); 195 196 FloatColumn occupancy = atomSite.getOccupancy(); 197 FloatColumn bIsoOrEquiv = atomSite.getBIsoOrEquiv(); 198 199 StrColumn labelAltId = atomSite.getLabelAltId(); 200 StrColumn typeSymbol = atomSite.getTypeSymbol(); 201 202 StrColumn pdbxPDBInsCode = atomSite.getPdbxPDBInsCode(); 203 IntColumn labelSeqId = atomSite.getLabelSeqId(); 204 IntColumn pdbx_pdb_model_num = atomSite.getPdbxPDBModelNum(); 205 206 for (int atomIndex = 0; atomIndex < atomSite.getRowCount(); atomIndex++) { 207 boolean startOfNewChain = false; 208 Character oneLetterCode = StructureTools.get1LetterCodeAmino(labelCompId.get(atomIndex)); 209 210 boolean isHetAtmInFile = false; 211 if (!"ATOM".equals(groupPDB.get(atomIndex))) { 212 if (oneLetterCode != null && oneLetterCode.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 213 oneLetterCode = null; 214 } 215 216 isHetAtmInFile = true; 217 } 218 219 String insCodeString = pdbxPDBInsCode.get(atomIndex); 220 Character insCode = null; 221 if (insCodeString != null && !insCodeString.isEmpty() && !"?".equals(insCodeString)) { 222 insCode = insCodeString.charAt(0); 223 } 224 225 // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.' 226 long seqId = labelSeqId.get(atomIndex); 227 228 String nmrModelNumber = pdbx_pdb_model_num.getStringData(atomIndex); 229 230 if (currentNmrModelNumber == null) { 231 currentNmrModelNumber = nmrModelNumber; 232 } 233 if (!currentNmrModelNumber.equals(nmrModelNumber)) { 234 currentNmrModelNumber = nmrModelNumber; 235 236 if (currentChain != null) { 237 currentChain.addGroup(currentGroup); 238 currentGroup.trimToSize(); 239 } 240 241 allModels.add(currentModel); 242 currentModel = new ArrayList<>(); 243 currentChain = null; 244 currentGroup = null; 245 } 246 247 String asymId = labelAsymId.get(atomIndex); 248 String authId = authAsymId.get(atomIndex); 249 if (currentChain == null) { 250 currentChain = new ChainImpl(); 251 currentChain.setName(authId); 252 currentChain.setId(asymId); 253 currentModel.add(currentChain); 254 startOfNewChain = true; 255 } 256 257 if (!asymId.equals(currentChain.getId())) { 258 startOfNewChain = true; 259 260 currentChain.addGroup(currentGroup); 261 262 Optional<Chain> testChain = currentModel.stream() 263 .filter(chain -> chain.getId().equals(asymId)) 264 .findFirst(); 265 266 if (testChain.isPresent()) { 267 currentChain = testChain.get(); 268 } else { 269 currentChain = new ChainImpl(); 270 currentChain.setName(authId); 271 currentChain.setId(asymId); 272 } 273 274 if (!currentModel.contains(currentChain)) { 275 currentModel.add(currentChain); 276 } 277 } 278 279 ResidueNumber residueNumber = new ResidueNumber(authId, authSeqId.get(atomIndex), insCode); 280 281 String recordName = groupPDB.get(atomIndex); 282 String compId = labelCompId.get(atomIndex); 283 if (currentGroup == null) { 284 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 285 currentGroup.setResidueNumber(residueNumber); 286 currentGroup.setPDBName(compId); 287 currentGroup.setHetAtomInFile(isHetAtmInFile); 288 } 289 290 Group altGroup = null; 291 String altLocation = labelAltId.get(atomIndex); 292 293 if (startOfNewChain) { 294 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 295 currentGroup.setResidueNumber(residueNumber); 296 currentGroup.setPDBName(compId); 297 currentGroup.setHetAtomInFile(isHetAtmInFile); 298 } else { 299 if (!residueNumber.equals(currentGroup.getResidueNumber())) { 300 currentChain.addGroup(currentGroup); 301 currentGroup.trimToSize(); 302 currentGroup = createGroup(recordName, oneLetterCode, compId, seqId); 303 currentGroup.setPDBName(compId); 304 currentGroup.setResidueNumber(residueNumber); 305 currentGroup.setHetAtomInFile(isHetAtmInFile); 306 } else { 307 if (altLocation != null && !altLocation.isEmpty() && !altLocation.equals(".")) { 308 altGroup = getAltLocGroup(recordName, altLocation.charAt(0), oneLetterCode, compId, seqId); 309 if (altGroup.getChain() == null) { 310 altGroup.setChain(currentChain); 311 } 312 } 313 } 314 } 315 316 if (params.isParseCAOnly()) { 317 if (!labelAtomId.get(atomIndex).equals(StructureTools.CA_ATOM_NAME) && "C".equals(typeSymbol.get(atomIndex))) { 318 continue; 319 } 320 } 321 322 Atom atom = new AtomImpl(); 323 324 atom.setPDBserial(id.get(atomIndex)); 325 atom.setName(labelAtomId.get(atomIndex)); 326 327 atom.setX(cartnX.get(atomIndex)); 328 atom.setY(cartnY.get(atomIndex)); 329 atom.setZ(cartnZ.get(atomIndex)); 330 331 atom.setOccupancy((float) occupancy.get(atomIndex)); 332 atom.setTempFactor((float) bIsoOrEquiv.get(atomIndex)); 333 334 if (altLocation == null || altLocation.isEmpty() || altLocation.equals(".")) { 335 atom.setAltLoc(' '); 336 } else { 337 atom.setAltLoc(altLocation.charAt(0)); 338 } 339 340 String ts = typeSymbol.get(atomIndex); 341 try { 342 Element element = Element.valueOfIgnoreCase(ts); 343 atom.setElement(element); 344 } catch (IllegalArgumentException e) { 345 logger.info("Element {} was not recognised as a BioJava-known element, the element will be " + 346 "represented as the generic element {}", ts, Element.R.name()); 347 atom.setElement(Element.R); 348 } 349 350 if (altGroup != null) { 351 altGroup.addAtom(atom); 352 } else { 353 currentGroup.addAtom(atom); 354 } 355 356 String atomName = atom.getName(); 357 if (!currentGroup.hasAtom(atomName)) { 358 if (currentGroup.getPDBName().equals(atom.getGroup().getPDBName())) { 359 if (!StructureTools.hasNonDeuteratedEquiv(atom, currentGroup)) { 360 currentGroup.addAtom(atom); 361 } 362 } 363 } 364 } 365 } 366 367 private Group getAltLocGroup(String recordName, Character altLoc, Character oneLetterCode, String threeLetterCode, 368 long seqId) { 369 List<Atom> atoms = currentGroup.getAtoms(); 370 if (atoms.size() > 0) { 371 if (atoms.get(0).getAltLoc().equals(altLoc)) { 372 return currentGroup; 373 } 374 } 375 376 List<Group> altLocs = currentGroup.getAltLocs(); 377 for (Group altLocGroup : altLocs) { 378 atoms = altLocGroup.getAtoms(); 379 if (atoms.size() > 0) { 380 for (Atom a1 : atoms) { 381 if (a1.getAltLoc().equals(altLoc)) { 382 return altLocGroup; 383 } 384 } 385 } 386 } 387 388 if (threeLetterCode.equals(currentGroup.getPDBName())) { 389 if (currentGroup.getAtoms().isEmpty()) { 390 return currentGroup; 391 } 392 393 Group altLocGroup = (Group) currentGroup.clone(); 394 altLocGroup.setAtoms(new ArrayList<>()); 395 altLocGroup.getAltLocs().clear(); 396 currentGroup.addAltLoc(altLocGroup); 397 return altLocGroup; 398 } 399 400 Group altLocGroup = createGroup(recordName, oneLetterCode, threeLetterCode, seqId); 401 altLocGroup.setPDBName(threeLetterCode); 402 altLocGroup.setResidueNumber(currentGroup.getResidueNumber()); 403 currentGroup.addAltLoc(altLocGroup); 404 return altLocGroup; 405 } 406 407 private Group createGroup(String record, Character oneLetterCode, String threeLetterCode, long seqId) { 408 Group group = ChemCompGroupFactory.getGroupFromChemCompDictionary(threeLetterCode); 409 if (group != null && !group.getChemComp().isEmpty()) { 410 if (group instanceof AminoAcidImpl) { 411 AminoAcidImpl aminoAcid = (AminoAcidImpl) group; 412 aminoAcid.setId(seqId); 413 } else if (group instanceof NucleotideImpl) { 414 NucleotideImpl nucleotide = (NucleotideImpl) group; 415 nucleotide.setId(seqId); 416 } else if (group instanceof HetatomImpl) { 417 HetatomImpl hetatom = (HetatomImpl) group; 418 hetatom.setId(seqId); 419 } 420 return group; 421 } 422 423 if ("ATOM".equals(record)) { 424 if (StructureTools.isNucleotide(threeLetterCode)) { 425 NucleotideImpl nucleotide = new NucleotideImpl(); 426 group = nucleotide; 427 nucleotide.setId(seqId); 428 } else if (oneLetterCode == null || oneLetterCode == StructureTools.UNKNOWN_GROUP_LABEL) { 429 HetatomImpl hetatom = new HetatomImpl(); 430 group = hetatom; 431 hetatom.setId(seqId); 432 } else { 433 AminoAcidImpl aminoAcid = new AminoAcidImpl(); 434 group = aminoAcid; 435 aminoAcid.setAminoType(oneLetterCode); 436 aminoAcid.setId(seqId); 437 } 438 } else { 439 if (StructureTools.isNucleotide(threeLetterCode)) { 440 NucleotideImpl nucleotide = new NucleotideImpl(); 441 group = nucleotide; 442 nucleotide.setId(seqId); 443 } else if (oneLetterCode != null) { 444 AminoAcidImpl aminoAcid = new AminoAcidImpl(); 445 group = aminoAcid; 446 aminoAcid.setAminoType(oneLetterCode); 447 aminoAcid.setId(seqId); 448 } else { 449 HetatomImpl hetatom = new HetatomImpl(); 450 hetatom.setId(seqId); 451 group = hetatom; 452 } 453 } 454 return group; 455 } 456 457 @Override 458 public void consumeAtomSites(AtomSites atomSites) { 459 // no atom sites present 460 if (!atomSites.isDefined() || atomSites.getRowCount() == 0) { 461 return; 462 } 463 464 try { 465 parsedScaleMatrix = new Matrix4d( 466 atomSites.getFractTransfMatrix11().get(0), 467 atomSites.getFractTransfMatrix12().get(0), 468 atomSites.getFractTransfMatrix13().get(0), 469 atomSites.getFractTransfVector1().get(0), 470 471 atomSites.getFractTransfMatrix21().get(0), 472 atomSites.getFractTransfMatrix22().get(0), 473 atomSites.getFractTransfMatrix23().get(0), 474 atomSites.getFractTransfVector2().get(0), 475 476 atomSites.getFractTransfMatrix31().get(0), 477 atomSites.getFractTransfMatrix32().get(0), 478 atomSites.getFractTransfMatrix33().get(0), 479 atomSites.getFractTransfVector3().get(0), 480 481 0, 482 0, 483 0, 484 1 485 ); 486 } catch (NumberFormatException e) { 487 logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not " + 488 "be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", 489 e.getMessage()); 490 structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false); 491 } 492 } 493 494 @Override 495 public void consumeAuditAuthor(AuditAuthor auditAuthor) { 496 for (int rowIndex = 0; rowIndex < auditAuthor.getRowCount(); rowIndex++) { 497 String name = auditAuthor.getName().get(rowIndex); 498 499 StringBuilder last = new StringBuilder(); 500 StringBuilder initials = new StringBuilder(); 501 boolean afterComma = false; 502 for (char c : name.toCharArray()) { 503 if (c == ' ') { 504 continue; 505 } 506 if (c == ',') { 507 afterComma = true; 508 continue; 509 } 510 511 if (afterComma) { 512 initials.append(c); 513 } else { 514 last.append(c); 515 } 516 } 517 518 StringBuilder newaa = new StringBuilder(); 519 newaa.append(initials); 520 newaa.append(last); 521 522 String auth = pdbHeader.getAuthors(); 523 if (auth == null) { 524 pdbHeader.setAuthors(newaa.toString()); 525 } else { 526 auth += "," + newaa.toString(); 527 pdbHeader.setAuthors(auth); 528 } 529 } 530 } 531 532 @Override 533 public void consumeCell(Cell cell) { 534 if (!cell.isDefined() || cell.getRowCount() == 0) { 535 return; 536 } 537 538 try { 539 float a = (float) cell.getLengthA().get(0); 540 float b = (float) cell.getLengthB().get(0); 541 float c = (float) cell.getLengthC().get(0); 542 float alpha = (float) cell.getAngleAlpha().get(0); 543 float beta = (float) cell.getAngleBeta().get(0); 544 float gamma = (float) cell.getAngleGamma().get(0); 545 546 CrystalCell crystalCell = new CrystalCell(); 547 crystalCell.setA(a); 548 crystalCell.setB(b); 549 crystalCell.setC(c); 550 crystalCell.setAlpha(alpha); 551 crystalCell.setBeta(beta); 552 crystalCell.setGamma(gamma); 553 554 if (!crystalCell.isCellReasonable()) { 555 // If the entry describes a structure determined by a technique other than X-ray crystallography, 556 // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees 557 // if so we don't add and CrystalCell will be null 558 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one " + 559 "dimension is below {}), discarding it.", CrystalCell.MIN_VALID_CELL_SIZE); 560 return; 561 } 562 563 structure.getPDBHeader() 564 .getCrystallographicInfo() 565 .setCrystalCell(crystalCell); 566 567 } catch (NumberFormatException e){ 568 structure.getPDBHeader() 569 .getCrystallographicInfo() 570 .setCrystalCell(null); 571 logger.info("could not parse some cell parameters ({}), ignoring _cell", e.getMessage()); 572 } 573 } 574 575 @Override 576 public void consumeChemComp(ChemComp chemComp) { 577 // TODO not impled in ref 578 } 579 580 @Override 581 public void consumeChemCompBond(ChemCompBond chemCompBond) { 582 // TODO not impled in ref 583 } 584 585 @Override 586 public void consumeDatabasePDBRemark(DatabasePDBRemark databasePDBremark) { 587 for (int rowIndex = 0; rowIndex < databasePDBremark.getRowCount(); rowIndex++) { 588 int id = databasePDBremark.getId().get(rowIndex); 589 if (id == 2) { 590 String line = databasePDBremark.getText().get(rowIndex); 591 int i = line.indexOf("ANGSTROM"); 592 593 if (i > 5) { 594 // line contains ANGSTROM info... 595 String resolution = line.substring(i - 5, i).trim(); 596 // convert string to float 597 try { 598 float res = Float.parseFloat(resolution); 599 pdbHeader.setResolution(res); 600 } catch (NumberFormatException e) { 601 logger.info("could not parse resolution from line and ignoring it {}", line); 602 return; 603 } 604 } 605 } 606 } 607 } 608 609 private Date convert(LocalDate localDate) { 610 return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant()); 611 } 612 613 @Override 614 public void consumeDatabasePDBRev(DatabasePDBRev databasePDBrev) { 615 logger.debug("got a database revision:" + databasePDBrev); 616 617 for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) { 618 if (databasePDBrev.getNum().get(rowIndex) == 1) { 619 String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex); 620 pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT))); 621 622 String date = databasePDBrev.getDate().get(rowIndex); 623 pdbHeader.setRelDate(convert(LocalDate.parse(date, DATE_FORMAT))); 624 } else { 625 String dbrev = databasePDBrev.getDate().get(rowIndex); 626 pdbHeader.setModDate(convert(LocalDate.parse(dbrev, DATE_FORMAT))); 627 } 628 } 629 } 630 631 @Override 632 public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecord) { 633 List<org.biojava.nbio.structure.DatabasePDBRevRecord> revRecords = pdbHeader.getRevisionRecords(); 634 if (revRecords == null) { 635 revRecords = new ArrayList<>(); 636 pdbHeader.setRevisionRecords(revRecords); 637 } 638 639 for (int i = 0; i < databasePDBrevRecord.getRowCount(); i++) { 640 revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i)); 641 } 642 } 643 644 @Override 645 public void consumeEntity(Entity entity) { 646 this.entity = entity; 647 } 648 649 @Override 650 public void consumeEntityPoly(EntityPoly entityPoly) { 651 this.entityPoly = entityPoly; 652 } 653 654 @Override 655 public void consumeEntitySrcGen(EntitySrcGen entitySrcGen) { 656 this.entitySrcGen = entitySrcGen; 657 } 658 659 @Override 660 public void consumeEntitySrcNat(EntitySrcNat entitySrcNat) { 661 this.entitySrcNat = entitySrcNat; 662 } 663 664 @Override 665 public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) { 666 this.entitySrcSyn = entitySrcSyn; 667 } 668 669 @Override 670 public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) { 671 for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) { 672 Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex)); 673 674 // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group 675 // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08 676 677 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(entityPolySeq.getMonId().get(rowIndex)); 678 //int seqId = Integer.parseInt(entityPolySeq.getNum()); 679 if (g != null && !g.getChemComp().isEmpty()) { 680 if (g instanceof AminoAcidImpl) { 681 AminoAcidImpl aa = (AminoAcidImpl) g; 682 aa.setRecordType(AminoAcid.SEQRESRECORD); 683 } 684 } else { 685 if (entityPolySeq.getMonId().get(rowIndex).length() == 3 && 686 StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)) != null) { 687 AminoAcidImpl a = new AminoAcidImpl(); 688 a.setRecordType(AminoAcid.SEQRESRECORD); 689 Character code1 = StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)); 690 a.setAminoType(code1); 691 g = a; 692 693 } else if (StructureTools.isNucleotide(entityPolySeq.getMonId().get(rowIndex))) { 694 // the group is actually a nucleotide group... 695 g = new NucleotideImpl(); 696 } else { 697 logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group " + 698 "for it", entityPolySeq.getNum().get(rowIndex), entityPolySeq.getMonId().get(rowIndex)); 699 g = new HetatomImpl(); 700 } 701 } 702 // at this stage we don't know about author residue numbers (insertion codes) 703 // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly 704 // sequential and follow the seqres sequence 1 to n) 705 // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes() 706 g.setResidueNumber(ResidueNumber.fromString(entityPolySeq.getNum().getStringData(rowIndex))); 707 g.setPDBName(entityPolySeq.getMonId().get(rowIndex)); 708 entityChain.addGroup(g); 709 } 710 } 711 712 private Chain getEntityChain(String entityId) { 713 for (Chain chain : entityChains) { 714 if (chain.getId().equals(entityId)) { 715 return chain; 716 } 717 } 718 719 // does not exist yet, so create... 720 Chain chain = new ChainImpl(); 721 chain.setId(entityId); 722 entityChains.add(chain); 723 724 return chain; 725 } 726 727 @Override 728 public void consumeExptl(Exptl exptl) { 729 for (int rowIndex = 0; rowIndex < exptl.getRowCount(); rowIndex++) { 730 pdbHeader.setExperimentalTechnique(exptl.getMethod().get(rowIndex)); 731 } 732 } 733 734 @Override 735 public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRevisionHistory) { 736 for (int rowIndex = 0; rowIndex < pdbxAuditRevisionHistory.getRowCount(); rowIndex++) { 737 // first entry in revision history is the release date 738 if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) { 739 String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex); 740 pdbHeader.setRelDate(convert(LocalDate.parse(release, DATE_FORMAT))); 741 } else { 742 // all other dates are revision dates; 743 // since this method may be called multiple times, 744 // the last revision date will "stick" 745 String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex); 746 pdbHeader.setModDate(convert(LocalDate.parse(revision, DATE_FORMAT))); 747 } 748 } 749 } 750 751 @Override 752 public void consumePdbxChemCompIdentifier(PdbxChemCompIdentifier pdbxChemCompIdentifier) { 753 // TODO not impled in ref 754 } 755 756 @Override 757 public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) { 758 for (int rowIndex = 0; rowIndex < pdbxDatabaseStatus.getRowCount(); rowIndex++) { 759 // the deposition date field is only available in mmCIF 5.0 760 StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate(); 761 if (recvdInitialDepositionDate.isDefined()) { 762 String deposition = recvdInitialDepositionDate.get(rowIndex); 763 pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT))); 764 } 765 } 766 } 767 768 @Override 769 public void consumePdbxEntityBranchDescriptor(PdbxEntityBranchDescriptor pdbxEntityBranchDescriptor) { 770 // TODO not considered in ref 771 } 772 773 @Override 774 public void consumePdbxMolecule(PdbxMolecule pdbxMolecule) { 775 // TODO not considered in ref 776 } 777 778 @Override 779 public void consumePdbxMoleculeFeatures(PdbxMoleculeFeatures pdbxMoleculeFeatures) { 780 // TODO not considered in ref 781 } 782 783 @Override 784 public void consumePdbxNonpolyScheme(PdbxNonpolyScheme pdbxNonpolyScheme) { 785 // TODO not impled in ref 786 } 787 788 @Override 789 public void consumePdbxReferenceEntityLink(PdbxReferenceEntityLink pdbxReferenceEntityLink) { 790 // TODO not considered in ref 791 } 792 793 @Override 794 public void consumePdbxReferenceEntityList(PdbxReferenceEntityList pdbxReferenceEntityList) { 795 // TODO not considered in ref 796 } 797 798 @Override 799 public void consumePdbxReferenceEntityPolyLink(PdbxReferenceEntityPolyLink pdbxReferenceEntityPolyLink) { 800 // TODO not considered in ref 801 } 802 803 @Override 804 public void consumePdbxStructAssembly(PdbxStructAssembly pdbxStructAssembly) { 805 this.structAssembly = pdbxStructAssembly; 806 } 807 808 @Override 809 public void consumePdbxStructAssemblyGen(PdbxStructAssemblyGen pdbxStructAssemblyGen) { 810 this.structAssemblyGen = pdbxStructAssemblyGen; 811 } 812 813 @Override 814 public void consumePdbxStructModResidue(PdbxStructModResidue pdbxStructModResidue) { 815 // TODO not considered in ref 816 } 817 818 @Override 819 public void consumePdbxStructOperList(PdbxStructOperList pdbxStructOperList) { 820 this.structOpers = pdbxStructOperList; 821 } 822 823 @Override 824 public void consumeRefine(Refine refine) { 825 for (int rowIndex = 0; rowIndex < refine.getRowCount(); rowIndex++) { 826 // RESOLUTION 827 // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m) 828 // there are 2 resolution values, one for each method 829 // we take the last one found so that behaviour is like in PDB file parsing 830 double lsDResHigh = refine.getLsDResHigh().get(rowIndex); 831 // TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0 832 if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) { 833 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}", 834 lsDResHigh, String.format("%4.2f",pdbHeader.getResolution())); 835 } 836 pdbHeader.setResolution((float) lsDResHigh); 837 838 FloatColumn lsRFactorRFree = refine.getLsRFactorRFree(); 839 // RFREE 840 if (pdbHeader.getRfree() != PDBHeader.DEFAULT_RFREE) { 841 logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {}", 842 lsRFactorRFree, String.format("%4.2f",pdbHeader.getRfree())); 843 } 844 if (lsRFactorRFree.isDefined() && lsRFactorRFree.getValueKind(rowIndex) == ValueKind.PRESENT) { 845 pdbHeader.setRfree((float) lsRFactorRFree.get(rowIndex)); 846 } else { 847 // some entries like 2ifo haven't got this field at all 848 logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value"); 849 } 850 851 // RWORK 852 FloatColumn lsRFactorRWork = refine.getLsRFactorRWork(); 853 if(pdbHeader.getRwork() != PDBHeader.DEFAULT_RFREE) { 854 logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ", 855 lsRFactorRWork, String.format("%4.2f",pdbHeader.getRwork())); 856 } 857 if (lsRFactorRWork.isDefined() && lsRFactorRWork.getValueKind(rowIndex) == ValueKind.PRESENT) { 858 pdbHeader.setRwork((float) lsRFactorRWork.get(rowIndex)); 859 } else { 860 logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value"); 861 } 862 } 863 } 864 865 @Override 866 public void consumeStruct(Struct struct) { 867 if (struct.isDefined() && struct.getTitle().isDefined()) { 868 pdbHeader.setTitle(struct.getTitle().get(0)); 869 } 870 871 if (struct.isDefined() && struct.getEntryId().isDefined()) { 872 PdbId pdbId; 873 String pdbCode = struct.getEntryId().get(0); 874 try { 875 pdbId = new PdbId(pdbCode); 876 } catch (IllegalArgumentException e) { 877 logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode); 878 pdbId = null; 879 } 880 pdbHeader.setPdbId(pdbId); 881 structure.setPdbId(pdbId); 882 } 883 } 884 885 @Override 886 public void consumeStructAsym(StructAsym structAsym) { 887 this.structAsym = structAsym; 888 } 889 890 @Override 891 public void consumeStructConf(StructConf structConf) { 892 // TODO not considered in ref 893 } 894 895 @Override 896 public void consumeStructConn(StructConn structConn) { 897 this.structConn = structConn; 898 } 899 900 @Override 901 public void consumeStructConnType(StructConnType structConnType) { 902 // TODO not considered in ref 903 } 904 905 @Override 906 public void consumeStructKeywords(StructKeywords structKeywords) { 907 ArrayList<String> keywordsList = new ArrayList<String>(); 908 909 StrColumn text = structKeywords.getText(); 910 if (text.isDefined()) { 911 String keywords = text.get(0); 912 String[] strings = keywords.split(" *, *"); 913 for (String string : strings) { 914 keywordsList.add(string.trim()); 915 } 916 } 917 structure.getPDBHeader().setKeywords(keywordsList); 918 919 StrColumn pdbxKeywords = structKeywords.getPdbxKeywords(); 920 if (pdbxKeywords.isDefined()) { 921 String keywords = pdbxKeywords.get(0); 922 pdbHeader.setClassification(keywords); 923 //This field should be left empty. TODO The next line should be removed later 924 pdbHeader.setDescription(keywords); 925 } 926 } 927 928 @Override 929 public void consumeStructNcsOper(StructNcsOper structNcsOper) { 930 this.structNcsOper = structNcsOper; 931 } 932 933 @Override 934 public void consumeStructRef(StructRef structRef) { 935 this.structRef = structRef; 936 } 937 938 @Override 939 public void consumeStructRefSeq(StructRefSeq structRefSeq) { 940 for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) { 941 String refId = structRefSeq.getRefId().get(rowIndex); 942 943 DBRef dbRef = new DBRef(); 944 945 dbRef.setIdCode(structRefSeq.getPdbxPDBIdCode().get(rowIndex)); 946 dbRef.setDbAccession(structRefSeq.getPdbxDbAccession().get(rowIndex)); 947 dbRef.setDbIdCode(structRefSeq.getPdbxDbAccession().get(rowIndex)); 948 dbRef.setChainName(structRefSeq.getPdbxStrandId().get(rowIndex)); 949 950 OptionalInt structRefRowIndex = IntStream.range(0, structRef.getRowCount()) 951 .filter(i -> structRef.getId().get(i).equals(refId)) 952 .findFirst(); 953 954 if (structRefRowIndex.isPresent()) { 955 dbRef.setDatabase(structRef.getDbName().get(structRefRowIndex.getAsInt())); 956 dbRef.setDbIdCode(structRef.getDbCode().get(structRefRowIndex.getAsInt())); 957 } else { 958 logger.info("could not find StructRef `{} for StructRefSeq {}", refId, rowIndex); 959 } 960 961 int seqBegin; 962 int seqEnd; 963 964 try { 965 seqBegin = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignBeg().get(rowIndex)); 966 seqEnd = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignEnd().get(rowIndex)); 967 } catch (NumberFormatException e) { 968 // this happens in a few entries, annotation error? e.g. 6eoj 969 logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref " + 970 "alignment info for accession {}. Error: {}", dbRef.getDbAccession(), e.getMessage()); 971 return; 972 } 973 974 char beginInsCode = ' '; 975 String pdbxSeqAlignBegInsCode = structRefSeq.getPdbxSeqAlignBegInsCode().get(rowIndex); 976 if (pdbxSeqAlignBegInsCode.length() > 0) { 977 beginInsCode = pdbxSeqAlignBegInsCode.charAt(0); 978 } 979 980 char endInsCode = ' '; 981 String pdbxSeqAlignEndInsCode = structRefSeq.getPdbxSeqAlignEndInsCode().get(rowIndex); 982 if (pdbxSeqAlignEndInsCode.length() > 0) { 983 endInsCode = pdbxSeqAlignEndInsCode.charAt(0); 984 } 985 986 if (beginInsCode == '?') { 987 beginInsCode = ' '; 988 } 989 if (endInsCode == '?') { 990 endInsCode = ' '; 991 } 992 993 dbRef.setSeqBegin(seqBegin); 994 dbRef.setInsertBegin(beginInsCode); 995 dbRef.setSeqEnd(seqEnd); 996 dbRef.setInsertEnd(endInsCode); 997 998 int dbSeqBegin = structRefSeq.getDbAlignBeg().get(rowIndex); 999 int dbSeqEnd = structRefSeq.getDbAlignEnd().get(rowIndex); 1000 1001 char dbBeginInsCode = ' '; 1002 StrColumn pdbxDbAlignBegInsCodeCol = structRefSeq.getPdbxDbAlignBegInsCode(); 1003 if (pdbxDbAlignBegInsCodeCol.isDefined()) { 1004 String pdbxDbAlignBegInsCode = pdbxDbAlignBegInsCodeCol.get(rowIndex); 1005 if (pdbxDbAlignBegInsCode.length() > 0) { 1006 dbBeginInsCode = pdbxDbAlignBegInsCode.charAt(0); 1007 } 1008 } 1009 1010 char dbEndInsCode = ' '; 1011 StrColumn pdbxDbAlignEndInsCodeCol = structRefSeq.getPdbxDbAlignEndInsCode(); 1012 if (pdbxDbAlignEndInsCodeCol.isDefined()) { 1013 String pdbxDbAlignEndInsCode = pdbxDbAlignEndInsCodeCol.get(rowIndex); 1014 if (pdbxDbAlignEndInsCode.length() > 0) { 1015 dbEndInsCode = pdbxDbAlignEndInsCode.charAt(0); 1016 } 1017 } 1018 1019 if (dbBeginInsCode == '?') { 1020 dbBeginInsCode = ' '; 1021 } 1022 if (dbEndInsCode == '?') { 1023 dbEndInsCode = ' '; 1024 } 1025 1026 dbRef.setDbSeqBegin(dbSeqBegin); 1027 dbRef.setIdbnsBegin(dbBeginInsCode); 1028 dbRef.setDbSeqEnd(dbSeqEnd); 1029 dbRef.setIdbnsEnd(dbEndInsCode); 1030 1031 List<DBRef> dbrefs = structure.getDBRefs(); 1032 if (dbrefs == null) { 1033 dbrefs = new ArrayList<>(); 1034 } 1035 dbrefs.add(dbRef); 1036 1037 logger.debug(dbRef.toPDB()); 1038 1039 structure.setDBRefs(dbrefs); 1040 } 1041 } 1042 1043 @Override 1044 public void consumeStructRefSeqDif(StructRefSeqDif structRefSeqDif) { 1045 this.structRefSeqDif = structRefSeqDif; 1046 } 1047 1048 @Override 1049 public void consumeStructSheetRange(StructSheetRange structSheetRange) { 1050 // TODO not considered in ref 1051 } 1052 1053 @Override 1054 public void consumeStructSite(StructSite structSite) { 1055 if (params.isHeaderOnly()) { 1056 return; 1057 } 1058 1059 List<Site> sites = structure.getSites(); 1060 if (sites == null) { 1061 sites = new ArrayList<>(); 1062 } 1063 1064 for (int rowIndex = 0; rowIndex < structSite.getRowCount(); rowIndex++) { 1065 Site site = null; 1066 for (Site asite : sites) { 1067 if (asite.getSiteID().equals(structSite.getId().get(rowIndex))) { 1068 site = asite; // prevent duplicate siteIds 1069 } 1070 } 1071 1072 boolean addSite = false; 1073 if (site == null) { 1074 site = new Site(); 1075 addSite = true; 1076 } 1077 1078 site.setSiteID(structSite.getId().get(rowIndex)); 1079 site.setDescription(structSite.getDetails().get(rowIndex)); 1080 site.setEvCode(structSite.getPdbxEvidenceCode().get(rowIndex)); 1081 1082 if (addSite) { 1083 sites.add(site); 1084 } 1085 } 1086 1087 structure.setSites(sites); 1088 } 1089 1090 @Override 1091 public void consumeStructSiteGen(StructSiteGen structSiteGen) { 1092 this.structSiteGen = structSiteGen; 1093 } 1094 1095 @Override 1096 public void consumeSymmetry(Symmetry symmetry) { 1097 for (int rowIndex = 0; rowIndex < symmetry.getRowCount(); rowIndex++) { 1098 String spaceGroupString = symmetry.getSpaceGroupNameH_M().get(rowIndex); 1099 SpaceGroup spaceGroup = SymoplibParser.getSpaceGroup(spaceGroupString); 1100 if (spaceGroup == null) { 1101 logger.warn("Space group '{}' not recognised as a standard space group", spaceGroupString); 1102 structure.getPDBHeader() 1103 .getCrystallographicInfo() 1104 .setNonStandardSg(true); 1105 } else { 1106 structure.getPDBHeader() 1107 .getCrystallographicInfo() 1108 .setSpaceGroup(spaceGroup); 1109 structure.getPDBHeader() 1110 .getCrystallographicInfo() 1111 .setNonStandardSg(false); 1112 } 1113 } 1114 } 1115 1116 @Override 1117 public void finish() { 1118 if (currentChain != null) { 1119 currentChain.addGroup(currentGroup); 1120 1121 Optional<Chain> testChain = currentModel.stream() 1122 .filter(chain -> chain.getId().equals(currentChain.getId())) 1123 .findFirst(); 1124 1125 if (!testChain.isPresent()) { 1126 currentModel.add(currentChain); 1127 } 1128 } else if (!params.isHeaderOnly()) { 1129 logger.warn("current chain is null at end of document."); 1130 } 1131 1132 allModels.add(currentModel); 1133 1134 initMaps(); 1135 1136 for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) { 1137 String id = structAsym.getId().get(rowIndex); 1138 String entityId = structAsym.getEntityId().get(rowIndex); 1139 logger.debug("Entity {} matches asym_id: {}", entityId, id); 1140 1141 Chain chain = getEntityChain(entityId); 1142 Chain seqRes = (Chain) chain.clone(); 1143 // to solve issue #160 (e.g. 3u7t) 1144 seqRes = removeSeqResHeterogeneity(seqRes); 1145 seqRes.setId(id); 1146 seqRes.setName(asymId2authorId.getOrDefault(id, id)); 1147 1148 EntityType type = EntityType.entityTypeFromString(getEntityType(entityId)); 1149 if (type == null || type == EntityType.POLYMER) { 1150 seqResChains.add(seqRes); 1151 } 1152 1153 logger.debug(" seqres: {} {}<", id, seqRes); 1154 addEntity(rowIndex, entityId, getEntityDescription(entityId), getEntityType(entityId)); 1155 } 1156 1157 if (!structAsym.isDefined() || structAsym.getRowCount() == 0) { 1158 logger.warn("No _struct_asym category in file, no SEQRES groups will be added."); 1159 } 1160 1161 // entities 1162 // In addEntities above we created the entities if they were present in the file 1163 // Now we need to make sure that they are linked to chains and also that if they are not present in the file we 1164 // need to add them now 1165 linkEntities(); 1166 1167 // now that we know the entities, we can add all chains to structure so that they are stored 1168 // properly as polymer/nonpolymer/water chains inside structure 1169 allModels.forEach(structure::addModel); 1170 1171 // Only align if requested (default) and not when headerOnly mode with no Atoms. 1172 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 1173 if (params.isAlignSeqRes() && !params.isHeaderOnly()){ 1174 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 1175 alignSeqRes(); 1176 } else { 1177 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 1178 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 1179 } 1180 1181 // Now make sure all altlocgroups have all the atoms in all the groups 1182 StructureTools.cleanUpAltLocs(structure); 1183 1184 // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out 1185 if (!params.isHeaderOnly()) { 1186 if (params.shouldCreateAtomBonds()) { 1187 addBonds(); 1188 } 1189 1190 if (params.shouldCreateAtomCharges()) { 1191 addCharges(); 1192 } 1193 } 1194 1195 if (!params.isHeaderOnly()) { 1196 addSites(); 1197 } 1198 1199 // set the oligomeric state info in the header... 1200 if (params.isParseBioAssembly()) { 1201 // the more detailed mapping of chains to rotation operations happens in StructureIO... 1202 1203 Map<Integer, BioAssemblyInfo> bioAssemblies = new LinkedHashMap<>(); 1204 for (int i = 0; i < structAssembly.getRowCount(); i++) { 1205 String assemblyId = structAssembly.getId().get(i); 1206 List<Integer> structAssemblyGenIndices = new ArrayList<>(); 1207 for (int j = 0; j < structAssemblyGen.getRowCount(); j++) { 1208 if (structAssemblyGen.getAssemblyId().get(j).equals(assemblyId)) { 1209 structAssemblyGenIndices.add(j); 1210 } 1211 } 1212 BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder(); 1213 // these are the transformations that need to be applied to our model 1214 List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(structAssembly, 1215 i, structAssemblyGen, structOpers); 1216 1217 int bioAssemblyId = -1; 1218 try { 1219 bioAssemblyId = Integer.parseInt(assemblyId); 1220 } catch (NumberFormatException e) { 1221 logger.info("Could not parse a numerical bio assembly id from '{}'", assemblyId); 1222 } 1223 1224 // if bioassembly id is not numerical we throw it away 1225 // this happens usually for viral capsid entries, like 1ei7 1226 // see issue #230 in github 1227 if (bioAssemblyId != -1) { 1228 int mmSize = 0; 1229 // note that the transforms contain asym ids of both polymers and non-polymers 1230 // For the mmsize, we are only interested in the polymers 1231 for (BiologicalAssemblyTransformation transf : transformations) { 1232 Chain c = structure.getChain(transf.getChainId()); 1233 if (c == null) { 1234 logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId()); 1235 continue; 1236 } 1237 if (c.getEntityType() == EntityType.POLYMER && 1238 // for entries like 4kro, sugars are annotated as polymers but we 1239 // don't want them in the macromolecularSize count 1240 !c.getEntityInfo().getDescription().contains("SUGAR")) { 1241 mmSize++; 1242 } 1243 } 1244 1245 BioAssemblyInfo bioAssembly = new BioAssemblyInfo(); 1246 bioAssembly.setId(bioAssemblyId); 1247 bioAssembly.setMacromolecularSize(mmSize); 1248 bioAssembly.setTransforms(transformations); 1249 bioAssemblies.put(bioAssemblyId, bioAssembly); 1250 } 1251 1252 } 1253 structure.getPDBHeader() 1254 .setBioAssemblies(bioAssemblies); 1255 } 1256 1257 setStructNcsOps(); 1258 setCrystallographicInfoMetadata(); 1259 1260 Map<String, List<SeqMisMatch>> misMatchMap = new HashMap<>(); 1261 for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) { 1262 SeqMisMatch seqMisMatch = new SeqMisMatchImpl(); 1263 seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex)); 1264 1265 String insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex); 1266 if (insCode != null && insCode.equals("?")) { 1267 insCode = null; 1268 } 1269 seqMisMatch.setInsCode(insCode); 1270 seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex)); 1271 seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex)); 1272 seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex)); 1273 seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex)); 1274 seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex)); 1275 1276 String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex); 1277 List<SeqMisMatch> seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>()); 1278 seqMisMatches.add(seqMisMatch); 1279 } 1280 1281 for (String chainId : misMatchMap.keySet()){ 1282 Chain chain = structure.getPolyChainByPDB(chainId); 1283 if (chain == null) { 1284 logger.warn("Could not set mismatches for chain with author id {}", chainId); 1285 continue; 1286 } 1287 1288 chain.setSeqMisMatches(misMatchMap.get(chainId)); 1289 } 1290 } 1291 1292 private String getEntityType(String entityId) { 1293 return IntStream.range(0, entity.getRowCount()) 1294 .filter(i -> entity.getId().get(i).equals(entityId)) 1295 .mapToObj(i -> entity.getType().get(i)) 1296 .findFirst() 1297 .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId)); 1298 } 1299 1300 private String getEntityDescription(String entityId) { 1301 return IntStream.range(0, entity.getRowCount()) 1302 .filter(i -> entity.getId().get(i).equals(entityId)) 1303 .mapToObj(i -> entity.getPdbxDescription().get(i)) 1304 .findFirst() 1305 .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId)); 1306 } 1307 1308 private void addEntity(int asymRowIndex, String entityId, String pdbxDescription, String type) { 1309 int eId = 0; 1310 try { 1311 eId = Integer.parseInt(entityId); 1312 } catch (NumberFormatException e) { 1313 logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity", entityId); 1314 } 1315 1316 int entityRowIndex = IntStream.range(0, entity.getRowCount()) 1317 .filter(i -> entity.getId().get(i).equals(entityId)) 1318 .findFirst() 1319 .orElse(-1); 1320 1321 EntityInfo entityInfo = structure.getEntityById(eId); 1322 1323 if (entityInfo == null) { 1324 entityInfo = new EntityInfo(); 1325 entityInfo.setMolId(eId); 1326 // we only add the compound if a polymeric one (to match what the PDB parser does) 1327 if (entityRowIndex != -1) { 1328 entityInfo.setDescription(pdbxDescription); 1329 1330 EntityType eType = EntityType.entityTypeFromString(type); 1331 if (eType != null) { 1332 entityInfo.setType(eType); 1333 } else { 1334 logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", type, eId); 1335 } 1336 addAncilliaryEntityData(asymRowIndex, entityInfo); 1337 structure.addEntityInfo(entityInfo); 1338 logger.debug("Adding Entity with entity id {} from _entity, with name: {}", eId, 1339 entityInfo.getDescription()); 1340 } 1341 } 1342 } 1343 1344 private void addAncilliaryEntityData(int asymRowIndex, EntityInfo entityInfo) { 1345 // Loop through each of the entity types and add the corresponding data 1346 // We're assuming if data is duplicated between sources it is consistent 1347 // This is a potentially huge assumption... 1348 1349 for (int rowIndex = 0; rowIndex < entitySrcGen.getRowCount(); rowIndex++) { 1350 if (!entitySrcGen.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1351 continue; 1352 } 1353 1354 addInformationFromEntitySrcGen(rowIndex, entityInfo); 1355 } 1356 1357 for (int rowIndex = 0; rowIndex < entitySrcNat.getRowCount(); rowIndex++) { 1358 if (!entitySrcNat.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1359 continue; 1360 } 1361 1362 addInformationFromEntitySrcNat(rowIndex, entityInfo); 1363 } 1364 1365 for (int rowIndex = 0; rowIndex < entitySrcSyn.getRowCount(); rowIndex++) { 1366 if (!entitySrcSyn.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) { 1367 continue; 1368 } 1369 1370 addInformationFromEntitySrcSyn(rowIndex, entityInfo); 1371 } 1372 } 1373 1374 private void addInformationFromEntitySrcSyn(int rowIndex, EntityInfo entityInfo) { 1375 entityInfo.setOrganismCommon(entitySrcSyn.getOrganismCommonName().get(rowIndex)); 1376 entityInfo.setOrganismScientific(entitySrcSyn.getOrganismScientific().get(rowIndex)); 1377 entityInfo.setOrganismTaxId(entitySrcSyn.getNcbiTaxonomyId().get(rowIndex)); 1378 } 1379 1380 private void addInformationFromEntitySrcNat(int rowIndex, EntityInfo entityInfo) { 1381 entityInfo.setAtcc(entitySrcNat.getPdbxAtcc().get(rowIndex)); 1382 entityInfo.setCell(entitySrcNat.getPdbxCell().get(rowIndex)); 1383 entityInfo.setOrganismCommon(entitySrcNat.getCommonName().get(rowIndex)); 1384 entityInfo.setOrganismScientific(entitySrcNat.getPdbxOrganismScientific().get(rowIndex)); 1385 entityInfo.setOrganismTaxId(entitySrcNat.getPdbxNcbiTaxonomyId().get(rowIndex)); 1386 } 1387 1388 private void addInformationFromEntitySrcGen(int rowIndex, EntityInfo entityInfo) { 1389 entityInfo.setAtcc(entitySrcGen.getPdbxGeneSrcAtcc().get(rowIndex)); 1390 entityInfo.setCell(entitySrcGen.getPdbxGeneSrcCell().get(rowIndex)); 1391 entityInfo.setOrganismCommon(entitySrcGen.getGeneSrcCommonName().get(rowIndex)); 1392 entityInfo.setOrganismScientific(entitySrcGen.getPdbxGeneSrcScientificName().get(rowIndex)); 1393 entityInfo.setOrganismTaxId(entitySrcGen.getPdbxGeneSrcNcbiTaxonomyId().get(rowIndex)); 1394 entityInfo.setExpressionSystemTaxId(entitySrcGen.getPdbxHostOrgNcbiTaxonomyId().get(rowIndex)); 1395 entityInfo.setExpressionSystem(entitySrcGen.getPdbxHostOrgScientificName().get(rowIndex)); 1396 } 1397 1398 private void setStructNcsOps() { 1399 List<Matrix4d> ncsOperators = new ArrayList<>(); 1400 1401 for (int rowIndex = 0; rowIndex < structNcsOper.getRowCount(); rowIndex++) { 1402 if (!"generate".equals(structNcsOper.getCode().get(rowIndex))) { 1403 continue; 1404 } 1405 1406 try { 1407 Matrix4d operator = new Matrix4d(); 1408 1409 operator.setElement(0, 0, structNcsOper.getMatrix11().get(rowIndex)); 1410 operator.setElement(0, 1, structNcsOper.getMatrix12().get(rowIndex)); 1411 operator.setElement(0, 2, structNcsOper.getMatrix13().get(rowIndex)); 1412 operator.setElement(0, 3, structNcsOper.getVector1().get(rowIndex)); 1413 1414 operator.setElement(1, 0, structNcsOper.getMatrix21().get(rowIndex)); 1415 operator.setElement(1, 1, structNcsOper.getMatrix22().get(rowIndex)); 1416 operator.setElement(1, 2, structNcsOper.getMatrix23().get(rowIndex)); 1417 operator.setElement(1, 3, structNcsOper.getVector2().get(rowIndex)); 1418 1419 operator.setElement(2, 0, structNcsOper.getMatrix31().get(rowIndex)); 1420 operator.setElement(2, 1, structNcsOper.getMatrix32().get(rowIndex)); 1421 operator.setElement(2, 2, structNcsOper.getMatrix33().get(rowIndex)); 1422 operator.setElement(2, 3, structNcsOper.getVector3().get(rowIndex)); 1423 1424 operator.setElement(3, 0, 0); 1425 operator.setElement(3, 1, 0); 1426 operator.setElement(3, 2, 0); 1427 operator.setElement(3, 3, 1); 1428 1429 ncsOperators.add(operator); 1430 } catch (NumberFormatException e) { 1431 logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", rowIndex + 1); 1432 } 1433 } 1434 1435 if (ncsOperators.size() > 0) { 1436 structure.getCrystallographicInfo() 1437 .setNcsOperators(ncsOperators.toArray(new Matrix4d[0])); 1438 } 1439 } 1440 1441 private void setCrystallographicInfoMetadata() { 1442 if (parsedScaleMatrix != null) { 1443 PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo(); 1444 boolean nonStd = false; 1445 if (crystalInfo.getCrystalCell() != null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) { 1446 nonStd = true; 1447 } 1448 1449 crystalInfo.setNonStandardCoordFrameConvention(nonStd); 1450 } 1451 } 1452 1453 private void addSites() { 1454 List<Site> sites = structure.getSites(); 1455 if (sites == null) sites = new ArrayList<>(); 1456 1457 for (int rowIndex = 0; rowIndex < structSiteGen.getRowCount(); rowIndex++) { 1458 // For each StructSiteGen, find the residues involved, if they exist then 1459 String site_id = structSiteGen.getSiteId().get(rowIndex); // multiple could be in same site. 1460 if (site_id == null) { 1461 site_id = ""; 1462 } 1463 String comp_id = structSiteGen.getLabelCompId().get(rowIndex); // PDBName 1464 1465 // Assumption: the author chain ID and residue number for the site is consistent with the original 1466 // author chain id and residue numbers. 1467 1468 String asymId = structSiteGen.getLabelAsymId().get(rowIndex); // chain name 1469 String authId = structSiteGen.getAuthAsymId().get(rowIndex); // chain Id 1470 String auth_seq_id = structSiteGen.getAuthSeqId().get(rowIndex); // Res num 1471 1472 String insCode = structSiteGen.getPdbxAuthInsCode().get(rowIndex); 1473 if (insCode != null && insCode.equals("?")) { 1474 insCode = null; 1475 } 1476 1477 // Look for asymID = chainID and seqID = seq_ID. Check that comp_id matches the resname. 1478 Group g = null; 1479 try { 1480 Chain chain = structure.getChain(asymId); 1481 1482 if (null != chain) { 1483 try { 1484 Character insChar = null; 1485 if (null != insCode && insCode.length() > 0) { 1486 insChar = insCode.charAt(0); 1487 } 1488 g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar)); 1489 } catch (NumberFormatException e) { 1490 logger.warn("Could not lookup residue : {}{}", authId, auth_seq_id); 1491 } 1492 } 1493 } catch (StructureException e) { 1494 logger.warn("Problem finding residue in site entry {} - {}", 1495 structSiteGen.getSiteId().get(rowIndex), e.getMessage()); 1496 } 1497 1498 if (g != null) { 1499 // 2. find the site_id, if not existing, create anew. 1500 Site site = null; 1501 for (Site asite : sites) { 1502 if (site_id.equals(asite.getSiteID())) { 1503 site = asite; 1504 } 1505 } 1506 1507 boolean addSite = false; 1508 1509 // 3. add this residue to the site. 1510 if (site == null) { 1511 addSite = true; 1512 site = new Site(); 1513 site.setSiteID(site_id); 1514 } 1515 1516 List<Group> groups = site.getGroups(); 1517 if (groups == null) { 1518 groups = new ArrayList<>(); 1519 } 1520 1521 // Check the self-consistency of the residue reference from auth_seq_id and chain_id 1522 if (!comp_id.equals(g.getPDBName())) { 1523 logger.warn("comp_id doesn't match the residue at {} {} - skipping", authId, auth_seq_id); 1524 } else { 1525 groups.add(g); 1526 site.setGroups(groups); 1527 } 1528 if (addSite) { 1529 sites.add(site); 1530 } 1531 } 1532 } 1533 structure.setSites(sites); 1534 } 1535 1536 private void addCharges() { 1537 ChargeAdder.addCharges(structure); 1538 } 1539 1540 /** 1541 * The method will return a new reference to a Chain with any consecutive groups 1542 * having same residue numbers removed. 1543 * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160) 1544 */ 1545 private static Chain removeSeqResHeterogeneity(Chain c) { 1546 Chain trimmedChain = new ChainImpl(); 1547 ResidueNumber lastResNum = null; 1548 1549 for (Group g : c.getAtomGroups()) { 1550 // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g) 1551 ResidueNumber currentResNum = new ResidueNumber( 1552 g.getResidueNumber().getChainName(), 1553 g.getResidueNumber().getSeqNum(), 1554 g.getResidueNumber().getInsCode()); 1555 1556 if (lastResNum == null || !lastResNum.equals(currentResNum)) { 1557 trimmedChain.addGroup(g); 1558 } else { 1559 logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely " + 1560 "has hetero='y': {}", g); 1561 } 1562 lastResNum = currentResNum; 1563 1564 } 1565 return trimmedChain; 1566 } 1567 1568 private void addBonds() { 1569 BondMaker maker = new BondMaker(structure, params); 1570 maker.makeBonds(); 1571 maker.formBondsFromStructConn(structConn); 1572 } 1573 1574 private void alignSeqRes() { 1575 logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence"); 1576 1577 // fix SEQRES residue numbering for all models 1578 1579 for (int model = 0; model < structure.nrModels(); model++) { 1580 List<Chain> atomList = structure.getModel(model); 1581 1582 for (Chain seqResChain : seqResChains){ 1583 1584 // this extracts the matching atom chain from atomList 1585 Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true); 1586 1587 if (atomChain == null) { 1588 // most likely there's no observed residues at all for the seqres chain: can't map 1589 // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues 1590 logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's " + 1591 "no observed residues in the chain.", seqResChain.getId()); 1592 continue; 1593 } 1594 1595 //map the atoms to the seqres... 1596 1597 // we need to first clone the seqres so that they stay independent for different models 1598 List<Group> seqResGroups = new ArrayList<>(); 1599 for (int i = 0; i < seqResChain.getAtomGroups().size(); i++) { 1600 seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone()); 1601 } 1602 1603 for (int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) { 1604 Group seqresG = seqResGroups.get(seqResPos); 1605 boolean found = false; 1606 for (Group atomG : atomChain.getAtomGroups()) { 1607 1608 int internalNr = getInternalNr(atomG); 1609 1610 if (seqresG.getResidueNumber().getSeqNum() == internalNr) { 1611 seqResGroups.set(seqResPos, atomG); 1612 found = true; 1613 break; 1614 } 1615 } 1616 1617 if (!found) 1618 // so far the residue number has tracked internal numbering. 1619 // however there are no atom records, as such this can't be a PDB residue number... 1620 seqresG.setResidueNumber(null); 1621 } 1622 atomChain.setSeqResGroups(seqResGroups); 1623 } 1624 } 1625 } 1626 1627 private int getInternalNr(Group atomG) { 1628 if (atomG.getType().equals(GroupType.AMINOACID)) { 1629 AminoAcidImpl aa = (AminoAcidImpl) atomG; 1630 return (int) aa.getId(); 1631 } else if (atomG.getType().equals(GroupType.NUCLEOTIDE)) { 1632 NucleotideImpl nu = (NucleotideImpl) atomG; 1633 return (int) nu.getId(); 1634 } else { 1635 HetatomImpl he = (HetatomImpl) atomG; 1636 return (int) he.getId(); 1637 } 1638 } 1639 1640 private void linkEntities() { 1641 for (List<Chain> allModel : allModels) { 1642 for (Chain chain : allModel) { 1643 //logger.info("linking entities for " + chain.getId() + " " + chain.getName()); 1644 String entityId = asymId2entityId.get(chain.getId()); 1645 1646 if (entityId == null) { 1647 // this can happen for instance if the cif file didn't have _struct_asym category at all 1648 // and thus we have no asymId2entityId mapping at all 1649 logger.info("No entity id could be found for chain {}", chain.getId()); 1650 continue; 1651 } 1652 1653 int eId = Integer.parseInt(entityId); 1654 1655 // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found. 1656 // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer 1657 // asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the 1658 // mmCIF internal data structures but is compatible with Structure interface. 1659 // Some examples of PDB entries with this kind of problem: 1660 // - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName 1661 // - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule 1662 // - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone 1663 1664 EntityInfo entityInfo = structure.getEntityById(eId); 1665 if (entityInfo == null) { 1666 // Supports the case where the only chain members were from non-polymeric entity that is missing. 1667 // Solved by creating a new Compound(entity) to which this chain will belong. 1668 logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.", 1669 eId, chain.getId()); 1670 entityInfo = new EntityInfo(); 1671 entityInfo.setMolId(eId); 1672 entityInfo.addChain(chain); 1673 if (chain.isWaterOnly()) { 1674 entityInfo.setType(EntityType.WATER); 1675 } else { 1676 entityInfo.setType(EntityType.NONPOLYMER); 1677 } 1678 chain.setEntityInfo(entityInfo); 1679 structure.addEntityInfo(entityInfo); 1680 } else { 1681 logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}", 1682 chain.getId(), chain.getName(), eId); 1683 entityInfo.addChain(chain); 1684 chain.setEntityInfo(entityInfo); 1685 } 1686 1687 } 1688 1689 } 1690 1691 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 1692 List<EntityInfo> entityInfos = structure.getEntityInfos(); 1693 if (entityInfos == null || entityInfos.isEmpty()) { 1694 List<List<Chain>> polyModels = new ArrayList<>(); 1695 List<List<Chain>> nonPolyModels = new ArrayList<>(); 1696 List<List<Chain>> waterModels = new ArrayList<>(); 1697 1698 for (List<Chain> model : allModels) { 1699 List<Chain> polyChains = new ArrayList<>(); 1700 List<Chain> nonPolyChains = new ArrayList<>(); 1701 List<Chain> waterChains = new ArrayList<>(); 1702 1703 polyModels.add(polyChains); 1704 nonPolyModels.add(nonPolyChains); 1705 waterModels.add(waterChains); 1706 1707 for (Chain chain : model) { 1708 // we only have entities for polymeric chains, all others are ignored for assigning entities 1709 if (chain.isWaterOnly()) { 1710 waterChains.add(chain); 1711 } else if (chain.isPureNonPolymer()) { 1712 nonPolyChains.add(chain); 1713 } else { 1714 polyChains.add(chain); 1715 } 1716 } 1717 } 1718 1719 entityInfos = EntityFinder.findPolyEntities(polyModels); 1720 EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos); 1721 1722 structure.setEntityInfos(entityInfos); 1723 } 1724 1725 // final sanity check: it can happen that from the annotated entities some are not linked to any chains 1726 // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds) 1727 // we simply log it, this can sign some other problems if the entities are used down the line 1728 for (EntityInfo e : entityInfos) { 1729 if (e.getChains().isEmpty()) { 1730 logger.info("Entity {} '{}' has no chains associated to it", 1731 e.getMolId() < 0 ? "with no entity id" : e.getMolId(), e.getDescription()); 1732 } 1733 } 1734 } 1735 1736 private void initMaps() { 1737 if (structAsym == null || !structAsym.isDefined() || structAsym.getRowCount() == 0) { 1738 logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available"); 1739 return; 1740 } 1741 1742 Map<String, List<String>> entityId2asymId = new HashMap<>(); 1743 for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) { 1744 String id = structAsym.getId().get(rowIndex); 1745 String entityId = structAsym.getEntityId().get(rowIndex); 1746 1747 logger.debug("Entity {} matches asym_id: {}", entityId, id); 1748 1749 asymId2entityId.put(id, entityId); 1750 1751 if (entityId2asymId.containsKey(entityId)) { 1752 List<String> asymIds = entityId2asymId.get(entityId); 1753 asymIds.add(id); 1754 } else { 1755 List<String> asymIds = new ArrayList<>(); 1756 asymIds.add(id); 1757 entityId2asymId.put(entityId, asymIds); 1758 } 1759 } 1760 1761 if (entityPoly == null || !entityPoly.isDefined() || entityPoly.getRowCount() == 0) { 1762 logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available " + 1763 "for header only parsing"); 1764 return; 1765 } 1766 1767 for (int rowIndex = 0; rowIndex < entityPoly.getRowCount(); rowIndex++) { 1768 if (!entityPoly.getPdbxStrandId().isDefined()) { 1769 logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to " + 1770 "author ids for this entity.", entityPoly.getEntityId().get(rowIndex)); 1771 break; 1772 } 1773 1774 String[] chainNames = entityPoly.getPdbxStrandId().get(rowIndex).split(","); 1775 List<String> asymIds = entityId2asymId.get(entityPoly.getEntityId().get(rowIndex)); 1776 if (chainNames.length != asymIds.size()) { 1777 logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) " + 1778 "for entity {} have different lengths! Can't provide a mapping from asym ids to author chain " + 1779 "ids", entityPoly.getEntityId().get(rowIndex)); 1780 break; 1781 } 1782 1783 for (int i = 0; i < chainNames.length; i++) { 1784 asymId2authorId.put(asymIds.get(i), chainNames[i]); 1785 } 1786 } 1787 } 1788 1789 @Override 1790 public Structure getContainer() { 1791 return structure; 1792 } 1793}