001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * created at Apr 26, 2008 021 */ 022package org.biojava.nbio.structure.io.mmcif; 023 024import java.text.ParseException; 025import java.text.SimpleDateFormat; 026import java.util.ArrayList; 027import java.util.Date; 028import java.util.HashMap; 029import java.util.List; 030import java.util.Locale; 031import java.util.Map; 032 033import javax.vecmath.Matrix4d; 034 035import org.biojava.nbio.structure.AminoAcid; 036import org.biojava.nbio.structure.AminoAcidImpl; 037import org.biojava.nbio.structure.Atom; 038import org.biojava.nbio.structure.AtomImpl; 039import org.biojava.nbio.structure.Chain; 040import org.biojava.nbio.structure.ChainImpl; 041import org.biojava.nbio.structure.EntityInfo; 042import org.biojava.nbio.structure.EntityType; 043import org.biojava.nbio.structure.DBRef; 044import org.biojava.nbio.structure.Element; 045import org.biojava.nbio.structure.Group; 046import org.biojava.nbio.structure.GroupType; 047import org.biojava.nbio.structure.HetatomImpl; 048import org.biojava.nbio.structure.NucleotideImpl; 049import org.biojava.nbio.structure.PDBCrystallographicInfo; 050import org.biojava.nbio.structure.PDBHeader; 051import org.biojava.nbio.structure.ResidueNumber; 052import org.biojava.nbio.structure.SeqMisMatch; 053import org.biojava.nbio.structure.SeqMisMatchImpl; 054import org.biojava.nbio.structure.Site; 055import org.biojava.nbio.structure.Structure; 056import org.biojava.nbio.structure.StructureException; 057import org.biojava.nbio.structure.StructureImpl; 058import org.biojava.nbio.structure.StructureTools; 059import org.biojava.nbio.structure.io.BondMaker; 060import org.biojava.nbio.structure.io.ChargeAdder; 061import org.biojava.nbio.structure.io.EntityFinder; 062import org.biojava.nbio.structure.io.FileParsingParameters; 063import org.biojava.nbio.structure.io.SeqRes2AtomAligner; 064import org.biojava.nbio.structure.io.mmcif.model.AtomSite; 065import org.biojava.nbio.structure.io.mmcif.model.AtomSites; 066import org.biojava.nbio.structure.io.mmcif.model.AuditAuthor; 067import org.biojava.nbio.structure.io.mmcif.model.Cell; 068import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 069import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom; 070import org.biojava.nbio.structure.io.mmcif.model.ChemCompBond; 071import org.biojava.nbio.structure.io.mmcif.model.ChemCompDescriptor; 072import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBremark; 073import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBrev; 074import org.biojava.nbio.structure.io.mmcif.model.DatabasePdbrevRecord; 075import org.biojava.nbio.structure.io.mmcif.model.Entity; 076import org.biojava.nbio.structure.io.mmcif.model.EntityPoly; 077import org.biojava.nbio.structure.io.mmcif.model.EntityPolySeq; 078import org.biojava.nbio.structure.io.mmcif.model.EntitySrcGen; 079import org.biojava.nbio.structure.io.mmcif.model.EntitySrcNat; 080import org.biojava.nbio.structure.io.mmcif.model.EntitySrcSyn; 081import org.biojava.nbio.structure.io.mmcif.model.Exptl; 082import org.biojava.nbio.structure.io.mmcif.model.PdbxAuditRevisionHistory; 083import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompDescriptor; 084import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompIdentifier; 085import org.biojava.nbio.structure.io.mmcif.model.PdbxDatabaseStatus; 086import org.biojava.nbio.structure.io.mmcif.model.PdbxEntityNonPoly; 087import org.biojava.nbio.structure.io.mmcif.model.PdbxNonPolyScheme; 088import org.biojava.nbio.structure.io.mmcif.model.PdbxPolySeqScheme; 089import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssembly; 090import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssemblyGen; 091import org.biojava.nbio.structure.io.mmcif.model.PdbxStructOperList; 092import org.biojava.nbio.structure.io.mmcif.model.Refine; 093import org.biojava.nbio.structure.io.mmcif.model.Struct; 094import org.biojava.nbio.structure.io.mmcif.model.StructAsym; 095import org.biojava.nbio.structure.io.mmcif.model.StructConn; 096import org.biojava.nbio.structure.io.mmcif.model.StructKeywords; 097import org.biojava.nbio.structure.io.mmcif.model.StructNcsOper; 098import org.biojava.nbio.structure.io.mmcif.model.StructRef; 099import org.biojava.nbio.structure.io.mmcif.model.StructRefSeq; 100import org.biojava.nbio.structure.io.mmcif.model.StructRefSeqDif; 101import org.biojava.nbio.structure.io.mmcif.model.StructSite; 102import org.biojava.nbio.structure.io.mmcif.model.StructSiteGen; 103import org.biojava.nbio.structure.io.mmcif.model.Symmetry; 104import org.biojava.nbio.structure.quaternary.BioAssemblyInfo; 105import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder; 106import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation; 107import org.biojava.nbio.structure.xtal.CrystalCell; 108import org.biojava.nbio.structure.xtal.SpaceGroup; 109import org.biojava.nbio.structure.xtal.SymoplibParser; 110import org.slf4j.Logger; 111import org.slf4j.LoggerFactory; 112 113/** 114 * A MMcifConsumer implementation that builds an in-memory representation of the 115 * content of a mmcif file as a BioJava Structure object. 116 * 117 * @author Andreas Prlic 118 * @since 1.7 119 */ 120 121public class SimpleMMcifConsumer implements MMcifConsumer { 122 123 private static final Logger logger = LoggerFactory.getLogger(SimpleMMcifConsumer.class); 124 125 private Structure structure; 126 private Chain currentChain; 127 private Group currentGroup; 128 129 /** 130 * A temporary data structure to hold all parsed chains 131 */ 132 private ArrayList<List<Chain>> allModels; 133 /** 134 * The current set of chains per model 135 */ 136 private List<Chain> currentModel; 137 private List<Entity> entities; 138 /** 139 * Needed in header only mode to get mapping between asym ids and author ids 140 */ 141 private List<EntityPoly> entityPolys; 142 private List<StructRef> strucRefs; 143 private List<Chain> seqResChains; 144 private List<Chain> entityChains; // needed to link entities, chains and compounds... 145 private List<StructAsym> structAsyms; // needed to link entities, chains and compounds... 146 private List<PdbxStructOperList> structOpers ; // 147 private List<PdbxStructAssembly> strucAssemblies; 148 private List<PdbxStructAssemblyGen> strucAssemblyGens; 149 private List<EntitySrcGen> entitySrcGens; 150 private List<EntitySrcNat> entitySrcNats; 151 private List<EntitySrcSyn> entitySrcSyns; 152 private List<StructConn> structConn; 153 private List<StructNcsOper> structNcsOper; 154 private List<StructRefSeqDif> sequenceDifs; 155 private List<StructSiteGen> structSiteGens; 156 157 private Matrix4d parsedScaleMatrix; 158 159 160 161 /** 162 * A map of asym ids (internal chain ids) to entity ids extracted from 163 * the _struct_asym category 164 */ 165 private Map<String,String> asymId2entityId; 166 167 /** 168 * A map of asym ids (internal chain ids) to author ids extracted from 169 * the _entity_poly category. Used in header only parsing. 170 */ 171 private Map<String,String> asymId2authorId; 172 173 private String currentNmrModelNumber ; 174 175 private FileParsingParameters params; 176 177 public SimpleMMcifConsumer(){ 178 params = new FileParsingParameters(); 179 documentStart(); 180 181 } 182 183 @Override 184 public void newEntity(Entity entity) { 185 logger.debug("New entity: {}",entity.toString()); 186 entities.add(entity); 187 } 188 189 @Override 190 public void newEntityPoly(EntityPoly entityPoly) { 191 entityPolys.add(entityPoly); 192 } 193 194 @Override 195 public void newPdbxStructOperList(PdbxStructOperList structOper){ 196 197 structOpers.add(structOper); 198 } 199 200 @Override 201 public void newStructAsym(StructAsym sasym){ 202 203 structAsyms.add(sasym); 204 } 205 206 private Entity getEntity(int entity_id){ 207 try { 208 for (Entity e: entities){ 209 int eId = Integer.parseInt(e.getId()); 210 if (eId== entity_id){ 211 return e; 212 } 213 } 214 } catch (NumberFormatException e) { 215 logger.warn("Entity id does not look like a number:", e.getMessage()); 216 } 217 return null; 218 } 219 220 @Override 221 public void newStructKeywords(StructKeywords kw){ 222 PDBHeader header = structure.getPDBHeader(); 223 if ( header == null) 224 header = new PDBHeader(); 225 header.setDescription(kw.getPdbx_keywords()); 226 header.setClassification(kw.getPdbx_keywords()); 227 } 228 229 @Override 230 public void setStruct(Struct struct) { 231 232 PDBHeader header = structure.getPDBHeader(); 233 if ( header == null) 234 header = new PDBHeader(); 235 236 header.setTitle(struct.getTitle()); 237 header.setIdCode(struct.getEntry_id()); 238 //header.setDescription(struct.getPdbx_descriptor()); 239 //header.setClassification(struct.getPdbx_descriptor()); 240 //header.setDescription(struct.getPdbx_descriptor()); 241 242 243 244 structure.setPDBHeader(header); 245 structure.setPDBCode(struct.getEntry_id()); 246 } 247 248 /** initiate new group, either Hetatom, Nucleotide, or AminoAcid */ 249 private Group getNewGroup(String recordName,Character aminoCode1, long seq_id,String groupCode3) { 250 251 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(groupCode3); 252 if ( g != null && !g.getChemComp().isEmpty()) { 253 if ( g instanceof AminoAcidImpl) { 254 AminoAcidImpl aa = (AminoAcidImpl) g; 255 aa.setId(seq_id); 256 } else if ( g instanceof NucleotideImpl) { 257 NucleotideImpl nuc = (NucleotideImpl) g; 258 nuc.setId(seq_id); 259 } else if ( g instanceof HetatomImpl) { 260 HetatomImpl het = (HetatomImpl)g; 261 het.setId(seq_id); 262 } 263 return g; 264 } 265 266 267 268 Group group; 269 if ( recordName.equals("ATOM") ) { 270 if (StructureTools.isNucleotide(groupCode3)) { 271 // it is a nucleotide 272 NucleotideImpl nu = new NucleotideImpl(); 273 group = nu; 274 nu.setId(seq_id); 275 276 } else if (aminoCode1==null || aminoCode1 == StructureTools.UNKNOWN_GROUP_LABEL){ 277 HetatomImpl h = new HetatomImpl(); 278 h.setId(seq_id); 279 group = h; 280 281 } else { 282 AminoAcidImpl aa = new AminoAcidImpl() ; 283 aa.setAminoType(aminoCode1); 284 aa.setId(seq_id); 285 group = aa ; 286 } 287 } 288 else { 289 if (StructureTools.isNucleotide(groupCode3)) { 290 // it is a nucleotide 291 NucleotideImpl nu = new NucleotideImpl(); 292 group = nu; 293 nu.setId(seq_id); 294 } 295 else if (aminoCode1 != null ) { 296 AminoAcidImpl aa = new AminoAcidImpl() ; 297 aa.setAminoType(aminoCode1); 298 aa.setId(seq_id); 299 group = aa ; 300 } else { 301 HetatomImpl h = new HetatomImpl(); 302 h.setId(seq_id); 303 group = h; 304 } 305 } 306 return group ; 307 } 308 309 /** 310 * Test if the given asymId is already present in the list of chains given. If yes, returns the chain 311 * otherwise returns null. 312 */ 313 private static Chain isKnownChain(String asymId, List<Chain> chains){ 314 315 for (int i = 0; i< chains.size();i++){ 316 Chain testchain = chains.get(i); 317 //System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<"); 318 if (asymId.equals(testchain.getId())) { 319 //System.out.println("chain "+ chainID+" already known ..."); 320 return testchain; 321 } 322 } 323 324 return null; 325 } 326 327 @Override 328 public void newAtomSite(AtomSite atom) { 329 330 if (params.isHeaderOnly()) return; 331 332 // Warning: getLabel_asym_id is not the "chain id" in the PDB file 333 // it is the internally used chain id. 334 // later on we will fix this... 335 336 // later one needs to map the asym id to the pdb_strand_id 337 338 //TODO: add support for FileParsingParams.getMaxAtoms() 339 340 boolean startOfNewChain = false; 341 342 String asymId = atom.getLabel_asym_id(); 343 String authId = atom.getAuth_asym_id(); 344 345 String recordName = atom.getGroup_PDB(); 346 String residueNumberS = atom.getAuth_seq_id(); 347 Integer residueNrInt = Integer.parseInt(residueNumberS); 348 349 // the 3-letter name of the group: 350 String groupCode3 = atom.getLabel_comp_id(); 351 352 boolean isHetAtomInFile = false; 353 354 Character aminoCode1 = null; 355 if ( recordName.equals("ATOM") ) 356 aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3); 357 else { 358 aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3); 359 360 // for nucleotides this will be null.. 361 if (aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) 362 aminoCode1 = null; 363 364 isHetAtomInFile = true; 365 } 366 String insCodeS = atom.getPdbx_PDB_ins_code(); 367 Character insCode = null; 368 if (! insCodeS.equals("?")) { 369 insCode = insCodeS.charAt(0); 370 } 371 // we store the internal seq id in the Atom._id field 372 // this is not a PDB file field but we need this to internally assign the insertion codes later 373 // from the pdbx_poly_seq entries.. 374 375 long seq_id = -1; 376 try { 377 seq_id = Long.parseLong(atom.getLabel_seq_id()); 378 } catch (NumberFormatException e){ 379 // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.', thus it is ok to 380 // silently ignore this 381 //logger.debug("Could not parse number for _atom_site.label_seq_id: "+e.getMessage()); 382 } 383 384 String nmrModelNumber = atom.getPdbx_PDB_model_num(); 385 386 if ( currentNmrModelNumber == null) { 387 currentNmrModelNumber = nmrModelNumber; 388 } 389 390 if (! currentNmrModelNumber.equals(nmrModelNumber)){ 391 currentNmrModelNumber = nmrModelNumber; 392 393 // add previous data 394 if ( currentChain != null ) { 395 currentChain.addGroup(currentGroup); 396 currentGroup.trimToSize(); 397 } 398 399 // we came to the beginning of a new NMR model 400 allModels.add(currentModel); 401 currentModel = new ArrayList<Chain>(); 402 currentChain = null; 403 currentGroup = null; 404 } 405 406 407 if (currentChain == null) { 408 409 currentChain = new ChainImpl(); 410 currentChain.setName(authId); 411 currentChain.setId(asymId); 412 currentModel.add(currentChain); 413 startOfNewChain = true; 414 } 415 416 //System.out.println("BEFORE: " + chain_id + " " + current_chain.getName()); 417 if ( ! asymId.equals(currentChain.getId()) ) { 418 //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId); 419 startOfNewChain = true; 420 421 // end up old chain... 422 currentChain.addGroup(currentGroup); 423 424 // see if old chain is known ... 425 Chain testchain = isKnownChain(asymId,currentModel); 426 427 if ( testchain == null) { 428 //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId); 429 430 currentChain = new ChainImpl(); 431 currentChain.setName(authId); 432 currentChain.setId(asymId); 433 434 } else { 435 currentChain = testchain; 436 } 437 438 if ( ! currentModel.contains(currentChain)) 439 currentModel.add(currentChain); 440 441 } 442 443 444 ResidueNumber residueNumber = new ResidueNumber(authId,residueNrInt, insCode); 445 446 if (currentGroup == null) { 447 448 449 currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3); 450 451 currentGroup.setResidueNumber(residueNumber); 452 currentGroup.setPDBName(groupCode3); 453 currentGroup.setHetAtomInFile(isHetAtomInFile); 454 } 455 456 // SET UP THE ALT LOC GROUP 457 Group altGroup = null; 458 String altLocS = atom.getLabel_alt_id(); 459 Character altLoc = ' '; 460 if ( altLocS.length()>0) { 461 altLoc = altLocS.charAt(0); 462 if ( altLoc.equals('.') ) 463 altLoc = ' '; 464 465 } 466 // If it's the start of the new chain 467 if ( startOfNewChain){ 468 currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3); 469 currentGroup.setResidueNumber(residueNumber); 470 currentGroup.setPDBName(groupCode3); 471 currentGroup.setHetAtomInFile(isHetAtomInFile); 472 } 473 // ANTHONY BRADLEY ADDED THIS -> WE ONLY WAN'T TO CHECK FOR ALT LOCS WHEN IT's NOT THE FIRST GROUP IN CHAIN 474 else{ 475 // check if residue number is the same ... 476 // insertion code is part of residue number 477 if ( ! residueNumber.equals(currentGroup.getResidueNumber())) { 478 //System.out.println("end of residue: "+current_group.getPDBCode()+" "+residueNrInt); 479 currentChain.addGroup(currentGroup); 480 currentGroup.trimToSize(); 481 currentGroup = getNewGroup(recordName,aminoCode1,seq_id,groupCode3); 482 currentGroup.setPDBName(groupCode3); 483 currentGroup.setResidueNumber(residueNumber); 484 currentGroup.setHetAtomInFile(isHetAtomInFile); 485 486 487 } else { 488 // same residueNumber, but altLocs... 489 // test altLoc 490 491 if ( ! altLoc.equals(' ') && ( ! altLoc.equals('.'))) { 492 logger.debug("found altLoc! " + altLoc + " " + currentGroup + " " + altGroup); 493 altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3, seq_id); 494 if (altGroup.getChain()==null) { 495 altGroup.setChain(currentChain); 496 } 497 } 498 } 499 } 500 //atomCount++; 501 //System.out.println("fixing atom name for >" + atom.getLabel_atom_id() + "< >" + fullname + "<"); 502 503 504 if ( params.isParseCAOnly() ){ 505 // yes , user wants to get CA only 506 // only parse CA atoms... 507 if (! (atom.getLabel_atom_id().equals(StructureTools.CA_ATOM_NAME) && atom.getType_symbol().equals("C"))) { 508 //System.out.println("ignoring " + line); 509 //atomCount--; 510 return; 511 } 512 } 513 514 //see if chain_id is one of the previous chains ... 515 516 Atom a = convertAtom(atom); 517 518 //see if chain_id is one of the previous chains ... 519 if ( altGroup != null) { 520 altGroup.addAtom(a); 521 altGroup = null; 522 } 523 else { 524 currentGroup.addAtom(a); 525 } 526 527 528 String atomName = a.getName(); 529 // make sure that main group has all atoms 530 // GitHub issue: #76 531 if ( ! currentGroup.hasAtom(atomName)) { 532 // Unless it's microheterogenity https://github.com/rcsb/codec-devel/issues/81 533 if (currentGroup.getPDBName().equals(a.getGroup().getPDBName())) { 534 if(!StructureTools.hasNonDeuteratedEquiv(a,currentGroup)){ 535 currentGroup.addAtom(a); 536 } 537 } 538 539 } 540 } 541 542 /** 543 * Convert a mmCIF AtomSite object to a BioJava Atom object 544 * 545 * @param atom the mmmcif AtomSite record 546 * @return an Atom 547 */ 548 private Atom convertAtom(AtomSite atom){ 549 550 551 Atom a = new AtomImpl(); 552 553 a.setPDBserial(Integer.parseInt(atom.getId())); 554 a.setName(atom.getLabel_atom_id()); 555 556 double x = Double.parseDouble (atom.getCartn_x()); 557 double y = Double.parseDouble (atom.getCartn_y()); 558 double z = Double.parseDouble (atom.getCartn_z()); 559 a.setX(x); 560 a.setY(y); 561 a.setZ(z); 562 563 float occupancy = Float.parseFloat (atom.getOccupancy()); 564 a.setOccupancy(occupancy); 565 566 float temp = Float.parseFloat (atom.getB_iso_or_equiv()); 567 a.setTempFactor(temp); 568 569 String alt = atom.getLabel_alt_id(); 570 if (( alt != null ) && ( alt.length() > 0) && (! alt.equals("."))){ 571 a.setAltLoc(new Character(alt.charAt(0))); 572 } else { 573 a.setAltLoc(new Character(' ')); 574 } 575 576 Element element = Element.R; 577 try { 578 element = Element.valueOfIgnoreCase(atom.getType_symbol()); 579 } catch (IllegalArgumentException e) { 580 logger.info("Element {} was not recognised as a BioJava-known element, the element will be represented as the generic element {}", atom.getType_symbol(), Element.R.name()); 581 } 582 a.setElement(element); 583 584 return a; 585 586 } 587 588 589 private Group getCorrectAltLocGroup( Character altLoc, 590 String recordName, 591 Character aminoCode1, 592 String groupCode3, 593 long seq_id) { 594 595 // see if we know this altLoc already; 596 List<Atom> atoms = currentGroup.getAtoms(); 597 if ( atoms.size() > 0) { 598 Atom a1 = atoms.get(0); 599 // we are just adding atoms to the current group 600 // probably there is a second group following later... 601 if (a1.getAltLoc().equals(altLoc)) { 602 603 return currentGroup; 604 } 605 } 606 607 List<Group> altLocs = currentGroup.getAltLocs(); 608 for ( Group altLocG : altLocs ){ 609 atoms = altLocG.getAtoms(); 610 if ( atoms.size() > 0) { 611 for ( Atom a1 : atoms) { 612 if (a1.getAltLoc().equals( altLoc)) { 613 614 return altLocG; 615 } 616 } 617 } 618 } 619 620 // no matching altLoc group found. 621 // build it up. 622 623 if ( groupCode3.equals(currentGroup.getPDBName())) { 624 if ( currentGroup.getAtoms().size() == 0) { 625 //System.out.println("current group is empty " + current_group + " " + altLoc); 626 return currentGroup; 627 } 628 //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc); 629 Group altLocG = (Group) currentGroup.clone(); 630 // drop atoms from cloned group... 631 // https://redmine.open-bio.org/issues/3307 632 altLocG.setAtoms(new ArrayList<Atom>()); 633 altLocG.getAltLocs().clear(); 634 currentGroup.addAltLoc(altLocG); 635 return altLocG; 636 } 637 638 // System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3); 639 //String recordName,Character aminoCode1, long seq_id,String groupCode3) { 640 Group altLocG = getNewGroup(recordName,aminoCode1,seq_id,groupCode3); 641 642 altLocG.setPDBName(groupCode3); 643 altLocG.setResidueNumber(currentGroup.getResidueNumber()); 644 currentGroup.addAltLoc(altLocG); 645 return altLocG; 646 } 647 648 /** 649 * Start the parsing 650 */ 651 @Override 652 public void documentStart() { 653 structure = new StructureImpl(); 654 655 currentChain = null; 656 currentGroup = null; 657 currentNmrModelNumber = null; 658 //atomCount = 0; 659 660 allModels = new ArrayList<List<Chain>>(); 661 currentModel = new ArrayList<Chain>(); 662 entities = new ArrayList<Entity>(); 663 entityPolys = new ArrayList<>(); 664 strucRefs = new ArrayList<StructRef>(); 665 seqResChains = new ArrayList<Chain>(); 666 entityChains = new ArrayList<Chain>(); 667 structAsyms = new ArrayList<StructAsym>(); 668 669 asymId2entityId = new HashMap<String,String>(); 670 asymId2authorId = new HashMap<>(); 671 structOpers = new ArrayList<PdbxStructOperList>(); 672 strucAssemblies = new ArrayList<PdbxStructAssembly>(); 673 strucAssemblyGens = new ArrayList<PdbxStructAssemblyGen>(); 674 entitySrcGens = new ArrayList<EntitySrcGen>(); 675 entitySrcNats = new ArrayList<EntitySrcNat>(); 676 entitySrcSyns = new ArrayList<EntitySrcSyn>(); 677 structConn = new ArrayList<StructConn>(); 678 structNcsOper = new ArrayList<StructNcsOper>(); 679 sequenceDifs = new ArrayList<StructRefSeqDif>(); 680 structSiteGens = new ArrayList<StructSiteGen>(); 681 } 682 683 684 @Override 685 public void documentEnd() { 686 687 // Expected that there is one current_chain that needs to be added to the model 688 // When in headerOnly mode, no Atoms are read, and there will not be an active 689 // current_chain. 690 if ( currentChain != null ) { 691 692 currentChain.addGroup(currentGroup); 693 if (isKnownChain(currentChain.getId(),currentModel) == null) { 694 currentModel.add(currentChain); 695 } 696 } else if (!params.isHeaderOnly()){ 697 logger.warn("current chain is null at end of document."); 698 } 699 700 allModels.add(currentModel); 701 702 // this populates the asymId2authorId and asymId2entityId maps, needed in header only mode to get the mapping 703 // between the 2 chain identifiers. 704 initMaps(); 705 706 for (StructAsym asym : structAsyms) { 707 708 logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() ); 709 710 Chain s = getEntityChain(asym.getEntity_id()); 711 Chain seqres = (Chain)s.clone(); 712 // to solve issue #160 (e.g. 3u7t) 713 seqres = removeSeqResHeterogeneity(seqres); 714 seqres.setId(asym.getId()); 715 if (asymId2authorId.get(asym.getId()) !=null ){ 716 seqres.setName(asymId2authorId.get(asym.getId())); 717 } else { 718 seqres.setName(asym.getId()); 719 } 720 721 EntityType type = null; 722 try { 723 Entity ent = getEntity(Integer.parseInt(asym.getEntity_id())); 724 type = EntityType.entityTypeFromString(ent.getType()); 725 } catch (NumberFormatException e) { 726 logger.debug("Could not parse integer from entity id field {}", asym.getEntity_id()); 727 } 728 729 // we'll only add seqres chains that are polymeric or unknown 730 if (type==null || type==EntityType.POLYMER ) { 731 seqResChains.add(seqres); 732 } 733 734 logger.debug(" seqres: " + asym.getId() + " " + seqres + "<") ; 735 // adding the entities to structure 736 addEntities(asym); 737 738 } 739 740 if (structAsyms.isEmpty()) { 741 logger.warn("No _struct_asym category in file, no SEQRES groups will be added."); 742 } 743 744 // entities 745 // In addEntities above we created the entities if they were present in the file 746 // Now we need to make sure that they are linked to chains and also that if they are not present in the file we need to add them now 747 linkEntities(); 748 749 // now that we know the entities, we can add all chains to structure so that they are stored 750 // properly as polymer/nonpolymer/water chains inside structure 751 for (List<Chain> model:allModels) { 752 structure.addModel(model); 753 } 754 755 // Only align if requested (default) and not when headerOnly mode with no Atoms. 756 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 757 if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){ 758 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 759 alignSeqRes(); 760 } else { 761 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 762 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 763 } 764 765 766 // Now make sure all altlocgroups have all the atoms in all the groups 767 StructureTools.cleanUpAltLocs(structure); 768 769 770 // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out 771 if (!params.isHeaderOnly()) { 772 if ( params.shouldCreateAtomBonds()) { 773 addBonds(); 774 } 775 776 if ( params.shouldCreateAtomCharges()) { 777 addCharges(); 778 } 779 } 780 781 if (!params.isHeaderOnly()) { 782 783 // Do structure.setSites(sites) after any chain renaming to be like PDB. 784 addSites(); 785 } 786 787 788 789 // set the oligomeric state info in the header... 790 if (params.isParseBioAssembly()) { 791 792 // the more detailed mapping of chains to rotation operations happens in StructureIO... 793 794 Map<Integer,BioAssemblyInfo> bioAssemblies = new HashMap<Integer, BioAssemblyInfo>(); 795 796 for ( PdbxStructAssembly psa : strucAssemblies){ 797 798 List<PdbxStructAssemblyGen> psags = new ArrayList<PdbxStructAssemblyGen>(1); 799 800 for ( PdbxStructAssemblyGen psag: strucAssemblyGens ) { 801 if ( psag.getAssembly_id().equals(psa.getId())) { 802 psags.add(psag); 803 } 804 } 805 806 BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder(); 807 808 // these are the transformations that need to be applied to our model 809 List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(psa, psags, structOpers); 810 811 int bioAssemblyId = -1; 812 try { 813 bioAssemblyId = Integer.parseInt(psa.getId()); 814 } catch (NumberFormatException e) { 815 logger.info("Could not parse a numerical bio assembly id from '{}'",psa.getId()); 816 } 817 818 // if bioassembly id is not numerical we throw it away 819 // this happens usually for viral capsid entries, like 1ei7 820 // see issue #230 in github 821 if (bioAssemblyId!=-1) { 822 int mmSize = 0; 823 // note that the transforms contain asym ids of both polymers and non-polymers 824 // For the mmsize, we are only interested in the polymers 825 for (BiologicalAssemblyTransformation transf:transformations) { 826 Chain c = structure.getChain(transf.getChainId()); 827 if (c==null) { 828 logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId()); 829 continue; 830 } 831 if (c.getEntityType() == EntityType.POLYMER && 832 // for entries like 4kro, sugars are annotated as polymers but we 833 // don't want them in the macromolecularSize count 834 !c.getEntityInfo().getDescription().contains("SUGAR") ) { 835 836 mmSize++; 837 } 838 } 839 840 BioAssemblyInfo bioAssembly = new BioAssemblyInfo(); 841 bioAssembly.setId(bioAssemblyId); 842 bioAssembly.setMacromolecularSize(mmSize); 843 bioAssembly.setTransforms(transformations); 844 bioAssemblies.put(bioAssemblyId,bioAssembly); 845 } 846 847 } 848 structure.getPDBHeader().setBioAssemblies(bioAssemblies); 849 } 850 851 setStructNcsOps(); 852 853 setCrystallographicInfoMetadata(); 854 855 856 Map<String,List<SeqMisMatch>> misMatchMap = new HashMap<String, List<SeqMisMatch>>(); 857 for (StructRefSeqDif sdif : sequenceDifs) { 858 SeqMisMatch misMatch = new SeqMisMatchImpl(); 859 misMatch.setDetails(sdif.getDetails()); 860 861 String insCode = sdif.getPdbx_pdb_ins_code(); 862 if ( insCode != null && insCode.equals("?")) 863 insCode = null; 864 misMatch.setInsCode(insCode); 865 misMatch.setOrigGroup(sdif.getDb_mon_id()); 866 misMatch.setPdbGroup(sdif.getMon_id()); 867 misMatch.setPdbResNum(sdif.getPdbx_auth_seq_num()); 868 misMatch.setUniProtId(sdif.getPdbx_seq_db_accession_code()); 869 misMatch.setSeqNum(sdif.getSeq_num()); 870 871 872 List<SeqMisMatch> mms = misMatchMap.get(sdif.getPdbx_pdb_strand_id()); 873 if ( mms == null) { 874 mms = new ArrayList<SeqMisMatch>(); 875 misMatchMap.put(sdif.getPdbx_pdb_strand_id(),mms); 876 } 877 mms.add(misMatch); 878 879 } 880 881 for (String chainId : misMatchMap.keySet()){ 882 883 Chain chain = structure.getPolyChainByPDB(chainId); 884 885 if ( chain == null) { 886 logger.warn("Could not set mismatches for chain with author id" + chainId); 887 continue; 888 } 889 890 chain.setSeqMisMatches(misMatchMap.get(chainId)); 891 892 893 } 894 895 } 896 897 /** 898 * Here we link entities to chains. 899 * Also if entities are not present in file, this initialises the entities with some heuristics, see {@link org.biojava.nbio.structure.io.EntityFinder} 900 */ 901 private void linkEntities() { 902 903 for (int i =0; i< allModels.size() ; i++){ 904 for (Chain chain : allModels.get(i)) { 905 //logger.info("linking entities for " + chain.getId() + " " + chain.getName()); 906 String entityId = asymId2entityId.get(chain.getId()); 907 908 if (entityId==null) { 909 // this can happen for instance if the cif file didn't have _struct_asym category at all 910 // and thus we have no asymId2entityId mapping at all 911 logger.info("No entity id could be found for chain {}", chain.getId()); 912 continue; 913 } 914 int eId = Integer.parseInt(entityId); 915 916 // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found. 917 // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer 918 // asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the 919 // mmCIF internal data structures but is compatible with Structure interface. 920 // Some examples of PDB entries with this kind of problem: 921 // - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName 922 // - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule 923 // - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone 924 925 EntityInfo entityInfo = structure.getEntityById(eId); 926 if (entityInfo==null) { 927 // Supports the case where the only chain members were from non-polymeric entity that is missing. 928 // Solved by creating a new Compound(entity) to which this chain will belong. 929 logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.", 930 eId, chain.getId()); 931 entityInfo = new EntityInfo(); 932 entityInfo.setMolId(eId); 933 entityInfo.addChain(chain); 934 if (chain.isWaterOnly()) { 935 entityInfo.setType(EntityType.WATER); 936 } else { 937 entityInfo.setType(EntityType.NONPOLYMER); 938 } 939 chain.setEntityInfo(entityInfo); 940 structure.addEntityInfo(entityInfo); 941 } else { 942 logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}", 943 chain.getId(), chain.getName(), eId); 944 entityInfo.addChain(chain); 945 chain.setEntityInfo(entityInfo); 946 } 947 948 } 949 950 } 951 952 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 953 List<EntityInfo> entityInfos = structure.getEntityInfos(); 954 if (entityInfos==null || entityInfos.isEmpty()) { 955 956 List<List<Chain>> polyModels = new ArrayList<>(); 957 List<List<Chain>> nonPolyModels = new ArrayList<>(); 958 List<List<Chain>> waterModels = new ArrayList<>(); 959 960 for (List<Chain> model:allModels) { 961 962 List<Chain> polyChains = new ArrayList<>(); 963 List<Chain> nonPolyChains = new ArrayList<>(); 964 List<Chain> waterChains = new ArrayList<>(); 965 966 polyModels.add(polyChains); 967 nonPolyModels.add(nonPolyChains); 968 waterModels.add(waterChains); 969 970 for (Chain c:model) { 971 972 // we only have entities for polymeric chains, all others are ignored for assigning entities 973 if (c.isWaterOnly()) { 974 waterChains.add(c); 975 976 } else if (c.isPureNonPolymer()) { 977 nonPolyChains.add(c); 978 979 } else { 980 polyChains.add(c); 981 } 982 } 983 } 984 985 entityInfos = EntityFinder.findPolyEntities(polyModels); 986 EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos); 987 988 989 structure.setEntityInfos(entityInfos); 990 } 991 992 // final sanity check: it can happen that from the annotated entities some are not linked to any chains 993 // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds) 994 // we simply log it, this can sign some other problems if the entities are used down the line 995 for (EntityInfo e:entityInfos) { 996 if (e.getChains().isEmpty()) { 997 logger.info("Entity {} '{}' has no chains associated to it", 998 e.getMolId()<0?"with no entity id":e.getMolId(), e.getDescription()); 999 } 1000 } 1001 1002 } 1003 1004 private void addCharges() { 1005 ChargeAdder.addCharges(structure); 1006 } 1007 1008 /** 1009 * The method will return a new reference to a Chain with any consecutive groups 1010 * having same residue numbers removed. 1011 * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160) 1012 * @param c 1013 * @return 1014 */ 1015 private static Chain removeSeqResHeterogeneity(Chain c) { 1016 1017 Chain trimmedChain = new ChainImpl(); 1018 1019 ResidueNumber lastResNum = null; 1020 1021 for (Group g:c.getAtomGroups()) { 1022 1023 // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g) 1024 ResidueNumber currentResNum = new ResidueNumber( 1025 g.getResidueNumber().getChainName(), 1026 g.getResidueNumber().getSeqNum(), 1027 g.getResidueNumber().getInsCode()); 1028 1029 if (lastResNum == null || !lastResNum.equals(currentResNum) ) { 1030 trimmedChain.addGroup(g); 1031 } else { 1032 logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': "+g); 1033 } 1034 1035 lastResNum = currentResNum; 1036 1037 } 1038 return trimmedChain; 1039 } 1040 1041 private void addBonds() { 1042 BondMaker maker = new BondMaker(structure, params); 1043 maker.makeBonds(); 1044 maker.formBondsFromStructConn(structConn); 1045 } 1046 1047 private void alignSeqRes() { 1048 1049 logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence"); 1050 1051 // fix SEQRES residue numbering for all models 1052 1053 for (int model=0;model<structure.nrModels();model++) { 1054 1055 List<Chain> atomList = structure.getModel(model); 1056 1057 for (Chain seqResChain: seqResChains){ 1058 1059 // this extracts the matching atom chain from atomList 1060 Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true); 1061 1062 if (atomChain == null) { 1063 // most likely there's no observed residues at all for the seqres chain: can't map 1064 // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues 1065 logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's no observed residues in the chain.", 1066 seqResChain.getId()); 1067 continue; 1068 } 1069 1070 //map the atoms to the seqres... 1071 1072 // we need to first clone the seqres so that they stay independent for different models 1073 List<Group> seqResGroups = new ArrayList<Group>(); 1074 for (int i=0;i<seqResChain.getAtomGroups().size();i++) { 1075 seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone()); 1076 } 1077 1078 for ( int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) { 1079 Group seqresG = seqResGroups.get(seqResPos); 1080 boolean found = false; 1081 for ( Group atomG: atomChain.getAtomGroups()) { 1082 1083 int internalNr = getInternalNr (atomG); 1084 1085 if (seqresG.getResidueNumber().getSeqNum() == internalNr ) { 1086 seqResGroups.set(seqResPos, atomG); 1087 found = true; 1088 break; 1089 } 1090 1091 1092 } 1093 if ( ! found) 1094 // so far the residue number has tracked internal numbering. 1095 // however there are no atom records, as such this can't be a PDB residue number... 1096 seqresG.setResidueNumber(null); 1097 } 1098 atomChain.setSeqResGroups(seqResGroups); 1099 1100 } 1101 } 1102 } 1103 1104 private int getInternalNr(Group atomG) { 1105 if ( atomG.getType().equals(GroupType.AMINOACID)) { 1106 AminoAcidImpl aa = (AminoAcidImpl) atomG; 1107 return new Long(aa.getId()).intValue(); 1108 } else if ( atomG.getType().equals(GroupType.NUCLEOTIDE)) { 1109 NucleotideImpl nu = (NucleotideImpl) atomG; 1110 return new Long(nu.getId()).intValue(); 1111 } else { 1112 HetatomImpl he = (HetatomImpl) atomG; 1113 return new Long(he.getId()).intValue(); 1114 } 1115 } 1116 1117 private void addEntities(StructAsym asym) { 1118 int eId = 0; 1119 try { 1120 eId = Integer.parseInt(asym.getEntity_id()); 1121 } catch (NumberFormatException e) { 1122 logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity",asym.getEntity_id()); 1123 } 1124 Entity e = getEntity(eId); 1125 1126 // for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing 1127 // we need to fill the Compounds in some other way: 1128 1129 EntityInfo entityInfo = structure.getEntityById(eId); 1130 1131 if (entityInfo==null) { 1132 //logger.info("Creating new EntityInfo " + eId + " " + e.getId() + " " + e.getPdbx_description()); 1133 entityInfo = new EntityInfo(); 1134 entityInfo.setMolId(eId); 1135 // we only add the compound if a polymeric one (to match what the PDB parser does) 1136 if (e!=null) { 1137 entityInfo.setDescription(e.getPdbx_description()); 1138 1139 EntityType eType = EntityType.entityTypeFromString(e.getType()); 1140 if (eType!=null) { 1141 entityInfo.setType(eType); 1142 } else { 1143 logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", e.getType(), eId); 1144 } 1145 addAncilliaryEntityData(asym, eId, e, entityInfo); 1146 structure.addEntityInfo(entityInfo); 1147 logger.debug("Adding Entity with entity id {} from _entity, with name: {}",eId, entityInfo.getDescription()); 1148 } 1149 } 1150 } 1151 1152 1153 /** 1154 * Add any extra information to the entity information. 1155 * @param asym 1156 * @param entityId 1157 * @param entity 1158 * @param entityInfo 1159 */ 1160 private void addAncilliaryEntityData(StructAsym asym, int entityId, Entity entity, EntityInfo entityInfo) { 1161 // Loop through each of the entity types and add the corresponding data 1162 // We're assuming if data is duplicated between sources it is consistent 1163 // This is a potentially huge assumption... 1164 1165 1166 for (EntitySrcGen esg : entitySrcGens) { 1167 1168 if (! esg.getEntity_id().equals(asym.getEntity_id())) 1169 continue; 1170 1171 addInformationFromESG(esg, entityId, entityInfo); 1172 1173 } 1174 1175 for (EntitySrcNat esn : entitySrcNats) { 1176 if (! esn.getEntity_id().equals(asym.getEntity_id())) 1177 continue; 1178 addInformationFromESN(esn, entityId, entityInfo); 1179 1180 } 1181 1182 for (EntitySrcSyn ess : entitySrcSyns) { 1183 if (! ess.getEntity_id().equals(asym.getEntity_id())) 1184 continue; 1185 addInfoFromESS(ess, entityId, entityInfo); 1186 1187 } 1188 } 1189 1190 /** 1191 * Add the information from an ESG to a compound. 1192 * @param entitySrcInfo 1193 * @param entityId 1194 * @param c 1195 */ 1196 private void addInformationFromESG(EntitySrcGen entitySrcInfo, int entityId, EntityInfo c) { 1197 c.setAtcc(entitySrcInfo.getPdbx_gene_src_atcc()); 1198 c.setCell(entitySrcInfo.getPdbx_gene_src_cell()); 1199 c.setOrganismCommon(entitySrcInfo.getGene_src_common_name()); 1200 c.setOrganismScientific(entitySrcInfo.getPdbx_gene_src_scientific_name()); 1201 c.setOrganismTaxId(entitySrcInfo.getPdbx_gene_src_ncbi_taxonomy_id()); 1202 c.setExpressionSystemTaxId(entitySrcInfo.getPdbx_host_org_ncbi_taxonomy_id()); 1203 c.setExpressionSystem(entitySrcInfo.getPdbx_host_org_scientific_name()); 1204 } 1205 1206 /** 1207 * Add the information to entity info from ESN. 1208 * @param esn 1209 * @param eId 1210 * @param c 1211 */ 1212 private void addInformationFromESN(EntitySrcNat esn, int eId, EntityInfo c) { 1213 1214 c.setAtcc(esn.getPdbx_atcc()); 1215 c.setCell(esn.getPdbx_cell()); 1216 c.setOrganismCommon(esn.getCommon_name()); 1217 c.setOrganismScientific(esn.getPdbx_organism_scientific()); 1218 c.setOrganismTaxId(esn.getPdbx_ncbi_taxonomy_id()); 1219 1220 } 1221 /** 1222 * Add the information from ESS to Entity info. 1223 * @param ess 1224 * @param eId 1225 * @param c 1226 */ 1227 private void addInfoFromESS(EntitySrcSyn ess, int eId, EntityInfo c) { 1228 c.setOrganismCommon(ess.getOrganism_common_name()); 1229 c.setOrganismScientific(ess.getOrganism_scientific()); 1230 c.setOrganismTaxId(ess.getNcbi_taxonomy_id()); 1231 1232 } 1233 1234 private void initMaps() { 1235 1236 1237 if (structAsyms == null || structAsyms.isEmpty()) { 1238 logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available"); 1239 return; 1240 } 1241 1242 Map<String, List<String>> entityId2asymId = new HashMap<>(); 1243 1244 for (StructAsym asym : structAsyms) { 1245 1246 logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() ); 1247 1248 asymId2entityId.put(asym.getId(), asym.getEntity_id()); 1249 1250 if (entityId2asymId.containsKey(asym.getEntity_id())) { 1251 List<String> asymIds = entityId2asymId.get(asym.getEntity_id()); 1252 asymIds.add(asym.getId()); 1253 } else { 1254 List<String> asymIds = new ArrayList<>(); 1255 asymIds.add(asym.getId()); 1256 entityId2asymId.put(asym.getEntity_id(), asymIds); 1257 } 1258 } 1259 1260 if (entityPolys==null || entityPolys.isEmpty()) { 1261 logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available for header only parsing"); 1262 return; 1263 } 1264 1265 for (EntityPoly ep:entityPolys) { 1266 if (ep.getPdbx_strand_id()==null) { 1267 logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to author ids for this entity.", ep.getEntity_id()); 1268 continue; 1269 } 1270 String[] chainNames = ep.getPdbx_strand_id().split(","); 1271 List<String> asymIds = entityId2asymId.get(ep.getEntity_id()); 1272 if (chainNames.length!=asymIds.size()) { 1273 logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) for entity {} have different lengths! Can't provide a mapping from asym ids to author chain ids", ep.getEntity_id()); 1274 continue; 1275 } 1276 for (int i=0; i<chainNames.length; i++) { 1277 asymId2authorId.put(asymIds.get(i), chainNames[i]); 1278 } 1279 } 1280 } 1281 1282 private void setStructNcsOps() { 1283 1284 ArrayList<Matrix4d> ncsOperators = new ArrayList<Matrix4d>(); 1285 1286 for (StructNcsOper sNcsOper:structNcsOper) { 1287 1288 if (!sNcsOper.getCode().equals("generate")) continue; 1289 1290 try { 1291 Matrix4d op = new Matrix4d(); 1292 op.setElement(3, 0, 0.0); 1293 op.setElement(3, 1, 0.0); 1294 op.setElement(3, 2, 0.0); 1295 op.setElement(3, 3, 1.0); 1296 1297 1298 op.setElement(0, 0, Double.parseDouble(sNcsOper.getMatrix11())); 1299 op.setElement(0, 1, Double.parseDouble(sNcsOper.getMatrix12())); 1300 op.setElement(0, 2, Double.parseDouble(sNcsOper.getMatrix13())); 1301 1302 op.setElement(1, 0, Double.parseDouble(sNcsOper.getMatrix21())); 1303 op.setElement(1, 1, Double.parseDouble(sNcsOper.getMatrix22())); 1304 op.setElement(1, 2, Double.parseDouble(sNcsOper.getMatrix23())); 1305 1306 op.setElement(2, 0, Double.parseDouble(sNcsOper.getMatrix31())); 1307 op.setElement(2, 1, Double.parseDouble(sNcsOper.getMatrix32())); 1308 op.setElement(2, 2, Double.parseDouble(sNcsOper.getMatrix33())); 1309 1310 op.setElement(0, 3, Double.parseDouble(sNcsOper.getVector1())); 1311 op.setElement(1, 3, Double.parseDouble(sNcsOper.getVector2())); 1312 op.setElement(2, 3, Double.parseDouble(sNcsOper.getVector3())); 1313 1314 ncsOperators.add(op); 1315 1316 } catch (NumberFormatException e) { 1317 logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", structNcsOper.indexOf(sNcsOper)+1); 1318 } 1319 1320 } 1321 1322 // we only set it if not empty, otherwise remains null 1323 if (ncsOperators.size()>0) { 1324 structure.getCrystallographicInfo().setNcsOperators( 1325 ncsOperators.toArray(new Matrix4d[ncsOperators.size()])); 1326 } 1327 } 1328 1329 private void setCrystallographicInfoMetadata() { 1330 if (parsedScaleMatrix!=null) { 1331 1332 PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo(); 1333 1334 boolean nonStd = false; 1335 if (crystalInfo.getCrystalCell()!=null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) { 1336 nonStd = true; 1337 } 1338 1339 crystalInfo.setNonStandardCoordFrameConvention(nonStd); 1340 } 1341 } 1342 1343 1344 /** This method will return the parsed protein structure, once the parsing has been finished 1345 * 1346 * @return a BioJava protein structure object 1347 */ 1348 public Structure getStructure() { 1349 1350 return structure; 1351 } 1352 1353 @Override 1354 public void newDatabasePDBrevRecord(DatabasePdbrevRecord record) { 1355 1356 PDBHeader header = structure.getPDBHeader(); 1357 1358 if ( header == null) { 1359 header = new PDBHeader(); 1360 structure.setPDBHeader(header); 1361 } 1362 1363 List<DatabasePdbrevRecord> revRecords = header.getRevisionRecords(); 1364 if ( revRecords == null) { 1365 revRecords = new ArrayList<DatabasePdbrevRecord>(); 1366 header.setRevisionRecords(revRecords); 1367 } 1368 revRecords.add(record); 1369 1370 1371 } 1372 1373 1374 @Override 1375 public void newDatabasePDBrev(DatabasePDBrev dbrev) { 1376 1377 logger.debug("got a database revision:" + dbrev); 1378 1379 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US); 1380 PDBHeader header = structure.getPDBHeader(); 1381 1382 if ( header == null) { 1383 header = new PDBHeader(); 1384 } 1385 1386 if (dbrev.getNum().equals("1")){ 1387 1388 try { 1389 Date dep = dateFormat.parse(dbrev.getDate_original()); 1390 header.setDepDate(dep); 1391 1392 } catch (ParseException e){ 1393 logger.warn("Could not parse date string '{}', deposition date will be unavailable", dbrev.getDate_original()); 1394 } 1395 1396 try { 1397 Date rel = dateFormat.parse(dbrev.getDate()); 1398 header.setRelDate(rel); 1399 1400 } catch (ParseException e){ 1401 logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate()); 1402 } 1403 1404 1405 } else { 1406 try { 1407 1408 Date mod = dateFormat.parse(dbrev.getDate()); 1409 header.setModDate(mod); 1410 1411 } catch (ParseException e){ 1412 logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate()); 1413 } 1414 } 1415 1416 structure.setPDBHeader(header); 1417 } 1418 1419 @Override 1420 public void newPdbxAuditRevisionHistory(PdbxAuditRevisionHistory history) { 1421 1422 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US); 1423 PDBHeader header = structure.getPDBHeader(); 1424 1425 if ( header == null) { 1426 header = new PDBHeader(); 1427 } 1428 1429 // first entry in revision history is the release date 1430 if (history.getOrdinal().equals("1")){ 1431 try { 1432 Date releaseDate = dateFormat.parse(history.getRevision_date()); 1433 header.setRelDate(releaseDate); 1434 1435 } catch (ParseException e){ 1436 logger.warn("Could not parse date string '{}', release date will be unavailable", history.getRevision_date()); 1437 } 1438 } else { 1439 // all other dates are revision dates; 1440 // since this method may be called multiple times, 1441 // the last revision date will "stick" 1442 try { 1443 Date revisionDate = dateFormat.parse(history.getRevision_date()); 1444 header.setModDate(revisionDate); 1445 } catch (ParseException e){ 1446 logger.warn("Could not parse date string '{}', revision date will be unavailable", history.getRevision_date()); 1447 } 1448 } 1449 1450 structure.setPDBHeader(header); 1451 } 1452 1453 @Override 1454 public void newPdbxDatabaseStatus(PdbxDatabaseStatus status) { 1455 1456 // the deposition date field is only available in mmCIF 5.0 1457 1458 if (status.getRecvd_initial_deposition_date() == null) { 1459 // skip this method for older mmCIF versions 1460 return; 1461 } 1462 1463 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US); 1464 PDBHeader header = structure.getPDBHeader(); 1465 1466 if (header == null) { 1467 header = new PDBHeader(); 1468 } 1469 1470 try { 1471 Date depositionDate = dateFormat.parse(status.getRecvd_initial_deposition_date()); 1472 header.setDepDate(depositionDate); 1473 } catch (ParseException e){ 1474 logger.warn("Could not parse date string '{}', deposition date will be unavailable", status.getRecvd_initial_deposition_date()); 1475 } 1476 1477 structure.setPDBHeader(header); 1478 } 1479 1480 @Override 1481 public void newDatabasePDBremark(DatabasePDBremark remark) { 1482 //System.out.println(remark); 1483 String id = remark.getId(); 1484 if (id.equals("2")){ 1485 1486 //this remark field contains the resolution information: 1487 String line = remark.getText(); 1488 1489 int i = line.indexOf("ANGSTROM"); 1490 if ( i > 5) { 1491 // line contains ANGSTROM info... 1492 String resolution = line.substring(i-5,i).trim(); 1493 // convert string to float 1494 float res = 99 ; 1495 try { 1496 res = Float.parseFloat(resolution); 1497 1498 } catch (NumberFormatException e) { 1499 logger.info("could not parse resolution from line and ignoring it " + line); 1500 return ; 1501 1502 1503 } 1504 // support for old style header 1505 1506 PDBHeader pdbHeader = structure.getPDBHeader(); 1507 pdbHeader.setResolution(res); 1508 1509 } 1510 1511 } 1512 } 1513 1514 @Override 1515 public void newRefine(Refine r){ 1516 1517 PDBHeader pdbHeader = structure.getPDBHeader(); 1518 // RESOLUTION 1519 // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m) 1520 // there are 2 resolution values, one for each method 1521 // we take the last one found so that behaviour is like in PDB file parsing 1522 if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) { 1523 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} " 1524 ,r.getLs_d_res_high(), String.format("%4.2f",pdbHeader.getResolution())); 1525 } 1526 try { 1527 pdbHeader.setResolution(Float.parseFloat(r.getLs_d_res_high())); 1528 } catch (NumberFormatException e){ 1529 logger.info("Could not parse resolution from " + r.getLs_d_res_high() + " " + e.getMessage()); 1530 } 1531 1532 1533 // RFREE 1534 if (pdbHeader.getRfree()!=PDBHeader.DEFAULT_RFREE) { 1535 logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {} ", 1536 r.getLs_R_factor_R_free(), String.format("%4.2f",pdbHeader.getRfree())); 1537 } 1538 if (r.getLs_R_factor_R_free()==null) { 1539 // some entries like 2ifo haven't got this field at all 1540 logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value"); 1541 } else { 1542 try { 1543 pdbHeader.setRfree(Float.parseFloat(r.getLs_R_factor_R_free())); 1544 } catch (NumberFormatException e){ 1545 // no rfree present ('?') is very usual, that's why we set it to debug 1546 logger.debug("Could not parse Rfree from string '{}'", r.getLs_R_factor_R_free()); 1547 } 1548 } 1549 1550 // RWORK 1551 if(pdbHeader.getRwork()!=PDBHeader.DEFAULT_RFREE) { 1552 logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ", 1553 r.getLs_R_factor_R_work(), String.format("%4.2f",pdbHeader.getRwork())); 1554 } 1555 if(r.getLs_R_factor_R_work()==null){ 1556 logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value"); 1557 } 1558 else{ 1559 try{ 1560 pdbHeader.setRwork(Float.parseFloat(r.getLs_R_factor_R_work())); 1561 } 1562 catch (NumberFormatException e){ 1563 logger.debug("Could not parse R-work from string '{}'", r.getLs_R_factor_R_work()); 1564 } 1565 1566 } 1567 1568 } 1569 1570 1571 @Override 1572 public void newAuditAuthor(AuditAuthor aa){ 1573 1574 String name = aa.getName(); 1575 1576 StringBuffer famName = new StringBuffer(); 1577 StringBuffer initials = new StringBuffer(); 1578 boolean afterComma = false; 1579 for ( char c: name.toCharArray()) { 1580 if ( c == ' ') 1581 continue; 1582 if ( c == ','){ 1583 afterComma = true; 1584 continue; 1585 } 1586 1587 if ( afterComma) 1588 initials.append(c); 1589 else 1590 famName.append(c); 1591 } 1592 1593 StringBuffer newaa = new StringBuffer(); 1594 newaa.append(initials); 1595 newaa.append(famName); 1596 1597 PDBHeader header = structure.getPDBHeader(); 1598 String auth = header.getAuthors(); 1599 if (auth == null) { 1600 header.setAuthors(newaa.toString()); 1601 }else { 1602 auth += "," + newaa.toString(); 1603 header.setAuthors(auth); 1604 1605 } 1606 } 1607 1608 @Override 1609 public void newExptl(Exptl exptl) { 1610 1611 PDBHeader pdbHeader = structure.getPDBHeader(); 1612 String method = exptl.getMethod(); 1613 pdbHeader.setExperimentalTechnique(method); 1614 1615 } 1616 1617 @Override 1618 public void newCell(Cell cell) { 1619 1620 try { 1621 float a = Float.parseFloat(cell.getLength_a()); 1622 float b = Float.parseFloat(cell.getLength_b()); 1623 float c = Float.parseFloat(cell.getLength_c()); 1624 float alpha = Float.parseFloat(cell.getAngle_alpha()); 1625 float beta = Float.parseFloat(cell.getAngle_beta()); 1626 float gamma = Float.parseFloat(cell.getAngle_gamma()); 1627 1628 CrystalCell xtalCell = new CrystalCell(); 1629 xtalCell.setA(a); 1630 xtalCell.setB(b); 1631 xtalCell.setC(c); 1632 xtalCell.setAlpha(alpha); 1633 xtalCell.setBeta(beta); 1634 xtalCell.setGamma(gamma); 1635 1636 if (!xtalCell.isCellReasonable()) { 1637 // If the entry describes a structure determined by a technique other than X-ray crystallography, 1638 // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees 1639 // if so we don't add and CrystalCell will be null 1640 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", 1641 CrystalCell.MIN_VALID_CELL_SIZE); 1642 return; 1643 } 1644 1645 structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(xtalCell); 1646 1647 } catch (NumberFormatException e){ 1648 structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(null); 1649 logger.info("could not parse some cell parameters ("+e.getMessage()+"), ignoring _cell "); 1650 } 1651 } 1652 1653 @Override 1654 public void newSymmetry(Symmetry symmetry) { 1655 String spaceGroup = symmetry.getSpace_group_name_H_M(); 1656 SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup); 1657 if (sg==null) { 1658 logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group"); 1659 structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(true); 1660 } else { 1661 structure.getPDBHeader().getCrystallographicInfo().setSpaceGroup(sg); 1662 structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(false); 1663 } 1664 } 1665 1666 @Override 1667 public void newStructNcsOper(StructNcsOper sNcsOper) { 1668 structNcsOper.add(sNcsOper); 1669 } 1670 1671 public void newAtomSites(AtomSites atomSites) { 1672 1673 try { 1674 Matrix4d m = new Matrix4d( 1675 Double.parseDouble(atomSites.getFract_transf_matrix11()), Double.parseDouble(atomSites.getFract_transf_matrix12()), Double.parseDouble(atomSites.getFract_transf_matrix13()), Double.parseDouble(atomSites.getFract_transf_vector1()), 1676 Double.parseDouble(atomSites.getFract_transf_matrix21()), Double.parseDouble(atomSites.getFract_transf_matrix22()), Double.parseDouble(atomSites.getFract_transf_matrix23()), Double.parseDouble(atomSites.getFract_transf_vector2()), 1677 Double.parseDouble(atomSites.getFract_transf_matrix31()), Double.parseDouble(atomSites.getFract_transf_matrix32()), Double.parseDouble(atomSites.getFract_transf_matrix33()), Double.parseDouble(atomSites.getFract_transf_vector3()), 1678 0,0,0,1); 1679 1680 parsedScaleMatrix = m; 1681 1682 } catch (NumberFormatException e) { 1683 logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", e.getMessage()); 1684 structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false); 1685 1686 // in this case parsedScaleMatrix stays null and can't be used in documentEnd() 1687 } 1688 } 1689 1690 @Override 1691 public void newStructRef(StructRef sref) { 1692 logger.debug(sref.toString()); 1693 strucRefs.add(sref); 1694 } 1695 1696 private StructRef getStructRef(String ref_id){ 1697 for (StructRef structRef : strucRefs) { 1698 1699 if (structRef.getId().equals(ref_id)){ 1700 return structRef; 1701 } 1702 1703 } 1704 return null; 1705 1706 } 1707 1708 /** 1709 * create a DBRef record from the StrucRefSeq record: 1710 * <pre> 1711 * PDB record DBREF 1712 * Field Name mmCIF Data Item 1713 * Section n.a. 1714 * PDB_ID_Code _struct_ref_seq.pdbx_PDB_id_code 1715 * Strand_ID _struct_ref_seq.pdbx_strand_id 1716 * Begin_Residue_Number _struct_ref_seq.pdbx_auth_seq_align_beg 1717 * Begin_Ins_Code _struct_ref_seq.pdbx_seq_align_beg_ins_code 1718 * End_Residue_Number _struct_ref_seq.pdbx_auth_seq_align_end 1719 * End_Ins_Code _struct_ref_seq.pdbx_seq_align_end_ins_code 1720 * Database _struct_ref.db_name 1721 * Database_Accession_No _struct_ref_seq.pdbx_db_accession 1722 * Database_ID_Code _struct_ref.db_code 1723 * Database_Begin_Residue_Number _struct_ref_seq.db_align_beg 1724 * Databaes_Begin_Ins_Code _struct_ref_seq.pdbx_db_align_beg_ins_code 1725 * Database_End_Residue_Number _struct_ref_seq.db_align_end 1726 * Databaes_End_Ins_Code _struct_ref_seq.pdbx_db_align_end_ins_code 1727 * </pre> 1728 * 1729 * 1730 */ 1731 @Override 1732 public void newStructRefSeq(StructRefSeq sref) { 1733 DBRef r = new DBRef(); 1734 1735 r.setIdCode(sref.getPdbx_PDB_id_code()); 1736 r.setDbAccession(sref.getPdbx_db_accession()); 1737 r.setDbIdCode(sref.getPdbx_db_accession()); 1738 1739 r.setChainName(sref.getPdbx_strand_id()); 1740 StructRef structRef = getStructRef(sref.getRef_id()); 1741 if (structRef == null){ 1742 logger.info("could not find StructRef " + sref.getRef_id() + " for StructRefSeq " + sref); 1743 } else { 1744 r.setDatabase(structRef.getDb_name()); 1745 r.setDbIdCode(structRef.getDb_code()); 1746 } 1747 1748 int seqbegin; 1749 int seqend; 1750 try{ 1751 seqbegin = Integer.parseInt(sref.getPdbx_auth_seq_align_beg()); 1752 seqend = Integer.parseInt(sref.getPdbx_auth_seq_align_end()); 1753 } 1754 catch(NumberFormatException e){ 1755 // this happens in a few entries, annotation error? e.g. 6eoj 1756 logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref alignment info for accession {}. Error: {}", r.getDbAccession(), e.getMessage()); 1757 return; 1758 } 1759 1760 Character begin_ins_code = ' '; 1761 if (sref.getPdbx_seq_align_beg_ins_code() != null ) { 1762 begin_ins_code = new Character(sref.getPdbx_seq_align_beg_ins_code().charAt(0)); 1763 } 1764 1765 Character end_ins_code = ' '; 1766 if (sref.getPdbx_seq_align_end_ins_code() != null) { 1767 end_ins_code = new Character(sref.getPdbx_seq_align_end_ins_code().charAt(0)); 1768 } 1769 1770 if (begin_ins_code == '?') 1771 begin_ins_code = ' '; 1772 1773 if (end_ins_code == '?') 1774 end_ins_code = ' '; 1775 1776 r.setSeqBegin(seqbegin); 1777 r.setInsertBegin(begin_ins_code); 1778 1779 r.setSeqEnd(seqend); 1780 r.setInsertEnd(end_ins_code); 1781 1782 int dbseqbegin = Integer.parseInt(sref.getDb_align_beg()); 1783 int dbseqend = Integer.parseInt(sref.getDb_align_end()); 1784 1785 Character db_begin_in_code = ' '; 1786 if (sref.getPdbx_db_align_beg_ins_code() != null) { 1787 db_begin_in_code = new Character(sref.getPdbx_db_align_beg_ins_code().charAt(0)); 1788 } 1789 1790 Character db_end_in_code = ' '; 1791 if (sref.getPdbx_db_align_end_ins_code() != null) { 1792 db_end_in_code = new Character(sref.getPdbx_db_align_end_ins_code().charAt(0)); 1793 } 1794 1795 if (db_begin_in_code == '?') 1796 db_begin_in_code = ' '; 1797 1798 if (db_end_in_code == '?') 1799 db_end_in_code = ' '; 1800 1801 1802 r.setDbSeqBegin(dbseqbegin); 1803 r.setIdbnsBegin(db_begin_in_code); 1804 1805 r.setDbSeqEnd(dbseqend); 1806 r.setIdbnsEnd(db_end_in_code); 1807 1808 List<DBRef> dbrefs = structure.getDBRefs(); 1809 if ( dbrefs == null) 1810 dbrefs = new ArrayList<DBRef>(); 1811 dbrefs.add(r); 1812 1813 logger.debug(r.toPDB()); 1814 1815 structure.setDBRefs(dbrefs); 1816 1817 } 1818 1819 @Override 1820 public void newStructRefSeqDif(StructRefSeqDif sref) { 1821 sequenceDifs.add(sref); 1822 } 1823 1824 private Chain getEntityChain(String entity_id){ 1825 1826 for (Chain chain : entityChains) { 1827 if ( chain.getId().equals(entity_id)){ 1828 1829 return chain; 1830 } 1831 } 1832 // does not exist yet, so create... 1833 1834 Chain chain = new ChainImpl(); 1835 chain.setId(entity_id); 1836 entityChains.add(chain); 1837 1838 return chain; 1839 1840 } 1841 1842 //private Chain getSeqResChain(String chainID){ 1843 // return getChainFromList(seqResChains, chainID); 1844 //} 1845 1846 1847 /** 1848 * Data items in the ENTITY_SRC_GEN category record details of 1849 * the source from which the entity was obtained in cases 1850 * where the source was genetically manipulated. The 1851 * following are treated separately: items pertaining to the tissue 1852 * from which the gene was obtained, items pertaining to the host 1853 * organism for gene expression and items pertaining to the actual 1854 * producing organism (plasmid). 1855 */ 1856 @Override 1857 public void newEntitySrcGen(EntitySrcGen entitySrcGen){ 1858 1859 // add to internal list. Map to Compound object later on... 1860 entitySrcGens.add(entitySrcGen); 1861 } 1862 1863 @Override 1864 public void newEntitySrcNat(EntitySrcNat entitySrcNat){ 1865 1866 // add to internal list. Map to Compound object later on... 1867 entitySrcNats.add(entitySrcNat); 1868 } 1869 1870 @Override 1871 public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){ 1872 1873 // add to internal list. Map to Compound object later on... 1874 entitySrcSyns.add(entitySrcSyn); 1875 } 1876 1877 /** 1878 * The EntityPolySeq object provide the amino acid sequence objects for the Entities. 1879 * Later on the entities are mapped to the BioJava {@link Chain} and {@link EntityInfo} objects. 1880 * @param epolseq the EntityPolySeq record for one amino acid 1881 */ 1882 @Override 1883 public void newEntityPolySeq(EntityPolySeq epolseq) { 1884 1885 logger.debug("NEW entity poly seq " + epolseq); 1886 1887 int eId = -1; 1888 try { 1889 eId = Integer.parseInt(epolseq.getEntity_id()); 1890 } catch (NumberFormatException e) { 1891 logger.warn("Could not parse entity id from EntityPolySeq: "+e.getMessage()); 1892 } 1893 Entity e = getEntity(eId); 1894 1895 if (e == null){ 1896 logger.info("Could not find entity "+ epolseq.getEntity_id()+". Can not match sequence to it."); 1897 return; 1898 } 1899 1900 Chain entityChain = getEntityChain(epolseq.getEntity_id()); 1901 1902 // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group 1903 // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08 1904 1905 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(epolseq.getMon_id()); 1906 //int seqId = Integer.parseInt(epolseq.getNum()); 1907 if ( g != null && !g.getChemComp().isEmpty()) { 1908 if ( g instanceof AminoAcidImpl) { 1909 AminoAcidImpl aa = (AminoAcidImpl) g; 1910 aa.setRecordType(AminoAcid.SEQRESRECORD); 1911 //aa.setId(seqId); 1912 } 1913 } else { 1914 1915 if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){ 1916 AminoAcidImpl a = new AminoAcidImpl(); 1917 a.setRecordType(AminoAcid.SEQRESRECORD); 1918 Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id()); 1919 a.setAminoType(code1); 1920 g = a; 1921 1922 } else if ( StructureTools.isNucleotide(epolseq.getMon_id())) { 1923 // the group is actually a nucleotide group... 1924 NucleotideImpl n = new NucleotideImpl(); 1925 g = n; 1926 1927 } else { 1928 logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id()); 1929 HetatomImpl h = new HetatomImpl(); 1930 g = h; 1931 1932 } 1933 1934 1935 } 1936 // at this stage we don't know about author residue numbers (insertion codes) 1937 // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly sequential and follow the seqres sequence 1 to n) 1938 // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes() 1939 g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum())); 1940 1941 g.setPDBName(epolseq.getMon_id()); 1942 1943 entityChain.addGroup(g); 1944 1945 } 1946 1947 @Override 1948 public void newPdbxPolySeqScheme(PdbxPolySeqScheme ppss) { 1949 1950 //if ( headerOnly) 1951 // return; 1952 1953 // replace the group asym ids with the real PDB ids! 1954 // replaceGroupSeqPos(ppss); // This might be incorrect in some pdb, to use auth_seq_id of the pdbx_poly_seq_scheme. 1955 1956 1957 } 1958 1959 1960 @Override 1961 public void newPdbxNonPolyScheme(PdbxNonPolyScheme ppss) { 1962 1963 //if (headerOnly) 1964 // return; 1965 1966 // merge the EntityPolySeq info and the AtomSite chains into one... 1967 //already known ignore: 1968 1969 } 1970 1971 @Override 1972 public void newPdbxEntityNonPoly(PdbxEntityNonPoly pen){ 1973 // TODO: do something with them... 1974 // not implemented yet... 1975 logger.debug(pen.getEntity_id() + " " + pen.getName() + " " + pen.getComp_id()); 1976 1977 } 1978 1979 @Override 1980 public void newChemComp(ChemComp c) { 1981 // TODO: do something with them... 1982 1983 } 1984 1985 @Override 1986 public void newGenericData(String category, List<String> loopFields, 1987 List<String> lineData) { 1988 1989 //logger.debug("unhandled category so far: " + category); 1990 } 1991 1992 @Override 1993 public FileParsingParameters getFileParsingParameters() 1994 { 1995 return params; 1996 } 1997 1998 @Override 1999 public void setFileParsingParameters(FileParsingParameters params) 2000 { 2001 this.params = params; 2002 2003 } 2004 2005 @Override 2006 public void newChemCompDescriptor(ChemCompDescriptor ccd) { 2007 2008 // TODO nothing happening here yet. 2009 2010 } 2011 2012 2013 2014 public List<PdbxStructOperList> getStructOpers() { 2015 return structOpers; 2016 } 2017 2018 @Override 2019 public void newPdbxStrucAssembly(PdbxStructAssembly strucAssembly) { 2020 strucAssemblies.add(strucAssembly); 2021 2022 } 2023 2024 public List<PdbxStructAssembly> getStructAssemblies(){ 2025 return strucAssemblies; 2026 } 2027 2028 @Override 2029 public void newPdbxStrucAssemblyGen(PdbxStructAssemblyGen strucAssembly) { 2030 strucAssemblyGens.add(strucAssembly); 2031 2032 } 2033 2034 public List<PdbxStructAssemblyGen> getStructAssemblyGens(){ 2035 return strucAssemblyGens; 2036 } 2037 2038 @Override 2039 public void newChemCompAtom(ChemCompAtom atom) { 2040 2041 } 2042 2043 @Override 2044 public void newPdbxChemCompIndentifier(PdbxChemCompIdentifier id) { 2045 2046 } 2047 2048 @Override 2049 public void newChemCompBond(ChemCompBond bond) { 2050 2051 } 2052 2053 @Override 2054 public void newPdbxChemCompDescriptor(PdbxChemCompDescriptor desc) { 2055 2056 } 2057 2058 @Override 2059 public void newStructConn(StructConn structConn) { 2060 this.structConn.add(structConn); 2061 } 2062 2063 @Override 2064 public void newStructSiteGen(StructSiteGen siteGen) { this.structSiteGens.add(siteGen); } 2065 2066 @Override 2067 public void newStructSite(StructSite structSite) { 2068 2069 if (params.isHeaderOnly()) { 2070 return; 2071 } 2072 2073 // Simply implement the method. 2074 List<Site> sites = structure.getSites(); 2075 if (sites == null) sites = new ArrayList<Site>(); 2076 2077 Site site = null; 2078 for (Site asite : sites) { 2079 if (asite.getSiteID().equals(structSite.getId())) { 2080 site = asite; // Prevent duplicate siteIds 2081 } 2082 } 2083 boolean addSite = false; 2084 if (site == null) { site = new Site(); addSite = true; } 2085 site.setSiteID(structSite.getId()); 2086 site.setDescription(structSite.getDetails()); 2087 // site.setPdbxEvidenceCode(structSite.getPdbxEvidenceCode()); // TODO - add addition fields in Sites 2088 if (addSite) sites.add(site); 2089 2090 structure.setSites(sites); 2091 } 2092 2093 /** 2094 * Build sites in a BioJava Structure using the original author chain id & residue numbers. 2095 * Sites are built from struct_site_gen records that have been parsed. 2096 */ 2097 private void addSites() { 2098 List<Site> sites = structure.getSites(); 2099 if (sites == null) sites = new ArrayList<Site>(); 2100 2101 for (StructSiteGen siteGen : structSiteGens) { 2102 // For each StructSiteGen, find the residues involved, if they exist then 2103 String site_id = siteGen.getSite_id(); // multiple could be in same site. 2104 if (site_id == null) site_id = ""; 2105 String comp_id = siteGen.getLabel_comp_id(); // PDBName 2106 2107 // Assumption: the author chain ID and residue number for the site is consistent with the original 2108 // author chain id and residue numbers. 2109 2110 String asymId = siteGen.getLabel_asym_id(); // chain name 2111 String authId = siteGen.getAuth_asym_id(); // chain Id 2112 String auth_seq_id = siteGen.getAuth_seq_id(); // Res num 2113 2114 String insCode = siteGen.getPdbx_auth_ins_code(); 2115 if ( insCode != null && insCode.equals("?")) 2116 insCode = null; 2117 2118 // Look for asymID = chainID and seqID = seq_ID. Check that comp_id matches the resname. 2119 Group g = null; 2120 try { 2121 Chain chain = structure.getChain(asymId); 2122 2123 if (null != chain) { 2124 try { 2125 Character insChar = null; 2126 if (null != insCode && insCode.length() > 0) insChar = insCode.charAt(0); 2127 g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar)); 2128 } catch (NumberFormatException e) { 2129 logger.warn("Could not lookup residue : " + authId + auth_seq_id); 2130 } 2131 } 2132 } catch (StructureException e) { 2133 logger.warn("Problem finding residue in site entry " + siteGen.getSite_id() + " - " + e.getMessage(), e.getMessage()); 2134 } 2135 2136 if (g != null) { 2137 // 2. find the site_id, if not existing, create anew. 2138 Site site = null; 2139 for (Site asite: sites) { 2140 if (site_id.equals(asite.getSiteID())) site = asite; 2141 } 2142 2143 boolean addSite = false; 2144 2145 // 3. add this residue to the site. 2146 if (site == null) { 2147 addSite = true; 2148 site = new Site(); 2149 site.setSiteID(site_id); 2150 } 2151 2152 List<Group> groups = site.getGroups(); 2153 if (groups == null) groups = new ArrayList<Group>(); 2154 2155 // Check the self-consistency of the residue reference from auth_seq_id and chain_id 2156 if (!comp_id.equals(g.getPDBName())) { 2157 logger.warn("comp_id doesn't match the residue at " + authId + " " + auth_seq_id + " - skipping"); 2158 } else { 2159 groups.add(g); 2160 site.setGroups(groups); 2161 } 2162 if (addSite) sites.add(site); 2163 } 2164 } 2165 structure.setSites(sites); 2166 } 2167}