001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * created at Apr 26, 2008 021 */ 022package org.biojava.nbio.structure.io.mmcif; 023 024import java.text.ParseException; 025import java.text.SimpleDateFormat; 026import java.util.ArrayList; 027import java.util.Date; 028import java.util.HashMap; 029import java.util.LinkedHashMap; 030import java.util.List; 031import java.util.Locale; 032import java.util.Map; 033 034import javax.vecmath.Matrix4d; 035 036import org.biojava.nbio.structure.AminoAcid; 037import org.biojava.nbio.structure.AminoAcidImpl; 038import org.biojava.nbio.structure.Atom; 039import org.biojava.nbio.structure.AtomImpl; 040import org.biojava.nbio.structure.Chain; 041import org.biojava.nbio.structure.ChainImpl; 042import org.biojava.nbio.structure.EntityInfo; 043import org.biojava.nbio.structure.EntityType; 044import org.biojava.nbio.structure.DBRef; 045import org.biojava.nbio.structure.Element; 046import org.biojava.nbio.structure.Group; 047import org.biojava.nbio.structure.GroupType; 048import org.biojava.nbio.structure.HetatomImpl; 049import org.biojava.nbio.structure.NucleotideImpl; 050import org.biojava.nbio.structure.PDBCrystallographicInfo; 051import org.biojava.nbio.structure.PDBHeader; 052import org.biojava.nbio.structure.ResidueNumber; 053import org.biojava.nbio.structure.SeqMisMatch; 054import org.biojava.nbio.structure.SeqMisMatchImpl; 055import org.biojava.nbio.structure.Site; 056import org.biojava.nbio.structure.Structure; 057import org.biojava.nbio.structure.StructureException; 058import org.biojava.nbio.structure.StructureImpl; 059import org.biojava.nbio.structure.StructureTools; 060import org.biojava.nbio.structure.io.BondMaker; 061import org.biojava.nbio.structure.io.ChargeAdder; 062import org.biojava.nbio.structure.io.EntityFinder; 063import org.biojava.nbio.structure.io.FileParsingParameters; 064import org.biojava.nbio.structure.io.SeqRes2AtomAligner; 065import org.biojava.nbio.structure.io.mmcif.model.AtomSite; 066import org.biojava.nbio.structure.io.mmcif.model.AtomSites; 067import org.biojava.nbio.structure.io.mmcif.model.AuditAuthor; 068import org.biojava.nbio.structure.io.mmcif.model.Cell; 069import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 070import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom; 071import org.biojava.nbio.structure.io.mmcif.model.ChemCompBond; 072import org.biojava.nbio.structure.io.mmcif.model.ChemCompDescriptor; 073import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBremark; 074import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBrev; 075import org.biojava.nbio.structure.io.mmcif.model.DatabasePdbrevRecord; 076import org.biojava.nbio.structure.io.mmcif.model.Entity; 077import org.biojava.nbio.structure.io.mmcif.model.EntityPoly; 078import org.biojava.nbio.structure.io.mmcif.model.EntityPolySeq; 079import org.biojava.nbio.structure.io.mmcif.model.EntitySrcGen; 080import org.biojava.nbio.structure.io.mmcif.model.EntitySrcNat; 081import org.biojava.nbio.structure.io.mmcif.model.EntitySrcSyn; 082import org.biojava.nbio.structure.io.mmcif.model.Exptl; 083import org.biojava.nbio.structure.io.mmcif.model.PdbxAuditRevisionHistory; 084import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompDescriptor; 085import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompIdentifier; 086import org.biojava.nbio.structure.io.mmcif.model.PdbxDatabaseStatus; 087import org.biojava.nbio.structure.io.mmcif.model.PdbxEntityNonPoly; 088import org.biojava.nbio.structure.io.mmcif.model.PdbxNonPolyScheme; 089import org.biojava.nbio.structure.io.mmcif.model.PdbxPolySeqScheme; 090import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssembly; 091import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssemblyGen; 092import org.biojava.nbio.structure.io.mmcif.model.PdbxStructOperList; 093import org.biojava.nbio.structure.io.mmcif.model.Refine; 094import org.biojava.nbio.structure.io.mmcif.model.Struct; 095import org.biojava.nbio.structure.io.mmcif.model.StructAsym; 096import org.biojava.nbio.structure.io.mmcif.model.StructConn; 097import org.biojava.nbio.structure.io.mmcif.model.StructKeywords; 098import org.biojava.nbio.structure.io.mmcif.model.StructNcsOper; 099import org.biojava.nbio.structure.io.mmcif.model.StructRef; 100import org.biojava.nbio.structure.io.mmcif.model.StructRefSeq; 101import org.biojava.nbio.structure.io.mmcif.model.StructRefSeqDif; 102import org.biojava.nbio.structure.io.mmcif.model.StructSite; 103import org.biojava.nbio.structure.io.mmcif.model.StructSiteGen; 104import org.biojava.nbio.structure.io.mmcif.model.Symmetry; 105import org.biojava.nbio.structure.quaternary.BioAssemblyInfo; 106import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder; 107import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation; 108import org.biojava.nbio.structure.xtal.CrystalCell; 109import org.biojava.nbio.structure.xtal.SpaceGroup; 110import org.biojava.nbio.structure.xtal.SymoplibParser; 111import org.slf4j.Logger; 112import org.slf4j.LoggerFactory; 113 114/** 115 * A MMcifConsumer implementation that builds an in-memory representation of the 116 * content of a mmcif file as a BioJava Structure object. 117 * 118 * @author Andreas Prlic 119 * @since 1.7 120 */ 121 122public class SimpleMMcifConsumer implements MMcifConsumer { 123 124 private static final Logger logger = LoggerFactory.getLogger(SimpleMMcifConsumer.class); 125 126 private Structure structure; 127 private Chain currentChain; 128 private Group currentGroup; 129 130 /** 131 * A temporary data structure to hold all parsed chains 132 */ 133 private ArrayList<List<Chain>> allModels; 134 /** 135 * The current set of chains per model 136 */ 137 private List<Chain> currentModel; 138 private List<Entity> entities; 139 /** 140 * Needed in header only mode to get mapping between asym ids and author ids 141 */ 142 private List<EntityPoly> entityPolys; 143 private List<StructRef> strucRefs; 144 private List<Chain> seqResChains; 145 private List<Chain> entityChains; // needed to link entities, chains and compounds... 146 private List<StructAsym> structAsyms; // needed to link entities, chains and compounds... 147 private List<PdbxStructOperList> structOpers ; // 148 private List<PdbxStructAssembly> strucAssemblies; 149 private List<PdbxStructAssemblyGen> strucAssemblyGens; 150 private List<EntitySrcGen> entitySrcGens; 151 private List<EntitySrcNat> entitySrcNats; 152 private List<EntitySrcSyn> entitySrcSyns; 153 private List<StructConn> structConn; 154 private List<StructNcsOper> structNcsOper; 155 private List<StructRefSeqDif> sequenceDifs; 156 private List<StructSiteGen> structSiteGens; 157 158 private Matrix4d parsedScaleMatrix; 159 160 161 162 /** 163 * A map of asym ids (internal chain ids) to entity ids extracted from 164 * the _struct_asym category 165 */ 166 private Map<String,String> asymId2entityId; 167 168 /** 169 * A map of asym ids (internal chain ids) to author ids extracted from 170 * the _entity_poly category. Used in header only parsing. 171 */ 172 private Map<String,String> asymId2authorId; 173 174 private String currentNmrModelNumber ; 175 176 private FileParsingParameters params; 177 178 public SimpleMMcifConsumer(){ 179 params = new FileParsingParameters(); 180 documentStart(); 181 182 } 183 184 @Override 185 public void newEntity(Entity entity) { 186 logger.debug("New entity: {}",entity.toString()); 187 entities.add(entity); 188 } 189 190 @Override 191 public void newEntityPoly(EntityPoly entityPoly) { 192 entityPolys.add(entityPoly); 193 } 194 195 @Override 196 public void newPdbxStructOperList(PdbxStructOperList structOper){ 197 198 structOpers.add(structOper); 199 } 200 201 @Override 202 public void newStructAsym(StructAsym sasym){ 203 204 structAsyms.add(sasym); 205 } 206 207 private Entity getEntity(int entity_id){ 208 try { 209 for (Entity e: entities){ 210 int eId = Integer.parseInt(e.getId()); 211 if (eId== entity_id){ 212 return e; 213 } 214 } 215 } catch (NumberFormatException e) { 216 logger.warn("Entity id does not look like a number:", e.getMessage()); 217 } 218 return null; 219 } 220 221 @Override 222 public void newStructKeywords(StructKeywords kw){ 223 PDBHeader header = structure.getPDBHeader(); 224 if ( header == null) 225 header = new PDBHeader(); 226 header.setDescription(kw.getPdbx_keywords()); 227 header.setClassification(kw.getPdbx_keywords()); 228 } 229 230 @Override 231 public void setStruct(Struct struct) { 232 233 PDBHeader header = structure.getPDBHeader(); 234 if ( header == null) 235 header = new PDBHeader(); 236 237 header.setTitle(struct.getTitle()); 238 header.setIdCode(struct.getEntry_id()); 239 //header.setDescription(struct.getPdbx_descriptor()); 240 //header.setClassification(struct.getPdbx_descriptor()); 241 //header.setDescription(struct.getPdbx_descriptor()); 242 243 244 245 structure.setPDBHeader(header); 246 structure.setPDBCode(struct.getEntry_id()); 247 } 248 249 /** initiate new group, either Hetatom, Nucleotide, or AminoAcid */ 250 private Group getNewGroup(String recordName,Character aminoCode1, long seq_id,String groupCode3) { 251 252 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(groupCode3); 253 if ( g != null && !g.getChemComp().isEmpty()) { 254 if ( g instanceof AminoAcidImpl) { 255 AminoAcidImpl aa = (AminoAcidImpl) g; 256 aa.setId(seq_id); 257 } else if ( g instanceof NucleotideImpl) { 258 NucleotideImpl nuc = (NucleotideImpl) g; 259 nuc.setId(seq_id); 260 } else if ( g instanceof HetatomImpl) { 261 HetatomImpl het = (HetatomImpl)g; 262 het.setId(seq_id); 263 } 264 return g; 265 } 266 267 268 269 Group group; 270 if ( recordName.equals("ATOM") ) { 271 if (StructureTools.isNucleotide(groupCode3)) { 272 // it is a nucleotide 273 NucleotideImpl nu = new NucleotideImpl(); 274 group = nu; 275 nu.setId(seq_id); 276 277 } else if (aminoCode1==null || aminoCode1 == StructureTools.UNKNOWN_GROUP_LABEL){ 278 HetatomImpl h = new HetatomImpl(); 279 h.setId(seq_id); 280 group = h; 281 282 } else { 283 AminoAcidImpl aa = new AminoAcidImpl() ; 284 aa.setAminoType(aminoCode1); 285 aa.setId(seq_id); 286 group = aa ; 287 } 288 } 289 else { 290 if (StructureTools.isNucleotide(groupCode3)) { 291 // it is a nucleotide 292 NucleotideImpl nu = new NucleotideImpl(); 293 group = nu; 294 nu.setId(seq_id); 295 } 296 else if (aminoCode1 != null ) { 297 AminoAcidImpl aa = new AminoAcidImpl() ; 298 aa.setAminoType(aminoCode1); 299 aa.setId(seq_id); 300 group = aa ; 301 } else { 302 HetatomImpl h = new HetatomImpl(); 303 h.setId(seq_id); 304 group = h; 305 } 306 } 307 return group ; 308 } 309 310 /** 311 * Test if the given asymId is already present in the list of chains given. If yes, returns the chain 312 * otherwise returns null. 313 */ 314 private static Chain isKnownChain(String asymId, List<Chain> chains){ 315 316 for (int i = 0; i< chains.size();i++){ 317 Chain testchain = chains.get(i); 318 //System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<"); 319 if (asymId.equals(testchain.getId())) { 320 //System.out.println("chain "+ chainID+" already known ..."); 321 return testchain; 322 } 323 } 324 325 return null; 326 } 327 328 @Override 329 public void newAtomSite(AtomSite atom) { 330 331 if (params.isHeaderOnly()) return; 332 333 // Warning: getLabel_asym_id is not the "chain id" in the PDB file 334 // it is the internally used chain id. 335 // later on we will fix this... 336 337 // later one needs to map the asym id to the pdb_strand_id 338 339 //TODO: add support for FileParsingParams.getMaxAtoms() 340 341 boolean startOfNewChain = false; 342 343 String asymId = atom.getLabel_asym_id(); 344 String authId = atom.getAuth_asym_id(); 345 346 String recordName = atom.getGroup_PDB(); 347 String residueNumberS = atom.getAuth_seq_id(); 348 Integer residueNrInt = Integer.parseInt(residueNumberS); 349 350 // the 3-letter name of the group: 351 String groupCode3 = atom.getLabel_comp_id(); 352 353 boolean isHetAtomInFile = false; 354 355 Character aminoCode1 = null; 356 if ( recordName.equals("ATOM") ) 357 aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3); 358 else { 359 aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3); 360 361 // for nucleotides this will be null.. 362 if (aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) 363 aminoCode1 = null; 364 365 isHetAtomInFile = true; 366 } 367 String insCodeS = atom.getPdbx_PDB_ins_code(); 368 Character insCode = null; 369 if (! insCodeS.equals("?")) { 370 insCode = insCodeS.charAt(0); 371 } 372 // we store the internal seq id in the Atom._id field 373 // this is not a PDB file field but we need this to internally assign the insertion codes later 374 // from the pdbx_poly_seq entries.. 375 376 long seq_id = -1; 377 try { 378 seq_id = Long.parseLong(atom.getLabel_seq_id()); 379 } catch (NumberFormatException e){ 380 // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.', thus it is ok to 381 // silently ignore this 382 //logger.debug("Could not parse number for _atom_site.label_seq_id: "+e.getMessage()); 383 } 384 385 String nmrModelNumber = atom.getPdbx_PDB_model_num(); 386 387 if ( currentNmrModelNumber == null) { 388 currentNmrModelNumber = nmrModelNumber; 389 } 390 391 if (! currentNmrModelNumber.equals(nmrModelNumber)){ 392 currentNmrModelNumber = nmrModelNumber; 393 394 // add previous data 395 if ( currentChain != null ) { 396 currentChain.addGroup(currentGroup); 397 currentGroup.trimToSize(); 398 } 399 400 // we came to the beginning of a new NMR model 401 allModels.add(currentModel); 402 currentModel = new ArrayList<Chain>(); 403 currentChain = null; 404 currentGroup = null; 405 } 406 407 408 if (currentChain == null) { 409 410 currentChain = new ChainImpl(); 411 currentChain.setName(authId); 412 currentChain.setId(asymId); 413 currentModel.add(currentChain); 414 startOfNewChain = true; 415 } 416 417 //System.out.println("BEFORE: " + chain_id + " " + current_chain.getName()); 418 if ( ! asymId.equals(currentChain.getId()) ) { 419 //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId); 420 startOfNewChain = true; 421 422 // end up old chain... 423 currentChain.addGroup(currentGroup); 424 425 // see if old chain is known ... 426 Chain testchain = isKnownChain(asymId,currentModel); 427 428 if ( testchain == null) { 429 //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId); 430 431 currentChain = new ChainImpl(); 432 currentChain.setName(authId); 433 currentChain.setId(asymId); 434 435 } else { 436 currentChain = testchain; 437 } 438 439 if ( ! currentModel.contains(currentChain)) 440 currentModel.add(currentChain); 441 442 } 443 444 445 ResidueNumber residueNumber = new ResidueNumber(authId,residueNrInt, insCode); 446 447 if (currentGroup == null) { 448 449 450 currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3); 451 452 currentGroup.setResidueNumber(residueNumber); 453 currentGroup.setPDBName(groupCode3); 454 currentGroup.setHetAtomInFile(isHetAtomInFile); 455 } 456 457 // SET UP THE ALT LOC GROUP 458 Group altGroup = null; 459 String altLocS = atom.getLabel_alt_id(); 460 Character altLoc = ' '; 461 if ( altLocS.length()>0) { 462 altLoc = altLocS.charAt(0); 463 if ( altLoc.equals('.') ) 464 altLoc = ' '; 465 466 } 467 // If it's the start of the new chain 468 if ( startOfNewChain){ 469 currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3); 470 currentGroup.setResidueNumber(residueNumber); 471 currentGroup.setPDBName(groupCode3); 472 currentGroup.setHetAtomInFile(isHetAtomInFile); 473 } 474 // ANTHONY BRADLEY ADDED THIS -> WE ONLY WAN'T TO CHECK FOR ALT LOCS WHEN IT's NOT THE FIRST GROUP IN CHAIN 475 else{ 476 // check if residue number is the same ... 477 // insertion code is part of residue number 478 if ( ! residueNumber.equals(currentGroup.getResidueNumber())) { 479 //System.out.println("end of residue: "+current_group.getPDBCode()+" "+residueNrInt); 480 currentChain.addGroup(currentGroup); 481 currentGroup.trimToSize(); 482 currentGroup = getNewGroup(recordName,aminoCode1,seq_id,groupCode3); 483 currentGroup.setPDBName(groupCode3); 484 currentGroup.setResidueNumber(residueNumber); 485 currentGroup.setHetAtomInFile(isHetAtomInFile); 486 487 488 } else { 489 // same residueNumber, but altLocs... 490 // test altLoc 491 492 if ( ! altLoc.equals(' ') && ( ! altLoc.equals('.'))) { 493 logger.debug("found altLoc! " + altLoc + " " + currentGroup + " " + altGroup); 494 altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3, seq_id); 495 if (altGroup.getChain()==null) { 496 altGroup.setChain(currentChain); 497 } 498 } 499 } 500 } 501 //atomCount++; 502 //System.out.println("fixing atom name for >" + atom.getLabel_atom_id() + "< >" + fullname + "<"); 503 504 505 if ( params.isParseCAOnly() ){ 506 // yes , user wants to get CA only 507 // only parse CA atoms... 508 if (! (atom.getLabel_atom_id().equals(StructureTools.CA_ATOM_NAME) && atom.getType_symbol().equals("C"))) { 509 //System.out.println("ignoring " + line); 510 //atomCount--; 511 return; 512 } 513 } 514 515 //see if chain_id is one of the previous chains ... 516 517 Atom a = convertAtom(atom); 518 519 //see if chain_id is one of the previous chains ... 520 if ( altGroup != null) { 521 altGroup.addAtom(a); 522 altGroup = null; 523 } 524 else { 525 currentGroup.addAtom(a); 526 } 527 528 529 String atomName = a.getName(); 530 // make sure that main group has all atoms 531 // GitHub issue: #76 532 if ( ! currentGroup.hasAtom(atomName)) { 533 // Unless it's microheterogenity https://github.com/rcsb/codec-devel/issues/81 534 if (currentGroup.getPDBName().equals(a.getGroup().getPDBName())) { 535 if(!StructureTools.hasNonDeuteratedEquiv(a,currentGroup)){ 536 currentGroup.addAtom(a); 537 } 538 } 539 540 } 541 } 542 543 /** 544 * Convert a mmCIF AtomSite object to a BioJava Atom object 545 * 546 * @param atom the mmmcif AtomSite record 547 * @return an Atom 548 */ 549 private Atom convertAtom(AtomSite atom){ 550 551 552 Atom a = new AtomImpl(); 553 554 a.setPDBserial(Integer.parseInt(atom.getId())); 555 a.setName(atom.getLabel_atom_id()); 556 557 double x = Double.parseDouble (atom.getCartn_x()); 558 double y = Double.parseDouble (atom.getCartn_y()); 559 double z = Double.parseDouble (atom.getCartn_z()); 560 a.setX(x); 561 a.setY(y); 562 a.setZ(z); 563 564 float occupancy = Float.parseFloat (atom.getOccupancy()); 565 a.setOccupancy(occupancy); 566 567 float temp = Float.parseFloat (atom.getB_iso_or_equiv()); 568 a.setTempFactor(temp); 569 570 String alt = atom.getLabel_alt_id(); 571 if (( alt != null ) && ( alt.length() > 0) && (! alt.equals("."))){ 572 a.setAltLoc(new Character(alt.charAt(0))); 573 } else { 574 a.setAltLoc(new Character(' ')); 575 } 576 577 Element element = Element.R; 578 try { 579 element = Element.valueOfIgnoreCase(atom.getType_symbol()); 580 } catch (IllegalArgumentException e) { 581 logger.info("Element {} was not recognised as a BioJava-known element, the element will be represented as the generic element {}", atom.getType_symbol(), Element.R.name()); 582 } 583 a.setElement(element); 584 585 return a; 586 587 } 588 589 590 private Group getCorrectAltLocGroup( Character altLoc, 591 String recordName, 592 Character aminoCode1, 593 String groupCode3, 594 long seq_id) { 595 596 // see if we know this altLoc already; 597 List<Atom> atoms = currentGroup.getAtoms(); 598 if ( atoms.size() > 0) { 599 Atom a1 = atoms.get(0); 600 // we are just adding atoms to the current group 601 // probably there is a second group following later... 602 if (a1.getAltLoc().equals(altLoc)) { 603 604 return currentGroup; 605 } 606 } 607 608 List<Group> altLocs = currentGroup.getAltLocs(); 609 for ( Group altLocG : altLocs ){ 610 atoms = altLocG.getAtoms(); 611 if ( atoms.size() > 0) { 612 for ( Atom a1 : atoms) { 613 if (a1.getAltLoc().equals( altLoc)) { 614 615 return altLocG; 616 } 617 } 618 } 619 } 620 621 // no matching altLoc group found. 622 // build it up. 623 624 if ( groupCode3.equals(currentGroup.getPDBName())) { 625 if ( currentGroup.getAtoms().size() == 0) { 626 //System.out.println("current group is empty " + current_group + " " + altLoc); 627 return currentGroup; 628 } 629 //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc); 630 Group altLocG = (Group) currentGroup.clone(); 631 // drop atoms from cloned group... 632 // https://redmine.open-bio.org/issues/3307 633 altLocG.setAtoms(new ArrayList<Atom>()); 634 altLocG.getAltLocs().clear(); 635 currentGroup.addAltLoc(altLocG); 636 return altLocG; 637 } 638 639 // System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3); 640 //String recordName,Character aminoCode1, long seq_id,String groupCode3) { 641 Group altLocG = getNewGroup(recordName,aminoCode1,seq_id,groupCode3); 642 643 altLocG.setPDBName(groupCode3); 644 altLocG.setResidueNumber(currentGroup.getResidueNumber()); 645 currentGroup.addAltLoc(altLocG); 646 return altLocG; 647 } 648 649 /** 650 * Start the parsing 651 */ 652 @Override 653 public void documentStart() { 654 structure = new StructureImpl(); 655 656 currentChain = null; 657 currentGroup = null; 658 currentNmrModelNumber = null; 659 //atomCount = 0; 660 661 allModels = new ArrayList<List<Chain>>(); 662 currentModel = new ArrayList<Chain>(); 663 entities = new ArrayList<Entity>(); 664 entityPolys = new ArrayList<>(); 665 strucRefs = new ArrayList<StructRef>(); 666 seqResChains = new ArrayList<Chain>(); 667 entityChains = new ArrayList<Chain>(); 668 structAsyms = new ArrayList<StructAsym>(); 669 670 asymId2entityId = new HashMap<String,String>(); 671 asymId2authorId = new HashMap<>(); 672 structOpers = new ArrayList<PdbxStructOperList>(); 673 strucAssemblies = new ArrayList<PdbxStructAssembly>(); 674 strucAssemblyGens = new ArrayList<PdbxStructAssemblyGen>(); 675 entitySrcGens = new ArrayList<EntitySrcGen>(); 676 entitySrcNats = new ArrayList<EntitySrcNat>(); 677 entitySrcSyns = new ArrayList<EntitySrcSyn>(); 678 structConn = new ArrayList<StructConn>(); 679 structNcsOper = new ArrayList<StructNcsOper>(); 680 sequenceDifs = new ArrayList<StructRefSeqDif>(); 681 structSiteGens = new ArrayList<StructSiteGen>(); 682 } 683 684 685 @Override 686 public void documentEnd() { 687 688 // Expected that there is one current_chain that needs to be added to the model 689 // When in headerOnly mode, no Atoms are read, and there will not be an active 690 // current_chain. 691 if ( currentChain != null ) { 692 693 currentChain.addGroup(currentGroup); 694 if (isKnownChain(currentChain.getId(),currentModel) == null) { 695 currentModel.add(currentChain); 696 } 697 } else if (!params.isHeaderOnly()){ 698 logger.warn("current chain is null at end of document."); 699 } 700 701 allModels.add(currentModel); 702 703 // this populates the asymId2authorId and asymId2entityId maps, needed in header only mode to get the mapping 704 // between the 2 chain identifiers. 705 initMaps(); 706 707 for (StructAsym asym : structAsyms) { 708 709 logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() ); 710 711 Chain s = getEntityChain(asym.getEntity_id()); 712 Chain seqres = (Chain)s.clone(); 713 // to solve issue #160 (e.g. 3u7t) 714 seqres = removeSeqResHeterogeneity(seqres); 715 seqres.setId(asym.getId()); 716 if (asymId2authorId.get(asym.getId()) !=null ){ 717 seqres.setName(asymId2authorId.get(asym.getId())); 718 } else { 719 seqres.setName(asym.getId()); 720 } 721 722 EntityType type = null; 723 try { 724 Entity ent = getEntity(Integer.parseInt(asym.getEntity_id())); 725 type = EntityType.entityTypeFromString(ent.getType()); 726 } catch (NumberFormatException e) { 727 logger.debug("Could not parse integer from entity id field {}", asym.getEntity_id()); 728 } 729 730 // we'll only add seqres chains that are polymeric or unknown 731 if (type==null || type==EntityType.POLYMER ) { 732 seqResChains.add(seqres); 733 } 734 735 logger.debug(" seqres: " + asym.getId() + " " + seqres + "<") ; 736 // adding the entities to structure 737 addEntities(asym); 738 739 } 740 741 if (structAsyms.isEmpty()) { 742 logger.warn("No _struct_asym category in file, no SEQRES groups will be added."); 743 } 744 745 // entities 746 // In addEntities above we created the entities if they were present in the file 747 // Now we need to make sure that they are linked to chains and also that if they are not present in the file we need to add them now 748 linkEntities(); 749 750 // now that we know the entities, we can add all chains to structure so that they are stored 751 // properly as polymer/nonpolymer/water chains inside structure 752 for (List<Chain> model:allModels) { 753 structure.addModel(model); 754 } 755 756 // Only align if requested (default) and not when headerOnly mode with no Atoms. 757 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 758 if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){ 759 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 760 alignSeqRes(); 761 } else { 762 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 763 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 764 } 765 766 767 // Now make sure all altlocgroups have all the atoms in all the groups 768 StructureTools.cleanUpAltLocs(structure); 769 770 // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out 771 if (!params.isHeaderOnly()) { 772 if ( params.shouldCreateAtomBonds()) { 773 addBonds(); 774 } 775 776 if ( params.shouldCreateAtomCharges()) { 777 addCharges(); 778 } 779 } 780 781 if (!params.isHeaderOnly()) { 782 783 // Do structure.setSites(sites) after any chain renaming to be like PDB. 784 addSites(); 785 } 786 787 788 789 // set the oligomeric state info in the header... 790 if (params.isParseBioAssembly()) { 791 792 // the more detailed mapping of chains to rotation operations happens in StructureIO... 793 794 Map<Integer,BioAssemblyInfo> bioAssemblies = new LinkedHashMap<Integer, BioAssemblyInfo>(); 795 796 for ( PdbxStructAssembly psa : strucAssemblies){ 797 798 List<PdbxStructAssemblyGen> psags = new ArrayList<PdbxStructAssemblyGen>(1); 799 800 for ( PdbxStructAssemblyGen psag: strucAssemblyGens ) { 801 if ( psag.getAssembly_id().equals(psa.getId())) { 802 psags.add(psag); 803 } 804 } 805 806 BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder(); 807 808 // these are the transformations that need to be applied to our model 809 List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(psa, psags, structOpers); 810 811 int bioAssemblyId = -1; 812 try { 813 bioAssemblyId = Integer.parseInt(psa.getId()); 814 } catch (NumberFormatException e) { 815 logger.info("Could not parse a numerical bio assembly id from '{}'",psa.getId()); 816 } 817 818 // if bioassembly id is not numerical we throw it away 819 // this happens usually for viral capsid entries, like 1ei7 820 // see issue #230 in github 821 if (bioAssemblyId!=-1) { 822 int mmSize = 0; 823 // note that the transforms contain asym ids of both polymers and non-polymers 824 // For the mmsize, we are only interested in the polymers 825 for (BiologicalAssemblyTransformation transf:transformations) { 826 Chain c = structure.getChain(transf.getChainId()); 827 if (c==null) { 828 logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId()); 829 continue; 830 } 831 if (c.getEntityType() == EntityType.POLYMER && 832 // for entries like 4kro, sugars are annotated as polymers but we 833 // don't want them in the macromolecularSize count 834 !c.getEntityInfo().getDescription().contains("SUGAR") ) { 835 836 mmSize++; 837 } 838 } 839 840 BioAssemblyInfo bioAssembly = new BioAssemblyInfo(); 841 bioAssembly.setId(bioAssemblyId); 842 bioAssembly.setMacromolecularSize(mmSize); 843 bioAssembly.setTransforms(transformations); 844 bioAssemblies.put(bioAssemblyId,bioAssembly); 845 } 846 847 } 848 structure.getPDBHeader().setBioAssemblies(bioAssemblies); 849 } 850 851 setStructNcsOps(); 852 853 setCrystallographicInfoMetadata(); 854 855 856 Map<String,List<SeqMisMatch>> misMatchMap = new HashMap<String, List<SeqMisMatch>>(); 857 for (StructRefSeqDif sdif : sequenceDifs) { 858 SeqMisMatch misMatch = new SeqMisMatchImpl(); 859 misMatch.setDetails(sdif.getDetails()); 860 861 String insCode = sdif.getPdbx_pdb_ins_code(); 862 if ( insCode != null && insCode.equals("?")) 863 insCode = null; 864 misMatch.setInsCode(insCode); 865 misMatch.setOrigGroup(sdif.getDb_mon_id()); 866 misMatch.setPdbGroup(sdif.getMon_id()); 867 misMatch.setPdbResNum(sdif.getPdbx_auth_seq_num()); 868 misMatch.setUniProtId(sdif.getPdbx_seq_db_accession_code()); 869 misMatch.setSeqNum(sdif.getSeq_num()); 870 871 872 List<SeqMisMatch> mms = misMatchMap.get(sdif.getPdbx_pdb_strand_id()); 873 if ( mms == null) { 874 mms = new ArrayList<SeqMisMatch>(); 875 misMatchMap.put(sdif.getPdbx_pdb_strand_id(),mms); 876 } 877 mms.add(misMatch); 878 879 } 880 881 for (String chainId : misMatchMap.keySet()){ 882 883 Chain chain = structure.getPolyChainByPDB(chainId); 884 885 if ( chain == null) { 886 logger.warn("Could not set mismatches for chain with author id" + chainId); 887 continue; 888 } 889 890 chain.setSeqMisMatches(misMatchMap.get(chainId)); 891 892 893 } 894 895 } 896 897 /** 898 * Here we link entities to chains. 899 * Also if entities are not present in file, this initialises the entities with some heuristics, see {@link org.biojava.nbio.structure.io.EntityFinder} 900 */ 901 private void linkEntities() { 902 903 for (int i =0; i< allModels.size() ; i++){ 904 for (Chain chain : allModels.get(i)) { 905 //logger.info("linking entities for " + chain.getId() + " " + chain.getName()); 906 String entityId = asymId2entityId.get(chain.getId()); 907 908 if (entityId==null) { 909 // this can happen for instance if the cif file didn't have _struct_asym category at all 910 // and thus we have no asymId2entityId mapping at all 911 logger.info("No entity id could be found for chain {}", chain.getId()); 912 continue; 913 } 914 int eId = Integer.parseInt(entityId); 915 916 // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found. 917 // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer 918 // asyms (chains). Either create a unique StructureImpl or modify existing for a better representation of the 919 // mmCIF internal data structures but is compatible with Structure interface. 920 // Some examples of PDB entries with this kind of problem: 921 // - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName 922 // - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule 923 // - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone 924 925 EntityInfo entityInfo = structure.getEntityById(eId); 926 if (entityInfo==null) { 927 // Supports the case where the only chain members were from non-polymeric entity that is missing. 928 // Solved by creating a new Compound(entity) to which this chain will belong. 929 logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.", 930 eId, chain.getId()); 931 entityInfo = new EntityInfo(); 932 entityInfo.setMolId(eId); 933 entityInfo.addChain(chain); 934 if (chain.isWaterOnly()) { 935 entityInfo.setType(EntityType.WATER); 936 } else { 937 entityInfo.setType(EntityType.NONPOLYMER); 938 } 939 chain.setEntityInfo(entityInfo); 940 structure.addEntityInfo(entityInfo); 941 } else { 942 logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}", 943 chain.getId(), chain.getName(), eId); 944 entityInfo.addChain(chain); 945 chain.setEntityInfo(entityInfo); 946 } 947 948 } 949 950 } 951 952 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 953 List<EntityInfo> entityInfos = structure.getEntityInfos(); 954 if (entityInfos==null || entityInfos.isEmpty()) { 955 956 List<List<Chain>> polyModels = new ArrayList<>(); 957 List<List<Chain>> nonPolyModels = new ArrayList<>(); 958 List<List<Chain>> waterModels = new ArrayList<>(); 959 960 for (List<Chain> model:allModels) { 961 962 List<Chain> polyChains = new ArrayList<>(); 963 List<Chain> nonPolyChains = new ArrayList<>(); 964 List<Chain> waterChains = new ArrayList<>(); 965 966 polyModels.add(polyChains); 967 nonPolyModels.add(nonPolyChains); 968 waterModels.add(waterChains); 969 970 for (Chain c:model) { 971 972 // we only have entities for polymeric chains, all others are ignored for assigning entities 973 if (c.isWaterOnly()) { 974 waterChains.add(c); 975 976 } else if (c.isPureNonPolymer()) { 977 nonPolyChains.add(c); 978 979 } else { 980 polyChains.add(c); 981 } 982 } 983 } 984 985 entityInfos = EntityFinder.findPolyEntities(polyModels); 986 EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos); 987 988 989 structure.setEntityInfos(entityInfos); 990 } 991 992 // final sanity check: it can happen that from the annotated entities some are not linked to any chains 993 // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds) 994 // we simply log it, this can sign some other problems if the entities are used down the line 995 for (EntityInfo e:entityInfos) { 996 if (e.getChains().isEmpty()) { 997 logger.info("Entity {} '{}' has no chains associated to it", 998 e.getMolId()<0?"with no entity id":e.getMolId(), e.getDescription()); 999 } 1000 } 1001 1002 } 1003 1004 private void addCharges() { 1005 ChargeAdder.addCharges(structure); 1006 } 1007 1008 /** 1009 * The method will return a new reference to a Chain with any consecutive groups 1010 * having same residue numbers removed. 1011 * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160) 1012 * @param c 1013 * @return 1014 */ 1015 private static Chain removeSeqResHeterogeneity(Chain c) { 1016 1017 Chain trimmedChain = new ChainImpl(); 1018 1019 ResidueNumber lastResNum = null; 1020 1021 for (Group g:c.getAtomGroups()) { 1022 1023 // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g) 1024 ResidueNumber currentResNum = new ResidueNumber( 1025 g.getResidueNumber().getChainName(), 1026 g.getResidueNumber().getSeqNum(), 1027 g.getResidueNumber().getInsCode()); 1028 1029 if (lastResNum == null || !lastResNum.equals(currentResNum) ) { 1030 trimmedChain.addGroup(g); 1031 } else { 1032 logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': "+g); 1033 } 1034 1035 lastResNum = currentResNum; 1036 1037 } 1038 return trimmedChain; 1039 } 1040 1041 private void addBonds() { 1042 BondMaker maker = new BondMaker(structure, params); 1043 maker.makeBonds(); 1044 maker.formBondsFromStructConn(structConn); 1045 } 1046 1047 private void alignSeqRes() { 1048 1049 logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence"); 1050 1051 // fix SEQRES residue numbering for all models 1052 1053 for (int model=0;model<structure.nrModels();model++) { 1054 1055 List<Chain> atomList = structure.getModel(model); 1056 1057 for (Chain seqResChain: seqResChains){ 1058 1059 // this extracts the matching atom chain from atomList 1060 Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true); 1061 1062 if (atomChain == null) { 1063 // most likely there's no observed residues at all for the seqres chain: can't map 1064 // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues 1065 logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's no observed residues in the chain.", 1066 seqResChain.getId()); 1067 continue; 1068 } 1069 1070 //map the atoms to the seqres... 1071 1072 // we need to first clone the seqres so that they stay independent for different models 1073 List<Group> seqResGroups = new ArrayList<Group>(); 1074 for (int i=0;i<seqResChain.getAtomGroups().size();i++) { 1075 seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone()); 1076 } 1077 1078 for ( int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) { 1079 Group seqresG = seqResGroups.get(seqResPos); 1080 boolean found = false; 1081 for ( Group atomG: atomChain.getAtomGroups()) { 1082 1083 int internalNr = getInternalNr (atomG); 1084 1085 if (seqresG.getResidueNumber().getSeqNum() == internalNr ) { 1086 seqResGroups.set(seqResPos, atomG); 1087 found = true; 1088 break; 1089 } 1090 1091 1092 } 1093 if ( ! found) 1094 // so far the residue number has tracked internal numbering. 1095 // however there are no atom records, as such this can't be a PDB residue number... 1096 seqresG.setResidueNumber(null); 1097 } 1098 atomChain.setSeqResGroups(seqResGroups); 1099 1100 } 1101 } 1102 } 1103 1104 private int getInternalNr(Group atomG) { 1105 if ( atomG.getType().equals(GroupType.AMINOACID)) { 1106 AminoAcidImpl aa = (AminoAcidImpl) atomG; 1107 return new Long(aa.getId()).intValue(); 1108 } else if ( atomG.getType().equals(GroupType.NUCLEOTIDE)) { 1109 NucleotideImpl nu = (NucleotideImpl) atomG; 1110 return new Long(nu.getId()).intValue(); 1111 } else { 1112 HetatomImpl he = (HetatomImpl) atomG; 1113 return new Long(he.getId()).intValue(); 1114 } 1115 } 1116 1117 private void addEntities(StructAsym asym) { 1118 int eId = 0; 1119 try { 1120 eId = Integer.parseInt(asym.getEntity_id()); 1121 } catch (NumberFormatException e) { 1122 logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity",asym.getEntity_id()); 1123 } 1124 Entity e = getEntity(eId); 1125 1126 // for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing 1127 // we need to fill the Compounds in some other way: 1128 1129 EntityInfo entityInfo = structure.getEntityById(eId); 1130 1131 if (entityInfo==null) { 1132 //logger.info("Creating new EntityInfo " + eId + " " + e.getId() + " " + e.getPdbx_description()); 1133 entityInfo = new EntityInfo(); 1134 entityInfo.setMolId(eId); 1135 // we only add the compound if a polymeric one (to match what the PDB parser does) 1136 if (e!=null) { 1137 entityInfo.setDescription(e.getPdbx_description()); 1138 1139 EntityType eType = EntityType.entityTypeFromString(e.getType()); 1140 if (eType!=null) { 1141 entityInfo.setType(eType); 1142 } else { 1143 logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", e.getType(), eId); 1144 } 1145 addAncilliaryEntityData(asym, eId, e, entityInfo); 1146 structure.addEntityInfo(entityInfo); 1147 logger.debug("Adding Entity with entity id {} from _entity, with name: {}",eId, entityInfo.getDescription()); 1148 } 1149 } 1150 } 1151 1152 1153 /** 1154 * Add any extra information to the entity information. 1155 * @param asym 1156 * @param entityId 1157 * @param entity 1158 * @param entityInfo 1159 */ 1160 private void addAncilliaryEntityData(StructAsym asym, int entityId, Entity entity, EntityInfo entityInfo) { 1161 // Loop through each of the entity types and add the corresponding data 1162 // We're assuming if data is duplicated between sources it is consistent 1163 // This is a potentially huge assumption... 1164 1165 1166 for (EntitySrcGen esg : entitySrcGens) { 1167 1168 if (! esg.getEntity_id().equals(asym.getEntity_id())) 1169 continue; 1170 1171 addInformationFromESG(esg, entityId, entityInfo); 1172 1173 } 1174 1175 for (EntitySrcNat esn : entitySrcNats) { 1176 if (! esn.getEntity_id().equals(asym.getEntity_id())) 1177 continue; 1178 addInformationFromESN(esn, entityId, entityInfo); 1179 1180 } 1181 1182 for (EntitySrcSyn ess : entitySrcSyns) { 1183 if (! ess.getEntity_id().equals(asym.getEntity_id())) 1184 continue; 1185 addInfoFromESS(ess, entityId, entityInfo); 1186 1187 } 1188 } 1189 1190 /** 1191 * Add the information from an ESG to a compound. 1192 * @param entitySrcInfo 1193 * @param entityId 1194 * @param c 1195 */ 1196 private void addInformationFromESG(EntitySrcGen entitySrcInfo, int entityId, EntityInfo c) { 1197 c.setAtcc(entitySrcInfo.getPdbx_gene_src_atcc()); 1198 c.setCell(entitySrcInfo.getPdbx_gene_src_cell()); 1199 c.setOrganismCommon(entitySrcInfo.getGene_src_common_name()); 1200 c.setOrganismScientific(entitySrcInfo.getPdbx_gene_src_scientific_name()); 1201 c.setOrganismTaxId(entitySrcInfo.getPdbx_gene_src_ncbi_taxonomy_id()); 1202 c.setExpressionSystemTaxId(entitySrcInfo.getPdbx_host_org_ncbi_taxonomy_id()); 1203 c.setExpressionSystem(entitySrcInfo.getPdbx_host_org_scientific_name()); 1204 } 1205 1206 /** 1207 * Add the information to entity info from ESN. 1208 * @param esn 1209 * @param eId 1210 * @param c 1211 */ 1212 private void addInformationFromESN(EntitySrcNat esn, int eId, EntityInfo c) { 1213 1214 c.setAtcc(esn.getPdbx_atcc()); 1215 c.setCell(esn.getPdbx_cell()); 1216 c.setOrganismCommon(esn.getCommon_name()); 1217 c.setOrganismScientific(esn.getPdbx_organism_scientific()); 1218 c.setOrganismTaxId(esn.getPdbx_ncbi_taxonomy_id()); 1219 1220 } 1221 /** 1222 * Add the information from ESS to Entity info. 1223 * @param ess 1224 * @param eId 1225 * @param c 1226 */ 1227 private void addInfoFromESS(EntitySrcSyn ess, int eId, EntityInfo c) { 1228 c.setOrganismCommon(ess.getOrganism_common_name()); 1229 c.setOrganismScientific(ess.getOrganism_scientific()); 1230 c.setOrganismTaxId(ess.getNcbi_taxonomy_id()); 1231 1232 } 1233 1234 private void initMaps() { 1235 1236 1237 if (structAsyms == null || structAsyms.isEmpty()) { 1238 logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available"); 1239 return; 1240 } 1241 1242 Map<String, List<String>> entityId2asymId = new HashMap<>(); 1243 1244 for (StructAsym asym : structAsyms) { 1245 1246 logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() ); 1247 1248 asymId2entityId.put(asym.getId(), asym.getEntity_id()); 1249 1250 if (entityId2asymId.containsKey(asym.getEntity_id())) { 1251 List<String> asymIds = entityId2asymId.get(asym.getEntity_id()); 1252 asymIds.add(asym.getId()); 1253 } else { 1254 List<String> asymIds = new ArrayList<>(); 1255 asymIds.add(asym.getId()); 1256 entityId2asymId.put(asym.getEntity_id(), asymIds); 1257 } 1258 } 1259 1260 if (entityPolys==null || entityPolys.isEmpty()) { 1261 logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available for header only parsing"); 1262 return; 1263 } 1264 1265 for (EntityPoly ep:entityPolys) { 1266 if (ep.getPdbx_strand_id()==null) { 1267 logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to author ids for this entity.", ep.getEntity_id()); 1268 continue; 1269 } 1270 String[] chainNames = ep.getPdbx_strand_id().split(","); 1271 List<String> asymIds = entityId2asymId.get(ep.getEntity_id()); 1272 if (chainNames.length!=asymIds.size()) { 1273 logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) for entity {} have different lengths! Can't provide a mapping from asym ids to author chain ids", ep.getEntity_id()); 1274 continue; 1275 } 1276 for (int i=0; i<chainNames.length; i++) { 1277 asymId2authorId.put(asymIds.get(i), chainNames[i]); 1278 } 1279 } 1280 } 1281 1282 private void setStructNcsOps() { 1283 1284 ArrayList<Matrix4d> ncsOperators = new ArrayList<Matrix4d>(); 1285 1286 for (StructNcsOper sNcsOper:structNcsOper) { 1287 1288 if (!sNcsOper.getCode().equals("generate")) continue; 1289 1290 try { 1291 Matrix4d op = new Matrix4d(); 1292 op.setElement(3, 0, 0.0); 1293 op.setElement(3, 1, 0.0); 1294 op.setElement(3, 2, 0.0); 1295 op.setElement(3, 3, 1.0); 1296 1297 1298 op.setElement(0, 0, Double.parseDouble(sNcsOper.getMatrix11())); 1299 op.setElement(0, 1, Double.parseDouble(sNcsOper.getMatrix12())); 1300 op.setElement(0, 2, Double.parseDouble(sNcsOper.getMatrix13())); 1301 1302 op.setElement(1, 0, Double.parseDouble(sNcsOper.getMatrix21())); 1303 op.setElement(1, 1, Double.parseDouble(sNcsOper.getMatrix22())); 1304 op.setElement(1, 2, Double.parseDouble(sNcsOper.getMatrix23())); 1305 1306 op.setElement(2, 0, Double.parseDouble(sNcsOper.getMatrix31())); 1307 op.setElement(2, 1, Double.parseDouble(sNcsOper.getMatrix32())); 1308 op.setElement(2, 2, Double.parseDouble(sNcsOper.getMatrix33())); 1309 1310 op.setElement(0, 3, Double.parseDouble(sNcsOper.getVector1())); 1311 op.setElement(1, 3, Double.parseDouble(sNcsOper.getVector2())); 1312 op.setElement(2, 3, Double.parseDouble(sNcsOper.getVector3())); 1313 1314 ncsOperators.add(op); 1315 1316 } catch (NumberFormatException e) { 1317 logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", structNcsOper.indexOf(sNcsOper)+1); 1318 } 1319 1320 } 1321 1322 // we only set it if not empty, otherwise remains null 1323 if (ncsOperators.size()>0) { 1324 structure.getCrystallographicInfo().setNcsOperators( 1325 ncsOperators.toArray(new Matrix4d[ncsOperators.size()])); 1326 } 1327 } 1328 1329 private void setCrystallographicInfoMetadata() { 1330 if (parsedScaleMatrix!=null) { 1331 1332 PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo(); 1333 1334 boolean nonStd = false; 1335 if (crystalInfo.getCrystalCell()!=null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) { 1336 nonStd = true; 1337 } 1338 1339 crystalInfo.setNonStandardCoordFrameConvention(nonStd); 1340 } 1341 } 1342 1343 1344 /** This method will return the parsed protein structure, once the parsing has been finished 1345 * 1346 * @return a BioJava protein structure object 1347 */ 1348 public Structure getStructure() { 1349 1350 return structure; 1351 } 1352 1353 @Override 1354 public void newDatabasePDBrevRecord(DatabasePdbrevRecord record) { 1355 1356 PDBHeader header = structure.getPDBHeader(); 1357 1358 if ( header == null) { 1359 header = new PDBHeader(); 1360 structure.setPDBHeader(header); 1361 } 1362 1363 List<DatabasePdbrevRecord> revRecords = header.getRevisionRecords(); 1364 if ( revRecords == null) { 1365 revRecords = new ArrayList<DatabasePdbrevRecord>(); 1366 header.setRevisionRecords(revRecords); 1367 } 1368 revRecords.add(record); 1369 1370 1371 } 1372 1373 1374 @Override 1375 public void newDatabasePDBrev(DatabasePDBrev dbrev) { 1376 1377 logger.debug("got a database revision:" + dbrev); 1378 1379 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US); 1380 PDBHeader header = structure.getPDBHeader(); 1381 1382 if ( header == null) { 1383 header = new PDBHeader(); 1384 } 1385 1386 if (dbrev.getNum().equals("1")){ 1387 1388 try { 1389 Date dep = dateFormat.parse(dbrev.getDate_original()); 1390 header.setDepDate(dep); 1391 1392 } catch (ParseException e){ 1393 logger.warn("Could not parse date string '{}', deposition date will be unavailable", dbrev.getDate_original()); 1394 } 1395 1396 try { 1397 Date rel = dateFormat.parse(dbrev.getDate()); 1398 header.setRelDate(rel); 1399 1400 } catch (ParseException e){ 1401 logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate()); 1402 } 1403 1404 1405 } else { 1406 try { 1407 1408 Date mod = dateFormat.parse(dbrev.getDate()); 1409 header.setModDate(mod); 1410 1411 } catch (ParseException e){ 1412 logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate()); 1413 } 1414 } 1415 1416 structure.setPDBHeader(header); 1417 } 1418 1419 @Override 1420 public void newPdbxAuditRevisionHistory(PdbxAuditRevisionHistory history) { 1421 1422 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US); 1423 PDBHeader header = structure.getPDBHeader(); 1424 1425 if ( header == null) { 1426 header = new PDBHeader(); 1427 } 1428 1429 // first entry in revision history is the release date 1430 if (history.getOrdinal().equals("1")){ 1431 try { 1432 Date releaseDate = dateFormat.parse(history.getRevision_date()); 1433 header.setRelDate(releaseDate); 1434 1435 } catch (ParseException e){ 1436 logger.warn("Could not parse date string '{}', release date will be unavailable", history.getRevision_date()); 1437 } 1438 } else { 1439 // all other dates are revision dates; 1440 // since this method may be called multiple times, 1441 // the last revision date will "stick" 1442 try { 1443 Date revisionDate = dateFormat.parse(history.getRevision_date()); 1444 header.setModDate(revisionDate); 1445 } catch (ParseException e){ 1446 logger.warn("Could not parse date string '{}', revision date will be unavailable", history.getRevision_date()); 1447 } 1448 } 1449 1450 structure.setPDBHeader(header); 1451 } 1452 1453 @Override 1454 public void newPdbxDatabaseStatus(PdbxDatabaseStatus status) { 1455 1456 // the deposition date field is only available in mmCIF 5.0 1457 1458 if (status.getRecvd_initial_deposition_date() == null) { 1459 // skip this method for older mmCIF versions 1460 return; 1461 } 1462 1463 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US); 1464 PDBHeader header = structure.getPDBHeader(); 1465 1466 if (header == null) { 1467 header = new PDBHeader(); 1468 } 1469 1470 try { 1471 Date depositionDate = dateFormat.parse(status.getRecvd_initial_deposition_date()); 1472 header.setDepDate(depositionDate); 1473 } catch (ParseException e){ 1474 logger.warn("Could not parse date string '{}', deposition date will be unavailable", status.getRecvd_initial_deposition_date()); 1475 } 1476 1477 structure.setPDBHeader(header); 1478 } 1479 1480 @Override 1481 public void newDatabasePDBremark(DatabasePDBremark remark) { 1482 //System.out.println(remark); 1483 String id = remark.getId(); 1484 if (id.equals("2")){ 1485 1486 //this remark field contains the resolution information: 1487 String line = remark.getText(); 1488 1489 int i = line.indexOf("ANGSTROM"); 1490 if ( i > 5) { 1491 // line contains ANGSTROM info... 1492 String resolution = line.substring(i-5,i).trim(); 1493 // convert string to float 1494 float res = 99 ; 1495 try { 1496 res = Float.parseFloat(resolution); 1497 1498 } catch (NumberFormatException e) { 1499 logger.info("could not parse resolution from line and ignoring it " + line); 1500 return ; 1501 1502 1503 } 1504 // support for old style header 1505 1506 PDBHeader pdbHeader = structure.getPDBHeader(); 1507 pdbHeader.setResolution(res); 1508 1509 } 1510 1511 } 1512 } 1513 1514 @Override 1515 public void newRefine(Refine r){ 1516 1517 PDBHeader pdbHeader = structure.getPDBHeader(); 1518 // RESOLUTION 1519 // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m) 1520 // there are 2 resolution values, one for each method 1521 // we take the last one found so that behaviour is like in PDB file parsing 1522 if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) { 1523 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} " 1524 ,r.getLs_d_res_high(), String.format("%4.2f",pdbHeader.getResolution())); 1525 } 1526 try { 1527 pdbHeader.setResolution(Float.parseFloat(r.getLs_d_res_high())); 1528 } catch (NumberFormatException e){ 1529 logger.info("Could not parse resolution from " + r.getLs_d_res_high() + " " + e.getMessage()); 1530 } 1531 1532 1533 // RFREE 1534 if (pdbHeader.getRfree()!=PDBHeader.DEFAULT_RFREE) { 1535 logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {} ", 1536 r.getLs_R_factor_R_free(), String.format("%4.2f",pdbHeader.getRfree())); 1537 } 1538 if (r.getLs_R_factor_R_free()==null) { 1539 // some entries like 2ifo haven't got this field at all 1540 logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value"); 1541 } else { 1542 try { 1543 pdbHeader.setRfree(Float.parseFloat(r.getLs_R_factor_R_free())); 1544 } catch (NumberFormatException e){ 1545 // no rfree present ('?') is very usual, that's why we set it to debug 1546 logger.debug("Could not parse Rfree from string '{}'", r.getLs_R_factor_R_free()); 1547 } 1548 } 1549 1550 // RWORK 1551 if(pdbHeader.getRwork()!=PDBHeader.DEFAULT_RFREE) { 1552 logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ", 1553 r.getLs_R_factor_R_work(), String.format("%4.2f",pdbHeader.getRwork())); 1554 } 1555 if(r.getLs_R_factor_R_work()==null){ 1556 logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value"); 1557 } 1558 else{ 1559 try{ 1560 pdbHeader.setRwork(Float.parseFloat(r.getLs_R_factor_R_work())); 1561 } 1562 catch (NumberFormatException e){ 1563 logger.debug("Could not parse R-work from string '{}'", r.getLs_R_factor_R_work()); 1564 } 1565 1566 } 1567 1568 } 1569 1570 1571 @Override 1572 public void newAuditAuthor(AuditAuthor aa){ 1573 1574 String name = aa.getName(); 1575 1576 StringBuffer famName = new StringBuffer(); 1577 StringBuffer initials = new StringBuffer(); 1578 boolean afterComma = false; 1579 for ( char c: name.toCharArray()) { 1580 if ( c == ' ') 1581 continue; 1582 if ( c == ','){ 1583 afterComma = true; 1584 continue; 1585 } 1586 1587 if ( afterComma) 1588 initials.append(c); 1589 else 1590 famName.append(c); 1591 } 1592 1593 StringBuffer newaa = new StringBuffer(); 1594 newaa.append(initials); 1595 newaa.append(famName); 1596 1597 PDBHeader header = structure.getPDBHeader(); 1598 String auth = header.getAuthors(); 1599 if (auth == null) { 1600 header.setAuthors(newaa.toString()); 1601 }else { 1602 auth += "," + newaa.toString(); 1603 header.setAuthors(auth); 1604 1605 } 1606 } 1607 1608 @Override 1609 public void newExptl(Exptl exptl) { 1610 1611 PDBHeader pdbHeader = structure.getPDBHeader(); 1612 String method = exptl.getMethod(); 1613 pdbHeader.setExperimentalTechnique(method); 1614 1615 } 1616 1617 @Override 1618 public void newCell(Cell cell) { 1619 1620 try { 1621 float a = Float.parseFloat(cell.getLength_a()); 1622 float b = Float.parseFloat(cell.getLength_b()); 1623 float c = Float.parseFloat(cell.getLength_c()); 1624 float alpha = Float.parseFloat(cell.getAngle_alpha()); 1625 float beta = Float.parseFloat(cell.getAngle_beta()); 1626 float gamma = Float.parseFloat(cell.getAngle_gamma()); 1627 1628 CrystalCell xtalCell = new CrystalCell(); 1629 xtalCell.setA(a); 1630 xtalCell.setB(b); 1631 xtalCell.setC(c); 1632 xtalCell.setAlpha(alpha); 1633 xtalCell.setBeta(beta); 1634 xtalCell.setGamma(gamma); 1635 1636 if (!xtalCell.isCellReasonable()) { 1637 // If the entry describes a structure determined by a technique other than X-ray crystallography, 1638 // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees 1639 // if so we don't add and CrystalCell will be null 1640 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", 1641 CrystalCell.MIN_VALID_CELL_SIZE); 1642 return; 1643 } 1644 1645 structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(xtalCell); 1646 1647 } catch (NumberFormatException e){ 1648 structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(null); 1649 logger.info("could not parse some cell parameters ("+e.getMessage()+"), ignoring _cell "); 1650 } 1651 } 1652 1653 @Override 1654 public void newSymmetry(Symmetry symmetry) { 1655 String spaceGroup = symmetry.getSpace_group_name_H_M(); 1656 SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup); 1657 if (sg==null) { 1658 logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group"); 1659 structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(true); 1660 } else { 1661 structure.getPDBHeader().getCrystallographicInfo().setSpaceGroup(sg); 1662 structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(false); 1663 } 1664 } 1665 1666 @Override 1667 public void newStructNcsOper(StructNcsOper sNcsOper) { 1668 structNcsOper.add(sNcsOper); 1669 } 1670 1671 public void newAtomSites(AtomSites atomSites) { 1672 1673 try { 1674 Matrix4d m = new Matrix4d( 1675 Double.parseDouble(atomSites.getFract_transf_matrix11()), Double.parseDouble(atomSites.getFract_transf_matrix12()), Double.parseDouble(atomSites.getFract_transf_matrix13()), Double.parseDouble(atomSites.getFract_transf_vector1()), 1676 Double.parseDouble(atomSites.getFract_transf_matrix21()), Double.parseDouble(atomSites.getFract_transf_matrix22()), Double.parseDouble(atomSites.getFract_transf_matrix23()), Double.parseDouble(atomSites.getFract_transf_vector2()), 1677 Double.parseDouble(atomSites.getFract_transf_matrix31()), Double.parseDouble(atomSites.getFract_transf_matrix32()), Double.parseDouble(atomSites.getFract_transf_matrix33()), Double.parseDouble(atomSites.getFract_transf_vector3()), 1678 0,0,0,1); 1679 1680 parsedScaleMatrix = m; 1681 1682 } catch (NumberFormatException e) { 1683 logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", e.getMessage()); 1684 structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false); 1685 1686 // in this case parsedScaleMatrix stays null and can't be used in documentEnd() 1687 } 1688 } 1689 1690 @Override 1691 public void newStructRef(StructRef sref) { 1692 logger.debug(sref.toString()); 1693 strucRefs.add(sref); 1694 } 1695 1696 private StructRef getStructRef(String ref_id){ 1697 for (StructRef structRef : strucRefs) { 1698 1699 if (structRef.getId().equals(ref_id)){ 1700 return structRef; 1701 } 1702 1703 } 1704 return null; 1705 1706 } 1707 1708 /** 1709 * create a DBRef record from the StrucRefSeq record: 1710 * <pre> 1711 * PDB record DBREF 1712 * Field Name mmCIF Data Item 1713 * Section n.a. 1714 * PDB_ID_Code _struct_ref_seq.pdbx_PDB_id_code 1715 * Strand_ID _struct_ref_seq.pdbx_strand_id 1716 * Begin_Residue_Number _struct_ref_seq.pdbx_auth_seq_align_beg 1717 * Begin_Ins_Code _struct_ref_seq.pdbx_seq_align_beg_ins_code 1718 * End_Residue_Number _struct_ref_seq.pdbx_auth_seq_align_end 1719 * End_Ins_Code _struct_ref_seq.pdbx_seq_align_end_ins_code 1720 * Database _struct_ref.db_name 1721 * Database_Accession_No _struct_ref_seq.pdbx_db_accession 1722 * Database_ID_Code _struct_ref.db_code 1723 * Database_Begin_Residue_Number _struct_ref_seq.db_align_beg 1724 * Databaes_Begin_Ins_Code _struct_ref_seq.pdbx_db_align_beg_ins_code 1725 * Database_End_Residue_Number _struct_ref_seq.db_align_end 1726 * Databaes_End_Ins_Code _struct_ref_seq.pdbx_db_align_end_ins_code 1727 * </pre> 1728 * 1729 * 1730 */ 1731 @Override 1732 public void newStructRefSeq(StructRefSeq sref) { 1733 DBRef r = new DBRef(); 1734 1735 r.setIdCode(sref.getPdbx_PDB_id_code()); 1736 r.setDbAccession(sref.getPdbx_db_accession()); 1737 r.setDbIdCode(sref.getPdbx_db_accession()); 1738 1739 r.setChainName(sref.getPdbx_strand_id()); 1740 StructRef structRef = getStructRef(sref.getRef_id()); 1741 if (structRef == null){ 1742 logger.info("could not find StructRef " + sref.getRef_id() + " for StructRefSeq " + sref); 1743 } else { 1744 r.setDatabase(structRef.getDb_name()); 1745 r.setDbIdCode(structRef.getDb_code()); 1746 } 1747 1748 int seqbegin; 1749 int seqend; 1750 try{ 1751 seqbegin = Integer.parseInt(sref.getPdbx_auth_seq_align_beg()); 1752 seqend = Integer.parseInt(sref.getPdbx_auth_seq_align_end()); 1753 } 1754 catch(NumberFormatException e){ 1755 // this happens in a few entries, annotation error? e.g. 6eoj 1756 logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref alignment info for accession {}. Error: {}", r.getDbAccession(), e.getMessage()); 1757 return; 1758 } 1759 1760 Character begin_ins_code = ' '; 1761 if (sref.getPdbx_seq_align_beg_ins_code() != null ) { 1762 begin_ins_code = new Character(sref.getPdbx_seq_align_beg_ins_code().charAt(0)); 1763 } 1764 1765 Character end_ins_code = ' '; 1766 if (sref.getPdbx_seq_align_end_ins_code() != null) { 1767 end_ins_code = new Character(sref.getPdbx_seq_align_end_ins_code().charAt(0)); 1768 } 1769 1770 if (begin_ins_code == '?') 1771 begin_ins_code = ' '; 1772 1773 if (end_ins_code == '?') 1774 end_ins_code = ' '; 1775 1776 r.setSeqBegin(seqbegin); 1777 r.setInsertBegin(begin_ins_code); 1778 1779 r.setSeqEnd(seqend); 1780 r.setInsertEnd(end_ins_code); 1781 1782 int dbseqbegin = Integer.parseInt(sref.getDb_align_beg()); 1783 int dbseqend = Integer.parseInt(sref.getDb_align_end()); 1784 1785 Character db_begin_in_code = ' '; 1786 if (sref.getPdbx_db_align_beg_ins_code() != null) { 1787 db_begin_in_code = new Character(sref.getPdbx_db_align_beg_ins_code().charAt(0)); 1788 } 1789 1790 Character db_end_in_code = ' '; 1791 if (sref.getPdbx_db_align_end_ins_code() != null) { 1792 db_end_in_code = new Character(sref.getPdbx_db_align_end_ins_code().charAt(0)); 1793 } 1794 1795 if (db_begin_in_code == '?') 1796 db_begin_in_code = ' '; 1797 1798 if (db_end_in_code == '?') 1799 db_end_in_code = ' '; 1800 1801 1802 r.setDbSeqBegin(dbseqbegin); 1803 r.setIdbnsBegin(db_begin_in_code); 1804 1805 r.setDbSeqEnd(dbseqend); 1806 r.setIdbnsEnd(db_end_in_code); 1807 1808 List<DBRef> dbrefs = structure.getDBRefs(); 1809 if ( dbrefs == null) 1810 dbrefs = new ArrayList<DBRef>(); 1811 dbrefs.add(r); 1812 1813 logger.debug(r.toPDB()); 1814 1815 structure.setDBRefs(dbrefs); 1816 1817 } 1818 1819 @Override 1820 public void newStructRefSeqDif(StructRefSeqDif sref) { 1821 sequenceDifs.add(sref); 1822 } 1823 1824 private Chain getEntityChain(String entity_id){ 1825 1826 for (Chain chain : entityChains) { 1827 if ( chain.getId().equals(entity_id)){ 1828 1829 return chain; 1830 } 1831 } 1832 // does not exist yet, so create... 1833 1834 Chain chain = new ChainImpl(); 1835 chain.setId(entity_id); 1836 entityChains.add(chain); 1837 1838 return chain; 1839 1840 } 1841 1842 //private Chain getSeqResChain(String chainID){ 1843 // return getChainFromList(seqResChains, chainID); 1844 //} 1845 1846 1847 /** 1848 * Data items in the ENTITY_SRC_GEN category record details of 1849 * the source from which the entity was obtained in cases 1850 * where the source was genetically manipulated. The 1851 * following are treated separately: items pertaining to the tissue 1852 * from which the gene was obtained, items pertaining to the host 1853 * organism for gene expression and items pertaining to the actual 1854 * producing organism (plasmid). 1855 */ 1856 @Override 1857 public void newEntitySrcGen(EntitySrcGen entitySrcGen){ 1858 1859 // add to internal list. Map to Compound object later on... 1860 entitySrcGens.add(entitySrcGen); 1861 } 1862 1863 @Override 1864 public void newEntitySrcNat(EntitySrcNat entitySrcNat){ 1865 1866 // add to internal list. Map to Compound object later on... 1867 entitySrcNats.add(entitySrcNat); 1868 } 1869 1870 @Override 1871 public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){ 1872 1873 // add to internal list. Map to Compound object later on... 1874 entitySrcSyns.add(entitySrcSyn); 1875 } 1876 1877 /** 1878 * The EntityPolySeq object provide the amino acid sequence objects for the Entities. 1879 * Later on the entities are mapped to the BioJava {@link Chain} and {@link EntityInfo} objects. 1880 * @param epolseq the EntityPolySeq record for one amino acid 1881 */ 1882 @Override 1883 public void newEntityPolySeq(EntityPolySeq epolseq) { 1884 1885 logger.debug("NEW entity poly seq " + epolseq); 1886 1887 int eId = -1; 1888 try { 1889 eId = Integer.parseInt(epolseq.getEntity_id()); 1890 } catch (NumberFormatException e) { 1891 logger.warn("Could not parse entity id from EntityPolySeq: "+e.getMessage()); 1892 } 1893 Entity e = getEntity(eId); 1894 1895 if (e == null){ 1896 logger.info("Could not find entity "+ epolseq.getEntity_id()+". Can not match sequence to it."); 1897 return; 1898 } 1899 1900 Chain entityChain = getEntityChain(epolseq.getEntity_id()); 1901 1902 // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group 1903 // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08 1904 1905 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(epolseq.getMon_id()); 1906 //int seqId = Integer.parseInt(epolseq.getNum()); 1907 if ( g != null && !g.getChemComp().isEmpty()) { 1908 if ( g instanceof AminoAcidImpl) { 1909 AminoAcidImpl aa = (AminoAcidImpl) g; 1910 aa.setRecordType(AminoAcid.SEQRESRECORD); 1911 //aa.setId(seqId); 1912 } 1913 } else { 1914 1915 if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){ 1916 AminoAcidImpl a = new AminoAcidImpl(); 1917 a.setRecordType(AminoAcid.SEQRESRECORD); 1918 Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id()); 1919 a.setAminoType(code1); 1920 g = a; 1921 1922 } else if ( StructureTools.isNucleotide(epolseq.getMon_id())) { 1923 // the group is actually a nucleotide group... 1924 NucleotideImpl n = new NucleotideImpl(); 1925 g = n; 1926 1927 } else { 1928 logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id()); 1929 HetatomImpl h = new HetatomImpl(); 1930 g = h; 1931 1932 } 1933 1934 1935 } 1936 // at this stage we don't know about author residue numbers (insertion codes) 1937 // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly sequential and follow the seqres sequence 1 to n) 1938 // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes() 1939 g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum())); 1940 1941 g.setPDBName(epolseq.getMon_id()); 1942 1943 entityChain.addGroup(g); 1944 1945 } 1946 1947 @Override 1948 public void newPdbxPolySeqScheme(PdbxPolySeqScheme ppss) { 1949 1950 //if ( headerOnly) 1951 // return; 1952 1953 // replace the group asym ids with the real PDB ids! 1954 // replaceGroupSeqPos(ppss); // This might be incorrect in some pdb, to use auth_seq_id of the pdbx_poly_seq_scheme. 1955 1956 1957 } 1958 1959 1960 @Override 1961 public void newPdbxNonPolyScheme(PdbxNonPolyScheme ppss) { 1962 1963 //if (headerOnly) 1964 // return; 1965 1966 // merge the EntityPolySeq info and the AtomSite chains into one... 1967 //already known ignore: 1968 1969 } 1970 1971 @Override 1972 public void newPdbxEntityNonPoly(PdbxEntityNonPoly pen){ 1973 // TODO: do something with them... 1974 // not implemented yet... 1975 logger.debug(pen.getEntity_id() + " " + pen.getName() + " " + pen.getComp_id()); 1976 1977 } 1978 1979 @Override 1980 public void newChemComp(ChemComp c) { 1981 // TODO: do something with them... 1982 1983 } 1984 1985 @Override 1986 public void newGenericData(String category, List<String> loopFields, 1987 List<String> lineData) { 1988 1989 //logger.debug("unhandled category so far: " + category); 1990 } 1991 1992 @Override 1993 public FileParsingParameters getFileParsingParameters() 1994 { 1995 return params; 1996 } 1997 1998 @Override 1999 public void setFileParsingParameters(FileParsingParameters params) 2000 { 2001 this.params = params; 2002 2003 } 2004 2005 @Override 2006 public void newChemCompDescriptor(ChemCompDescriptor ccd) { 2007 2008 // TODO nothing happening here yet. 2009 2010 } 2011 2012 2013 2014 public List<PdbxStructOperList> getStructOpers() { 2015 return structOpers; 2016 } 2017 2018 @Override 2019 public void newPdbxStrucAssembly(PdbxStructAssembly strucAssembly) { 2020 strucAssemblies.add(strucAssembly); 2021 2022 } 2023 2024 public List<PdbxStructAssembly> getStructAssemblies(){ 2025 return strucAssemblies; 2026 } 2027 2028 @Override 2029 public void newPdbxStrucAssemblyGen(PdbxStructAssemblyGen strucAssembly) { 2030 strucAssemblyGens.add(strucAssembly); 2031 2032 } 2033 2034 public List<PdbxStructAssemblyGen> getStructAssemblyGens(){ 2035 return strucAssemblyGens; 2036 } 2037 2038 @Override 2039 public void newChemCompAtom(ChemCompAtom atom) { 2040 2041 } 2042 2043 @Override 2044 public void newPdbxChemCompIndentifier(PdbxChemCompIdentifier id) { 2045 2046 } 2047 2048 @Override 2049 public void newChemCompBond(ChemCompBond bond) { 2050 2051 } 2052 2053 @Override 2054 public void newPdbxChemCompDescriptor(PdbxChemCompDescriptor desc) { 2055 2056 } 2057 2058 @Override 2059 public void newStructConn(StructConn structConn) { 2060 this.structConn.add(structConn); 2061 } 2062 2063 @Override 2064 public void newStructSiteGen(StructSiteGen siteGen) { this.structSiteGens.add(siteGen); } 2065 2066 @Override 2067 public void newStructSite(StructSite structSite) { 2068 2069 if (params.isHeaderOnly()) { 2070 return; 2071 } 2072 2073 // Simply implement the method. 2074 List<Site> sites = structure.getSites(); 2075 if (sites == null) sites = new ArrayList<Site>(); 2076 2077 Site site = null; 2078 for (Site asite : sites) { 2079 if (asite.getSiteID().equals(structSite.getId())) { 2080 site = asite; // Prevent duplicate siteIds 2081 } 2082 } 2083 boolean addSite = false; 2084 if (site == null) { site = new Site(); addSite = true; } 2085 site.setSiteID(structSite.getId()); 2086 site.setDescription(structSite.getDetails()); 2087 // site.setPdbxEvidenceCode(structSite.getPdbxEvidenceCode()); // TODO - add addition fields in Sites 2088 if (addSite) sites.add(site); 2089 2090 structure.setSites(sites); 2091 } 2092 2093 /** 2094 * Build sites in a BioJava Structure using the original author chain id & residue numbers. 2095 * Sites are built from struct_site_gen records that have been parsed. 2096 */ 2097 private void addSites() { 2098 List<Site> sites = structure.getSites(); 2099 if (sites == null) sites = new ArrayList<Site>(); 2100 2101 for (StructSiteGen siteGen : structSiteGens) { 2102 // For each StructSiteGen, find the residues involved, if they exist then 2103 String site_id = siteGen.getSite_id(); // multiple could be in same site. 2104 if (site_id == null) site_id = ""; 2105 String comp_id = siteGen.getLabel_comp_id(); // PDBName 2106 2107 // Assumption: the author chain ID and residue number for the site is consistent with the original 2108 // author chain id and residue numbers. 2109 2110 String asymId = siteGen.getLabel_asym_id(); // chain name 2111 String authId = siteGen.getAuth_asym_id(); // chain Id 2112 String auth_seq_id = siteGen.getAuth_seq_id(); // Res num 2113 2114 String insCode = siteGen.getPdbx_auth_ins_code(); 2115 if ( insCode != null && insCode.equals("?")) 2116 insCode = null; 2117 2118 // Look for asymID = chainID and seqID = seq_ID. Check that comp_id matches the resname. 2119 Group g = null; 2120 try { 2121 Chain chain = structure.getChain(asymId); 2122 2123 if (null != chain) { 2124 try { 2125 Character insChar = null; 2126 if (null != insCode && insCode.length() > 0) insChar = insCode.charAt(0); 2127 g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar)); 2128 } catch (NumberFormatException e) { 2129 logger.warn("Could not lookup residue : " + authId + auth_seq_id); 2130 } 2131 } 2132 } catch (StructureException e) { 2133 logger.warn("Problem finding residue in site entry " + siteGen.getSite_id() + " - " + e.getMessage(), e.getMessage()); 2134 } 2135 2136 if (g != null) { 2137 // 2. find the site_id, if not existing, create anew. 2138 Site site = null; 2139 for (Site asite: sites) { 2140 if (site_id.equals(asite.getSiteID())) site = asite; 2141 } 2142 2143 boolean addSite = false; 2144 2145 // 3. add this residue to the site. 2146 if (site == null) { 2147 addSite = true; 2148 site = new Site(); 2149 site.setSiteID(site_id); 2150 } 2151 2152 List<Group> groups = site.getGroups(); 2153 if (groups == null) groups = new ArrayList<Group>(); 2154 2155 // Check the self-consistency of the residue reference from auth_seq_id and chain_id 2156 if (!comp_id.equals(g.getPDBName())) { 2157 logger.warn("comp_id doesn't match the residue at " + authId + " " + auth_seq_id + " - skipping"); 2158 } else { 2159 groups.add(g); 2160 site.setGroups(groups); 2161 } 2162 if (addSite) sites.add(site); 2163 } 2164 } 2165 structure.setSites(sites); 2166 } 2167}