001/* 002 * 003 * This code may be freely distributed and modified under the 004 * terms of the GNU Lesser General Public Licence. This should 005 * be distributed with the code. If you do not have a copy, 006 * see: 007 * 008 * http://www.gnu.org/copyleft/lesser.html 009 * 010 * Copyright for this code is held jointly by the individual 011 * authors. These should be listed in @author doc comments. 012 * 013 * For more information on the BioJava project and its aims, 014 * or to join the biojava-l mailing list, visit the home page 015 * at: 016 * 017 * http://www.biojava.org/ 018 * 019 * Created on 16.03.2004 020 * 021 */ 022package org.biojava.nbio.structure.io; 023 024import static java.lang.Math.min; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.text.DateFormat; 031import java.text.ParseException; 032import java.text.SimpleDateFormat; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Date; 036import java.util.HashMap; 037import java.util.Iterator; 038import java.util.LinkedHashMap; 039import java.util.List; 040import java.util.Locale; 041import java.util.Map; 042import java.util.StringTokenizer; 043import java.util.regex.Matcher; 044import java.util.regex.Pattern; 045 046import javax.vecmath.Matrix4d; 047 048import org.biojava.nbio.structure.AminoAcid; 049import org.biojava.nbio.structure.AminoAcidImpl; 050import org.biojava.nbio.structure.Atom; 051import org.biojava.nbio.structure.AtomImpl; 052import org.biojava.nbio.structure.Author; 053import org.biojava.nbio.structure.Chain; 054import org.biojava.nbio.structure.ChainImpl; 055import org.biojava.nbio.structure.DBRef; 056import org.biojava.nbio.structure.Element; 057import org.biojava.nbio.structure.EntityInfo; 058import org.biojava.nbio.structure.EntityType; 059import org.biojava.nbio.structure.Group; 060import org.biojava.nbio.structure.GroupIterator; 061import org.biojava.nbio.structure.HetatomImpl; 062import org.biojava.nbio.structure.JournalArticle; 063import org.biojava.nbio.structure.NucleotideImpl; 064import org.biojava.nbio.structure.PDBCrystallographicInfo; 065import org.biojava.nbio.structure.PDBHeader; 066import org.biojava.nbio.structure.PdbId; 067import org.biojava.nbio.structure.ResidueNumber; 068import org.biojava.nbio.structure.Site; 069import org.biojava.nbio.structure.Structure; 070import org.biojava.nbio.structure.StructureException; 071import org.biojava.nbio.structure.StructureImpl; 072import org.biojava.nbio.structure.StructureTools; 073import org.biojava.nbio.structure.chem.ChemCompAtom; 074import org.biojava.nbio.structure.chem.ChemCompGroupFactory; 075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord; 076import org.biojava.nbio.structure.secstruc.SecStrucInfo; 077import org.biojava.nbio.structure.secstruc.SecStrucType; 078import org.biojava.nbio.structure.xtal.CrystalCell; 079import org.biojava.nbio.structure.xtal.SpaceGroup; 080import org.biojava.nbio.structure.xtal.SymoplibParser; 081import org.slf4j.Logger; 082import org.slf4j.LoggerFactory; 083 084 085/** 086 * This class implements the actual PDB file parsing. Do not access it directly, but 087 * via the PDBFileReader class. 088 * 089 * <h2>Parsing</h2> 090 * 091 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods. 092 * 093 * 094 * <p> 095 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD. 096 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically 097 * switch to a C-alpha only representation. 098 * 099 * <p> 100 * The result of the parsing of the PDB file is a new {@link Structure} object. 101 * 102 * <p> 103 * For more documentation on how to work with the Structure API please 104 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top"> 105 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a> 106 * 107 * 108 * 109 * 110 * <h2>Example</h2> 111 * <p> 112 * Q: How can I get a Structure object from a PDB file? 113 * <p> 114 * A: 115 * <pre> 116 * public {@link Structure} loadStructure(String pathToPDBFile){ 117 * // The PDBFileParser is wrapped by the PDBFileReader 118 * {@link PDBFileReader} pdbreader = new {@link PDBFileReader}(); 119 * 120 * {@link Structure} structure = null; 121 * try{ 122 * structure = pdbreader.getStructure(pathToPDBFile); 123 * System.out.println(structure); 124 * } catch (IOException e) { 125 * e.printStackTrace(); 126 * } 127 * return structure; 128 * } 129 * </pre> 130 * 131 * 132 * @author Andreas Prlic 133 * @author Jules Jacobsen 134 * @author Jose Duarte 135 * @since 1.4 136 */ 137public class PDBFileParser { 138 139 140 141 private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class); 142 143 // for printing 144 private static final String NEWLINE = System.getProperty("line.separator"); 145 146 147 // required for parsing: 148 private String pdbId; //the actual id of the entry 149 private Structure structure; 150 private List<List<Chain>> allModels; // a temp data structure to keep all models 151 private List<Chain> currentModel; // contains the ATOM records for each model 152 private Chain currentChain; 153 private Group currentGroup; 154 155 private List<Chain> seqResChains; // contains all the chains for the SEQRES records 156 //we're going to work on the assumption that the files are current - 157 //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true. 158 //if true then lines will be truncated at 72 characters in certain cases 159 //(pdb_COMPOUND_handler for example) 160 private boolean isLegacyFormat = false; 161 162 private boolean blankChainIdsPresent = false; 163 164 // for re-creating the biological assembly 165 private PDBBioAssemblyParser bioAssemblyParser = null; 166 167 private PDBHeader pdbHeader; 168 private PDBCrystallographicInfo crystallographicInfo; 169 private JournalArticle journalArticle; 170 private List<Map<String, Integer>> connects ; 171 private List<Map<String,String>> helixList; 172 private List<Map<String,String>> strandList; 173 private List<Map<String,String>> turnList; 174 175 private int lengthCheck ; 176 177 private boolean isLastCompndLine = false; 178 private boolean isLastSourceLine = false; 179 private EntityInfo current_compound; 180 private List<EntityInfo> entities = new ArrayList<EntityInfo>(); 181 private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>(); 182 private List<String> compndLines = new ArrayList<String>(); 183 private List<String> sourceLines = new ArrayList<String>(); 184 private List<String> journalLines = new ArrayList<String>(); 185 private List<String> keywordsLines = new ArrayList<String>(); 186 private List<DBRef> dbrefs; 187 private Map<String, Site> siteMap = new LinkedHashMap<String, Site>(); 188 private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>(); 189 190 private List<SSBondImpl> ssbonds = new ArrayList<>(); 191 192 // for storing LINK until we have all the atoms parsed 193 private List<LinkRecord> linkRecords; 194 195 private Matrix4d currentNcsOp; 196 private List<Matrix4d> ncsOperators; 197 198 // for parsing COMPOUND and SOURCE Header lines 199 private int prevMolId; 200 private String previousContinuationField; 201 private String continuationField; 202 private String continuationString; 203 204 private DateFormat dateFormat; 205 206 // for rfree parsing 207 private float rfreeStandardLine = -1; 208 private float rfreeNoCutoffLine = -1; 209 210 private static final List<String> compndFieldValues = new ArrayList<String>( 211 Arrays.asList( 212 "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:", 213 "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:", 214 "BIOLOGICAL_UNIT:", "OTHER_DETAILS:" 215 )); 216 217 218 private static final List<String> ignoreCompndFieldValues = new ArrayList<String>( 219 Arrays.asList( 220 "HETEROGEN:","ENGINEEREED:","FRAGMENT,", 221 "MUTANT:","SYNTHETIC:" 222 )); 223 // ENGINEEREED in pdb219d 224 225 private static final List<String> sourceFieldValues = new ArrayList<String>( 226 Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:", 227 "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:", 228 "ORGANISM_TAXID:","STRAIN:", 229 "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:", 230 "CELL:", "ORGANELLE:", "SECRETION:", "GENE:", 231 "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:", 232 "EXPRESSION_SYSTEM_TAXID:", 233 "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:", 234 "EXPRESSION_SYSTEM_CELL_LINE:", 235 "EXPRESSION_SYSTEM_ATCC_NUMBER:", 236 "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:", 237 "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:", 238 "EXPRESSION_SYSTEM_CELLULAR_LOCATION:", 239 "EXPRESSION_SYSTEM_VECTOR_TYPE:", 240 "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:", 241 "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:")); 242 243 private int atomCount; 244 245 // parsing options: 246 247 private int atomCAThreshold ; 248 249 private int loadMaxAtoms; 250 251 private boolean atomOverflow; 252 253 /** flag to tell parser to only read Calpha coordinates **/ 254 private boolean parseCAonly; 255 256 257 private FileParsingParameters params; 258 259 private boolean startOfMolecule; 260 private boolean startOfModel; 261 262 public PDBFileParser() { 263 params = new FileParsingParameters(); 264 265 allModels = new ArrayList<>(); 266 structure = null ; 267 currentModel = null; 268 currentChain = null; 269 currentGroup = null; 270 // we initialise to true since at the beginning of the file we are always starting a new molecule 271 startOfMolecule = true; 272 startOfModel = true; 273 274 275 pdbHeader = new PDBHeader(); 276 crystallographicInfo = new PDBCrystallographicInfo(); 277 connects = new ArrayList<Map<String,Integer>>() ; 278 279 280 helixList = new ArrayList<Map<String,String>>(); 281 strandList = new ArrayList<Map<String,String>>(); 282 turnList = new ArrayList<Map<String,String>>(); 283 current_compound = null; 284 dbrefs = new ArrayList<DBRef>(); 285 siteMap = null; 286 dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US); 287 atomCount = 0; 288 atomOverflow = false; 289 parseCAonly = false; 290 291 // this SHOULD not be done 292 // DONOT:setFileParsingParameters(params); 293 // set the correct max values for parsing... 294 loadMaxAtoms = params.getMaxAtoms(); 295 atomCAThreshold = params.getAtomCaThreshold(); 296 297 linkRecords = new ArrayList<LinkRecord>(); 298 299 blankChainIdsPresent = false; 300 301 } 302 303 /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */ 304 private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) { 305 306 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3); 307 if ( g != null && !g.getChemComp().isEmpty()) 308 return g; 309 310 311 Group group; 312 if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){ 313 group = new HetatomImpl(); 314 315 } else if(StructureTools.isNucleotide(aminoCode3)) { 316 // it is a nucleotide 317 NucleotideImpl nu = new NucleotideImpl(); 318 group = nu; 319 320 } else { 321 AminoAcidImpl aa = new AminoAcidImpl() ; 322 aa.setAminoType(aminoCode1); 323 group = aa ; 324 } 325 326 // System.out.println("new resNum type: "+ resNum.getType() ); 327 return group ; 328 } 329 330 331 332 // Handler methods to deal with PDB file records properly. 333 /** 334 Handler for 335 HEADER Record Format 336 <pre> 337 COLUMNS DATA TYPE FIELD DEFINITION 338 ---------------------------------------------------------------------------------- 339 1 - 6 Record name "HEADER" 340 11 - 50 String(40) classification Classifies the molecule(s) 341 51 - 59 Date depDate Deposition date. This is the date 342 the coordinates were received by 343 the PDB 344 63 - 66 IDcode idCode This identifier is unique within PDB 345 </pre> 346 */ 347 private void pdb_HEADER_Handler(String line) { 348 349 String classification = null; 350 String deposition_date = null; 351 String pdbCode = null; 352 353 int len = line.trim().length(); 354 if(len > 10) { 355 classification = line.substring (10, min(len,50)).trim() ; 356 pdbHeader.setClassification(classification); 357 } 358 if(len > 50) { 359 deposition_date = line.substring (50, min(len,59)).trim() ; 360 try { 361 Date dep = dateFormat.parse(deposition_date); 362 pdbHeader.setDepDate(dep); 363 364 } catch (ParseException e){ 365 logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date"); 366 } 367 } 368 if(len > 62) { 369 pdbCode = line.substring (62, min(len,66)).trim() ; 370 pdbId = pdbCode; 371 372 logger.debug("Parsing entry " + pdbId); 373 374 375 PdbId pdbIdToSet; 376 try { 377 pdbIdToSet = new PdbId(pdbCode); 378 } catch (IllegalArgumentException e) { 379 logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode); 380 pdbIdToSet = null; 381 } 382 structure.setPdbId(pdbIdToSet); 383 pdbHeader.setPdbId(pdbIdToSet); 384 } 385 386 //*really* old files (you'll need to hunt to find these as they 387 //should have been remediated) have headers like below. Plus the 388 //pdbId at positions 72-76 is present in every line 389 390 //HEADER PROTEINASE INHIBITOR (TRYPSIN) 05-OCT-84 5PTI 5PTI 3 391 //HEADER TRANSFERASE (ACYLTRANSFERASE) 02-SEP-92 1LAC 1LAC 2 392 if (len > 66) { 393 if (pdbId.equals(line.substring (72, 76))){ 394 isLegacyFormat = true; 395 logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly."); 396 } 397 } 398 399 } 400 401 402 /** 403 * Parses the following record: 404 * <pre> 405 * COLUMNS DATA TYPE FIELD DEFINITION 406 * ------------------------------------------------------------------------------------ 407 * 1 - 6 Record name "AUTHOR" 408 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 409 * 11 - 79 List authorList List of the author names, separated 410 * by commas. 411 * 412 * </pre> 413 * @param line 414 */ 415 private void pdb_AUTHOR_Handler(String line) { 416 417 String authors = line.substring(10).trim(); 418 419 String auth = pdbHeader.getAuthors(); 420 if (auth == null){ 421 pdbHeader.setAuthors(authors); 422 } else { 423 auth += authors; 424 pdbHeader.setAuthors(auth); 425 } 426 427 } 428 429 430 431 /** 432 * Parses the following record: 433 * 434 * <pre> 435 * COLUMNS DATA TYPE FIELD DEFINITION 436 * -------------------------------------------------------------------- 437 * 1 - 6 Record name "HELIX " 438 * 8 - 10 Integer serNum Serial number of the helix. 439 * This starts at 1 and increases 440 * incrementally. 441 * 12 - 14 LString(3) helixID Helix identifier. In addition 442 * to a serial number, each helix is 443 * given an alphanumeric character 444 * helix identifier. 445 * 16 - 18 Residue name initResName Name of the initial residue. 446 * 20 Character initChainID Chain identifier for the chain 447 * containing this helix. 448 * 22 - 25 Integer initSeqNum Sequence number of the initial 449 * residue. 450 * 26 AChar initICode Insertion code of the initial 451 * residue. 452 * 28 - 30 Residue name endResName Name of the terminal residue of 453 * the helix. 454 * 32 Character endChainID Chain identifier for the chain 455 * containing this helix. 456 * 34 - 37 Integer endSeqNum Sequence number of the terminal 457 * residue. 458 * 38 AChar endICode Insertion code of the terminal 459 * residue. 460 * 39 - 40 Integer helixClass Helix class (see below). 461 * 41 - 70 String comment Comment about this helix. 462 * 72 - 76 Integer length Length of this helix. 463 * </pre> 464 */ 465 private void pdb_HELIX_Handler(String line){ 466 467 if (params.isHeaderOnly()) return; 468 469 if (line.length()<38) { 470 logger.info("HELIX line has length under 38. Ignoring it."); 471 return; 472 } 473 474 String initResName = line.substring(15,18).trim(); 475 String initChainId = line.substring(19,20); 476 String initSeqNum = line.substring(21,25).trim(); 477 String initICode = line.substring(25,26); 478 String endResName = line.substring(27,30).trim(); 479 String endChainId = line.substring(31,32); 480 String endSeqNum = line.substring(33,37).trim(); 481 String endICode = line.substring(37,38); 482 483 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 484 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 485 486 Map<String,String> m = new HashMap<String,String>(); 487 488 m.put("initResName",initResName); 489 m.put("initChainId", initChainId); 490 m.put("initSeqNum", initSeqNum); 491 m.put("initICode", initICode); 492 m.put("endResName", endResName); 493 m.put("endChainId", endChainId); 494 m.put("endSeqNum",endSeqNum); 495 m.put("endICode",endICode); 496 497 helixList.add(m); 498 499 } 500 501 /** 502 * Handler for 503 * <pre> 504 * COLUMNS DATA TYPE FIELD DEFINITION 505 * -------------------------------------------------------------- 506 * 1 - 6 Record name "SHEET " 507 * 8 - 10 Integer strand Strand number which starts at 1 508 * for each strand within a sheet 509 * and increases by one. 510 * 12 - 14 LString(3) sheetID Sheet identifier. 511 * 15 - 16 Integer numStrands Number of strands in sheet. 512 * 18 - 20 Residue name initResName Residue name of initial residue. 513 * 22 Character initChainID Chain identifier of initial 514 * residue in strand. 515 * 23 - 26 Integer initSeqNum Sequence number of initial 516 * residue in strand. 517 * 27 AChar initICode Insertion code of initial residue 518 * in strand. 519 * 29 - 31 Residue name endResName Residue name of terminal residue. 520 * 33 Character endChainID Chain identifier of terminal 521 * residue. 522 * 34 - 37 Integer endSeqNum Sequence number of terminal 523 * residue. 524 * 38 AChar endICode Insertion code of terminal 525 * residue. 526 * 39 - 40 Integer sense Sense of strand with respect to 527 * previous strand in the sheet. 0 528 * if first strand, 1 if parallel, 529 * -1 if anti-parallel. 530 * 42 - 45 Atom curAtom Registration. Atom name in 531 * current strand. 532 * 46 - 48 Residue name curResName Registration. Residue name in 533 * current strand. 534 * 50 Character curChainId Registration. Chain identifier in 535 * current strand. 536 * 51 - 54 Integer curResSeq Registration. Residue sequence 537 * number in current strand. 538 * 55 AChar curICode Registration. Insertion code in 539 * current strand. 540 * 57 - 60 Atom prevAtom Registration. Atom name in 541 * previous strand. 542 * 61 - 63 Residue name prevResName Registration. Residue name in 543 * previous strand. 544 * 65 Character prevChainId Registration. Chain identifier in 545 * previous strand. 546 * 66 - 69 Integer prevResSeq Registration. Residue sequence 547 * number in previous strand. 548 * 70 AChar prevICode Registration. Insertion code in 549 * previous strand. 550 * </pre> 551 */ 552 private void pdb_SHEET_Handler( String line){ 553 554 if (params.isHeaderOnly()) return; 555 556 if (line.length()<38) { 557 logger.info("SHEET line has length under 38. Ignoring it."); 558 return; 559 } 560 561 String initResName = line.substring(17,20).trim(); 562 String initChainId = line.substring(21,22); 563 String initSeqNum = line.substring(22,26).trim(); 564 String initICode = line.substring(26,27); 565 String endResName = line.substring(28,31).trim(); 566 String endChainId = line.substring(32,33); 567 String endSeqNum = line.substring(33,37).trim(); 568 String endICode = line.substring(37,38); 569 570 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 571 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 572 573 Map<String,String> m = new HashMap<String,String>(); 574 575 m.put("initResName",initResName); 576 m.put("initChainId", initChainId); 577 m.put("initSeqNum", initSeqNum); 578 m.put("initICode", initICode); 579 m.put("endResName", endResName); 580 m.put("endChainId", endChainId); 581 m.put("endSeqNum",endSeqNum); 582 m.put("endICode",endICode); 583 584 strandList.add(m); 585 } 586 587 588 /** 589 * Handler for TURN lines 590 * <pre> 591 * COLUMNS DATA TYPE FIELD DEFINITION 592 * -------------------------------------------------------------------- 593 * 1 - 6 Record name "TURN " 594 * 8 - 10 Integer seq Turn number; starts with 1 and 595 * increments by one. 596 * 12 - 14 LString(3) turnId Turn identifier 597 * 16 - 18 Residue name initResName Residue name of initial residue in 598 * turn. 599 * 20 Character initChainId Chain identifier for the chain 600 * containing this turn. 601 * 21 - 24 Integer initSeqNum Sequence number of initial residue 602 * in turn. 603 * 25 AChar initICode Insertion code of initial residue 604 * in turn. 605 * 27 - 29 Residue name endResName Residue name of terminal residue 606 * of turn. 607 * 31 Character endChainId Chain identifier for the chain 608 * containing this turn. 609 * 32 - 35 Integer endSeqNum Sequence number of terminal 610 * residue of turn. 611 * 36 AChar endICode Insertion code of terminal residue 612 * of turn. 613 * 41 - 70 String comment Associated comment. 614 * </pre> 615 * @param line 616 */ 617 private void pdb_TURN_Handler( String line){ 618 619 if (params.isHeaderOnly()) return; 620 621 if (line.length()<36) { 622 logger.info("TURN line has length under 36. Ignoring it."); 623 return; 624 } 625 626 String initResName = line.substring(15,18).trim(); 627 String initChainId = line.substring(19,20); 628 String initSeqNum = line.substring(20,24).trim(); 629 String initICode = line.substring(24,25); 630 String endResName = line.substring(26,29).trim(); 631 String endChainId = line.substring(30,31); 632 String endSeqNum = line.substring(31,35).trim(); 633 String endICode = line.substring(35,36); 634 635 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 636 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 637 638 Map<String,String> m = new HashMap<String,String>(); 639 640 m.put("initResName",initResName); 641 m.put("initChainId", initChainId); 642 m.put("initSeqNum", initSeqNum); 643 m.put("initICode", initICode); 644 m.put("endResName", endResName); 645 m.put("endChainId", endChainId); 646 m.put("endSeqNum",endSeqNum); 647 m.put("endICode",endICode); 648 649 turnList.add(m); 650 } 651 652 /** 653 * Handler for 654 * REVDAT Record format: 655 * <pre> 656 * 657 * COLUMNS DATA TYPE FIELD DEFINITION 658 * ---------------------------------------------------------------------------------- 659 * 1 - 6 Record name "REVDAT" 660 * 8 - 10 Integer modNum Modification number. 661 * 11 - 12 Continuation continuation Allows concatenation of multiple 662 * records. 663 * 14 - 22 Date modDate Date of modification (or release for 664 * new entries). This is not repeated 665 * on continuation lines. 666 * 24 - 28 String(5) modId Identifies this particular 667 * modification. It links to the 668 * archive used internally by PDB. 669 * This is not repeated on continuation 670 * lines. 671 * 32 Integer modType An integer identifying the type of 672 * modification. In case of revisions 673 * with more than one possible modType, 674 * the highest value applicable will be 675 * assigned. 676 * 40 - 45 LString(6) record Name of the modified record. 677 * 47 - 52 LString(6) record Name of the modified record. 678 * 54 - 59 LString(6) record Name of the modified record. 679 * 61 - 66 LString(6) record Name of the modified record. 680 * </pre> 681 */ 682 private void pdb_REVDAT_Handler(String line) { 683 684 // keep the first as latest modified date and the last as release date 685 Date modDate = pdbHeader.getModDate(); 686 687 if ( modDate==null || modDate.equals(new Date(0)) ) { 688 689 // modified date is still uninitialized 690 String modificationDate = line.substring (13, 22).trim() ; 691 692 try { 693 Date dep = dateFormat.parse(modificationDate); 694 pdbHeader.setModDate(dep); 695 pdbHeader.setRelDate(dep); 696 } catch (ParseException e){ 697 logger.info("Could not parse revision date string '"+modificationDate+"'. "); 698 } 699 700 } else { 701 702 // set as the release date 703 String releaseDate = line.substring (13, 22).trim() ; 704 705 try { 706 Date dep = dateFormat.parse(releaseDate); 707 pdbHeader.setRelDate(dep); 708 } catch (ParseException e){ 709 logger.info("Could not parse revision date string '"+releaseDate+"'. "); 710 } 711 } 712 } 713 714 /** 715 * Handler for 716 * SEQRES record format 717 * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied. 718 * <p> 719 * Record Format: 720 * <p> 721 * <pre> 722 * COLUMNS DATA TYPE FIELD DEFINITION 723 * --------------------------------------------------------------------------------- 724 * 1 - 6 Record name "SEQRES" 725 * 9 - 10 Integer serNum Serial number of the SEQRES record 726 * for the current chain. Starts at 1 727 * and increments by one each line. 728 * Reset to 1 for each chain. 729 * 12 Character chainID Chain identifier. This may be any 730 * single legal character, including a 731 * blank which is used if there is 732 * only one chain. 733 * 14 - 17 Integer numRes Number of residues in the chain. 734 * This value is repeated on every 735 * record. 736 * 20 - 22 Residue name resName Residue name. 737 * 24 - 26 Residue name resName Residue name. 738 * 28 - 30 Residue name resName Residue name. 739 * 32 - 34 Residue name resName Residue name. 740 * 36 - 38 Residue name resName Residue name. 741 * 40 - 42 Residue name resName Residue name. 742 * 44 - 46 Residue name resName Residue name. 743 * 48 - 50 Residue name resName Residue name. 744 * 52 - 54 Residue name resName Residue name. 745 * 56 - 58 Residue name resName Residue name. 746 * 60 - 62 Residue name resName Residue name. 747 * 64 - 66 Residue name resName Residue name. 748 * 68 - 70 Residue name resName Residue name. 749 * </pre> 750 * @author Jules Jacobsen 751 */ 752 private void pdb_SEQRES_Handler(String line) { 753 754 /* 755 * 1 2 3 4 5 6 7 756 * 1234567890123456789012345678901234567890123456789012345678901234567890 757 * SEQRES 1 A 376 LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR 758 * SEQRES 1 A 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 759 * SEQRES 2 A 21 TYR GLN LEU GLU ASN TYR CYS ASN 760 * SEQRES 1 B 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 761 * SEQRES 2 B 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 762 * SEQRES 3 B 30 THR PRO LYS ALA 763 * SEQRES 1 C 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 764 * SEQRES 2 C 21 TYR GLN LEU GLU ASN TYR CYS ASN 765 * SEQRES 1 D 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 766 * SEQRES 2 D 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 767 * SEQRES 3 D 30 THR PRO LYS ALA 768 */ 769 770 String recordName = line.substring(0, 6).trim(); 771 String chainID = line.substring(11, 12); 772 String newLength = line.substring(13,17).trim(); 773 String subSequence = line.substring(18); 774 775 if ( lengthCheck == -1 ){ 776 lengthCheck = Integer.parseInt(newLength); 777 } 778 779 StringTokenizer subSequenceResidues = new StringTokenizer(subSequence); 780 781 Character aminoCode1 = null; 782 if (! recordName.equals(AminoAcid.SEQRESRECORD)) { 783 // should not have been called 784 return; 785 } 786 787 currentChain = isKnownChain(chainID, seqResChains); 788 if ( currentChain == null) { 789 790 currentChain = new ChainImpl(); 791 currentChain.setId(chainID); 792 currentChain.setName(chainID); 793 794 } 795 796 while (subSequenceResidues.hasMoreTokens()) { 797 798 String threeLetter = subSequenceResidues.nextToken(); 799 800 aminoCode1 = StructureTools.get1LetterCode(threeLetter); 801 802 //if (aminoCode1 == null) { 803 // could be a nucleotide... 804 // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide... 805 //} 806 currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter); 807 808 currentGroup.setPDBName(threeLetter); 809 810 if ( currentGroup instanceof AminoAcid){ 811 AminoAcid aa = (AminoAcid)currentGroup; 812 aa.setRecordType(AminoAcid.SEQRESRECORD); 813 } 814 // add the current resNum to the new chain. 815 currentChain.addGroup(currentGroup); 816 817 } 818 Chain test = isKnownChain(chainID, seqResChains); 819 820 if ( test == null) 821 seqResChains.add(currentChain); 822 823 if (currentGroup != null) 824 currentGroup.trimToSize(); 825 826 currentGroup = null; 827 currentChain = null; 828 829 // the current chain is finished! 830 //if ( current_chain.getLength() != lengthCheck ){ 831 // System.err.println("the length of chain " + current_chain.getName() + "(" + 832 // current_chain.getLength() + ") does not match the expected " + lengthCheck); 833 //} 834 835 lengthCheck = Integer.parseInt(newLength); 836 837 } 838 839 840 841 /** 842 * Handler for 843 * TITLE Record Format 844 * <pre> 845 COLUMNS DATA TYPE FIELD DEFINITION 846 ---------------------------------------------------------------------------------- 847 1 - 6 Record name "TITLE " 848 9 - 10 Continuation continuation Allows concatenation of multiple 849 records. 850 11 - 70 String title Title of the experiment. 851 * </pre> 852 * 853 */ 854 private void pdb_TITLE_Handler(String line) { 855 String title; 856 if ( line.length() > 79) 857 title = line.substring(10,80).trim(); 858 else 859 title = line.substring(10,line.length()).trim(); 860 861 String t = pdbHeader.getTitle(); 862 if ( (t != null) && (! t.equals("")) ){ 863 if (t.endsWith("-")) 864 t += ""; // if last line ends with a hyphen then we don't add space 865 else 866 t += " "; 867 } 868 else t = ""; 869 870 t += title; 871 872 pdbHeader.setTitle(t); 873 } 874 875 /** 876 * JRNL handler. 877 * The JRNL record contains the primary literature citation that describes the experiment which resulted 878 * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary 879 * reference, then there is no JRNL reference. Other references are given in REMARK 1. 880 * 881 * Record Format 882 * <pre> 883 * COLUMNS DATA TYPE FIELD DEFINITION 884 * ----------------------------------------------------------------------- 885 * 1 - 6 Record name "JRNL " 886 * 887 * 13 - 70 LString text See Details below. 888 * </pre> 889 */ 890 private void pdb_JRNL_Handler(String line) { 891 //add the strings to the journalLines 892 //the actual JournalArticle is then built when the whole entry is being 893 //finalized with triggerEndFileChecks() 894 //JRNL TITL NMR SOLUTION STRUCTURE OF RECOMBINANT TICK 1TAP 10 895 if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) { 896 //trim off the trailing PDB id from legacy files. 897 //are we really trying to still cater for these museum pieces? 898 899 logger.debug("trimming legacy PDB id from end of JRNL section line"); 900 901 line = line.substring(0, line.length() - 8); 902 journalLines.add(line); 903 } else { 904 journalLines.add(line); 905 } 906 } 907 908 /** 909 * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same 910 * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be 911 * missing. Don't say I didn't warn you. 912 * 913 * @param line 914 */ 915 private void pdb_COMPND_Handler(String line) { 916 917 logger.debug("previousContinuationField is " 918 + previousContinuationField); 919 logger.debug("current continuationField is " 920 + continuationField); 921 logger.debug("current continuationString is " 922 + continuationString); 923 logger.debug("current compound is " 924 + current_compound); 925 926 927 // In legacy PDB files the line ends with the PDB code and a serial number, chop those off! 928 //format version 3.0 onwards will have 80 characters in a line 929 // if (line.length() > 72) { 930 if (isLegacyFormat) { 931 // if (DEBUG) { 932 // System.out.println("We have a legacy file - truncating line length to 71 characters:"); 933 // System.out.println(line); 934 // } 935 line = line.substring(0, 72); 936 } 937 938 line = line.substring(10, line.length()); 939 940 941 String[] fieldList = line.trim().split("\\s+"); 942 int fl = fieldList.length; 943 if (fl > 0) { 944 String field0 = fieldList[0]; 945 if (compndFieldValues.contains(field0)) { 946 continuationField = field0; 947 if (previousContinuationField.equals("")) { 948 previousContinuationField = continuationField; 949 } 950 } else if (field0.endsWith(";") && compndFieldValues.contains(field0.substring(0, field0.length()-1)) ) { 951 // the ':' character indicates the end of a field name and should be invalid as part the first data token 952 // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check 953 // UPDATE: There is no harm of having a ':' in the first data token. e.g. 3fdj contains a ':'. 954 // The intended case occurs only if the token is a key followed by a colon and a semicolon without spaces, e.g. "COMPND 2 MOLECULE:;" 955 logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier"); 956 return; 957 } 958 } else { 959 // the line will be added as data to the previous field 960 } 961 962 963 line = line.replace(continuationField, "").trim(); 964 965 StringTokenizer compndTokens = new StringTokenizer(line); 966 967 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 968 969 while (compndTokens.hasMoreTokens()) { 970 String token = compndTokens.nextToken(); 971 972 if (previousContinuationField.equals("")) { 973 previousContinuationField = continuationField; 974 } 975 976 if (previousContinuationField.equals(continuationField) 977 && compndFieldValues.contains(continuationField)) { 978 979 logger.debug("Still in field " + continuationField); 980 logger.debug("token = " + token); 981 982 continuationString = continuationString.concat(token + " "); 983 984 logger.debug("continuationString = " 985 + continuationString); 986 987 } 988 if (!continuationField.equals(previousContinuationField)) { 989 990 if (continuationString.equals("")) { 991 continuationString = token; 992 993 } else { 994 995 compndValueSetter(previousContinuationField, 996 continuationString); 997 previousContinuationField = continuationField; 998 continuationString = token + " "; 999 } 1000 } else if (ignoreCompndFieldValues.contains(token)) { 1001 // this field shall be ignored 1002 //continuationField = token; 1003 } 1004 } 1005 if (isLastCompndLine) { 1006 // final line in the section - finish off the compound 1007 // System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header."); 1008 compndValueSetter(continuationField, continuationString); 1009 continuationString = ""; 1010 if (current_compound!=null) entities.add(current_compound); 1011 } 1012 } 1013 1014 /** 1015 * Set the value in the current molId object 1016 * @param field 1017 * @param value 1018 */ 1019 private void compndValueSetter(String field, String value) { 1020 1021 value = value.trim().replace(";", ""); 1022 if (field.equals("MOL_ID:")) { 1023 1024 int i = -1; 1025 try { 1026 i = Integer.valueOf(value); 1027 } catch (NumberFormatException e){ 1028 logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value); 1029 } 1030 if (i>0 && prevMolId!=i) { 1031 1032 if (current_compound!=null) entities.add(current_compound); 1033 1034 logger.debug("Initialising new Compound with mol_id {}", i); 1035 1036 current_compound = new EntityInfo(); 1037 1038 current_compound.setMolId(i); 1039 1040 // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25 1041 current_compound.setType(EntityType.POLYMER); 1042 1043 prevMolId = i; 1044 } 1045 1046 } 1047 1048 // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return 1049 if (current_compound==null) { 1050 return; 1051 } 1052 1053 if (field.equals("MOLECULE:")) { 1054 current_compound.setDescription(value); 1055 1056 } 1057 if (field.equals("CHAIN:")) { 1058 //System.out.println(value); 1059 StringTokenizer chainTokens = new StringTokenizer(value, ","); 1060 List<String> chains = new ArrayList<String>(); 1061 1062 while (chainTokens.hasMoreTokens()) { 1063 String chainID = chainTokens.nextToken().trim(); 1064 // NULL is used in old PDB files to represent empty chain DI 1065 if (chainID.equals("NULL")) 1066 chainID = " "; 1067 chains.add(chainID); 1068 } 1069 compoundMolIds2chainIds.put(current_compound.getMolId(),chains); 1070 1071 } 1072 if (field.equals("SYNONYM:")) { 1073 1074 StringTokenizer synonyms = new StringTokenizer(value, ","); 1075 List<String> names = new ArrayList<String>(); 1076 1077 while (synonyms.hasMoreTokens()) { 1078 names.add(synonyms.nextToken()); 1079 1080 current_compound.setSynonyms(names); 1081 } 1082 1083 } 1084 1085 if (field.equals("EC:")) { 1086 1087 StringTokenizer ecNumTokens = new StringTokenizer(value, ","); 1088 List<String> ecNums = new ArrayList<String>(); 1089 1090 while (ecNumTokens.hasMoreTokens()) { 1091 ecNums.add(ecNumTokens.nextToken()); 1092 1093 current_compound.setEcNums(ecNums); 1094 } 1095 1096 } 1097 if (field.equals("FRAGMENT:")) { 1098 1099 current_compound.setFragment(value); 1100 1101 } 1102 if (field.equals("ENGINEERED:")) { 1103 1104 current_compound.setEngineered(value); 1105 1106 } 1107 if (field.equals("MUTATION:")) { 1108 1109 current_compound.setMutation(value); 1110 1111 } 1112 if (field.equals("BIOLOGICAL_UNIT:")) { 1113 1114 current_compound.setBiologicalUnit(value); 1115 1116 } 1117 if (field.equals("OTHER_DETAILS:")) { 1118 1119 current_compound.setDetails(value); 1120 1121 } 1122 1123 } 1124 1125 1126 /** 1127 * Handler for 1128 * SOURCE Record format 1129 * 1130 * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied. 1131 * Record Format 1132 * <pre> 1133 * COLUMNS DATA TYPE FIELD DEFINITION 1134 * ------------------------------------------------------------------------------- 1135 * 1 - 6 Record name "SOURCE" 1136 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 1137 * 11 - 70 Specification srcName Identifies the source of the macromolecule in 1138 * list a token: value format. 1139 * </pre> 1140 * @param line the line to be parsed 1141 */ 1142 private void pdb_SOURCE_Handler(String line) { 1143 // works in the same way as the pdb_COMPND_Handler. 1144 String continuationNr = line.substring(9, 10).trim(); 1145 1146 1147 1148 logger.debug("current continuationNo is " 1149 + continuationNr); 1150 logger.debug("previousContinuationField is " 1151 + previousContinuationField); 1152 logger.debug("current continuationField is " 1153 + continuationField); 1154 logger.debug("current continuationString is " 1155 + continuationString); 1156 logger.debug("current compound is " 1157 + current_compound); 1158 1159 1160 // following the docs, the last valid character should be 79, chop off the rest 1161 if (line.length() > 79) { 1162 line = line.substring(0, 79); 1163 } 1164 1165 line = line.substring(10, line.length()); 1166 1167 logger.debug("LINE: >" + line + "<"); 1168 1169 String[] fieldList = line.split("\\s+"); 1170 1171 if (!fieldList[0].equals("") 1172 && sourceFieldValues.contains(fieldList[0])) { 1173 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'"); 1174 continuationField = fieldList[0]; 1175 if (previousContinuationField.equals("")) { 1176 previousContinuationField = continuationField; 1177 } 1178 1179 } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) { 1180 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'"); 1181 continuationField = fieldList[1]; 1182 if (previousContinuationField.equals("")) { 1183 previousContinuationField = continuationField; 1184 } 1185 1186 } else { 1187 if (continuationNr.equals("")) { 1188 1189 logger.debug("looks like an old PDB file"); 1190 1191 continuationField = "MOLECULE:"; 1192 if (previousContinuationField.equals("")) { 1193 previousContinuationField = continuationField; 1194 } 1195 } 1196 1197 } 1198 1199 line = line.replace(continuationField, "").trim(); 1200 1201 StringTokenizer compndTokens = new StringTokenizer(line); 1202 1203 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 1204 1205 while (compndTokens.hasMoreTokens()) { 1206 String token = compndTokens.nextToken(); 1207 1208 if (previousContinuationField.equals("")) { 1209 // System.out.println("previousContinuationField is empty. Setting to : " + continuationField); 1210 previousContinuationField = continuationField; 1211 } 1212 1213 if (previousContinuationField.equals(continuationField) 1214 && sourceFieldValues.contains(continuationField)) { 1215 1216 logger.debug("Still in field " + continuationField); 1217 1218 continuationString = continuationString.concat(token + " "); 1219 1220 logger.debug("continuationString = " 1221 + continuationString); 1222 } 1223 if (!continuationField.equals(previousContinuationField)) { 1224 1225 if (continuationString.equals("")) { 1226 continuationString = token; 1227 1228 } else { 1229 1230 sourceValueSetter(previousContinuationField, 1231 continuationString); 1232 previousContinuationField = continuationField; 1233 continuationString = token + " "; 1234 } 1235 } else if (ignoreCompndFieldValues.contains(token)) { 1236 // this field shall be ignored 1237 //continuationField = token; 1238 } 1239 } 1240 if (isLastSourceLine) { 1241 // final line in the section - finish off the compound 1242 // System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header."); 1243 sourceValueSetter(continuationField, continuationString); 1244 continuationString = ""; 1245 //compounds.add(current_compound); 1246 } 1247 1248 } 1249 1250 1251 /** 1252 * Set the value in the current molId object 1253 * 1254 * @param field 1255 * @param value 1256 */ 1257 private void sourceValueSetter(String field, String value) { 1258 1259 value = value.trim().replace(";", ""); 1260 // System.out.println("[sourceValueSetter] " + field); 1261 if (field.equals("MOL_ID:")) { 1262 1263 try { 1264 current_compound = entities.get(Integer.valueOf(value) - 1); 1265 } catch (NumberFormatException e){ 1266 logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage()); 1267 return; 1268 } 1269 1270 1271 // System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId()); 1272 1273 } 1274 if (field.equals("SYNTHETIC:")) { 1275 current_compound.setSynthetic(value); 1276 } else if (field.equals("FRAGMENT:")) { 1277 current_compound.setFragment(value); 1278 } else if (field.equals("ORGANISM_SCIENTIFIC:")) { 1279 current_compound.setOrganismScientific(value); 1280 } else if (field.equals("ORGANISM_TAXID:")) { 1281 current_compound.setOrganismTaxId(value); 1282 } else if (field.equals("ORGANISM_COMMON:")) { 1283 current_compound.setOrganismCommon(value); 1284 } else if (field.equals("STRAIN:")) { 1285 current_compound.setStrain(value); 1286 } else if (field.equals("VARIANT:")) { 1287 current_compound.setVariant(value); 1288 } else if (field.equals("CELL_LINE:")) { 1289 current_compound.setCellLine(value); 1290 } else if (field.equals("ATCC:")) { 1291 current_compound.setAtcc(value); 1292 } else if (field.equals("ORGAN:")) { 1293 current_compound.setOrgan(value); 1294 } else if (field.equals("TISSUE:")) { 1295 current_compound.setTissue(value); 1296 } else if (field.equals("CELL:")) { 1297 current_compound.setCell(value); 1298 } else if (field.equals("ORGANELLE:")) { 1299 current_compound.setOrganelle(value); 1300 } else if (field.equals("SECRETION:")) { 1301 current_compound.setSecretion(value); 1302 } else if (field.equals("GENE:")) { 1303 current_compound.setGene(value); 1304 } else if (field.equals("CELLULAR_LOCATION:")) { 1305 current_compound.setCellularLocation(value); 1306 } else if (field.equals("EXPRESSION_SYSTEM:")) { 1307 current_compound.setExpressionSystem(value); 1308 } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) { 1309 current_compound.setExpressionSystemTaxId(value); 1310 } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) { 1311 current_compound.setExpressionSystemStrain(value); 1312 } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) { 1313 current_compound.setExpressionSystemVariant(value); 1314 } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) { 1315 current_compound.setExpressionSystemCellLine(value); 1316 } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) { 1317 current_compound.setExpressionSystemAtccNumber(value); 1318 } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) { 1319 current_compound.setExpressionSystemOrgan(value); 1320 } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) { 1321 current_compound.setExpressionSystemTissue(value); 1322 } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) { 1323 current_compound.setExpressionSystemCell(value); 1324 } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) { 1325 current_compound.setExpressionSystemOrganelle(value); 1326 } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) { 1327 current_compound.setExpressionSystemCellularLocation(value); 1328 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) { 1329 current_compound.setExpressionSystemVectorType(value); 1330 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) { 1331 current_compound.setExpressionSystemVector(value); 1332 } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) { 1333 current_compound.setExpressionSystemPlasmid(value); 1334 } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) { 1335 current_compound.setExpressionSystemGene(value); 1336 } else if (field.equals("OTHER_DETAILS:")) { 1337 current_compound.setExpressionSystemOtherDetails(value); 1338 } 1339 1340 } 1341 1342 /** 1343 * Handler for REMARK lines 1344 */ 1345 private void pdb_REMARK_Handler(String line) { 1346 1347 if ( line == null || line.length() < 11) 1348 return; 1349 1350 1351 if (line.startsWith("REMARK 800")) { 1352 pdb_REMARK_800_Handler(line); 1353 1354 } else if ( line.startsWith("REMARK 350")){ 1355 1356 if ( params.isParseBioAssembly()) { 1357 1358 if (bioAssemblyParser == null){ 1359 bioAssemblyParser = new PDBBioAssemblyParser(); 1360 } 1361 1362 bioAssemblyParser.pdb_REMARK_350_Handler(line); 1363 } 1364 1365 // REMARK 3 (for R free) 1366 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1367 // then last one encountered will be taken 1368 } else if (line.startsWith("REMARK 3 FREE R VALUE")) { 1369 1370 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 1371 // Here we follow this strategy: 1372 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 1373 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 1374 1375 Pattern pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*"); 1376 Matcher mR = pR.matcher(line); 1377 if (mR.matches()) { 1378 try { 1379 rfreeNoCutoffLine = Float.parseFloat(mR.group(1)); 1380 } catch (NumberFormatException e) { 1381 logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it"); 1382 } 1383 } 1384 pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*"); 1385 mR = pR.matcher(line); 1386 if (mR.matches()) { 1387 try { 1388 rfreeStandardLine = Float.parseFloat(mR.group(1)); 1389 } catch (NumberFormatException e) { 1390 logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1)); 1391 } 1392 } 1393 1394 // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries) 1395 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1396 // then last one encountered will be taken 1397 } else if (line.startsWith("REMARK 3 RESOLUTION RANGE HIGH")){ 1398 Pattern pR = Pattern.compile("^REMARK 3 RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*"); 1399 Matcher mR = pR.matcher(line); 1400 if (mR.matches()) { 1401 try { 1402 float res = Float.parseFloat(mR.group(1)); 1403 if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) { 1404 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} " 1405 ,mR.group(1), String.format("%4.2f",pdbHeader.getResolution())); 1406 } 1407 pdbHeader.setResolution(res); 1408 } catch (NumberFormatException e) { 1409 logger.info("Could not parse resolution '{}', ignoring it",mR.group(1)); 1410 } 1411 } 1412 } 1413 1414 } 1415 1416 1417 1418 1419 1420 1421 /** 1422 * Handler for 1423 * EXPDTA Record Format 1424 <pre> 1425 COLUMNS DATA TYPE FIELD DEFINITION 1426 ------------------------------------------------------------------------------- 1427 1 - 6 Record name "EXPDTA" 1428 9 - 10 Continuation continuation Allows concatenation of multiple 1429 records. 1430 11 - 70 SList technique The experimental technique(s) with 1431 optional comment describing the 1432 sample or experiment. 1433 1434 allowed techniques are: 1435 ELECTRON DIFFRACTION 1436 FIBER DIFFRACTION 1437 FLUORESCENCE TRANSFER 1438 NEUTRON DIFFRACTION 1439 NMR 1440 THEORETICAL MODEL 1441 X-RAY DIFFRACTION 1442 </pre> 1443 */ 1444 private void pdb_EXPDTA_Handler(String line) { 1445 1446 String technique ; 1447 if (line.length() > 69) 1448 technique = line.substring (10, 70).trim() ; 1449 else 1450 technique = line.substring(10).trim(); 1451 1452 for (String singleTechnique: technique.split(";\\s+")) { 1453 pdbHeader.setExperimentalTechnique(singleTechnique); 1454 } 1455 1456 1457 } 1458 1459 /** 1460 * Handler for 1461 * CRYST1 Record Format 1462 * The CRYST1 record presents the unit cell parameters, space group, and Z value. 1463 * If the entry describes a structure determined by a technique other than X-ray crystallography, 1464 * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1465 * <pre> 1466 * COLUMNS DATA TYPE FIELD DEFINITION 1467 * ------------------------------------------------------------- 1468 * 1 - 6 Record name "CRYST1" 1469 * 7 - 15 Real(9.3) a a (Angstroms). 1470 * 16 - 24 Real(9.3) b b (Angstroms). 1471 * 25 - 33 Real(9.3) c c (Angstroms). 1472 * 34 - 40 Real(7.2) alpha alpha (degrees). 1473 * 41 - 47 Real(7.2) beta beta (degrees). 1474 * 48 - 54 Real(7.2) gamma gamma (degrees). 1475 * 56 - 66 LString sGroup Space group. 1476 * 67 - 70 Integer z Z value. 1477 * </pre> 1478 */ 1479 private void pdb_CRYST1_Handler(String line) { 1480 // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1') 1481 if (line.length() < 58) { 1482 logger.warn("CRYST1 record has fewer than 58 columns: will ignore it"); 1483 return; 1484 } 1485 1486 float a; 1487 float b; 1488 float c; 1489 float alpha; 1490 float beta; 1491 float gamma; 1492 String spaceGroup = ""; 1493 1494 try { 1495 a = Float.parseFloat(line.substring(6,15).trim()); 1496 b = Float.parseFloat(line.substring(15,24).trim()); 1497 c = Float.parseFloat(line.substring(24,33).trim()); 1498 alpha = Float.parseFloat(line.substring(33,40).trim()); 1499 beta = Float.parseFloat(line.substring(40,47).trim()); 1500 gamma = Float.parseFloat(line.substring(47,54).trim()); 1501 } catch (NumberFormatException e) { 1502 logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line); 1503 return ; 1504 } 1505 if (line.length()>=66) { 1506 // for well formatted files 1507 spaceGroup = line.substring(55,66).trim(); 1508 } else { 1509 // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value 1510 spaceGroup = line.substring(55,line.length()).trim(); 1511 } 1512 1513 CrystalCell xtalCell = new CrystalCell(); 1514 xtalCell.setA(a); 1515 xtalCell.setB(b); 1516 xtalCell.setC(c); 1517 xtalCell.setAlpha(alpha); 1518 xtalCell.setBeta(beta); 1519 xtalCell.setGamma(gamma); 1520 1521 if (!xtalCell.isCellReasonable()) { 1522 // If the entry describes a structure determined by a technique other than X-ray crystallography, 1523 // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1524 // if so we don't add the crystal cell and it remains null 1525 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", 1526 CrystalCell.MIN_VALID_CELL_SIZE); 1527 } else { 1528 crystallographicInfo.setCrystalCell(xtalCell); 1529 } 1530 1531 SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup); 1532 if (sg==null) { 1533 logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group"); 1534 crystallographicInfo.setNonStandardSg(true); 1535 } else { 1536 crystallographicInfo.setSpaceGroup(sg); 1537 crystallographicInfo.setNonStandardSg(false); 1538 } 1539 } 1540 1541 /** 1542 * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries) 1543 * 1544 * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn 1545 * <pre> 1546 * COLUMNS DATA TYPE FIELD DEFINITION 1547 * ------------------------------------------------------------- 1548 * 1549 * 1 - 6 Record name "MTRIXn" n=1, 2, or 3 1550 * 8 - 10 Integer serial Serial number. 1551 * 11 - 20 Real(10.6) m[n][1] Mn1 1552 * 21 - 30 Real(10.6) m[n][2] Mn2 1553 * 31 - 40 Real(10.6) m[n][3] Mn3 1554 * 46 - 55 Real(10.5) v[n] Vn 1555 * 60 Integer iGiven 1 1556 * 1557 * </pre> 1558 * Note that we ignore operators with iGiven==1 1559 * 1560 * @param line 1561 */ 1562 private void pdb_MTRIXn_Handler(String line) { 1563 1564 // don't process incomplete records 1565 if (line.length() < 55) { 1566 logger.info("MTRIXn record has fewer than 55 columns: will ignore it"); 1567 return; 1568 } 1569 1570 1571 try { 1572 1573 int rowIndex = Integer.parseInt(line.substring(5,6)); 1574 double col1Value = Double.parseDouble(line.substring(10,20)); 1575 double col2Value = Double.parseDouble(line.substring(20,30)); 1576 double col3Value = Double.parseDouble(line.substring(30,40)); 1577 double translValue = Double.parseDouble(line.substring(45,55)); 1578 int iGiven = 0; 1579 if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) { 1580 iGiven = Integer.parseInt(line.substring(59,60)); 1581 } 1582 1583 if (iGiven == 1) return; 1584 1585 if (ncsOperators==null) { 1586 // we initialise on first pass 1587 ncsOperators = new ArrayList<Matrix4d>(); 1588 } 1589 1590 if (currentNcsOp==null) { 1591 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1592 } 1593 1594 currentNcsOp.setElement(rowIndex-1, 0, col1Value); 1595 currentNcsOp.setElement(rowIndex-1, 1, col2Value); 1596 currentNcsOp.setElement(rowIndex-1, 2, col3Value); 1597 currentNcsOp.setElement(rowIndex-1, 3, translValue); 1598 1599 1600 if (rowIndex==3) { 1601 ncsOperators.add(currentNcsOp); 1602 // we initialise for next matrix to come 1603 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1604 } 1605 1606 } catch (NumberFormatException e) { 1607 logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<"); 1608 } 1609 } 1610 1611 /** 1612 * Handler for ATOM. 1613 * Record Format: 1614 * 1615 * <pre> 1616 * ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1617 * 1618 * COLUMNS DATA TYPE FIELD DEFINITION 1619 * --------------------------------------------------------------------------------- 1620 * 1 - 6 Record name "ATOM " 1621 * 7 - 11 Integer serial Atom serial number. 1622 * 13 - 16 Atom name Atom name. 1623 * 17 Character altLoc Alternate location indicator. 1624 * 18 - 20 Residue name resName Residue name. 1625 * 22 Character chainID Chain identifier. 1626 * 23 - 26 Integer resSeq Residue sequence number. 1627 * 27 AChar iCode Code for insertion of residues. 1628 * 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 1629 * 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 1630 * 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 1631 * 55 - 60 Real(6.2) occupancy Occupancy. 1632 * 61 - 66 Real(6.2) tempFactor Temperature factor. 1633 * 73 - 76 LString(4) segID Segment identifier, left-justified. 1634 * 77 - 78 LString(2) element Element symbol, right-justified. 1635 * 79 - 80 LString(2) charge Charge on the atom. 1636 * </pre> 1637 */ 1638 private void pdb_ATOM_Handler(String line) { 1639 1640 if ( params.isHeaderOnly()) 1641 return; 1642 1643 // let's first get the chain name which will serve to identify if we are starting a new molecule 1644 String chainName = line.substring(21,22); 1645 1646 if (chainName.equals(" ")) { 1647 blankChainIdsPresent = true; 1648 } 1649 1650 if (currentChain!=null && !currentChain.getName().equals(chainName)) { 1651 // new chain name: another molecule coming 1652 startOfMolecule = true; 1653 } 1654 1655 if (startOfMolecule) { 1656 // we add last chain if there was one 1657 if (currentChain!=null) { 1658 currentModel.add(currentChain); 1659 // let's not forget adding the last group to the finishing chain 1660 if (currentGroup!=null) { 1661 currentChain.addGroup(currentGroup); 1662 } 1663 } 1664 // we initialise the new molecule to come 1665 currentChain = new ChainImpl(); 1666 // note that the chainId (asym id) is set properly later in assignAsymIds 1667 currentChain.setId(chainName); 1668 currentChain.setName(chainName); 1669 1670 } 1671 1672 if (startOfModel) { 1673 // we add last model if there was one 1674 if (currentModel!=null) { 1675 allModels.add(currentModel); 1676 } 1677 // we initialise the model to come 1678 currentModel = new ArrayList<>(); 1679 } 1680 1681 1682 // let's get the residue number and see if we need to start a new group 1683 1684 String groupCode3 = line.substring(17,20).trim(); 1685 String resNum = line.substring(22,26).trim(); 1686 Character iCode = line.substring(26,27).charAt(0); 1687 if ( iCode == ' ') 1688 iCode = null; 1689 ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode); 1690 1691 //recordName groupCode3 1692 //| | resNum 1693 //| | | iCode 1694 //| | | | | || 1695 //ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1696 //ATOM 1964 N ARG H 221A 5.963 -16.715 27.669 1.00 28.59 N 1697 1698 Character aminoCode1 = StructureTools.get1LetterCode(groupCode3); 1699 1700 String recordName = line.substring (0, 6).trim (); 1701 1702 boolean isHetAtomInFile = false; 1703 1704 if (recordName.equals("HETATM") ){ 1705 // HETATOM RECORDS are treated slightly differently 1706 // some modified amino acids that we want to treat as amino acids 1707 // can be found as HETATOM records 1708 if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) 1709 aminoCode1 = null; 1710 1711 isHetAtomInFile = true; 1712 } 1713 1714 if ( startOfMolecule) { 1715 1716 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1717 1718 currentGroup.setPDBName(groupCode3); 1719 currentGroup.setResidueNumber(residueNumber); 1720 currentGroup.setHetAtomInFile(isHetAtomInFile); 1721 1722 } 1723 1724 // resetting states 1725 startOfModel = false; 1726 startOfMolecule = false; 1727 1728 1729 Character altLoc = new Character(line.substring (16, 17).charAt(0)); 1730 Group altGroup = null; 1731 1732 1733 // check if residue number is the same ... 1734 if ( ! residueNumber.equals(currentGroup.getResidueNumber())) { 1735 1736 currentChain.addGroup(currentGroup); 1737 currentGroup.trimToSize(); 1738 1739 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1740 1741 currentGroup.setPDBName(groupCode3); 1742 currentGroup.setResidueNumber(residueNumber); 1743 currentGroup.setHetAtomInFile(isHetAtomInFile); 1744 1745 } else { 1746 // same residueNumber, but altLocs... 1747 1748 // test altLoc 1749 if ( ! altLoc.equals(' ')) { 1750 logger.debug("found altLoc! " + currentGroup + " " + altGroup); 1751 altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3); 1752 if ( altGroup.getChain() == null) { 1753 // need to set current chain 1754 altGroup.setChain(currentChain); 1755 } 1756 1757 } 1758 } 1759 1760 atomCount++; 1761 1762 if ( atomCount == atomCAThreshold ) { 1763 // throw away the SEQRES lines - too much to deal with... 1764 logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines"); 1765 seqResChains.clear(); 1766 1767 switchCAOnly(); 1768 1769 } 1770 1771 1772 1773 if ( atomCount == loadMaxAtoms){ 1774 logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line); 1775 return; 1776 } 1777 if ( atomCount > loadMaxAtoms){ 1778 return; 1779 } 1780 1781 1782 // 1 2 3 4 5 6 1783 //012345678901234567890123456789012345678901234567890123456789 1784 //ATOM 1 N MET 1 20.154 29.699 5.276 1.0 1785 //ATOM 112 CA ASP 112 41.017 33.527 28.371 1.00 0.00 1786 //ATOM 53 CA MET 7 23.772 33.989 -21.600 1.00 0.00 C 1787 //ATOM 112 CA ASP 112 37.613 26.621 33.571 0 0 1788 1789 1790 String fullname = line.substring (12, 16); 1791 1792 // check for CA only if requested 1793 if ( parseCAonly ){ 1794 // yes , user wants to get CA only 1795 // only parse CA atoms... 1796 if (! fullname.equals(" CA ")){ 1797 //System.out.println("ignoring " + line); 1798 atomCount--; 1799 return; 1800 } 1801 } 1802 1803 if ( params.getAcceptedAtomNames() != null) { 1804 1805 boolean found = false; 1806 for (String ok : params.getAcceptedAtomNames()){ 1807 //System.out.println(ok + "< >" + fullname +"<"); 1808 1809 if ( ok.equals(fullname.trim())) { 1810 found = true; 1811 break; 1812 } 1813 } 1814 if ( ! found) { 1815 atomCount--; 1816 return; 1817 } 1818 } 1819 // create new atom 1820 1821 int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ()); 1822 AtomImpl atom = new AtomImpl() ; 1823 atom.setPDBserial(pdbnumber) ; 1824 1825 atom.setAltLoc(altLoc); 1826 atom.setName(fullname.trim()); 1827 1828 double x = Double.parseDouble (line.substring (30, 38).trim()); 1829 double y = Double.parseDouble (line.substring (38, 46).trim()); 1830 double z = Double.parseDouble (line.substring (46, 54).trim()); 1831 1832 double[] coords = new double[3]; 1833 coords[0] = x ; 1834 coords[1] = y ; 1835 coords[2] = z ; 1836 atom.setCoords(coords); 1837 1838 float occu = 1.0f; 1839 if ( line.length() > 59 ) { 1840 try { 1841 // occu and tempf are sometimes not used :-/ 1842 occu = Float.parseFloat (line.substring (54, 60).trim()); 1843 } catch (NumberFormatException e){} 1844 } 1845 1846 float tempf = 0.0f; 1847 if ( line.length() > 65) { 1848 try { 1849 tempf = Float.parseFloat (line.substring (60, 66).trim()); 1850 } catch (NumberFormatException e){} 1851 } 1852 1853 atom.setOccupancy( occu ); 1854 atom.setTempFactor( tempf ); 1855 1856 1857 1858 1859 // Parse element from the element field. If this field is 1860 // missing (i.e. misformatted PDB file), then parse the 1861 // element from the chemical component. 1862 Element element = Element.R; 1863 boolean guessElement = true; 1864 if ( line.length() > 77 ) { 1865 // parse element from element field 1866 String elementSymbol = line.substring(76, 78).trim(); 1867 if (elementSymbol.isEmpty()) { 1868 logger.info("Element column was empty for atom {} {}. Assigning atom element " 1869 + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber); 1870 } else { 1871 1872 try { 1873 element = Element.valueOfIgnoreCase(elementSymbol); 1874 guessElement = false; 1875 } catch (IllegalArgumentException e){ 1876 logger.info("Element {} of atom {} {} was not recognised. Assigning atom element " 1877 + "from Chemical Component Dictionary information", elementSymbol, 1878 fullname.trim(), pdbnumber); 1879 } 1880 } 1881 } else { 1882 logger.info("Missformatted PDB file: element column of atom {} {} is not present. " 1883 + "Assigning atom element from Chemical Component Dictionary information", 1884 fullname.trim(), pdbnumber); 1885 } 1886 if (guessElement) { 1887 String elementSymbol = null; 1888 if (currentGroup.getChemComp() != null) { 1889 for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) { 1890 if (a.getAtomId().equals(fullname.trim())) { 1891 elementSymbol = a.getTypeSymbol(); 1892 break; 1893 } 1894 } 1895 if (elementSymbol == null) { 1896 logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. " 1897 + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName()); 1898 } else { 1899 try { 1900 element = Element.valueOfIgnoreCase(elementSymbol); 1901 } catch (IllegalArgumentException e) { 1902 // this can still happen for cases like UNK 1903 logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. " 1904 + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber); 1905 } 1906 } 1907 } else { 1908 logger.warn("Chemical Component Dictionary information was not found for Atom name {}. " 1909 + "Assigning generic element R to it", fullname.trim()); 1910 } 1911 1912 } 1913 atom.setElement(element); 1914 1915 1916 //see if chain_id is one of the previous chains ... 1917 if ( altGroup != null) { 1918 altGroup.addAtom(atom); 1919 altGroup = null; 1920 } 1921 else { 1922 currentGroup.addAtom(atom); 1923 } 1924 1925 1926 // make sure that main group has all atoms 1927 // GitHub issue: #76 1928 if ( ! currentGroup.hasAtom(atom.getName())) { 1929 currentGroup.addAtom(atom); 1930 } 1931 1932 1933 1934 } 1935 1936 1937 private Group getCorrectAltLocGroup( Character altLoc, 1938 String recordName, Character aminoCode1, String groupCode3) { 1939 1940 // see if we know this altLoc already; 1941 List<Atom> atoms = currentGroup.getAtoms(); 1942 if ( atoms.size() > 0) { 1943 Atom a1 = atoms.get(0); 1944 // we are just adding atoms to the current group 1945 // probably there is a second group following later... 1946 if (a1.getAltLoc().equals(altLoc)) { 1947 1948 return currentGroup; 1949 } 1950 } 1951 1952 List<Group> altLocs = currentGroup.getAltLocs(); 1953 for ( Group altLocG : altLocs ){ 1954 atoms = altLocG.getAtoms(); 1955 if ( atoms.size() > 0) { 1956 for ( Atom a1 : atoms) { 1957 if (a1.getAltLoc().equals( altLoc)) { 1958 1959 return altLocG; 1960 } 1961 } 1962 } 1963 } 1964 1965 // no matching altLoc group found. 1966 // build it up. 1967 1968 if ( groupCode3.equals(currentGroup.getPDBName())) { 1969 if ( currentGroup.getAtoms().size() == 0) { 1970 //System.out.println("current group is empty " + current_group + " " + altLoc); 1971 return currentGroup; 1972 } 1973 //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc); 1974 Group altLocG = (Group) currentGroup.clone(); 1975 // drop atoms from cloned group... 1976 // https://redmine.open-bio.org/issues/3307 1977 altLocG.setAtoms(new ArrayList<Atom>()); 1978 altLocG.getAltLocs().clear(); 1979 currentGroup.addAltLoc(altLocG); 1980 return altLocG; 1981 } 1982 1983 // System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3); 1984 Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3); 1985 1986 1987 altLocG.setPDBName(groupCode3); 1988 1989 altLocG.setResidueNumber(currentGroup.getResidueNumber()); 1990 currentGroup.addAltLoc(altLocG); 1991 return altLocG; 1992 } 1993 1994 private void switchCAOnly(){ 1995 parseCAonly = true; 1996 1997 1998 currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel); 1999 2000 for ( int i =0; i< structure.nrModels() ; i++){ 2001 // iterate over all known models ... 2002 List<Chain> model = structure.getModel(i); 2003 model = CAConverter.getRepresentativeAtomsOnly(model); 2004 structure.setModel(i,model); 2005 } 2006 2007 currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain); 2008 2009 } 2010 2011 2012 /** safes repeating a few lines ... */ 2013 private Integer conect_helper (String line,int start,int end) { 2014 if (line.length() < end) return null; 2015 2016 String sbond = line.substring(start,end).trim(); 2017 int bond = -1 ; 2018 Integer b = null ; 2019 2020 if ( ! sbond.equals("")) { 2021 bond = Integer.parseInt(sbond); 2022 b = new Integer(bond); 2023 } 2024 2025 return b ; 2026 } 2027 2028 /** 2029 * Handler for CONECT Record Format 2030 <pre> 2031 COLUMNS DATA TYPE FIELD DEFINITION 2032 --------------------------------------------------------------------------------- 2033 1 - 6 Record name "CONECT" 2034 7 - 11 Integer serial Atom serial number 2035 12 - 16 Integer serial Serial number of bonded atom 2036 17 - 21 Integer serial Serial number of bonded atom 2037 22 - 26 Integer serial Serial number of bonded atom 2038 27 - 31 Integer serial Serial number of bonded atom 2039 32 - 36 Integer serial Serial number of hydrogen bonded 2040 atom 2041 37 - 41 Integer serial Serial number of hydrogen bonded 2042 atom 2043 42 - 46 Integer serial Serial number of salt bridged 2044 atom 2045 47 - 51 Integer serial Serial number of hydrogen bonded 2046 atom 2047 52 - 56 Integer serial Serial number of hydrogen bonded 2048 atom 2049 57 - 61 Integer serial Serial number of salt bridged 2050 atom 2051 </pre> 2052 */ 2053 private void pdb_CONECT_Handler(String line) { 2054 2055 if ( atomOverflow) { 2056 return ; 2057 } 2058 if (params.isHeaderOnly()) { 2059 return; 2060 } 2061 2062 // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines... 2063 try { 2064 int atomserial = Integer.parseInt (line.substring(6 ,11).trim()); 2065 Integer bond1 = conect_helper(line,11,16); 2066 Integer bond2 = conect_helper(line,16,21); 2067 Integer bond3 = conect_helper(line,21,26); 2068 Integer bond4 = conect_helper(line,26,31); 2069 Integer hyd1 = conect_helper(line,31,36); 2070 Integer hyd2 = conect_helper(line,36,41); 2071 Integer salt1 = conect_helper(line,41,46); 2072 Integer hyd3 = conect_helper(line,46,51); 2073 Integer hyd4 = conect_helper(line,51,56); 2074 Integer salt2 = conect_helper(line,56,61); 2075 2076 //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+ 2077 // hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2); 2078 HashMap<String, Integer> cons = new HashMap<String, Integer>(); 2079 cons.put("atomserial",new Integer(atomserial)); 2080 2081 if ( bond1 != null) cons.put("bond1",bond1); 2082 if ( bond2 != null) cons.put("bond2",bond2); 2083 if ( bond3 != null) cons.put("bond3",bond3); 2084 if ( bond4 != null) cons.put("bond4",bond4); 2085 if ( hyd1 != null) cons.put("hydrogen1",hyd1); 2086 if ( hyd2 != null) cons.put("hydrogen2",hyd2); 2087 if ( salt1 != null) cons.put("salt1",salt1); 2088 if ( hyd3 != null) cons.put("hydrogen3",hyd3); 2089 if ( hyd4 != null) cons.put("hydrogen4",hyd4); 2090 if ( salt2 != null) cons.put("salt2",salt2); 2091 2092 connects.add(cons); 2093 } catch (NumberFormatException e){ 2094 logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line); 2095 return; 2096 } 2097 } 2098 2099 /** 2100 * Handler for MODEL Record Format 2101 * <pre> 2102 * COLUMNS DATA TYPE FIELD DEFINITION 2103 * ---------------------------------------------------------------------- 2104 * 1 - 6 Record name "MODEL " 2105 * 11 - 14 Integer serial Model serial number. 2106 * </pre> 2107 */ 2108 private void pdb_MODEL_Handler(String line) { 2109 2110 if (params.isHeaderOnly()) return; 2111 2112 // new model: we start a new molecule 2113 startOfMolecule = true; 2114 startOfModel = true; 2115 2116 } 2117 2118 /** 2119 * Handler for TER record. The record is used in deposited PDB files and many others, 2120 * but it's often forgotten by some softwares. In any case it helps identifying the 2121 * start of ligand molecules so we use it for that. 2122 */ 2123 private void pdb_TER_Handler() { 2124 startOfMolecule = true; 2125 } 2126 2127 2128 /** 2129 * DBREF handler 2130 * <pre> 2131 * COLUMNS DATA TYPE FIELD DEFINITION 2132 * ---------------------------------------------------------------- 2133 * 1 - 6 Record name "DBREF " 2134 * 8 - 11 IDcode idCode ID code of this entry. 2135 * 13 Character chainID Chain identifier. 2136 * 15 - 18 Integer seqBegin Initial sequence number 2137 * of the PDB sequence segment. 2138 * 19 AChar insertBegin Initial insertion code 2139 * of the PDB sequence segment. 2140 * 21 - 24 Integer seqEnd Ending sequence number 2141 * of the PDB sequence segment. 2142 * 25 AChar insertEnd Ending insertion code 2143 * of the PDB sequence segment. 2144 * 27 - 32 LString database Sequence database name. 2145 * 34 - 41 LString dbAccession Sequence database accession code. 2146 * 43 - 54 LString dbIdCode Sequence database 2147 * identification code. 2148 * 56 - 60 Integer dbseqBegin Initial sequence number of the 2149 * database seqment. 2150 * 61 AChar idbnsBeg Insertion code of initial residue 2151 * of the segment, if PDB is the 2152 * reference. 2153 * 63 - 67 Integer dbseqEnd Ending sequence number of the 2154 * database segment. 2155 * 68 AChar dbinsEnd Insertion code of the ending 2156 * residue of the segment, if PDB is 2157 * the reference. 2158 * </pre> 2159 */ 2160 private void pdb_DBREF_Handler(String line){ 2161 2162 logger.debug("Parsing DBREF " + line); 2163 2164 DBRef dbref = new DBRef(); 2165 String idCode = line.substring(7,11); 2166 String chainName = line.substring(12,13); 2167 String seqBegin = line.substring(14,18); 2168 String insertBegin = line.substring(18,19); 2169 String seqEnd = line.substring(20,24); 2170 String insertEnd = line.substring(24,25); 2171 String database = line.substring(26,32); 2172 String dbAccession = line.substring(33,41); 2173 String dbIdCode = line.substring(42,54); 2174 String dbseqBegin = line.substring(55,60); 2175 String idbnsBeg = line.substring(60,61); 2176 String dbseqEnd = line.substring(62,67); 2177 // Support implicit space character at end 2178 String dbinsEnd; 2179 if(line.length() >= 68) 2180 dbinsEnd = line.substring(67,68); 2181 else 2182 dbinsEnd = " "; 2183 2184 dbref.setIdCode(idCode); 2185 dbref.setChainName(chainName); 2186 dbref.setSeqBegin(intFromString(seqBegin)); 2187 dbref.setInsertBegin(insertBegin.charAt(0)); 2188 dbref.setSeqEnd(intFromString(seqEnd)); 2189 dbref.setInsertEnd(insertEnd.charAt(0)); 2190 dbref.setDatabase(database.trim()); 2191 dbref.setDbAccession(dbAccession.trim()); 2192 dbref.setDbIdCode(dbIdCode.trim()); 2193 dbref.setDbSeqBegin(intFromString(dbseqBegin)); 2194 dbref.setIdbnsBegin(idbnsBeg.charAt(0)); 2195 dbref.setDbSeqEnd(intFromString(dbseqEnd)); 2196 dbref.setIdbnsEnd(dbinsEnd.charAt(0)); 2197 2198 //System.out.println(dbref.toPDB()); 2199 dbrefs.add(dbref); 2200 } 2201 2202 2203 /** 2204 * Process the disulfide bond info provided by an SSBOND record 2205 * 2206 * <pre> 2207 COLUMNS DATA TYPE FIELD DEFINITION 2208 ------------------------------------------------------------------- 2209 1 - 6 Record name "SSBOND" 2210 8 - 10 Integer serNum Serial number. 2211 12 - 14 LString(3) "CYS" Residue name. 2212 16 Character chainID1 Chain identifier. 2213 18 - 21 Integer seqNum1 Residue sequence number. 2214 22 AChar icode1 Insertion code. 2215 26 - 28 LString(3) "CYS" Residue name. 2216 30 Character chainID2 Chain identifier. 2217 32 - 35 Integer seqNum2 Residue sequence number. 2218 36 AChar icode2 Insertion code. 2219 60 - 65 SymOP sym1 Symmetry oper for 1st resid 2220 67 - 72 SymOP sym2 Symmetry oper for 2nd resid 2221 * </pre> 2222 */ 2223 private void pdb_SSBOND_Handler(String line){ 2224 2225 if (params.isHeaderOnly()) return; 2226 2227 if (line.length()<36) { 2228 logger.info("SSBOND line has length under 36. Ignoring it."); 2229 return; 2230 } 2231 2232 String chain1 = line.substring(15,16); 2233 String seqNum1 = line.substring(17,21).trim(); 2234 String icode1 = line.substring(21,22); 2235 String chain2 = line.substring(29,30); 2236 String seqNum2 = line.substring(31,35).trim(); 2237 String icode2 = line.substring(35,36); 2238 2239 if (line.length()>=72) { 2240 String symop1 = line.substring(59, 65).trim(); 2241 String symop2 = line.substring(66, 72).trim(); 2242 2243 // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them 2244 if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing 2245 (!symop1.equals("1555") || !symop2.equals("1555")) ) { 2246 logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2); 2247 return; 2248 } 2249 } 2250 2251 if (icode1.equals(" ")) 2252 icode1 = ""; 2253 if (icode2.equals(" ")) 2254 icode2 = ""; 2255 2256 SSBondImpl ssbond = new SSBondImpl(); 2257 2258 ssbond.setChainID1(chain1); 2259 ssbond.setResnum1(seqNum1); 2260 ssbond.setChainID2(chain2); 2261 ssbond.setResnum2(seqNum2); 2262 ssbond.setInsCode1(icode1); 2263 ssbond.setInsCode2(icode2); 2264 ssbonds.add(ssbond); 2265 } 2266 2267 2268 /** 2269 * Takes care of LINK records. These take the format of: 2270 * 2271 * <pre> 2272 * COLUMNS DATA TYPE FIELD DEFINITION 2273 * -------------------------------------------------------------------------------- 2274 * 1 - 6 Record name "LINK " 2275 * 13 - 16 Atom name1 Atom name. 2276 * 17 Character altLoc1 Alternate location indicator. 2277 * 18 - 20 Residue name resName1 Residue name. 2278 * 22 Character chainID1 Chain identifier. 2279 * 23 - 26 Integer resSeq1 Residue sequence number. 2280 * 27 AChar iCode1 Insertion code. 2281 * 43 - 46 Atom name2 Atom name. 2282 * 47 Character altLoc2 Alternate location indicator. 2283 * 48 - 50 Residue name resName2 Residue name. 2284 * 52 Character chainID2 Chain identifier. 2285 * 53 - 56 Integer resSeq2 Residue sequence number. 2286 * 57 AChar iCode2 Insertion code. 2287 * 60 - 65 SymOP sym1 Symmetry operator for 1st atom. 2288 * 67 - 72 SymOP sym2 Symmetry operator for 2nd atom. 2289 * </pre> 2290 * 2291 * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK) 2292 * 2293 * @param line the LINK record line to parse. 2294 */ 2295 private void pdb_LINK_Handler(String line) { 2296 2297 if (params.isHeaderOnly()) return; 2298 2299 // Check for the minimal set of fields. 2300 if (line.length()<56) { 2301 logger.info("LINK line has length under 56. Ignoring it."); 2302 return; 2303 } 2304 2305 int len = line.length(); 2306 2307 String name1 = line.substring(12, 16).trim(); 2308 String altLoc1 = line.substring(16, 17).trim(); 2309 String resName1 = line.substring(17, 20).trim(); 2310 String chainID1 = line.substring(21, 22).trim(); 2311 String resSeq1 = line.substring(22, 26).trim(); 2312 String iCode1 = line.substring(26, 27).trim(); 2313 2314 String name2 = line.substring(42, 46).trim(); 2315 String altLoc2 = line.substring(46, 47).trim(); 2316 String resName2 = line.substring(47, 50).trim(); 2317 String chainID2 = line.substring(51, 52).trim(); 2318 String resSeq2 = line.substring(52, 56).trim(); 2319 String iCode2 = null; // Might get trimmed if blank. 2320 if (len > 56) iCode2 = line.substring(56, 57).trim(); 2321 2322 String sym1 = null; 2323 if (len > 64) sym1 = line.substring(59, 65).trim(); 2324 String sym2 = null; 2325 if (len > 71) sym2 = line.substring(66, 72).trim(); 2326 2327 linkRecords.add(new LinkRecord( 2328 name1, altLoc1, resName1, chainID1, resSeq1, iCode1, 2329 name2, altLoc2, resName2, chainID2, resSeq2, iCode2, 2330 sym1, sym2)); 2331 } 2332 2333 /** 2334 * Handler for the SITE records. <br> 2335 * 2336 * <pre> 2337 * 2338 * COLUMNS DATA TYPE FIELD DEFINITION 2339 * --------------------------------------------------------------------------------- 2340 * 1 - 6 Record name "SITE " 2341 * 8 - 10 Integer seqNum Sequence number. 2342 * 12 - 14 LString(3) siteID Site name. 2343 * 16 - 17 Integer numRes Number of residues that compose the siteResidues. 2344 * 19 - 21 Residue name resName1 Residue name for first residue that 2345 * creates the siteResidues. 2346 * 23 Character chainID1 Chain identifier for first residue of siteResidues. 2347 * 24 - 27 Integer seq1 Residue sequence number for first residue 2348 * of the siteResidues. 2349 * 28 AChar iCode1 Insertion code for first residue of the siteResidues. 2350 * 2351 * example: 2352 * 1 2 3 4 5 6 7 8 2353 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2354 * SITE 1 AC1 3 HIS A 94 HIS A 96 HIS A 119 2355 * SITE 1 AC2 5 ASN A 62 GLY A 63 HIS A 64 HOH A 328 2356 * SITE 2 AC2 5 HOH A 634 2357 * SITE 1 AC3 5 GLN A 136 GLN A 137 PRO A 138 GLU A 205 2358 * SITE 2 AC3 5 CYS A 206 2359 * SITE 1 AC4 11 HIS A 64 HIS A 94 HIS A 96 HIS A 119 2360 * SITE 2 AC4 11 LEU A 198 THR A 199 THR A 200 TRP A 209 2361 * SITE 3 AC4 11 HOH A 572 HOH A 582 HOH A 635 2362 * </pre> 2363 * @param line the SITE line record being currently read 2364 * @author Amr ALHOSSARY 2365 * @author Jules Jacobsen 2366 */ 2367 private void pdb_SITE_Handler(String line){ 2368 2369 if (params.isHeaderOnly()) return; 2370 2371 // make a map of: SiteId to List<ResidueNumber> 2372 2373 logger.debug("Site Line:"+line); 2374 2375 2376 String siteID = line.substring(11, 14); 2377 //fetch the siteResidues from the map 2378 List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID); 2379 2380 //if the siteResidues doesn't yet exist, make a new one. 2381 if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){ 2382 siteResidues = new ArrayList<ResidueNumber>(); 2383 siteToResidueMap.put(siteID.trim(), siteResidues); 2384 2385 logger.debug(String.format("New Site made: %s %s", siteID, siteResidues)); 2386 logger.debug("Now made " + siteMap.size() + " sites"); 2387 2388 } 2389 2390 logger.debug(String.format("SiteId: %s", siteID)); 2391 2392 2393 //line = 'SITE 1 AC1 6 ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2394 //line.substring(18) = 'ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2395 line = line.substring(18); 2396 String groupString = null; 2397 //groupString = 'ARG H 221A' 2398 //keep iterating through chunks of 10 characters - these are the groups in the siteResidues 2399 while (!(groupString = line.substring(0, 10)).equals(" ")) { 2400 //groupstring: 'ARG H 221A' 2401 2402 logger.debug("groupString: '" + groupString + "'"); 2403 2404 //set the residue name 2405 //residueName = 'ARG' 2406 String residueName = groupString.substring(0, 3); 2407 Character aminoCode1 = StructureTools.get1LetterCode(residueName); 2408 if (aminoCode1 != null) { 2409 if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 2410 aminoCode1 = null; 2411 } 2412 } 2413 2414 //this is already in the right format, so no need to fiddle with it... 2415 //pdbCode = 'H 221A' 2416 // String pdbCode = groupString.substring(4, 10).trim(); 2417 String chainId = groupString.substring(4, 5); 2418 Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim()); 2419 Character insCode = groupString.substring(9, 10).charAt(0); 2420 //set insCode to null as a measure to prevent storing thousands of empty Strings 2421 //- the empty value is returned using Group.getInsCode() 2422 // if (insCode.equals(" ")) { 2423 // insCode = null; 2424 // } 2425 2426 logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode)); 2427 2428 //make a new resNum with the data - this will be linked up with a site later 2429 ResidueNumber residueNumber = new ResidueNumber(); 2430 2431 2432 logger.debug("pdbCode: '" + resNum + insCode + "'"); 2433 2434 residueNumber.setChainName(chainId); 2435 residueNumber.setSeqNum(resNum); 2436 residueNumber.setInsCode(insCode); 2437 //add the resNum to the groups 2438 siteResidues.add(residueNumber); 2439 2440 logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID); 2441 2442 line = line.substring(11); 2443 } 2444 2445 logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):"); 2446 for (String key : siteToResidueMap.keySet()) { 2447 logger.debug(key + " : " + siteToResidueMap.get(key)); 2448 } 2449 2450 } 2451 2452 //Site variable related to parsing the REMARK 800 records. 2453 Site site; 2454 2455 private String[] keywords; 2456 private void pdb_REMARK_800_Handler(String line){ 2457 2458 if (params.isHeaderOnly()) return; 2459 2460 // 'REMARK 800 SITE_IDENTIFIER: CAT ' 2461 line = line.substring(11); 2462 String[] fields = line.split(": "); 2463 2464 if (fields.length == 2) { 2465 if (fields[0].equals("SITE_IDENTIFIER")) { 2466 // remark800Counter++; 2467 String siteID = fields[1].trim(); 2468 2469 logger.debug("siteID: '" + siteID +"'"); 2470 2471 //fetch the siteResidues from the map 2472 site = siteMap.get(siteID); 2473 2474 //if the siteResidues doesn't yet exist, make a new one. 2475 if (site == null || !siteID.equals(site.getSiteID())) { 2476 site = new Site(siteID, new ArrayList<Group>()); 2477 siteMap.put(site.getSiteID(), site); 2478 2479 logger.debug("New Site made: " + site); 2480 logger.debug("Now made " + siteMap.size() + " sites"); 2481 2482 } 2483 } 2484 if (fields[0].equals("EVIDENCE_CODE")) { 2485 // remark800Counter++; 2486 String evCode = fields[1].trim(); 2487 2488 logger.debug("evCode: '" + evCode +"'"); 2489 2490 //fetch the siteResidues from the map 2491 site.setEvCode(evCode); 2492 } 2493 if (fields[0].equals("SITE_DESCRIPTION")) { 2494 // remark800Counter++; 2495 String desc = fields[1].trim(); 2496 2497 logger.debug("desc: '" + desc +"'"); 2498 2499 //fetch the siteResidues from the map 2500 site.setDescription(desc); 2501 2502 logger.debug("Finished making REMARK 800 for site " + site.getSiteID()); 2503 logger.debug(site.remark800toPDB()); 2504 2505 } 2506 } 2507 } 2508 2509 private int intFromString(String intString){ 2510 int val = Integer.MIN_VALUE; 2511 try { 2512 val = Integer.parseInt(intString.trim()); 2513 } catch (NumberFormatException ex){ 2514 logger.info("Could not parse a number: " + ex.getMessage()); 2515 } 2516 return val; 2517 } 2518 2519 2520 2521 /** 2522 * Finds in the given list of chains the first one that has as name the given chainID. 2523 * If no such Chain can be found it returns null. 2524 */ 2525 private static Chain isKnownChain(String chainID, List<Chain> chains){ 2526 2527 for (int i = 0; i< chains.size();i++){ 2528 Chain testchain = chains.get(i); 2529 if (chainID.equals(testchain.getName())) { 2530 return testchain; 2531 } 2532 } 2533 2534 return null; 2535 } 2536 2537 2538 2539 private BufferedReader getBufferedReader(InputStream inStream) 2540 throws IOException { 2541 2542 BufferedReader buf ; 2543 if (inStream == null) { 2544 throw new IOException ("input stream is null!"); 2545 } 2546 2547 buf = new BufferedReader (new InputStreamReader (inStream)); 2548 return buf ; 2549 2550 } 2551 2552 2553 2554 /** 2555 * Parse a PDB file and return a datastructure implementing 2556 * PDBStructure interface. 2557 * 2558 * @param inStream an InputStream object 2559 * @return a Structure object 2560 * @throws IOException 2561 */ 2562 public Structure parsePDBFile(InputStream inStream) 2563 throws IOException 2564 { 2565 2566 BufferedReader buf = getBufferedReader(inStream); 2567 2568 return parsePDBFile(buf); 2569 2570 } 2571 2572 /** 2573 * Parse a PDB file and return a datastructure implementing 2574 * PDBStructure interface. 2575 * 2576 * @param buf a BufferedReader object 2577 * @return the Structure object 2578 * @throws IOException ... 2579 */ 2580 public Structure parsePDBFile(BufferedReader buf) 2581 throws IOException 2582 { 2583 // set the correct max values for parsing... 2584 loadMaxAtoms = params.getMaxAtoms(); 2585 atomCAThreshold = params.getAtomCaThreshold(); 2586 2587 2588 // (re)set structure 2589 2590 allModels = new ArrayList<>(); 2591 structure = new StructureImpl() ; 2592 currentModel = null; 2593 currentChain = null; 2594 currentGroup = null; 2595 // we initialise to true since at the beginning of the file we are always starting a new molecule 2596 startOfMolecule = true; 2597 startOfModel = true; 2598 2599 seqResChains = new ArrayList<Chain>(); 2600 siteMap = new LinkedHashMap<String, Site>(); 2601 pdbHeader = new PDBHeader(); 2602 connects = new ArrayList<Map<String,Integer>>(); 2603 previousContinuationField = ""; 2604 continuationField = ""; 2605 continuationString = ""; 2606 current_compound = null; 2607 sourceLines.clear(); 2608 compndLines.clear(); 2609 keywordsLines.clear(); 2610 isLastCompndLine = false; 2611 isLastSourceLine = false; 2612 prevMolId = -1; 2613 entities.clear(); 2614 helixList.clear(); 2615 strandList.clear(); 2616 turnList.clear(); 2617 lengthCheck = -1; 2618 atomCount = 0; 2619 atomOverflow = false; 2620 linkRecords = new ArrayList<LinkRecord>(); 2621 siteToResidueMap.clear(); 2622 2623 blankChainIdsPresent = false; 2624 2625 parseCAonly = params.isParseCAOnly(); 2626 2627 String line = null; 2628 2629 while ((line = buf.readLine()) != null) { 2630 2631 // ignore empty lines 2632 if ( line.equals("") || 2633 (line.equals(NEWLINE))){ 2634 continue; 2635 } 2636 2637 2638 // ignore short TER and END lines 2639 if ( line.startsWith("END")) { 2640 continue; 2641 } 2642 2643 if ( line.length() < 6 && !line.startsWith("TER")) { 2644 logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" ); 2645 continue; 2646 } 2647 2648 String recordName = null; 2649 if (line.length()<6) 2650 recordName = line.trim(); 2651 else 2652 recordName = line.substring (0, 6).trim (); 2653 2654 try { 2655 if (recordName.equals("ATOM")) 2656 pdb_ATOM_Handler(line); 2657 else if (recordName.equals("SEQRES")) 2658 pdb_SEQRES_Handler(line); 2659 else if (recordName.equals("HETATM")) 2660 pdb_ATOM_Handler(line); 2661 else if (recordName.equals("MODEL")) 2662 pdb_MODEL_Handler(line); 2663 else if (recordName.equals("TER")) 2664 pdb_TER_Handler(); 2665 else if (recordName.equals("HEADER")) 2666 pdb_HEADER_Handler(line); 2667 else if (recordName.equals("AUTHOR")) 2668 pdb_AUTHOR_Handler(line); 2669 else if (recordName.equals("TITLE")) 2670 pdb_TITLE_Handler(line); 2671 else if (recordName.equals("SOURCE")) 2672 sourceLines.add(line); //pdb_SOURCE_Handler 2673 else if (recordName.equals("COMPND")) 2674 compndLines.add(line); //pdb_COMPND_Handler 2675 else if (recordName.equals("KEYWDS")) 2676 keywordsLines.add(line); 2677 else if (recordName.equals("JRNL")) 2678 pdb_JRNL_Handler(line); 2679 else if (recordName.equals("EXPDTA")) 2680 pdb_EXPDTA_Handler(line); 2681 else if (recordName.equals("CRYST1")) 2682 pdb_CRYST1_Handler(line); 2683 else if (recordName.startsWith("MTRIX")) 2684 pdb_MTRIXn_Handler(line); 2685 else if (recordName.equals("REMARK")) 2686 pdb_REMARK_Handler(line); 2687 else if (recordName.equals("CONECT")) 2688 pdb_CONECT_Handler(line); 2689 else if (recordName.equals("REVDAT")) 2690 pdb_REVDAT_Handler(line); 2691 else if (recordName.equals("DBREF")) 2692 pdb_DBREF_Handler(line); 2693 else if (recordName.equals("SITE")) 2694 pdb_SITE_Handler(line); 2695 else if (recordName.equals("SSBOND")) 2696 pdb_SSBOND_Handler(line); 2697 else if (recordName.equals("LINK")) 2698 pdb_LINK_Handler(line); 2699 else if ( params.isParseSecStruc()) { 2700 if ( recordName.equals("HELIX") ) pdb_HELIX_Handler ( line ) ; 2701 else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ; 2702 else if (recordName.equals("TURN")) pdb_TURN_Handler( line ) ; 2703 } 2704 } catch (StringIndexOutOfBoundsException | NullPointerException ex) { 2705 logger.info("Unable to parse [" + line + "]"); 2706 } 2707 } 2708 2709 makeCompounds(compndLines, sourceLines); 2710 2711 handlePDBKeywords(keywordsLines); 2712 2713 triggerEndFileChecks(); 2714 2715 if (params.shouldCreateAtomBonds()) { 2716 formBonds(); 2717 } 2718 2719 if ( params.shouldCreateAtomCharges()) { 2720 addCharges(); 2721 } 2722 2723 if ( params.isParseSecStruc() && !params.isHeaderOnly()) 2724 setSecStruc(); 2725 2726 // Now correct the alternate location group 2727 StructureTools.cleanUpAltLocs(structure); 2728 2729 return structure; 2730 2731 } 2732 2733 2734 /** 2735 * Add the charges to the Structure 2736 */ 2737 private void addCharges() { 2738 ChargeAdder.addCharges(structure); 2739 } 2740 2741 /** 2742 * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained. 2743 * @author Jules Jacobsen 2744 * @param compoundList 2745 * @param sourceList 2746 */ 2747 private void makeCompounds(List<String> compoundList, 2748 List<String> sourceList) { 2749 // System.out.println("[makeCompounds] making compounds from compoundLines"); 2750 2751 for (String line : compoundList) { 2752 if (compoundList.indexOf(line) + 1 == compoundList.size()) { 2753 // System.out.println("[makeCompounds] Final line in compoundLines."); 2754 isLastCompndLine = true; 2755 } 2756 pdb_COMPND_Handler(line); 2757 2758 } 2759 // System.out.println("[makeCompounds] adding sources to compounds from sourceLines"); 2760 // since we're starting again from the first compound, reset it here 2761 if ( entities.size() == 0){ 2762 current_compound = new EntityInfo(); 2763 } else { 2764 current_compound = entities.get(0); 2765 } 2766 for (String line : sourceList) { 2767 if (sourceList.indexOf(line) + 1 == sourceList.size()) { 2768 // System.out.println("[makeCompounds] Final line in sourceLines."); 2769 isLastSourceLine = true; 2770 } 2771 pdb_SOURCE_Handler(line); 2772 } 2773 2774 } 2775 2776 /**Parse KEYWODS record of the PDB file.<br> 2777 * A keyword may be split over two lines. whether a keyword ends by the end 2778 * of a line or it is aplit over two lines, a <code>space</code> is added 2779 * between the 2 lines's contents, unless the first line ends in 2780 * a '-' character. 2781 * <pre> 2782 * Record Format 2783 * COLUMNS DATA TYPE FIELD DEFINITION 2784 * --------------------------------------------------------------------------------- 2785 * 1 - 6 Record name "KEYWDS" 2786 * 9 - 10 Continuation continuation Allows concatenation of records if necessary. 2787 * 11 - 79 List keywds Comma-separated list of keywords relevant 2788 * to the entry. 2789 * Example 2790 * 1 2 3 4 5 6 7 8 2791 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2792 * KEYWDS LYASE, TRICARBOXYLIC ACID CYCLE, MITOCHONDRION, OXIDATIVE 2793 * KEYWDS 2 METABOLISM 2794 * </pre> 2795 * @param lines The KEWODS record lines. 2796 * @author Amr ALHOSSARY 2797 */ 2798 private void handlePDBKeywords(List<String> lines) { 2799 StringBuilder fullList = new StringBuilder(); 2800 for (String line : lines) { 2801 String kwList = line.substring(10).trim(); 2802 if(kwList.length() > 0) { 2803 if(fullList.length() > 0 && fullList.indexOf("-", fullList.length()-1) < 0) { 2804 fullList.append(' '); 2805 } 2806 fullList.append(kwList); 2807 } 2808 } 2809 String fulllengthList = fullList.toString(); 2810 keywords = fulllengthList.split("( )*,( )*"); 2811 ArrayList<String> lst = new ArrayList<String>(keywords.length); 2812 for (String keyword : keywords) { 2813 if(keyword.length() == 0) { 2814 logger.debug("Keyword empty in structure {}", structure.getIdentifier().toString()); 2815 continue; 2816 } 2817 lst.add(keyword); 2818 } 2819 pdbHeader.setKeywords(lst); 2820 } 2821 2822 /** 2823 * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide 2824 * bonds), peptide bonds, and intra-residue bonds. 2825 * <p> 2826 * Note: the current implementation only looks at the first model of each 2827 * structure. This may need to be fixed in the future. 2828 */ 2829 private void formBonds() { 2830 2831 BondMaker maker = new BondMaker(structure, params); 2832 2833 // LINK records should be preserved, they are the way that 2834 // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers. 2835 // The analogy in mmCIF is the _struct_conn record. 2836 for (LinkRecord linkRecord : linkRecords) { 2837 maker.formLinkRecordBond(linkRecord); 2838 } 2839 2840 maker.formDisulfideBonds(ssbonds); 2841 2842 maker.makeBonds(); 2843 } 2844 2845 2846 2847 private void triggerEndFileChecks(){ 2848 2849 // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines) 2850 if (currentChain!=null && currentGroup!=null) { 2851 currentChain.addGroup(currentGroup); 2852 } 2853 if (currentModel!=null && currentChain!=null) { 2854 currentModel.add(currentChain); 2855 } 2856 if (currentModel!=null) { 2857 allModels.add(currentModel); 2858 } 2859 2860 if (blankChainIdsPresent) { 2861 // from biojava 5.0 there's limited support for old pdb files with blank chain ids 2862 logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly."); 2863 } 2864 2865 // reordering chains following the mmcif model and assigning entities 2866 assignChainsAndEntities(); 2867 structure.setEntityInfos(entities); 2868 2869 2870 2871 // header data 2872 2873 Date modDate = pdbHeader.getModDate(); 2874 if ( modDate.equals(new Date(0)) ) { 2875 // modification date = deposition date 2876 Date depositionDate = pdbHeader.getDepDate(); 2877 2878 if (! depositionDate.equals(modDate)){ 2879 // depDate is 0000-00-00 2880 pdbHeader.setDepDate(depositionDate); 2881 } 2882 2883 } 2884 2885 structure.setPDBHeader(pdbHeader); 2886 structure.setCrystallographicInfo(crystallographicInfo); 2887 2888 //set the JournalArticle, if there is one 2889 if (!journalLines.isEmpty()) { 2890 buildjournalArticle(); 2891 pdbHeader.setJournalArticle(journalArticle); 2892 } 2893 2894 structure.setDBRefs(dbrefs); 2895 2896 // Only align if requested (default) and not when headerOnly mode with no Atoms. 2897 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 2898 if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){ 2899 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 2900 SeqRes2AtomAligner aligner = new SeqRes2AtomAligner(); 2901 aligner.align(structure,seqResChains); 2902 2903 } else { 2904 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 2905 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 2906 } 2907 2908 2909 2910 //associate the temporary Groups in the siteMap to the ones 2911 if (!params.isHeaderOnly()) { 2912 // Only can link SITES if Atom Groups were parsed. 2913 linkSitesToGroups(); // will work now that setSites is called 2914 } 2915 2916 if ( bioAssemblyParser != null){ 2917 bioAssemblyParser.setMacromolecularSizes(); 2918 pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap()); 2919 } 2920 2921 if (ncsOperators !=null && ncsOperators.size()>0) { 2922 crystallographicInfo.setNcsOperators( 2923 ncsOperators.toArray(new Matrix4d[ncsOperators.size()])); 2924 } 2925 2926 2927 // rfree end file check 2928 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 2929 // Here we follow this strategy: 2930 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 2931 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 2932 2933 if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) { 2934 pdbHeader.setRfree(rfreeNoCutoffLine); 2935 } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) { 2936 pdbHeader.setRfree(rfreeStandardLine); 2937 } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) { 2938 pdbHeader.setRfree(rfreeStandardLine); 2939 } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE 2940 2941 2942 2943 } 2944 2945 private void setSecStruc(){ 2946 2947 setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2948 SecStrucType.helix4); 2949 setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2950 SecStrucType.extended); 2951 setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2952 SecStrucType.turn); 2953 2954 //Now insert random coil to the Groups that did not have SS information 2955 GroupIterator gi = new GroupIterator(structure); 2956 while (gi.hasNext()){ 2957 Group g = gi.next(); 2958 if (g.hasAminoAtoms()){ 2959 if (g.getProperty(Group.SEC_STRUC) == null){ 2960 SecStrucInfo ss = new SecStrucInfo(g, 2961 SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2962 SecStrucType.coil); 2963 g.setProperty(Group.SEC_STRUC, ss); 2964 } 2965 } 2966 } 2967 2968 } 2969 2970 private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){ 2971 2972 2973 Iterator<Map<String,String>> iter = secList.iterator(); 2974 nextElement: 2975 while (iter.hasNext()){ 2976 Map<String,String> m = iter.next(); 2977 2978 // assign all residues in this range to this secondary structure type 2979 // String initResName = (String)m.get("initResName"); 2980 String initChainId = m.get("initChainId"); 2981 String initSeqNum = m.get("initSeqNum" ); 2982 String initICode = m.get("initICode" ); 2983 // String endResName = (String)m.get("endResName" ); 2984 String endChainId = m.get("endChainId" ); 2985 String endSeqNum = m.get("endSeqNum"); 2986 String endICode = m.get("endICode"); 2987 2988 if (initICode.equals(" ")) 2989 initICode = ""; 2990 if (endICode.equals(" ")) 2991 endICode = ""; 2992 2993 GroupIterator gi = new GroupIterator(structure); 2994 boolean inRange = false; 2995 while (gi.hasNext()){ 2996 Group g = gi.next(); 2997 Chain c = g.getChain(); 2998 2999 if (c.getName().equals(initChainId)){ 3000 3001 String pdbCode = initSeqNum + initICode; 3002 if ( g.getResidueNumber().toString().equals(pdbCode) ) { 3003 inRange = true; 3004 } 3005 } 3006 if ( inRange){ 3007 if (g.hasAminoAtoms()) { 3008 SecStrucInfo ss = new SecStrucInfo(g, assignment, type); 3009 g.setProperty(Group.SEC_STRUC, ss); 3010 } 3011 3012 } 3013 if ( c.getName().equals(endChainId)){ 3014 String pdbCode = endSeqNum + endICode; 3015 if (pdbCode.equals(g.getResidueNumber().toString())){ 3016 inRange = false; 3017 continue nextElement; 3018 } 3019 } 3020 } 3021 } 3022 } 3023 3024 /** 3025 * Gets all chains with given chainName from given models list 3026 * @param chainName 3027 * @param polyModels 3028 * @return 3029 */ 3030 private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) { 3031 List<List<Chain>> models = new ArrayList<>(); 3032 3033 for (List<Chain> chains:polyModels) { 3034 List<Chain> matchingChains = new ArrayList<>(); 3035 models.add(matchingChains); 3036 for (Chain c:chains) { 3037 if (c.getName().equals(chainName)) { 3038 matchingChains.add(c); 3039 } 3040 } 3041 } 3042 return models; 3043 } 3044 3045 /** 3046 * Split the given chain (containing non-polymer groups and water groups only) 3047 * into individual chains per non-polymer group and individual chains per contiguous sets of water groups. 3048 * @param chain 3049 * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains 3050 */ 3051 private static List<List<Chain>> splitNonPolyChain(Chain chain) { 3052 List<Chain> splitNonPolys = new ArrayList<>(); 3053 List<Chain> waterChains = new ArrayList<>(); 3054 3055 Chain split = null; 3056 boolean previousGroupIsWater = false; 3057 3058 for (Group g:chain.getAtomGroups()){ 3059 3060 if (!previousGroupIsWater) { 3061 // add last one if there's one 3062 if (split!=null) { 3063 splitNonPolys.add(split); 3064 } 3065 split = new ChainImpl(); 3066 split.setName(chain.getName()); 3067 } else if (!g.isWater()) { 3068 // previous group is water and this group is not water: we change from a water chain to a non-poly 3069 // we'll need to add now the water chain to the list of water chains 3070 waterChains.add(split); 3071 split = new ChainImpl(); 3072 split.setName(chain.getName()); 3073 } 3074 3075 if (g.isWater()) { 3076 previousGroupIsWater = true; 3077 } else { 3078 previousGroupIsWater = false; 3079 3080 } 3081 3082 // this should include alt locs (referenced from the main group) 3083 split.addGroup(g); 3084 3085 } 3086 3087 // adding the last split chain: either to water or non-poly depending on what was the last seen group 3088 if (split!=null) { 3089 if (previousGroupIsWater) 3090 waterChains.add(split); 3091 else 3092 splitNonPolys.add(split); 3093 } 3094 3095 3096 List<List<Chain>> all = new ArrayList<>(2); 3097 all.add(splitNonPolys); 3098 all.add(waterChains); 3099 3100 return all; 3101 } 3102 3103 /** 3104 * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files 3105 * @param polys 3106 * @param nonPolys 3107 * @param waters 3108 */ 3109 private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) { 3110 3111 for (int i=0; i<polys.size(); i++) { 3112 String asymId = "A"; 3113 3114 for (Chain poly:polys.get(i)) { 3115 poly.setId(asymId); 3116 asymId = getNextAsymId(asymId); 3117 } 3118 for (Chain nonPoly:nonPolys.get(i)) { 3119 nonPoly.setId(asymId); 3120 asymId = getNextAsymId(asymId); 3121 } 3122 for (Chain water:waters.get(i)) { 3123 water.setId(asymId); 3124 asymId = getNextAsymId(asymId); 3125 } 3126 } 3127 } 3128 3129 /** 3130 * Gets the next asym id given an asymId, according to the convention followed by 3131 * mmCIF files produced by the PDB 3132 * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,... 3133 * @param asymId 3134 * @return 3135 */ 3136 private String getNextAsymId(String asymId) { 3137 if (asymId.length()==1) { 3138 if (!asymId.equals("Z")) { 3139 return Character.toString(getNextChar(asymId.charAt(0))); 3140 } else { 3141 return "AA"; 3142 } 3143 } else if (asymId.length()==2) { 3144 if (asymId.equals("ZZ")) { 3145 return "AAA"; 3146 } 3147 char[] c = new char[2]; 3148 asymId.getChars(0, 2, c, 0); 3149 c[0] = getNextChar(c[0]); 3150 if (c[0]=='A') { 3151 c[1] = getNextChar(c[1]); 3152 } 3153 return new String(c); 3154 } else if (asymId.length()==3) { 3155 char[] c = new char[3]; 3156 asymId.getChars(0, 3, c, 0); 3157 c[0] = getNextChar(c[0]); 3158 if (c[0]=='A') { 3159 c[1] = getNextChar(c[1]); 3160 if (c[1]=='A') { 3161 c[2] = getNextChar(c[2]); 3162 } 3163 } 3164 return new String(c); 3165 } 3166 return null; 3167 } 3168 3169 private char getNextChar(char c) { 3170 if (c!='Z') { 3171 return ((char)(c+1)); 3172 } else { 3173 return 'A'; 3174 } 3175 } 3176 3177 /** 3178 * Here we assign chains following the mmCIF data model: 3179 * one chain per polymer, one chain per non-polymer group and 3180 * several water chains. 3181 * <p> 3182 * Subsequently we assign entities for them: either from those read from 3183 * COMPOUND records or from those found heuristically through {@link EntityFinder} 3184 * 3185 */ 3186 private void assignChainsAndEntities(){ 3187 3188 List<List<Chain>> polyModels = new ArrayList<>(); 3189 List<List<Chain>> nonPolyModels = new ArrayList<>(); 3190 List<List<Chain>> waterModels = new ArrayList<>(); 3191 3192 for (List<Chain> model:allModels) { 3193 3194 List<Chain> polyChains = new ArrayList<>(); 3195 List<Chain> nonPolyChains = new ArrayList<>(); 3196 List<Chain> waterChains = new ArrayList<>(); 3197 3198 polyModels.add(polyChains); 3199 nonPolyModels.add(nonPolyChains); 3200 waterModels.add(waterChains); 3201 3202 for (Chain c:model) { 3203 3204 // we only have entities for polymeric chains, all others are ignored for assigning entities 3205 if (c.isWaterOnly()) { 3206 waterChains.add(c); 3207 3208 } else if (c.isPureNonPolymer()) { 3209 nonPolyChains.add(c); 3210 3211 } else { 3212 polyChains.add(c); 3213 } 3214 } 3215 } 3216 3217 List<List<Chain>> splitNonPolyModels = new ArrayList<>(); 3218 for (int i=0; i<nonPolyModels.size(); i++) { 3219 List<Chain> nonPolyModel = nonPolyModels.get(i); 3220 List<Chain> waterModel = waterModels.get(i); 3221 3222 List<Chain> splitNonPolys = new ArrayList<>(); 3223 splitNonPolyModels.add(splitNonPolys); 3224 3225 for (Chain nonPoly:nonPolyModel) { 3226 List<List<Chain>> splits = splitNonPolyChain(nonPoly); 3227 splitNonPolys.addAll(splits.get(0)); 3228 waterModel.addAll(splits.get(1)); 3229 } 3230 } 3231 3232 3233 // now we have all chains as in mmcif, let's assign ids following the mmcif rules 3234 assignAsymIds(polyModels, splitNonPolyModels, waterModels); 3235 3236 3237 if (!entities.isEmpty()) { 3238 // if the file contained COMPOUND records then we can assign entities to the poly chains 3239 for (EntityInfo comp : entities){ 3240 List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId()); 3241 if ( chainIds == null) 3242 continue; 3243 for ( String chainId : chainIds) { 3244 3245 List<List<Chain>> models = findChains(chainId, polyModels); 3246 3247 for (List<Chain> matchingChains:models) { 3248 for (Chain chain:matchingChains) { 3249 comp.addChain(chain); 3250 chain.setEntityInfo(comp); 3251 } 3252 3253 if (matchingChains.isEmpty()) { 3254 // usually if this happens something is wrong with the PDB header 3255 // e.g. 2brd - there is no Chain A, although it is specified in the header 3256 // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES 3257 // but the authors didn't observe in the density so it's completely missing 3258 // from the ATOM lines 3259 logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId()); 3260 } 3261 } 3262 } 3263 } 3264 3265 } else { 3266 3267 logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically"); 3268 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 3269 entities = EntityFinder.findPolyEntities(polyModels); 3270 3271 } 3272 3273 // now we assign entities to the nonpoly and water chains 3274 EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities); 3275 3276 3277 // in some rare cases purely non-polymer or purely water chain are present in pdb files 3278 // see https://github.com/biojava/biojava/pull/394 3279 // these case should be covered by the above 3280 3281 3282 // now that we have entities in chains we add the chains to the structure 3283 3284 for (int i=0;i<allModels.size();i++) { 3285 List<Chain> model = new ArrayList<>(); 3286 model.addAll(polyModels.get(i)); 3287 model.addAll(splitNonPolyModels.get(i)); 3288 model.addAll(waterModels.get(i)); 3289 structure.addModel(model); 3290 } 3291 3292 3293 } 3294 3295 /** 3296 * Links the Sites in the siteMap to the Groups in the Structure via the 3297 * siteToResidueMap ResidueNumber. 3298 * @author Jules Jacobsen 3299 * @return 3300 */ 3301 private void linkSitesToGroups() { 3302 3303 //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size()); 3304 3305 //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back. 3306 //the return list 3307 3308 if ( siteMap == null || siteToResidueMap == null){ 3309 logger.info("Sites can not be linked to residues!"); 3310 3311 return; 3312 } 3313 3314 List<Site> sites = null; 3315 //check that there are chains with which to associate the groups 3316 if (structure.getChains().isEmpty()) { 3317 sites = new ArrayList<Site>(siteMap.values()); 3318 logger.info("No chains to link Site Groups with - Sites will not be present in the Structure"); 3319 return; 3320 } 3321 3322 //check that the keys in the siteMap and SiteToResidueMap are equal 3323 if (! siteMap.keySet().equals(siteToResidueMap.keySet())) { 3324 logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure"); 3325 logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet()); 3326 //return; 3327 } 3328 3329 //so we have chains - associate the siteResidues-related groups with the ones 3330 //already in in the chains 3331 for (String key : siteMap.keySet()) { 3332 Site currentSite = siteMap.get(key); 3333 List<ResidueNumber> linkedGroups = siteToResidueMap.get(key); 3334 if ( linkedGroups == null) 3335 continue; 3336 for (ResidueNumber residueNumber : linkedGroups) { 3337 3338 String pdbCode = residueNumber.toString(); 3339 String chain = residueNumber.getChainName(); 3340 // System.out.println("chain: '" + chain + "'"); 3341 // String resNum = resNum.getSeqNum().toString(); 3342 // System.out.println("resNum: '" + resNum + "'"); 3343 3344 Group linkedGroup = null; 3345 try { 3346 //TODO: implement findGroup(ResidueNumber resNum) 3347 linkedGroup = structure.findGroup(chain, pdbCode); 3348 } catch (StructureException ex) { 3349 logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")"); 3350 continue; 3351 } 3352 3353 // System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID()); 3354 currentSite.getGroups().add(linkedGroup); 3355 } 3356 } 3357 3358 //System.out.println("SITEMAP: " + siteMap); 3359 3360 sites = new ArrayList<Site>(siteMap.values()); 3361 structure.setSites(sites); 3362 //System.out.println("STRUCTURE SITES: " + structure.getSites().size()); 3363 // for (Site site : structure.getSites()) { 3364 // System.out.println(site); 3365 // } 3366 // System.out.println("Linked Site Groups with Chains"); 3367 3368 } 3369 3370 private void buildjournalArticle() { 3371 3372 logger.debug("building new JournalArticle"); 3373 // for (String line : journalLines) { 3374 // System.out.println(line); 3375 // } 3376 3377 this.journalArticle = new JournalArticle(); 3378 // JRNL AUTH M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI, 3379 // JRNL AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT 3380 // JRNL TITL A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY 3381 // JRNL TITL 2 STAPHYLOCOCCUS AUREUS. 3382 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3383 // JRNL REFN ISSN 1529-2908 3384 // JRNL PMID 17351618 3385 // JRNL DOI 10.1038/NI1450 3386 StringBuffer auth = new StringBuffer(); 3387 StringBuffer titl = new StringBuffer(); 3388 StringBuffer edit = new StringBuffer(); 3389 StringBuffer ref = new StringBuffer(); 3390 StringBuffer publ = new StringBuffer(); 3391 StringBuffer refn = new StringBuffer(); 3392 StringBuffer pmid = new StringBuffer(); 3393 StringBuffer doi = new StringBuffer(); 3394 3395 for (String line : journalLines) { 3396 if ( line.length() < 19 ) { 3397 logger.info("can not process Journal line: " + line); 3398 continue; 3399 } 3400 // System.out.println("'" + line + "'"); 3401 String subField = line.substring(12, 16); 3402 // System.out.println("'" + subField + "'"); 3403 if (subField.equals("AUTH")) { 3404 auth.append(line.substring(19, line.length()).trim()); 3405 3406 logger.debug("AUTH '" + auth.toString() + "'"); 3407 3408 } 3409 if (subField.equals("TITL")) { 3410 //add a space to the end of a line so that when wrapped the 3411 //words on the join won't be concatenated 3412 titl.append(line.substring(19, line.length()).trim()).append(" "); 3413 3414 logger.debug("TITL '" + titl.toString() + "'"); 3415 3416 } 3417 if (subField.equals("EDIT")) { 3418 edit.append(line.substring(19, line.length()).trim()); 3419 3420 logger.debug("EDIT '" + edit.toString() + "'"); 3421 3422 } 3423 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3424 if (subField.equals("REF ")) { 3425 ref.append(line.substring(19, line.length()).trim()).append(" "); 3426 3427 logger.debug("REF '" + ref.toString() + "'"); 3428 3429 } 3430 if (subField.equals("PUBL")) { 3431 publ.append(line.substring(19, line.length()).trim()).append(" "); 3432 3433 logger.debug("PUBL '" + publ.toString() + "'"); 3434 3435 } 3436 // JRNL REFN ISSN 1529-2908 3437 if (subField.equals("REFN")) { 3438 if ( line.length() < 35 ) { 3439 logger.info("can not process Journal REFN line: " + line); 3440 continue; 3441 } 3442 refn.append(line.substring(35, line.length()).trim()); 3443 3444 logger.debug("REFN '" + refn.toString() + "'"); 3445 3446 } 3447 // JRNL PMID 17351618 3448 if (subField.equals("PMID")) { 3449 pmid.append(line.substring(19, line.length()).trim()); 3450 3451 logger.debug("PMID '" + pmid.toString() + "'"); 3452 3453 } 3454 // JRNL DOI 10.1038/NI1450 3455 if (subField.equals("DOI ")) { 3456 doi.append(line.substring(19, line.length()).trim()); 3457 3458 logger.debug("DOI '" + doi.toString() + "'"); 3459 3460 } 3461 } 3462 3463 //now set the parts of the JournalArticle 3464 journalArticle.setAuthorList(authorBuilder(auth.toString())); 3465 journalArticle.setEditorList(authorBuilder(edit.toString())); 3466 journalArticle.setRef(ref.toString()); 3467 JournalParser journalParser = new JournalParser(ref.toString()); 3468 journalArticle.setJournalName(journalParser.getJournalName()); 3469 if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) { 3470 journalArticle.setIsPublished(true); 3471 } 3472 journalArticle.setVolume(journalParser.getVolume()); 3473 journalArticle.setStartPage(journalParser.getStartPage()); 3474 journalArticle.setPublicationDate(journalParser.getPublicationDate()); 3475 journalArticle.setPublisher(publ.toString().trim()); 3476 journalArticle.setTitle(titl.toString().trim()); 3477 journalArticle.setRefn(refn.toString().trim()); 3478 journalArticle.setPmid(pmid.toString().trim()); 3479 journalArticle.setDoi(doi.toString().trim()); 3480 3481 3482 logger.debug("Made JournalArticle:"); 3483 logger.debug(journalArticle.toString()); 3484 3485 } 3486 3487 //inner class to deal with all the journal info 3488 private class JournalParser { 3489 3490 private String journalName; 3491 private String volume; 3492 private String startPage; 3493 private int publicationDate; 3494 3495 3496 public JournalParser(String ref) { 3497 3498 logger.debug("JournalParser init '" + ref + "'"); 3499 3500 3501 if (ref.equals("TO BE PUBLISHED ")) { 3502 journalName = ref.trim(); 3503 3504 logger.debug(String.format("JournalParser found journalString '%s'", journalName)); 3505 3506 return; 3507 } 3508 3509 if (ref.length() < 48) { 3510 logger.info("REF line too short - must be at least 48 characters to be valid for parsing."); 3511 journalName = ""; 3512 volume = ""; 3513 startPage = ""; 3514 publicationDate = 0; 3515 return; 3516 } 3517 //can be multi line: 3518 //REF PHILOS.TRANS.R.SOC.LONDON, V. 293 53 1981 3519 //REF 2 SER.B 3520 3521 //or 3522 3523 //REF GLYCOGEN PHOSPHORYLASE B: 1 1991 3524 //REF 2 DESCRIPTION OF THE PROTEIN 3525 //REF 3 STRUCTURE 3526 3527 //but usually single line 3528 //REF NUCLEIC ACIDS RES. 2009 3529 //REF MOL.CELL 2009 3530 //REF NAT.STRUCT.MOL.BIOL. V. 16 238 2009 3531 //REF ACTA CRYSTALLOGR.,SECT.F V. 65 199 2009 3532 //check if the date is present at the end of the line. 3533 // 09876543210987654321 3534 //'J.BIOL.CHEM. V. 280 23000 2005 ' 3535 //'J.AM.CHEM.SOC. V. 130 16011 2008 ' 3536 //'NAT.STRUCT.MOL.BIOL. V. 16 238 2009' 3537 String volumeInformation = ref.substring(30, 48); 3538 3539 logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation)); 3540 3541 //volumeInformation: 'V. 293 53 1981 ' 3542 // String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim(); 3543 // String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim(); 3544 // String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim(); 3545 // String journalString = ref.substring(0 , ref.length() - 18).trim(); 3546 String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim(); 3547 String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim(); 3548 String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim(); 3549 //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk) 3550 String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim(); 3551 journalString = journalString.trim(); 3552 // System.out.println("journalString: " + journalString); 3553 3554 logger.debug(String.format("JournalParser found volumeString '%s'", volumeString)); 3555 logger.debug(String.format("JournalParser found startPageString '%s'", startPageString)); 3556 logger.debug(String.format("JournalParser found dateString '%s'", dateString)); 3557 logger.debug(String.format("JournalParser found journalString '%s'", journalString)); 3558 3559 3560 if (!dateString.equals(" ")) { 3561 try { 3562 publicationDate = Integer.valueOf(dateString); 3563 } catch (NumberFormatException nfe) { 3564 logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1"); 3565 } 3566 // if (DEBUG) { 3567 // System.out.println("JournalParser set date " + publicationDate); 3568 // } 3569 } 3570 3571 if (!startPageString.equals(" ")) { 3572 startPage = startPageString; 3573 // if (DEBUG) { 3574 // System.out.println("JournalParser set startPage " + startPage); 3575 // } 3576 } 3577 3578 if (!volumeString.equals(" ")) { 3579 volume = volumeString; 3580 // if (DEBUG) { 3581 // System.out.println("JournalParser set volume " + volume); 3582 // } 3583 } 3584 3585 if (!journalString.equals(" ")) { 3586 journalName = journalString; 3587 3588 logger.debug("JournalParser set journalName " + journalName); 3589 3590 } 3591 } 3592 3593 private String getJournalName() { 3594 return journalName; 3595 } 3596 3597 private int getPublicationDate() { 3598 return publicationDate; 3599 } 3600 3601 private String getStartPage() { 3602 return startPage; 3603 } 3604 3605 private String getVolume() { 3606 return volume; 3607 } 3608 } 3609 3610 private List<Author> authorBuilder(String authorString) { 3611 ArrayList<Author> authorList = new ArrayList<Author>(); 3612 3613 if (authorString.equals("")) { 3614 return authorList; 3615 } 3616 3617 String[] authors = authorString.split(","); 3618 // if (DEBUG) { 3619 // for (int i = 0; i < authors.length; i++) { 3620 // String string = authors[i]; 3621 // System.out.println("authorBuilder author: '" + string + "'"); 3622 // } 3623 // } 3624 // AUTH SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS 3625 // AUTH 2 DISEASE (SSGCID) 3626 // or 3627 // AUTH E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET, 3628 // AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA, 3629 // AUTH 3 A.BOCHKAREV,D.COSSAR, 3630 // AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC) 3631 // or 3632 // AUTH T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER 3633 if (authors.length == 1) { 3634 //only one element means it's a consortium only 3635 Author author = new Author(); 3636 author.setSurname(authors[0]); 3637 3638 logger.debug("Set consortium author name " + author.getSurname()); 3639 3640 authorList.add(author); 3641 } else { 3642 for (int i = 0; i < authors.length; i++) { 3643 String authorFullName = authors[i]; 3644 3645 logger.debug("Building author " + authorFullName); 3646 3647 Author author = new Author(); 3648 String regex = "\\."; 3649 String[] authorNames = authorFullName.split(regex); 3650 // if (DEBUG) { 3651 // System.out.println("authorNames size " + authorNames.length); 3652 // for (int j = 0; j < authorNames.length; j++) { 3653 // String name = authorNames[j]; 3654 // System.out.println("split authName '" + name + "'"); 3655 // 3656 // } 3657 // } 3658 if (authorNames.length == 0) { 3659 author.setSurname(authorFullName); 3660 3661 logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname()); 3662 3663 } 3664 //again there might be a consortium name so there may be no elements 3665 else if (authorNames.length == 1) { 3666 author.setSurname(authorNames[0]); 3667 3668 logger.debug("Set consortium author name in multiple author block " + author.getSurname 3669 ()); 3670 3671 } else { 3672 String initials = ""; 3673 for (int j = 0; j < authorNames.length - 1; j++) { 3674 String initial = authorNames[j]; 3675 // if (DEBUG) { 3676 // System.out.println("adding initial '" + initial + "'"); 3677 // } 3678 //build the initials back up again 3679 initials += initial + "."; 3680 } 3681 3682 logger.debug("built initials '" + initials + "'"); 3683 3684 author.setInitials(initials); 3685 //surname is always last 3686 int lastName = authorNames.length - 1; 3687 String surname = authorNames[lastName]; 3688 3689 logger.debug("built author surname " + surname); 3690 3691 author.setSurname(surname); 3692 3693 } 3694 authorList.add(author); 3695 } 3696 } 3697 return authorList; 3698 } 3699 3700 public void setFileParsingParameters(FileParsingParameters params) 3701 { 3702 this.params= params; 3703 3704 // set the correct max values for parsing... 3705 loadMaxAtoms = params.getMaxAtoms(); 3706 atomCAThreshold = params.getAtomCaThreshold(); 3707 3708 3709 } 3710 3711 public FileParsingParameters getFileParsingParameters(){ 3712 return params; 3713 } 3714 3715 3716}