001/* 002 * 003 * This code may be freely distributed and modified under the 004 * terms of the GNU Lesser General Public Licence. This should 005 * be distributed with the code. If you do not have a copy, 006 * see: 007 * 008 * http://www.gnu.org/copyleft/lesser.html 009 * 010 * Copyright for this code is held jointly by the individual 011 * authors. These should be listed in @author doc comments. 012 * 013 * For more information on the BioJava project and its aims, 014 * or to join the biojava-l mailing list, visit the home page 015 * at: 016 * 017 * http://www.biojava.org/ 018 * 019 * Created on 16.03.2004 020 * 021 */ 022package org.biojava.nbio.structure.io; 023 024import static java.lang.Math.min; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.text.DateFormat; 031import java.text.ParseException; 032import java.text.SimpleDateFormat; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Date; 036import java.util.HashMap; 037import java.util.Iterator; 038import java.util.LinkedHashMap; 039import java.util.List; 040import java.util.Locale; 041import java.util.Map; 042import java.util.StringTokenizer; 043import java.util.regex.Matcher; 044import java.util.regex.Pattern; 045 046import javax.vecmath.Matrix4d; 047 048import org.biojava.nbio.structure.AminoAcid; 049import org.biojava.nbio.structure.AminoAcidImpl; 050import org.biojava.nbio.structure.Atom; 051import org.biojava.nbio.structure.AtomImpl; 052import org.biojava.nbio.structure.Author; 053import org.biojava.nbio.structure.Chain; 054import org.biojava.nbio.structure.ChainImpl; 055import org.biojava.nbio.structure.DBRef; 056import org.biojava.nbio.structure.Element; 057import org.biojava.nbio.structure.EntityInfo; 058import org.biojava.nbio.structure.EntityType; 059import org.biojava.nbio.structure.Group; 060import org.biojava.nbio.structure.GroupIterator; 061import org.biojava.nbio.structure.HetatomImpl; 062import org.biojava.nbio.structure.JournalArticle; 063import org.biojava.nbio.structure.NucleotideImpl; 064import org.biojava.nbio.structure.PDBCrystallographicInfo; 065import org.biojava.nbio.structure.PDBHeader; 066import org.biojava.nbio.structure.PdbId; 067import org.biojava.nbio.structure.ResidueNumber; 068import org.biojava.nbio.structure.Site; 069import org.biojava.nbio.structure.Structure; 070import org.biojava.nbio.structure.StructureException; 071import org.biojava.nbio.structure.StructureImpl; 072import org.biojava.nbio.structure.StructureTools; 073import org.biojava.nbio.structure.chem.ChemCompAtom; 074import org.biojava.nbio.structure.chem.ChemCompGroupFactory; 075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord; 076import org.biojava.nbio.structure.secstruc.SecStrucInfo; 077import org.biojava.nbio.structure.secstruc.SecStrucType; 078import org.biojava.nbio.structure.xtal.CrystalCell; 079import org.biojava.nbio.structure.xtal.SpaceGroup; 080import org.biojava.nbio.structure.xtal.SymoplibParser; 081import org.slf4j.Logger; 082import org.slf4j.LoggerFactory; 083 084 085/** 086 * This class implements the actual PDB file parsing. Do not access it directly, but 087 * via the PDBFileReader class. 088 * 089 * <h2>Parsing</h2> 090 * 091 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods. 092 * 093 * 094 * <p> 095 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD. 096 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically 097 * switch to a C-alpha only representation. 098 * 099 * <p> 100 * The result of the parsing of the PDB file is a new {@link Structure} object. 101 * 102 * <p> 103 * For more documentation on how to work with the Structure API please 104 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top"> 105 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a> 106 * 107 * 108 * 109 * 110 * <h2>Example</h2> 111 * <p> 112 * Q: How can I get a Structure object from a PDB file? 113 * <p> 114 * A: 115 * <pre> 116 * public {@link Structure} loadStructure(String pathToPDBFile){ 117 * // The PDBFileParser is wrapped by the PDBFileReader 118 * {@link PDBFileReader} pdbreader = new {@link PDBFileReader}(); 119 * 120 * {@link Structure} structure = null; 121 * try{ 122 * structure = pdbreader.getStructure(pathToPDBFile); 123 * System.out.println(structure); 124 * } catch (IOException e) { 125 * e.printStackTrace(); 126 * } 127 * return structure; 128 * } 129 * </pre> 130 * 131 * 132 * @author Andreas Prlic 133 * @author Jules Jacobsen 134 * @author Jose Duarte 135 * @since 1.4 136 */ 137public class PDBFileParser { 138 139 140 141 private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class); 142 143 // for printing 144 private static final String NEWLINE = System.getProperty("line.separator"); 145 146 147 // required for parsing: 148 private String pdbId; //the actual id of the entry 149 private Structure structure; 150 private List<List<Chain>> allModels; // a temp data structure to keep all models 151 private List<Chain> currentModel; // contains the ATOM records for each model 152 private Chain currentChain; 153 private Group currentGroup; 154 155 private List<Chain> seqResChains; // contains all the chains for the SEQRES records 156 //we're going to work on the assumption that the files are current - 157 //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true. 158 //if true then lines will be truncated at 72 characters in certain cases 159 //(pdb_COMPOUND_handler for example) 160 private boolean isLegacyFormat = false; 161 162 private boolean blankChainIdsPresent = false; 163 164 // for re-creating the biological assembly 165 private PDBBioAssemblyParser bioAssemblyParser = null; 166 167 private PDBHeader pdbHeader; 168 private PDBCrystallographicInfo crystallographicInfo; 169 private JournalArticle journalArticle; 170 private List<Map<String, Integer>> connects ; 171 private List<Map<String,String>> helixList; 172 private List<Map<String,String>> strandList; 173 private List<Map<String,String>> turnList; 174 175 private int lengthCheck ; 176 177 private boolean isLastCompndLine = false; 178 private boolean isLastSourceLine = false; 179 private EntityInfo current_compound; 180 private List<EntityInfo> entities = new ArrayList<>(); 181 private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<>(); 182 private List<String> compndLines = new ArrayList<>(); 183 private List<String> sourceLines = new ArrayList<>(); 184 private List<String> journalLines = new ArrayList<>(); 185 private List<String> keywordsLines = new ArrayList<>(); 186 private List<DBRef> dbrefs; 187 private Map<String, Site> siteMap = new LinkedHashMap<>(); 188 private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<>(); 189 190 private List<SSBondImpl> ssbonds = new ArrayList<>(); 191 192 // for storing LINK until we have all the atoms parsed 193 private List<LinkRecord> linkRecords; 194 195 private Matrix4d currentNcsOp; 196 private List<Matrix4d> ncsOperators; 197 198 // for parsing COMPOUND and SOURCE Header lines 199 private int prevMolId; 200 private String previousContinuationField; 201 private String continuationField; 202 private String continuationString; 203 204 private DateFormat dateFormat; 205 206 // for rfree parsing 207 private float rfreeStandardLine = -1; 208 private float rfreeNoCutoffLine = -1; 209 210 private static final List<String> compndFieldValues = new ArrayList<>( 211 Arrays.asList( 212 "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:", 213 "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:", 214 "BIOLOGICAL_UNIT:", "OTHER_DETAILS:" 215 )); 216 217 218 private static final List<String> ignoreCompndFieldValues = new ArrayList<>( 219 Arrays.asList( 220 "HETEROGEN:","ENGINEEREED:","FRAGMENT,", 221 "MUTANT:","SYNTHETIC:" 222 )); 223 // ENGINEEREED in pdb219d 224 225 private static final List<String> sourceFieldValues = new ArrayList<>( 226 Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:", 227 "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:", 228 "ORGANISM_TAXID:","STRAIN:", 229 "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:", 230 "CELL:", "ORGANELLE:", "SECRETION:", "GENE:", 231 "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:", 232 "EXPRESSION_SYSTEM_TAXID:", 233 "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:", 234 "EXPRESSION_SYSTEM_CELL_LINE:", 235 "EXPRESSION_SYSTEM_ATCC_NUMBER:", 236 "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:", 237 "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:", 238 "EXPRESSION_SYSTEM_CELLULAR_LOCATION:", 239 "EXPRESSION_SYSTEM_VECTOR_TYPE:", 240 "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:", 241 "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:")); 242 243 private int atomCount; 244 245 // parsing options: 246 247 private int atomCAThreshold ; 248 249 private int loadMaxAtoms; 250 251 private boolean atomOverflow; 252 253 /** flag to tell parser to only read Calpha coordinates **/ 254 private boolean parseCAonly; 255 256 257 private FileParsingParameters params; 258 259 private boolean startOfMolecule; 260 private boolean startOfModel; 261 262 public PDBFileParser() { 263 params = new FileParsingParameters(); 264 265 allModels = new ArrayList<>(); 266 structure = null ; 267 currentModel = null; 268 currentChain = null; 269 currentGroup = null; 270 // we initialise to true since at the beginning of the file we are always starting a new molecule 271 startOfMolecule = true; 272 startOfModel = true; 273 274 275 pdbHeader = new PDBHeader(); 276 crystallographicInfo = new PDBCrystallographicInfo(); 277 connects = new ArrayList<>() ; 278 279 280 helixList = new ArrayList<>(); 281 strandList = new ArrayList<>(); 282 turnList = new ArrayList<>(); 283 current_compound = null; 284 dbrefs = new ArrayList<>(); 285 siteMap = null; 286 dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US); 287 atomCount = 0; 288 atomOverflow = false; 289 parseCAonly = false; 290 291 // this SHOULD not be done 292 // DONOT:setFileParsingParameters(params); 293 // set the correct max values for parsing... 294 loadMaxAtoms = params.getMaxAtoms(); 295 atomCAThreshold = params.getAtomCaThreshold(); 296 297 linkRecords = new ArrayList<>(); 298 299 blankChainIdsPresent = false; 300 301 } 302 303 /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */ 304 private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) { 305 306 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3); 307 if ( g != null && !g.getChemComp().isEmpty()) 308 return g; 309 310 311 Group group; 312 if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){ 313 group = new HetatomImpl(); 314 315 } else if(StructureTools.isNucleotide(aminoCode3)) { 316 // it is a nucleotide 317 NucleotideImpl nu = new NucleotideImpl(); 318 group = nu; 319 320 } else { 321 AminoAcidImpl aa = new AminoAcidImpl() ; 322 aa.setAminoType(aminoCode1); 323 group = aa ; 324 } 325 326 // System.out.println("new resNum type: "+ resNum.getType() ); 327 return group ; 328 } 329 330 331 332 // Handler methods to deal with PDB file records properly. 333 /** 334 Handler for 335 HEADER Record Format 336 <pre> 337 COLUMNS DATA TYPE FIELD DEFINITION 338 ---------------------------------------------------------------------------------- 339 1 - 6 Record name "HEADER" 340 11 - 50 String(40) classification Classifies the molecule(s) 341 51 - 59 Date depDate Deposition date. This is the date 342 the coordinates were received by 343 the PDB 344 63 - 66 IDcode idCode This identifier is unique within PDB 345 </pre> 346 */ 347 private void pdb_HEADER_Handler(String line) { 348 349 String classification = null; 350 String deposition_date = null; 351 String pdbCode = null; 352 353 int len = line.trim().length(); 354 if(len > 10) { 355 classification = line.substring (10, min(len,50)).trim() ; 356 pdbHeader.setClassification(classification); 357 } 358 if(len > 50) { 359 deposition_date = line.substring (50, min(len,59)).trim() ; 360 try { 361 Date dep = dateFormat.parse(deposition_date); 362 pdbHeader.setDepDate(dep); 363 364 } catch (ParseException e){ 365 logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date"); 366 } 367 } 368 if(len > 62) { 369 pdbCode = line.substring (62, min(len,66)).trim() ; 370 pdbId = pdbCode; 371 372 logger.debug("Parsing entry {}", pdbId); 373 374 PdbId pdbIdToSet; 375 if(pdbCode.isBlank()) { 376 pdbIdToSet = null; 377 } else { 378 try { 379 pdbIdToSet = new PdbId(pdbCode); 380 } catch (IllegalArgumentException e) { 381 logger.warn("Malformed PDB ID {}. setting PdbId to null", pdbCode); 382 pdbIdToSet = null; 383 } 384 } 385 structure.setPdbId(pdbIdToSet); 386 pdbHeader.setPdbId(pdbIdToSet); 387 } 388 389 //*really* old files (you'll need to hunt to find these as they 390 //should have been remediated) have headers like below. Plus the 391 //pdbId at positions 72-76 is present in every line 392 393 //HEADER PROTEINASE INHIBITOR (TRYPSIN) 05-OCT-84 5PTI 5PTI 3 394 //HEADER TRANSFERASE (ACYLTRANSFERASE) 02-SEP-92 1LAC 1LAC 2 395 if (len > 66) { 396 if (pdbId.equals(line.substring (72, 76))){ 397 isLegacyFormat = true; 398 logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly."); 399 } 400 } 401 402 } 403 404 405 /** 406 * Parses the following record: 407 * <pre> 408 * COLUMNS DATA TYPE FIELD DEFINITION 409 * ------------------------------------------------------------------------------------ 410 * 1 - 6 Record name "AUTHOR" 411 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 412 * 11 - 79 List authorList List of the author names, separated 413 * by commas. 414 * 415 * </pre> 416 * @param line 417 */ 418 private void pdb_AUTHOR_Handler(String line) { 419 420 String authors = line.substring(10).trim(); 421 422 String auth = pdbHeader.getAuthors(); 423 if (auth == null){ 424 pdbHeader.setAuthors(authors); 425 } else { 426 auth += authors; 427 pdbHeader.setAuthors(auth); 428 } 429 430 } 431 432 433 434 /** 435 * Parses the following record: 436 * 437 * <pre> 438 * COLUMNS DATA TYPE FIELD DEFINITION 439 * -------------------------------------------------------------------- 440 * 1 - 6 Record name "HELIX " 441 * 8 - 10 Integer serNum Serial number of the helix. 442 * This starts at 1 and increases 443 * incrementally. 444 * 12 - 14 LString(3) helixID Helix identifier. In addition 445 * to a serial number, each helix is 446 * given an alphanumeric character 447 * helix identifier. 448 * 16 - 18 Residue name initResName Name of the initial residue. 449 * 20 Character initChainID Chain identifier for the chain 450 * containing this helix. 451 * 22 - 25 Integer initSeqNum Sequence number of the initial 452 * residue. 453 * 26 AChar initICode Insertion code of the initial 454 * residue. 455 * 28 - 30 Residue name endResName Name of the terminal residue of 456 * the helix. 457 * 32 Character endChainID Chain identifier for the chain 458 * containing this helix. 459 * 34 - 37 Integer endSeqNum Sequence number of the terminal 460 * residue. 461 * 38 AChar endICode Insertion code of the terminal 462 * residue. 463 * 39 - 40 Integer helixClass Helix class (see below). 464 * 41 - 70 String comment Comment about this helix. 465 * 72 - 76 Integer length Length of this helix. 466 * </pre> 467 */ 468 private void pdb_HELIX_Handler(String line){ 469 470 if (params.isHeaderOnly()) return; 471 472 if (line.length()<38) { 473 logger.info("HELIX line has length under 38. Ignoring it."); 474 return; 475 } 476 477 String initResName = line.substring(15,18).trim(); 478 String initChainId = line.substring(19,20); 479 String initSeqNum = line.substring(21,25).trim(); 480 String initICode = line.substring(25,26); 481 String endResName = line.substring(27,30).trim(); 482 String endChainId = line.substring(31,32); 483 String endSeqNum = line.substring(33,37).trim(); 484 String endICode = line.substring(37,38); 485 486 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 487 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 488 489 Map<String,String> m = new HashMap<>(); 490 491 m.put("initResName",initResName); 492 m.put("initChainId", initChainId); 493 m.put("initSeqNum", initSeqNum); 494 m.put("initICode", initICode); 495 m.put("endResName", endResName); 496 m.put("endChainId", endChainId); 497 m.put("endSeqNum",endSeqNum); 498 m.put("endICode",endICode); 499 500 helixList.add(m); 501 502 } 503 504 /** 505 * Handler for 506 * <pre> 507 * COLUMNS DATA TYPE FIELD DEFINITION 508 * -------------------------------------------------------------- 509 * 1 - 6 Record name "SHEET " 510 * 8 - 10 Integer strand Strand number which starts at 1 511 * for each strand within a sheet 512 * and increases by one. 513 * 12 - 14 LString(3) sheetID Sheet identifier. 514 * 15 - 16 Integer numStrands Number of strands in sheet. 515 * 18 - 20 Residue name initResName Residue name of initial residue. 516 * 22 Character initChainID Chain identifier of initial 517 * residue in strand. 518 * 23 - 26 Integer initSeqNum Sequence number of initial 519 * residue in strand. 520 * 27 AChar initICode Insertion code of initial residue 521 * in strand. 522 * 29 - 31 Residue name endResName Residue name of terminal residue. 523 * 33 Character endChainID Chain identifier of terminal 524 * residue. 525 * 34 - 37 Integer endSeqNum Sequence number of terminal 526 * residue. 527 * 38 AChar endICode Insertion code of terminal 528 * residue. 529 * 39 - 40 Integer sense Sense of strand with respect to 530 * previous strand in the sheet. 0 531 * if first strand, 1 if parallel, 532 * -1 if anti-parallel. 533 * 42 - 45 Atom curAtom Registration. Atom name in 534 * current strand. 535 * 46 - 48 Residue name curResName Registration. Residue name in 536 * current strand. 537 * 50 Character curChainId Registration. Chain identifier in 538 * current strand. 539 * 51 - 54 Integer curResSeq Registration. Residue sequence 540 * number in current strand. 541 * 55 AChar curICode Registration. Insertion code in 542 * current strand. 543 * 57 - 60 Atom prevAtom Registration. Atom name in 544 * previous strand. 545 * 61 - 63 Residue name prevResName Registration. Residue name in 546 * previous strand. 547 * 65 Character prevChainId Registration. Chain identifier in 548 * previous strand. 549 * 66 - 69 Integer prevResSeq Registration. Residue sequence 550 * number in previous strand. 551 * 70 AChar prevICode Registration. Insertion code in 552 * previous strand. 553 * </pre> 554 */ 555 private void pdb_SHEET_Handler( String line){ 556 557 if (params.isHeaderOnly()) return; 558 559 if (line.length()<38) { 560 logger.info("SHEET line has length under 38. Ignoring it."); 561 return; 562 } 563 564 String initResName = line.substring(17,20).trim(); 565 String initChainId = line.substring(21,22); 566 String initSeqNum = line.substring(22,26).trim(); 567 String initICode = line.substring(26,27); 568 String endResName = line.substring(28,31).trim(); 569 String endChainId = line.substring(32,33); 570 String endSeqNum = line.substring(33,37).trim(); 571 String endICode = line.substring(37,38); 572 573 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 574 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 575 576 Map<String,String> m = new HashMap<>(); 577 578 m.put("initResName",initResName); 579 m.put("initChainId", initChainId); 580 m.put("initSeqNum", initSeqNum); 581 m.put("initICode", initICode); 582 m.put("endResName", endResName); 583 m.put("endChainId", endChainId); 584 m.put("endSeqNum",endSeqNum); 585 m.put("endICode",endICode); 586 587 strandList.add(m); 588 } 589 590 591 /** 592 * Handler for TURN lines 593 * <pre> 594 * COLUMNS DATA TYPE FIELD DEFINITION 595 * -------------------------------------------------------------------- 596 * 1 - 6 Record name "TURN " 597 * 8 - 10 Integer seq Turn number; starts with 1 and 598 * increments by one. 599 * 12 - 14 LString(3) turnId Turn identifier 600 * 16 - 18 Residue name initResName Residue name of initial residue in 601 * turn. 602 * 20 Character initChainId Chain identifier for the chain 603 * containing this turn. 604 * 21 - 24 Integer initSeqNum Sequence number of initial residue 605 * in turn. 606 * 25 AChar initICode Insertion code of initial residue 607 * in turn. 608 * 27 - 29 Residue name endResName Residue name of terminal residue 609 * of turn. 610 * 31 Character endChainId Chain identifier for the chain 611 * containing this turn. 612 * 32 - 35 Integer endSeqNum Sequence number of terminal 613 * residue of turn. 614 * 36 AChar endICode Insertion code of terminal residue 615 * of turn. 616 * 41 - 70 String comment Associated comment. 617 * </pre> 618 * @param line 619 */ 620 private void pdb_TURN_Handler( String line){ 621 622 if (params.isHeaderOnly()) return; 623 624 if (line.length()<36) { 625 logger.info("TURN line has length under 36. Ignoring it."); 626 return; 627 } 628 629 String initResName = line.substring(15,18).trim(); 630 String initChainId = line.substring(19,20); 631 String initSeqNum = line.substring(20,24).trim(); 632 String initICode = line.substring(24,25); 633 String endResName = line.substring(26,29).trim(); 634 String endChainId = line.substring(30,31); 635 String endSeqNum = line.substring(31,35).trim(); 636 String endICode = line.substring(35,36); 637 638 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 639 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 640 641 Map<String,String> m = new HashMap<>(); 642 643 m.put("initResName",initResName); 644 m.put("initChainId", initChainId); 645 m.put("initSeqNum", initSeqNum); 646 m.put("initICode", initICode); 647 m.put("endResName", endResName); 648 m.put("endChainId", endChainId); 649 m.put("endSeqNum",endSeqNum); 650 m.put("endICode",endICode); 651 652 turnList.add(m); 653 } 654 655 /** 656 * Handler for 657 * REVDAT Record format: 658 * <pre> 659 * 660 * COLUMNS DATA TYPE FIELD DEFINITION 661 * ---------------------------------------------------------------------------------- 662 * 1 - 6 Record name "REVDAT" 663 * 8 - 10 Integer modNum Modification number. 664 * 11 - 12 Continuation continuation Allows concatenation of multiple 665 * records. 666 * 14 - 22 Date modDate Date of modification (or release for 667 * new entries). This is not repeated 668 * on continuation lines. 669 * 24 - 28 String(5) modId Identifies this particular 670 * modification. It links to the 671 * archive used internally by PDB. 672 * This is not repeated on continuation 673 * lines. 674 * 32 Integer modType An integer identifying the type of 675 * modification. In case of revisions 676 * with more than one possible modType, 677 * the highest value applicable will be 678 * assigned. 679 * 40 - 45 LString(6) record Name of the modified record. 680 * 47 - 52 LString(6) record Name of the modified record. 681 * 54 - 59 LString(6) record Name of the modified record. 682 * 61 - 66 LString(6) record Name of the modified record. 683 * </pre> 684 */ 685 private void pdb_REVDAT_Handler(String line) { 686 687 // keep the first as latest modified date and the last as release date 688 Date modDate = pdbHeader.getModDate(); 689 690 if ( modDate==null || modDate.equals(new Date(0)) ) { 691 692 // modified date is still uninitialized 693 String modificationDate = line.substring (13, 22).trim() ; 694 695 try { 696 Date dep = dateFormat.parse(modificationDate); 697 pdbHeader.setModDate(dep); 698 pdbHeader.setRelDate(dep); 699 } catch (ParseException e){ 700 logger.info("Could not parse revision date string '"+modificationDate+"'. "); 701 } 702 703 } else { 704 705 // set as the release date 706 String releaseDate = line.substring (13, 22).trim() ; 707 708 try { 709 Date dep = dateFormat.parse(releaseDate); 710 pdbHeader.setRelDate(dep); 711 } catch (ParseException e){ 712 logger.info("Could not parse revision date string '"+releaseDate+"'. "); 713 } 714 } 715 } 716 717 /** 718 * Handler for 719 * SEQRES record format 720 * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied. 721 * <p> 722 * Record Format: 723 * <p> 724 * <pre> 725 * COLUMNS DATA TYPE FIELD DEFINITION 726 * --------------------------------------------------------------------------------- 727 * 1 - 6 Record name "SEQRES" 728 * 9 - 10 Integer serNum Serial number of the SEQRES record 729 * for the current chain. Starts at 1 730 * and increments by one each line. 731 * Reset to 1 for each chain. 732 * 12 Character chainID Chain identifier. This may be any 733 * single legal character, including a 734 * blank which is used if there is 735 * only one chain. 736 * 14 - 17 Integer numRes Number of residues in the chain. 737 * This value is repeated on every 738 * record. 739 * 20 - 22 Residue name resName Residue name. 740 * 24 - 26 Residue name resName Residue name. 741 * 28 - 30 Residue name resName Residue name. 742 * 32 - 34 Residue name resName Residue name. 743 * 36 - 38 Residue name resName Residue name. 744 * 40 - 42 Residue name resName Residue name. 745 * 44 - 46 Residue name resName Residue name. 746 * 48 - 50 Residue name resName Residue name. 747 * 52 - 54 Residue name resName Residue name. 748 * 56 - 58 Residue name resName Residue name. 749 * 60 - 62 Residue name resName Residue name. 750 * 64 - 66 Residue name resName Residue name. 751 * 68 - 70 Residue name resName Residue name. 752 * </pre> 753 * @author Jules Jacobsen 754 */ 755 private void pdb_SEQRES_Handler(String line) { 756 757 /* 758 * 1 2 3 4 5 6 7 759 * 1234567890123456789012345678901234567890123456789012345678901234567890 760 * SEQRES 1 A 376 LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR 761 * SEQRES 1 A 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 762 * SEQRES 2 A 21 TYR GLN LEU GLU ASN TYR CYS ASN 763 * SEQRES 1 B 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 764 * SEQRES 2 B 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 765 * SEQRES 3 B 30 THR PRO LYS ALA 766 * SEQRES 1 C 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 767 * SEQRES 2 C 21 TYR GLN LEU GLU ASN TYR CYS ASN 768 * SEQRES 1 D 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 769 * SEQRES 2 D 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 770 * SEQRES 3 D 30 THR PRO LYS ALA 771 */ 772 773 String recordName = line.substring(0, 6).trim(); 774 String chainID = line.substring(11, 12); 775 String newLength = line.substring(13,17).trim(); 776 String subSequence = line.substring(18); 777 778 if ( lengthCheck == -1 ){ 779 lengthCheck = Integer.parseInt(newLength); 780 } 781 782 StringTokenizer subSequenceResidues = new StringTokenizer(subSequence); 783 784 Character aminoCode1 = null; 785 if (! recordName.equals(AminoAcid.SEQRESRECORD)) { 786 // should not have been called 787 return; 788 } 789 790 currentChain = isKnownChain(chainID, seqResChains); 791 if ( currentChain == null) { 792 793 currentChain = new ChainImpl(); 794 currentChain.setId(chainID); 795 currentChain.setName(chainID); 796 797 } 798 799 while (subSequenceResidues.hasMoreTokens()) { 800 801 String threeLetter = subSequenceResidues.nextToken(); 802 803 aminoCode1 = StructureTools.get1LetterCode(threeLetter); 804 805 //if (aminoCode1 == null) { 806 // could be a nucleotide... 807 // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide... 808 //} 809 currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter); 810 811 currentGroup.setPDBName(threeLetter); 812 813 if ( currentGroup instanceof AminoAcid){ 814 AminoAcid aa = (AminoAcid)currentGroup; 815 aa.setRecordType(AminoAcid.SEQRESRECORD); 816 } 817 // add the current resNum to the new chain. 818 currentChain.addGroup(currentGroup); 819 820 } 821 Chain test = isKnownChain(chainID, seqResChains); 822 823 if ( test == null) 824 seqResChains.add(currentChain); 825 826 if (currentGroup != null) 827 currentGroup.trimToSize(); 828 829 currentGroup = null; 830 currentChain = null; 831 832 // the current chain is finished! 833 //if ( current_chain.getLength() != lengthCheck ){ 834 // System.err.println("the length of chain " + current_chain.getName() + "(" + 835 // current_chain.getLength() + ") does not match the expected " + lengthCheck); 836 //} 837 838 lengthCheck = Integer.parseInt(newLength); 839 840 } 841 842 843 844 /** 845 * Handler for 846 * TITLE Record Format 847 * <pre> 848 COLUMNS DATA TYPE FIELD DEFINITION 849 ---------------------------------------------------------------------------------- 850 1 - 6 Record name "TITLE " 851 9 - 10 Continuation continuation Allows concatenation of multiple 852 records. 853 11 - 70 String title Title of the experiment. 854 * </pre> 855 * 856 */ 857 private void pdb_TITLE_Handler(String line) { 858 String title; 859 if ( line.length() > 79) 860 title = line.substring(10,80).trim(); 861 else 862 title = line.substring(10,line.length()).trim(); 863 864 String t = pdbHeader.getTitle(); 865 if ( (t != null) && (! "".equals(t)) ){ 866 if (t.endsWith("-")) 867 t += ""; // if last line ends with a hyphen then we don't add space 868 else 869 t += " "; 870 } 871 else t = ""; 872 873 t += title; 874 875 pdbHeader.setTitle(t); 876 } 877 878 /** 879 * JRNL handler. 880 * The JRNL record contains the primary literature citation that describes the experiment which resulted 881 * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary 882 * reference, then there is no JRNL reference. Other references are given in REMARK 1. 883 * 884 * Record Format 885 * <pre> 886 * COLUMNS DATA TYPE FIELD DEFINITION 887 * ----------------------------------------------------------------------- 888 * 1 - 6 Record name "JRNL " 889 * 890 * 13 - 70 LString text See Details below. 891 * </pre> 892 */ 893 private void pdb_JRNL_Handler(String line) { 894 //add the strings to the journalLines 895 //the actual JournalArticle is then built when the whole entry is being 896 //finalized with triggerEndFileChecks() 897 //JRNL TITL NMR SOLUTION STRUCTURE OF RECOMBINANT TICK 1TAP 10 898 if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) { 899 //trim off the trailing PDB id from legacy files. 900 //are we really trying to still cater for these museum pieces? 901 902 logger.debug("trimming legacy PDB id from end of JRNL section line"); 903 904 line = line.substring(0, line.length() - 8); 905 journalLines.add(line); 906 } else { 907 journalLines.add(line); 908 } 909 } 910 911 /** 912 * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same 913 * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be 914 * missing. Don't say I didn't warn you. 915 * 916 * @param line 917 */ 918 private void pdb_COMPND_Handler(String line) { 919 920 logger.debug("previousContinuationField is {}", previousContinuationField); 921 logger.debug("current continuationField is {}", continuationField); 922 logger.debug("current continuationString is {}", continuationString); 923 logger.debug("current compound is {}", current_compound); 924 925 926 // In legacy PDB files the line ends with the PDB code and a serial number, chop those off! 927 //format version 3.0 onwards will have 80 characters in a line 928 // if (line.length() > 72) { 929 if (isLegacyFormat) { 930 // if (DEBUG) { 931 // System.out.println("We have a legacy file - truncating line length to 71 characters:"); 932 // System.out.println(line); 933 // } 934 line = line.substring(0, 72); 935 } 936 937 line = line.substring(10, line.length()); 938 939 940 String[] fieldList = line.trim().split("\\s+"); 941 int fl = fieldList.length; 942 if (fl > 0) { 943 String field0 = fieldList[0]; 944 if (compndFieldValues.contains(field0)) { 945 continuationField = field0; 946 if ("".equals(previousContinuationField)) { 947 previousContinuationField = continuationField; 948 } 949 } else if (field0.endsWith(";") && compndFieldValues.contains(field0.substring(0, field0.length()-1)) ) { 950 // the ':' character indicates the end of a field name and should be invalid as part the first data token 951 // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check 952 // UPDATE: There is no harm of having a ':' in the first data token. e.g. 3fdj contains a ':'. 953 // The intended case occurs only if the token is a key followed by a colon and a semicolon without spaces, e.g. "COMPND 2 MOLECULE:;" 954 logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier"); 955 return; 956 } 957 } else { 958 // the line will be added as data to the previous field 959 } 960 961 962 line = line.replace(continuationField, "").trim(); 963 964 StringTokenizer compndTokens = new StringTokenizer(line); 965 966 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 967 968 while (compndTokens.hasMoreTokens()) { 969 String token = compndTokens.nextToken(); 970 971 if ("".equals(previousContinuationField)) { 972 previousContinuationField = continuationField; 973 } 974 975 if (previousContinuationField.equals(continuationField) 976 && compndFieldValues.contains(continuationField)) { 977 978 logger.debug("Still in field {}", continuationField); 979 logger.debug("token = {}", token); 980 981 continuationString = continuationString.concat(token + " "); 982 983 logger.debug("continuationString = {}", continuationString); 984 985 } 986 if (!continuationField.equals(previousContinuationField)) { 987 988 if ("".equals(continuationString)) { 989 continuationString = token; 990 991 } else { 992 993 compndValueSetter(previousContinuationField, 994 continuationString); 995 previousContinuationField = continuationField; 996 continuationString = token + " "; 997 } 998 } else if (ignoreCompndFieldValues.contains(token)) { 999 // this field shall be ignored 1000 //continuationField = token; 1001 } 1002 } 1003 if (isLastCompndLine) { 1004 // final line in the section - finish off the compound 1005 // System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header."); 1006 compndValueSetter(continuationField, continuationString); 1007 continuationString = ""; 1008 if (current_compound!=null) entities.add(current_compound); 1009 } 1010 } 1011 1012 /** 1013 * Set the value in the current molId object 1014 * @param field 1015 * @param value 1016 */ 1017 private void compndValueSetter(String field, String value) { 1018 1019 value = value.trim().replace(";", ""); 1020 if ("MOL_ID:".equals(field)) { 1021 1022 int i = -1; 1023 try { 1024 i = Integer.valueOf(value); 1025 } catch (NumberFormatException e){ 1026 logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value); 1027 } 1028 if (i>0 && prevMolId!=i) { 1029 1030 if (current_compound!=null) entities.add(current_compound); 1031 1032 logger.debug("Initialising new Compound with mol_id {}", i); 1033 1034 current_compound = new EntityInfo(); 1035 1036 current_compound.setMolId(i); 1037 1038 // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25 1039 current_compound.setType(EntityType.POLYMER); 1040 1041 prevMolId = i; 1042 } 1043 1044 } 1045 1046 // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return 1047 if (current_compound==null) { 1048 return; 1049 } 1050 1051 if ("MOLECULE:".equals(field)) { 1052 current_compound.setDescription(value); 1053 1054 } 1055 if ("CHAIN:".equals(field)) { 1056 //System.out.println(value); 1057 StringTokenizer chainTokens = new StringTokenizer(value, ","); 1058 List<String> chains = new ArrayList<>(); 1059 1060 while (chainTokens.hasMoreTokens()) { 1061 String chainID = chainTokens.nextToken().trim(); 1062 // NULL is used in old PDB files to represent empty chain DI 1063 if ("NULL".equals(chainID)) 1064 chainID = " "; 1065 chains.add(chainID); 1066 } 1067 compoundMolIds2chainIds.put(current_compound.getMolId(),chains); 1068 1069 } 1070 if ("SYNONYM:".equals(field)) { 1071 1072 StringTokenizer synonyms = new StringTokenizer(value, ","); 1073 List<String> names = new ArrayList<>(); 1074 1075 while (synonyms.hasMoreTokens()) { 1076 names.add(synonyms.nextToken()); 1077 1078 current_compound.setSynonyms(names); 1079 } 1080 1081 } 1082 1083 if ("EC:".equals(field)) { 1084 1085 StringTokenizer ecNumTokens = new StringTokenizer(value, ","); 1086 List<String> ecNums = new ArrayList<>(); 1087 1088 while (ecNumTokens.hasMoreTokens()) { 1089 ecNums.add(ecNumTokens.nextToken()); 1090 1091 current_compound.setEcNums(ecNums); 1092 } 1093 1094 } 1095 if ("FRAGMENT:".equals(field)) { 1096 1097 current_compound.setFragment(value); 1098 1099 } 1100 if ("ENGINEERED:".equals(field)) { 1101 1102 current_compound.setEngineered(value); 1103 1104 } 1105 if ("MUTATION:".equals(field)) { 1106 1107 current_compound.setMutation(value); 1108 1109 } 1110 if ("BIOLOGICAL_UNIT:".equals(field)) { 1111 1112 current_compound.setBiologicalUnit(value); 1113 1114 } 1115 if ("OTHER_DETAILS:".equals(field)) { 1116 1117 current_compound.setDetails(value); 1118 1119 } 1120 1121 } 1122 1123 1124 /** 1125 * Handler for 1126 * SOURCE Record format 1127 * 1128 * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied. 1129 * Record Format 1130 * <pre> 1131 * COLUMNS DATA TYPE FIELD DEFINITION 1132 * ------------------------------------------------------------------------------- 1133 * 1 - 6 Record name "SOURCE" 1134 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 1135 * 11 - 70 Specification srcName Identifies the source of the macromolecule in 1136 * list a token: value format. 1137 * </pre> 1138 * @param line the line to be parsed 1139 */ 1140 private void pdb_SOURCE_Handler(String line) { 1141 // works in the same way as the pdb_COMPND_Handler. 1142 String continuationNr = line.substring(9, 10).trim(); 1143 1144 1145 1146 logger.debug("current continuationNo is {}", continuationNr); 1147 logger.debug("previousContinuationField is {}", previousContinuationField); 1148 logger.debug("current continuationField is {}", continuationField); 1149 logger.debug("current continuationString is {}", continuationString); 1150 logger.debug("current compound is {}", current_compound); 1151 1152 1153 // following the docs, the last valid character should be 79, chop off the rest 1154 if (line.length() > 79) { 1155 line = line.substring(0, 79); 1156 } 1157 1158 line = line.substring(10, line.length()); 1159 1160 logger.debug("LINE: >{}<", line); 1161 1162 String[] fieldList = line.split("\\s+"); 1163 1164 if (!"".equals(fieldList[0]) 1165 && sourceFieldValues.contains(fieldList[0])) { 1166 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'"); 1167 continuationField = fieldList[0]; 1168 if ("".equals(previousContinuationField)) { 1169 previousContinuationField = continuationField; 1170 } 1171 1172 } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) { 1173 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'"); 1174 continuationField = fieldList[1]; 1175 if ("".equals(previousContinuationField)) { 1176 previousContinuationField = continuationField; 1177 } 1178 1179 } else { 1180 if ("".equals(continuationNr)) { 1181 1182 logger.debug("looks like an old PDB file"); 1183 1184 continuationField = "MOLECULE:"; 1185 if ("".equals(previousContinuationField)) { 1186 previousContinuationField = continuationField; 1187 } 1188 } 1189 1190 } 1191 1192 line = line.replace(continuationField, "").trim(); 1193 1194 StringTokenizer compndTokens = new StringTokenizer(line); 1195 1196 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 1197 1198 while (compndTokens.hasMoreTokens()) { 1199 String token = compndTokens.nextToken(); 1200 1201 if ("".equals(previousContinuationField)) { 1202 // System.out.println("previousContinuationField is empty. Setting to : " + continuationField); 1203 previousContinuationField = continuationField; 1204 } 1205 1206 if (previousContinuationField.equals(continuationField) 1207 && sourceFieldValues.contains(continuationField)) { 1208 1209 logger.debug("Still in field {}", continuationField); 1210 1211 continuationString = continuationString.concat(token + " "); 1212 1213 logger.debug("continuationString = {}", continuationString); 1214 } 1215 if (!continuationField.equals(previousContinuationField)) { 1216 1217 if ("".equals(continuationString)) { 1218 continuationString = token; 1219 1220 } else { 1221 1222 sourceValueSetter(previousContinuationField, 1223 continuationString); 1224 previousContinuationField = continuationField; 1225 continuationString = token + " "; 1226 } 1227 } else if (ignoreCompndFieldValues.contains(token)) { 1228 // this field shall be ignored 1229 //continuationField = token; 1230 } 1231 } 1232 if (isLastSourceLine) { 1233 // final line in the section - finish off the compound 1234 // System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header."); 1235 sourceValueSetter(continuationField, continuationString); 1236 continuationString = ""; 1237 //compounds.add(current_compound); 1238 } 1239 1240 } 1241 1242 1243 /** 1244 * Set the value in the current molId object 1245 * 1246 * @param field 1247 * @param value 1248 */ 1249 private void sourceValueSetter(String field, String value) { 1250 1251 value = value.trim().replace(";", ""); 1252 // System.out.println("[sourceValueSetter] " + field); 1253 if ("MOL_ID:".equals(field)) { 1254 1255 try { 1256 current_compound = entities.get(Integer.valueOf(value) - 1); 1257 } catch (NumberFormatException e){ 1258 logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage()); 1259 return; 1260 } 1261 1262 1263 // System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId()); 1264 1265 } 1266 if ("SYNTHETIC:".equals(field)) { 1267 current_compound.setSynthetic(value); 1268 } else if ("FRAGMENT:".equals(field)) { 1269 current_compound.setFragment(value); 1270 } else if ("ORGANISM_SCIENTIFIC:".equals(field)) { 1271 current_compound.setOrganismScientific(value); 1272 } else if ("ORGANISM_TAXID:".equals(field)) { 1273 current_compound.setOrganismTaxId(value); 1274 } else if ("ORGANISM_COMMON:".equals(field)) { 1275 current_compound.setOrganismCommon(value); 1276 } else if ("STRAIN:".equals(field)) { 1277 current_compound.setStrain(value); 1278 } else if ("VARIANT:".equals(field)) { 1279 current_compound.setVariant(value); 1280 } else if ("CELL_LINE:".equals(field)) { 1281 current_compound.setCellLine(value); 1282 } else if ("ATCC:".equals(field)) { 1283 current_compound.setAtcc(value); 1284 } else if ("ORGAN:".equals(field)) { 1285 current_compound.setOrgan(value); 1286 } else if ("TISSUE:".equals(field)) { 1287 current_compound.setTissue(value); 1288 } else if ("CELL:".equals(field)) { 1289 current_compound.setCell(value); 1290 } else if ("ORGANELLE:".equals(field)) { 1291 current_compound.setOrganelle(value); 1292 } else if ("SECRETION:".equals(field)) { 1293 current_compound.setSecretion(value); 1294 } else if ("GENE:".equals(field)) { 1295 current_compound.setGene(value); 1296 } else if ("CELLULAR_LOCATION:".equals(field)) { 1297 current_compound.setCellularLocation(value); 1298 } else if ("EXPRESSION_SYSTEM:".equals(field)) { 1299 current_compound.setExpressionSystem(value); 1300 } else if ("EXPRESSION_SYSTEM_TAXID:".equals(field)) { 1301 current_compound.setExpressionSystemTaxId(value); 1302 } else if ("EXPRESSION_SYSTEM_STRAIN:".equals(field)) { 1303 current_compound.setExpressionSystemStrain(value); 1304 } else if ("EXPRESSION_SYSTEM_VARIANT:".equals(field)) { 1305 current_compound.setExpressionSystemVariant(value); 1306 } else if ("EXPRESSION_SYSTEM_CELL_LINE:".equals(field)) { 1307 current_compound.setExpressionSystemCellLine(value); 1308 } else if ("EXPRESSION_SYSTEM_ATCC_NUMBER:".equals(field)) { 1309 current_compound.setExpressionSystemAtccNumber(value); 1310 } else if ("EXPRESSION_SYSTEM_ORGAN:".equals(field)) { 1311 current_compound.setExpressionSystemOrgan(value); 1312 } else if ("EXPRESSION_SYSTEM_TISSUE:".equals(field)) { 1313 current_compound.setExpressionSystemTissue(value); 1314 } else if ("EXPRESSION_SYSTEM_CELL:".equals(field)) { 1315 current_compound.setExpressionSystemCell(value); 1316 } else if ("EXPRESSION_SYSTEM_ORGANELLE:".equals(field)) { 1317 current_compound.setExpressionSystemOrganelle(value); 1318 } else if ("EXPRESSION_SYSTEM_CELLULAR_LOCATION:".equals(field)) { 1319 current_compound.setExpressionSystemCellularLocation(value); 1320 } else if ("EXPRESSION_SYSTEM_VECTOR_TYPE:".equals(field)) { 1321 current_compound.setExpressionSystemVectorType(value); 1322 } else if ("EXPRESSION_SYSTEM_VECTOR:".equals(field)) { 1323 current_compound.setExpressionSystemVector(value); 1324 } else if ("EXPRESSION_SYSTEM_PLASMID:".equals(field)) { 1325 current_compound.setExpressionSystemPlasmid(value); 1326 } else if ("EXPRESSION_SYSTEM_GENE:".equals(field)) { 1327 current_compound.setExpressionSystemGene(value); 1328 } else if ("OTHER_DETAILS:".equals(field)) { 1329 current_compound.setExpressionSystemOtherDetails(value); 1330 } 1331 1332 } 1333 1334 /** 1335 * Handler for REMARK lines 1336 */ 1337 private void pdb_REMARK_Handler(String line) { 1338 1339 if ( line == null || line.length() < 11) 1340 return; 1341 1342 1343 if (line.startsWith("REMARK 800")) { 1344 pdb_REMARK_800_Handler(line); 1345 1346 } else if ( line.startsWith("REMARK 350")){ 1347 1348 if ( params.isParseBioAssembly()) { 1349 1350 if (bioAssemblyParser == null){ 1351 bioAssemblyParser = new PDBBioAssemblyParser(); 1352 } 1353 1354 bioAssemblyParser.pdb_REMARK_350_Handler(line); 1355 } 1356 } else if (line.startsWith("REMARK 2")) { 1357 //REMARK 2 RESOLUTION. 1358 Pattern pR = Pattern.compile("^REMARK 2 RESOLUTION.\\s+(\\d+\\.\\d+)\\s+ANGSTROMS\\..*"); 1359 handleResolutionLine(line, pR); 1360 1361 // REMARK 3 (for R free) 1362 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1363 // then last one encountered will be taken 1364 } else if (line.startsWith("REMARK 3 FREE R VALUE")) { 1365 1366 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 1367 // Here we follow this strategy: 1368 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 1369 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 1370 1371 Pattern pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*"); 1372 Matcher mR = pR.matcher(line); 1373 if (mR.matches()) { 1374 try { 1375 rfreeNoCutoffLine = Float.parseFloat(mR.group(1)); 1376 } catch (NumberFormatException e) { 1377 logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it"); 1378 } 1379 } 1380 pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*"); 1381 mR = pR.matcher(line); 1382 if (mR.matches()) { 1383 try { 1384 rfreeStandardLine = Float.parseFloat(mR.group(1)); 1385 } catch (NumberFormatException e) { 1386 logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1)); 1387 } 1388 } 1389 1390 // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries) 1391 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1392 // then last one encountered will be taken 1393 } else if (line.startsWith("REMARK 3 RESOLUTION RANGE HIGH")){ 1394 Pattern pR = Pattern.compile("^REMARK 3 RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*"); 1395 handleResolutionLine(line, pR); 1396 } else if (line.startsWith("REMARK 3 EFFECTIVE RESOLUTION")){ 1397 Pattern pR = Pattern.compile("^REMARK 3 EFFECTIVE RESOLUTION \\(ANGSTROMS\\)\\s+:\\s+(\\d+\\.\\d+).*"); 1398 handleResolutionLine(line, pR); 1399 } 1400 } 1401 1402 public void handleResolutionLine(String line, Pattern pR) { 1403 Matcher mR = pR.matcher(line); 1404 if (mR.matches()) { 1405 final String resString = mR.group(1); 1406 try { 1407 float res = Float.parseFloat(resString); 1408 final float resInHeader = pdbHeader.getResolution(); 1409 if (resInHeader!=PDBHeader.DEFAULT_RESOLUTION && resInHeader != res) { 1410 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} " 1411 ,resString, String.format("%4.2f",resInHeader)); 1412 } 1413 pdbHeader.setResolution(res); 1414 } catch (NumberFormatException e) { 1415 logger.info("Could not parse resolution '{}', ignoring it",resString); 1416 } 1417 } 1418 } 1419 1420 1421 1422 1423 1424 1425 /** 1426 * Handler for 1427 * EXPDTA Record Format 1428 <pre> 1429 COLUMNS DATA TYPE FIELD DEFINITION 1430 ------------------------------------------------------------------------------- 1431 1 - 6 Record name "EXPDTA" 1432 9 - 10 Continuation continuation Allows concatenation of multiple 1433 records. 1434 11 - 70 SList technique The experimental technique(s) with 1435 optional comment describing the 1436 sample or experiment. 1437 1438 allowed techniques are: 1439 ELECTRON DIFFRACTION 1440 FIBER DIFFRACTION 1441 FLUORESCENCE TRANSFER 1442 NEUTRON DIFFRACTION 1443 NMR 1444 THEORETICAL MODEL 1445 X-RAY DIFFRACTION 1446 </pre> 1447 */ 1448 private void pdb_EXPDTA_Handler(String line) { 1449 1450 String technique ; 1451 if (line.length() > 69) 1452 technique = line.substring (10, 70).trim() ; 1453 else 1454 technique = line.substring(10).trim(); 1455 1456 for (String singleTechnique: technique.split(";\\s+")) { 1457 pdbHeader.setExperimentalTechnique(singleTechnique); 1458 } 1459 1460 1461 } 1462 1463 /** 1464 * Handler for 1465 * CRYST1 Record Format 1466 * The CRYST1 record presents the unit cell parameters, space group, and Z value. 1467 * If the entry describes a structure determined by a technique other than X-ray crystallography, 1468 * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1469 * <pre> 1470 * COLUMNS DATA TYPE FIELD DEFINITION 1471 * ------------------------------------------------------------- 1472 * 1 - 6 Record name "CRYST1" 1473 * 7 - 15 Real(9.3) a a (Angstroms). 1474 * 16 - 24 Real(9.3) b b (Angstroms). 1475 * 25 - 33 Real(9.3) c c (Angstroms). 1476 * 34 - 40 Real(7.2) alpha alpha (degrees). 1477 * 41 - 47 Real(7.2) beta beta (degrees). 1478 * 48 - 54 Real(7.2) gamma gamma (degrees). 1479 * 56 - 66 LString sGroup Space group. 1480 * 67 - 70 Integer z Z value. 1481 * </pre> 1482 */ 1483 private void pdb_CRYST1_Handler(String line) { 1484 // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1') 1485 if (line.length() < 58) { 1486 logger.warn("CRYST1 record has fewer than 58 columns: will ignore it"); 1487 return; 1488 } 1489 1490 float a; 1491 float b; 1492 float c; 1493 float alpha; 1494 float beta; 1495 float gamma; 1496 String spaceGroup = ""; 1497 1498 try { 1499 a = Float.parseFloat(line.substring(6,15).trim()); 1500 b = Float.parseFloat(line.substring(15,24).trim()); 1501 c = Float.parseFloat(line.substring(24,33).trim()); 1502 alpha = Float.parseFloat(line.substring(33,40).trim()); 1503 beta = Float.parseFloat(line.substring(40,47).trim()); 1504 gamma = Float.parseFloat(line.substring(47,54).trim()); 1505 } catch (NumberFormatException e) { 1506 logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line); 1507 return ; 1508 } 1509 if (line.length()>=66) { 1510 // for well formatted files 1511 spaceGroup = line.substring(55,66).trim(); 1512 } else { 1513 // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value 1514 spaceGroup = line.substring(55,line.length()).trim(); 1515 } 1516 1517 CrystalCell xtalCell = new CrystalCell(); 1518 xtalCell.setA(a); 1519 xtalCell.setB(b); 1520 xtalCell.setC(c); 1521 xtalCell.setAlpha(alpha); 1522 xtalCell.setBeta(beta); 1523 xtalCell.setGamma(gamma); 1524 1525 if (!xtalCell.isCellReasonable()) { 1526 // If the entry describes a structure determined by a technique other than X-ray crystallography, 1527 // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1528 // if so we don't add the crystal cell and it remains null 1529 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", 1530 CrystalCell.MIN_VALID_CELL_SIZE); 1531 } else { 1532 crystallographicInfo.setCrystalCell(xtalCell); 1533 } 1534 1535 SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup); 1536 if (sg==null) { 1537 logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group"); 1538 crystallographicInfo.setNonStandardSg(true); 1539 } else { 1540 crystallographicInfo.setSpaceGroup(sg); 1541 crystallographicInfo.setNonStandardSg(false); 1542 } 1543 } 1544 1545 /** 1546 * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries) 1547 * 1548 * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn 1549 * <pre> 1550 * COLUMNS DATA TYPE FIELD DEFINITION 1551 * ------------------------------------------------------------- 1552 * 1553 * 1 - 6 Record name "MTRIXn" n=1, 2, or 3 1554 * 8 - 10 Integer serial Serial number. 1555 * 11 - 20 Real(10.6) m[n][1] Mn1 1556 * 21 - 30 Real(10.6) m[n][2] Mn2 1557 * 31 - 40 Real(10.6) m[n][3] Mn3 1558 * 46 - 55 Real(10.5) v[n] Vn 1559 * 60 Integer iGiven 1 1560 * 1561 * </pre> 1562 * Note that we ignore operators with iGiven==1 1563 * 1564 * @param line 1565 */ 1566 private void pdb_MTRIXn_Handler(String line) { 1567 1568 // don't process incomplete records 1569 if (line.length() < 55) { 1570 logger.info("MTRIXn record has fewer than 55 columns: will ignore it"); 1571 return; 1572 } 1573 1574 1575 try { 1576 1577 int rowIndex = Integer.parseInt(line.substring(5,6)); 1578 double col1Value = Double.parseDouble(line.substring(10,20)); 1579 double col2Value = Double.parseDouble(line.substring(20,30)); 1580 double col3Value = Double.parseDouble(line.substring(30,40)); 1581 double translValue = Double.parseDouble(line.substring(45,55)); 1582 int iGiven = 0; 1583 if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) { 1584 iGiven = Integer.parseInt(line.substring(59,60)); 1585 } 1586 1587 if (iGiven == 1) return; 1588 1589 if (ncsOperators==null) { 1590 // we initialise on first pass 1591 ncsOperators = new ArrayList<Matrix4d>(); 1592 } 1593 1594 if (currentNcsOp==null) { 1595 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1596 } 1597 1598 currentNcsOp.setElement(rowIndex-1, 0, col1Value); 1599 currentNcsOp.setElement(rowIndex-1, 1, col2Value); 1600 currentNcsOp.setElement(rowIndex-1, 2, col3Value); 1601 currentNcsOp.setElement(rowIndex-1, 3, translValue); 1602 1603 1604 if (rowIndex==3) { 1605 ncsOperators.add(currentNcsOp); 1606 // we initialise for next matrix to come 1607 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1608 } 1609 1610 } catch (NumberFormatException e) { 1611 logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<"); 1612 } 1613 } 1614 1615 /** 1616 * Handler for ATOM. 1617 * Record Format: 1618 * 1619 * <pre> 1620 * ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1621 * 1622 * COLUMNS DATA TYPE FIELD DEFINITION 1623 * --------------------------------------------------------------------------------- 1624 * 1 - 6 Record name "ATOM " 1625 * 7 - 11 Integer serial Atom serial number. 1626 * 13 - 16 Atom name Atom name. 1627 * 17 Character altLoc Alternate location indicator. 1628 * 18 - 20 Residue name resName Residue name. 1629 * 22 Character chainID Chain identifier. 1630 * 23 - 26 Integer resSeq Residue sequence number. 1631 * 27 AChar iCode Code for insertion of residues. 1632 * 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 1633 * 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 1634 * 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 1635 * 55 - 60 Real(6.2) occupancy Occupancy. 1636 * 61 - 66 Real(6.2) tempFactor Temperature factor. 1637 * 73 - 76 LString(4) segID Segment identifier, left-justified. 1638 * 77 - 78 LString(2) element Element symbol, right-justified. 1639 * 79 - 80 LString(2) charge Charge on the atom. 1640 * </pre> 1641 */ 1642 private void pdb_ATOM_Handler(String line) { 1643 1644 if ( params.isHeaderOnly()) 1645 return; 1646 1647 // let's first get the chain name which will serve to identify if we are starting a new molecule 1648 String chainName = line.substring(21,22); 1649 1650 if (" ".equals(chainName)) { 1651 blankChainIdsPresent = true; 1652 } 1653 1654 if (currentChain!=null && !currentChain.getName().equals(chainName)) { 1655 // new chain name: another molecule coming 1656 startOfMolecule = true; 1657 } 1658 1659 if (startOfMolecule) { 1660 // we add last chain if there was one 1661 if (currentChain!=null) { 1662 currentModel.add(currentChain); 1663 // let's not forget adding the last group to the finishing chain 1664 if (currentGroup!=null) { 1665 currentChain.addGroup(currentGroup); 1666 } 1667 } 1668 // we initialise the new molecule to come 1669 currentChain = new ChainImpl(); 1670 // note that the chainId (asym id) is set properly later in assignAsymIds 1671 currentChain.setId(chainName); 1672 currentChain.setName(chainName); 1673 1674 } 1675 1676 if (startOfModel) { 1677 // we add last model if there was one 1678 if (currentModel!=null) { 1679 allModels.add(currentModel); 1680 } 1681 // we initialise the model to come 1682 currentModel = new ArrayList<>(); 1683 } 1684 1685 1686 // let's get the residue number and see if we need to start a new group 1687 1688 String groupCode3 = line.substring(17,20).trim(); 1689 String resNum = line.substring(22,26).trim(); 1690 Character iCode = line.substring(26,27).charAt(0); 1691 if ( iCode == ' ') 1692 iCode = null; 1693 ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode); 1694 1695 //recordName groupCode3 1696 //| | resNum 1697 //| | | iCode 1698 //| | | | | || 1699 //ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1700 //ATOM 1964 N ARG H 221A 5.963 -16.715 27.669 1.00 28.59 N 1701 1702 Character aminoCode1 = StructureTools.get1LetterCode(groupCode3); 1703 1704 String recordName = line.substring (0, 6).trim (); 1705 1706 boolean isHetAtomInFile = false; 1707 1708 if ("HETATM".equals(recordName) ){ 1709 // HETATOM RECORDS are treated slightly differently 1710 // some modified amino acids that we want to treat as amino acids 1711 // can be found as HETATOM records 1712 if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) 1713 aminoCode1 = null; 1714 1715 isHetAtomInFile = true; 1716 } 1717 1718 if ( startOfMolecule) { 1719 1720 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1721 1722 currentGroup.setPDBName(groupCode3); 1723 currentGroup.setResidueNumber(residueNumber); 1724 currentGroup.setHetAtomInFile(isHetAtomInFile); 1725 1726 } 1727 1728 // resetting states 1729 startOfModel = false; 1730 startOfMolecule = false; 1731 1732 1733 Character altLoc = line.substring (16, 17).charAt(0); 1734 Group altGroup = null; 1735 1736 1737 // check if residue number is the same ... 1738 if ( ! residueNumber.equals(currentGroup.getResidueNumber())) { 1739 1740 currentChain.addGroup(currentGroup); 1741 currentGroup.trimToSize(); 1742 1743 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1744 1745 currentGroup.setPDBName(groupCode3); 1746 currentGroup.setResidueNumber(residueNumber); 1747 currentGroup.setHetAtomInFile(isHetAtomInFile); 1748 1749 } else { 1750 // same residueNumber, but altLocs... 1751 1752 // test altLoc 1753 if ( ! altLoc.equals(' ')) { 1754 logger.debug("found altLoc! " + currentGroup + " " + altGroup); 1755 altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3); 1756 if ( altGroup.getChain() == null) { 1757 // need to set current chain 1758 altGroup.setChain(currentChain); 1759 } 1760 1761 } 1762 } 1763 1764 atomCount++; 1765 1766 if ( atomCount == atomCAThreshold ) { 1767 // throw away the SEQRES lines - too much to deal with... 1768 logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines"); 1769 seqResChains.clear(); 1770 1771 switchCAOnly(); 1772 1773 } 1774 1775 1776 1777 if ( atomCount == loadMaxAtoms){ 1778 logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line); 1779 return; 1780 } 1781 if ( atomCount > loadMaxAtoms){ 1782 return; 1783 } 1784 1785 1786 // 1 2 3 4 5 6 1787 //012345678901234567890123456789012345678901234567890123456789 1788 //ATOM 1 N MET 1 20.154 29.699 5.276 1.0 1789 //ATOM 112 CA ASP 112 41.017 33.527 28.371 1.00 0.00 1790 //ATOM 53 CA MET 7 23.772 33.989 -21.600 1.00 0.00 C 1791 //ATOM 112 CA ASP 112 37.613 26.621 33.571 0 0 1792 1793 1794 String fullname = line.substring (12, 16); 1795 1796 // check for CA only if requested 1797 if ( parseCAonly ){ 1798 // yes , user wants to get CA only 1799 // only parse CA atoms... 1800 if (! " CA ".equals(fullname)){ 1801 //System.out.println("ignoring " + line); 1802 atomCount--; 1803 return; 1804 } 1805 } 1806 1807 if ( params.getAcceptedAtomNames() != null) { 1808 1809 boolean found = false; 1810 for (String ok : params.getAcceptedAtomNames()){ 1811 //System.out.println(ok + "< >" + fullname +"<"); 1812 1813 if ( ok.equals(fullname.trim())) { 1814 found = true; 1815 break; 1816 } 1817 } 1818 if ( ! found) { 1819 atomCount--; 1820 return; 1821 } 1822 } 1823 // create new atom 1824 1825 int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ()); 1826 AtomImpl atom = new AtomImpl() ; 1827 atom.setPDBserial(pdbnumber) ; 1828 1829 atom.setAltLoc(altLoc); 1830 atom.setName(fullname.trim()); 1831 1832 double x = Double.parseDouble (line.substring (30, 38).trim()); 1833 double y = Double.parseDouble (line.substring (38, 46).trim()); 1834 double z = Double.parseDouble (line.substring (46, 54).trim()); 1835 1836 double[] coords = new double[3]; 1837 coords[0] = x ; 1838 coords[1] = y ; 1839 coords[2] = z ; 1840 atom.setCoords(coords); 1841 1842 float occu = 1.0f; 1843 if ( line.length() > 59 ) { 1844 try { 1845 // occu and tempf are sometimes not used :-/ 1846 occu = Float.parseFloat (line.substring (54, 60).trim()); 1847 } catch (NumberFormatException e){} 1848 } 1849 1850 float tempf = 0.0f; 1851 if ( line.length() > 65) { 1852 try { 1853 tempf = Float.parseFloat (line.substring (60, 66).trim()); 1854 } catch (NumberFormatException e){} 1855 } 1856 1857 atom.setOccupancy( occu ); 1858 atom.setTempFactor( tempf ); 1859 1860 1861 1862 1863 // Parse element from the element field. If this field is 1864 // missing (i.e. misformatted PDB file), then parse the 1865 // element from the chemical component. 1866 Element element = Element.R; 1867 boolean guessElement = true; 1868 if ( line.length() > 77 ) { 1869 // parse element from element field 1870 String elementSymbol = line.substring(76, 78).trim(); 1871 if (elementSymbol.isEmpty()) { 1872 logger.info("Element column was empty for atom {} {}. Assigning atom element " 1873 + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber); 1874 } else { 1875 1876 try { 1877 element = Element.valueOfIgnoreCase(elementSymbol); 1878 guessElement = false; 1879 } catch (IllegalArgumentException e){ 1880 logger.info("Element {} of atom {} {} was not recognised. Assigning atom element " 1881 + "from Chemical Component Dictionary information", elementSymbol, 1882 fullname.trim(), pdbnumber); 1883 } 1884 } 1885 } else { 1886 logger.info("Missformatted PDB file: element column of atom {} {} is not present. " 1887 + "Assigning atom element from Chemical Component Dictionary information", 1888 fullname.trim(), pdbnumber); 1889 } 1890 if (guessElement) { 1891 String elementSymbol = null; 1892 if (currentGroup.getChemComp() != null) { 1893 for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) { 1894 if (a.getAtomId().equals(fullname.trim())) { 1895 elementSymbol = a.getTypeSymbol(); 1896 break; 1897 } 1898 } 1899 if (elementSymbol == null) { 1900 logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. " 1901 + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName()); 1902 } else { 1903 try { 1904 element = Element.valueOfIgnoreCase(elementSymbol); 1905 } catch (IllegalArgumentException e) { 1906 // this can still happen for cases like UNK 1907 logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. " 1908 + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber); 1909 } 1910 } 1911 } else { 1912 logger.warn("Chemical Component Dictionary information was not found for Atom name {}. " 1913 + "Assigning generic element R to it", fullname.trim()); 1914 } 1915 1916 } 1917 atom.setElement(element); 1918 1919 1920 //see if chain_id is one of the previous chains ... 1921 if ( altGroup != null) { 1922 altGroup.addAtom(atom); 1923 altGroup = null; 1924 } 1925 else { 1926 currentGroup.addAtom(atom); 1927 } 1928 1929 1930 // make sure that main group has all atoms 1931 // GitHub issue: #76 1932 if ( ! currentGroup.hasAtom(atom.getName())) { 1933 currentGroup.addAtom(atom); 1934 } 1935 1936 1937 1938 } 1939 1940 1941 private Group getCorrectAltLocGroup( Character altLoc, 1942 String recordName, Character aminoCode1, String groupCode3) { 1943 1944 // see if we know this altLoc already; 1945 List<Atom> atoms = currentGroup.getAtoms(); 1946 if ( atoms.size() > 0) { 1947 Atom a1 = atoms.get(0); 1948 // we are just adding atoms to the current group 1949 // probably there is a second group following later... 1950 if (a1.getAltLoc().equals(altLoc)) { 1951 1952 return currentGroup; 1953 } 1954 } 1955 1956 List<Group> altLocs = currentGroup.getAltLocs(); 1957 for ( Group altLocG : altLocs ){ 1958 atoms = altLocG.getAtoms(); 1959 if ( atoms.size() > 0) { 1960 for ( Atom a1 : atoms) { 1961 if (a1.getAltLoc().equals( altLoc)) { 1962 1963 return altLocG; 1964 } 1965 } 1966 } 1967 } 1968 1969 // no matching altLoc group found. 1970 // build it up. 1971 1972 if ( groupCode3.equals(currentGroup.getPDBName())) { 1973 if ( currentGroup.getAtoms().size() == 0) { 1974 //System.out.println("current group is empty " + current_group + " " + altLoc); 1975 return currentGroup; 1976 } 1977 //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc); 1978 Group altLocG = (Group) currentGroup.clone(); 1979 // drop atoms from cloned group... 1980 // https://redmine.open-bio.org/issues/3307 1981 altLocG.setAtoms(new ArrayList<Atom>()); 1982 altLocG.getAltLocs().clear(); 1983 currentGroup.addAltLoc(altLocG); 1984 return altLocG; 1985 } 1986 1987 // System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3); 1988 Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3); 1989 1990 1991 altLocG.setPDBName(groupCode3); 1992 1993 altLocG.setResidueNumber(currentGroup.getResidueNumber()); 1994 currentGroup.addAltLoc(altLocG); 1995 return altLocG; 1996 } 1997 1998 private void switchCAOnly(){ 1999 parseCAonly = true; 2000 2001 2002 currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel); 2003 2004 for ( int i =0; i< structure.nrModels() ; i++){ 2005 // iterate over all known models ... 2006 List<Chain> model = structure.getModel(i); 2007 model = CAConverter.getRepresentativeAtomsOnly(model); 2008 structure.setModel(i,model); 2009 } 2010 2011 currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain); 2012 2013 } 2014 2015 2016 /** safes repeating a few lines ... */ 2017 private Integer conect_helper (String line,int start,int end) { 2018 if (line.length() < end) return null; 2019 2020 String sbond = line.substring(start,end).trim(); 2021 int bond = -1 ; 2022 Integer b = null ; 2023 2024 if ( ! "".equals(sbond)) { 2025 bond = Integer.parseInt(sbond); 2026 b = bond; 2027 } 2028 2029 return b ; 2030 } 2031 2032 /** 2033 * Handler for CONECT Record Format 2034 <pre> 2035 COLUMNS DATA TYPE FIELD DEFINITION 2036 --------------------------------------------------------------------------------- 2037 1 - 6 Record name "CONECT" 2038 7 - 11 Integer serial Atom serial number 2039 12 - 16 Integer serial Serial number of bonded atom 2040 17 - 21 Integer serial Serial number of bonded atom 2041 22 - 26 Integer serial Serial number of bonded atom 2042 27 - 31 Integer serial Serial number of bonded atom 2043 32 - 36 Integer serial Serial number of hydrogen bonded 2044 atom 2045 37 - 41 Integer serial Serial number of hydrogen bonded 2046 atom 2047 42 - 46 Integer serial Serial number of salt bridged 2048 atom 2049 47 - 51 Integer serial Serial number of hydrogen bonded 2050 atom 2051 52 - 56 Integer serial Serial number of hydrogen bonded 2052 atom 2053 57 - 61 Integer serial Serial number of salt bridged 2054 atom 2055 </pre> 2056 */ 2057 private void pdb_CONECT_Handler(String line) { 2058 2059 if ( atomOverflow) { 2060 return ; 2061 } 2062 if (params.isHeaderOnly()) { 2063 return; 2064 } 2065 2066 // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines... 2067 try { 2068 int atomserial = Integer.parseInt (line.substring(6 ,11).trim()); 2069 Integer bond1 = conect_helper(line,11,16); 2070 Integer bond2 = conect_helper(line,16,21); 2071 Integer bond3 = conect_helper(line,21,26); 2072 Integer bond4 = conect_helper(line,26,31); 2073 Integer hyd1 = conect_helper(line,31,36); 2074 Integer hyd2 = conect_helper(line,36,41); 2075 Integer salt1 = conect_helper(line,41,46); 2076 Integer hyd3 = conect_helper(line,46,51); 2077 Integer hyd4 = conect_helper(line,51,56); 2078 Integer salt2 = conect_helper(line,56,61); 2079 2080 //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+ 2081 // hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2); 2082 HashMap<String, Integer> cons = new HashMap<>(); 2083 cons.put("atomserial",atomserial); 2084 2085 if ( bond1 != null) cons.put("bond1",bond1); 2086 if ( bond2 != null) cons.put("bond2",bond2); 2087 if ( bond3 != null) cons.put("bond3",bond3); 2088 if ( bond4 != null) cons.put("bond4",bond4); 2089 if ( hyd1 != null) cons.put("hydrogen1",hyd1); 2090 if ( hyd2 != null) cons.put("hydrogen2",hyd2); 2091 if ( salt1 != null) cons.put("salt1",salt1); 2092 if ( hyd3 != null) cons.put("hydrogen3",hyd3); 2093 if ( hyd4 != null) cons.put("hydrogen4",hyd4); 2094 if ( salt2 != null) cons.put("salt2",salt2); 2095 2096 connects.add(cons); 2097 } catch (NumberFormatException e){ 2098 logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line); 2099 return; 2100 } 2101 } 2102 2103 /** 2104 * Handler for MODEL Record Format 2105 * <pre> 2106 * COLUMNS DATA TYPE FIELD DEFINITION 2107 * ---------------------------------------------------------------------- 2108 * 1 - 6 Record name "MODEL " 2109 * 11 - 14 Integer serial Model serial number. 2110 * </pre> 2111 */ 2112 private void pdb_MODEL_Handler(String line) { 2113 2114 if (params.isHeaderOnly()) return; 2115 2116 // new model: we start a new molecule 2117 startOfMolecule = true; 2118 startOfModel = true; 2119 2120 } 2121 2122 /** 2123 * Handler for TER record. The record is used in deposited PDB files and many others, 2124 * but it's often forgotten by some softwares. In any case it helps identifying the 2125 * start of ligand molecules so we use it for that. 2126 */ 2127 private void pdb_TER_Handler() { 2128 startOfMolecule = true; 2129 } 2130 2131 2132 /** 2133 * DBREF handler 2134 * <pre> 2135 * COLUMNS DATA TYPE FIELD DEFINITION 2136 * ---------------------------------------------------------------- 2137 * 1 - 6 Record name "DBREF " 2138 * 8 - 11 IDcode idCode ID code of this entry. 2139 * 13 Character chainID Chain identifier. 2140 * 15 - 18 Integer seqBegin Initial sequence number 2141 * of the PDB sequence segment. 2142 * 19 AChar insertBegin Initial insertion code 2143 * of the PDB sequence segment. 2144 * 21 - 24 Integer seqEnd Ending sequence number 2145 * of the PDB sequence segment. 2146 * 25 AChar insertEnd Ending insertion code 2147 * of the PDB sequence segment. 2148 * 27 - 32 LString database Sequence database name. 2149 * 34 - 41 LString dbAccession Sequence database accession code. 2150 * 43 - 54 LString dbIdCode Sequence database 2151 * identification code. 2152 * 56 - 60 Integer dbseqBegin Initial sequence number of the 2153 * database seqment. 2154 * 61 AChar idbnsBeg Insertion code of initial residue 2155 * of the segment, if PDB is the 2156 * reference. 2157 * 63 - 67 Integer dbseqEnd Ending sequence number of the 2158 * database segment. 2159 * 68 AChar dbinsEnd Insertion code of the ending 2160 * residue of the segment, if PDB is 2161 * the reference. 2162 * </pre> 2163 */ 2164 private void pdb_DBREF_Handler(String line){ 2165 2166 logger.debug("Parsing DBREF {}", line); 2167 2168 DBRef dbref = new DBRef(); 2169 String idCode = line.substring(7,11); 2170 String chainName = line.substring(12,13); 2171 String seqBegin = line.substring(14,18); 2172 String insertBegin = line.substring(18,19); 2173 String seqEnd = line.substring(20,24); 2174 String insertEnd = line.substring(24,25); 2175 String database = line.substring(26,32); 2176 String dbAccession = line.substring(33,41); 2177 String dbIdCode = line.substring(42,54); 2178 String dbseqBegin = line.substring(55,60); 2179 String idbnsBeg = line.substring(60,61); 2180 String dbseqEnd = line.substring(62,67); 2181 // Support implicit space character at end 2182 String dbinsEnd; 2183 if(line.length() >= 68) 2184 dbinsEnd = line.substring(67,68); 2185 else 2186 dbinsEnd = " "; 2187 2188 dbref.setIdCode(idCode); 2189 dbref.setChainName(chainName); 2190 dbref.setSeqBegin(intFromString(seqBegin)); 2191 dbref.setInsertBegin(insertBegin.charAt(0)); 2192 dbref.setSeqEnd(intFromString(seqEnd)); 2193 dbref.setInsertEnd(insertEnd.charAt(0)); 2194 dbref.setDatabase(database.trim()); 2195 dbref.setDbAccession(dbAccession.trim()); 2196 dbref.setDbIdCode(dbIdCode.trim()); 2197 dbref.setDbSeqBegin(intFromString(dbseqBegin)); 2198 dbref.setIdbnsBegin(idbnsBeg.charAt(0)); 2199 dbref.setDbSeqEnd(intFromString(dbseqEnd)); 2200 dbref.setIdbnsEnd(dbinsEnd.charAt(0)); 2201 2202 //System.out.println(dbref.toPDB()); 2203 dbrefs.add(dbref); 2204 } 2205 2206 2207 /** 2208 * Process the disulfide bond info provided by an SSBOND record 2209 * 2210 * <pre> 2211 COLUMNS DATA TYPE FIELD DEFINITION 2212 ------------------------------------------------------------------- 2213 1 - 6 Record name "SSBOND" 2214 8 - 10 Integer serNum Serial number. 2215 12 - 14 LString(3) "CYS" Residue name. 2216 16 Character chainID1 Chain identifier. 2217 18 - 21 Integer seqNum1 Residue sequence number. 2218 22 AChar icode1 Insertion code. 2219 26 - 28 LString(3) "CYS" Residue name. 2220 30 Character chainID2 Chain identifier. 2221 32 - 35 Integer seqNum2 Residue sequence number. 2222 36 AChar icode2 Insertion code. 2223 60 - 65 SymOP sym1 Symmetry oper for 1st resid 2224 67 - 72 SymOP sym2 Symmetry oper for 2nd resid 2225 * </pre> 2226 */ 2227 private void pdb_SSBOND_Handler(String line){ 2228 2229 if (params.isHeaderOnly()) return; 2230 2231 if (line.length()<36) { 2232 logger.info("SSBOND line has length under 36. Ignoring it."); 2233 return; 2234 } 2235 2236 String chain1 = line.substring(15,16); 2237 String seqNum1 = line.substring(17,21).trim(); 2238 String icode1 = line.substring(21,22); 2239 String chain2 = line.substring(29,30); 2240 String seqNum2 = line.substring(31,35).trim(); 2241 String icode2 = line.substring(35,36); 2242 2243 if (line.length()>=72) { 2244 String symop1 = line.substring(59, 65).trim(); 2245 String symop2 = line.substring(66, 72).trim(); 2246 2247 // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them 2248 if (!"".equals(symop1) && !"".equals(symop2) && // in case the field is missing 2249 (!"1555".equals(symop1) || !"1555".equals(symop2)) ) { 2250 logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2); 2251 return; 2252 } 2253 } 2254 2255 if (" ".equals(icode1)) 2256 icode1 = ""; 2257 if (" ".equals(icode2)) 2258 icode2 = ""; 2259 2260 SSBondImpl ssbond = new SSBondImpl(); 2261 2262 ssbond.setChainID1(chain1); 2263 ssbond.setResnum1(seqNum1); 2264 ssbond.setChainID2(chain2); 2265 ssbond.setResnum2(seqNum2); 2266 ssbond.setInsCode1(icode1); 2267 ssbond.setInsCode2(icode2); 2268 ssbonds.add(ssbond); 2269 } 2270 2271 2272 /** 2273 * Takes care of LINK records. These take the format of: 2274 * 2275 * <pre> 2276 * COLUMNS DATA TYPE FIELD DEFINITION 2277 * -------------------------------------------------------------------------------- 2278 * 1 - 6 Record name "LINK " 2279 * 13 - 16 Atom name1 Atom name. 2280 * 17 Character altLoc1 Alternate location indicator. 2281 * 18 - 20 Residue name resName1 Residue name. 2282 * 22 Character chainID1 Chain identifier. 2283 * 23 - 26 Integer resSeq1 Residue sequence number. 2284 * 27 AChar iCode1 Insertion code. 2285 * 43 - 46 Atom name2 Atom name. 2286 * 47 Character altLoc2 Alternate location indicator. 2287 * 48 - 50 Residue name resName2 Residue name. 2288 * 52 Character chainID2 Chain identifier. 2289 * 53 - 56 Integer resSeq2 Residue sequence number. 2290 * 57 AChar iCode2 Insertion code. 2291 * 60 - 65 SymOP sym1 Symmetry operator for 1st atom. 2292 * 67 - 72 SymOP sym2 Symmetry operator for 2nd atom. 2293 * </pre> 2294 * 2295 * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK) 2296 * 2297 * @param line the LINK record line to parse. 2298 */ 2299 private void pdb_LINK_Handler(String line) { 2300 2301 if (params.isHeaderOnly()) return; 2302 2303 // Check for the minimal set of fields. 2304 if (line.length()<56) { 2305 logger.info("LINK line has length under 56. Ignoring it."); 2306 return; 2307 } 2308 2309 int len = line.length(); 2310 2311 String name1 = line.substring(12, 16).trim(); 2312 String altLoc1 = line.substring(16, 17).trim(); 2313 String resName1 = line.substring(17, 20).trim(); 2314 String chainID1 = line.substring(21, 22).trim(); 2315 String resSeq1 = line.substring(22, 26).trim(); 2316 String iCode1 = line.substring(26, 27).trim(); 2317 2318 String name2 = line.substring(42, 46).trim(); 2319 String altLoc2 = line.substring(46, 47).trim(); 2320 String resName2 = line.substring(47, 50).trim(); 2321 String chainID2 = line.substring(51, 52).trim(); 2322 String resSeq2 = line.substring(52, 56).trim(); 2323 String iCode2 = null; // Might get trimmed if blank. 2324 if (len > 56) iCode2 = line.substring(56, 57).trim(); 2325 2326 String sym1 = null; 2327 if (len > 64) sym1 = line.substring(59, 65).trim(); 2328 String sym2 = null; 2329 if (len > 71) sym2 = line.substring(66, 72).trim(); 2330 2331 linkRecords.add(new LinkRecord( 2332 name1, altLoc1, resName1, chainID1, resSeq1, iCode1, 2333 name2, altLoc2, resName2, chainID2, resSeq2, iCode2, 2334 sym1, sym2)); 2335 } 2336 2337 /** 2338 * Handler for the SITE records. <br> 2339 * 2340 * <pre> 2341 * 2342 * COLUMNS DATA TYPE FIELD DEFINITION 2343 * --------------------------------------------------------------------------------- 2344 * 1 - 6 Record name "SITE " 2345 * 8 - 10 Integer seqNum Sequence number. 2346 * 12 - 14 LString(3) siteID Site name. 2347 * 16 - 17 Integer numRes Number of residues that compose the siteResidues. 2348 * 19 - 21 Residue name resName1 Residue name for first residue that 2349 * creates the siteResidues. 2350 * 23 Character chainID1 Chain identifier for first residue of siteResidues. 2351 * 24 - 27 Integer seq1 Residue sequence number for first residue 2352 * of the siteResidues. 2353 * 28 AChar iCode1 Insertion code for first residue of the siteResidues. 2354 * 2355 * example: 2356 * 1 2 3 4 5 6 7 8 2357 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2358 * SITE 1 AC1 3 HIS A 94 HIS A 96 HIS A 119 2359 * SITE 1 AC2 5 ASN A 62 GLY A 63 HIS A 64 HOH A 328 2360 * SITE 2 AC2 5 HOH A 634 2361 * SITE 1 AC3 5 GLN A 136 GLN A 137 PRO A 138 GLU A 205 2362 * SITE 2 AC3 5 CYS A 206 2363 * SITE 1 AC4 11 HIS A 64 HIS A 94 HIS A 96 HIS A 119 2364 * SITE 2 AC4 11 LEU A 198 THR A 199 THR A 200 TRP A 209 2365 * SITE 3 AC4 11 HOH A 572 HOH A 582 HOH A 635 2366 * </pre> 2367 * @param line the SITE line record being currently read 2368 * @author Amr ALHOSSARY 2369 * @author Jules Jacobsen 2370 */ 2371 private void pdb_SITE_Handler(String line){ 2372 2373 if (params.isHeaderOnly()) return; 2374 2375 // make a map of: SiteId to List<ResidueNumber> 2376 2377 logger.debug("Site Line:{}", line); 2378 2379 2380 String siteID = line.substring(11, 14); 2381 //fetch the siteResidues from the map 2382 List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID); 2383 2384 //if the siteResidues doesn't yet exist, make a new one. 2385 if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){ 2386 siteResidues = new ArrayList<>(); 2387 siteToResidueMap.put(siteID.trim(), siteResidues); 2388 2389 logger.debug(String.format("New Site made: %s %s", siteID, siteResidues)); 2390 logger.debug("Now made {} sites", siteMap.size()); 2391 2392 } 2393 2394 logger.debug(String.format("SiteId: %s", siteID)); 2395 2396 2397 //line = 'SITE 1 AC1 6 ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2398 //line.substring(18) = 'ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2399 line = line.substring(18); 2400 String groupString = null; 2401 //groupString = 'ARG H 221A' 2402 //keep iterating through chunks of 10 characters - these are the groups in the siteResidues 2403 while (!" ".equals((groupString = line.substring(0, 10)))) { 2404 //groupstring: 'ARG H 221A' 2405 2406 logger.debug("groupString: '{}'", groupString); 2407 2408 //set the residue name 2409 //residueName = 'ARG' 2410 String residueName = groupString.substring(0, 3); 2411 Character aminoCode1 = StructureTools.get1LetterCode(residueName); 2412 if (aminoCode1 != null) { 2413 if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 2414 aminoCode1 = null; 2415 } 2416 } 2417 2418 //this is already in the right format, so no need to fiddle with it... 2419 //pdbCode = 'H 221A' 2420 // String pdbCode = groupString.substring(4, 10).trim(); 2421 String chainId = groupString.substring(4, 5); 2422 Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim()); 2423 Character insCode = groupString.substring(9, 10).charAt(0); 2424 //set insCode to null as a measure to prevent storing thousands of empty Strings 2425 //- the empty value is returned using Group.getInsCode() 2426 // if (insCode.equals(" ")) { 2427 // insCode = null; 2428 // } 2429 2430 logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode)); 2431 2432 //make a new resNum with the data - this will be linked up with a site later 2433 ResidueNumber residueNumber = new ResidueNumber(); 2434 2435 2436 logger.debug("pdbCode: '{}{}'", resNum, insCode); 2437 2438 residueNumber.setChainName(chainId); 2439 residueNumber.setSeqNum(resNum); 2440 residueNumber.setInsCode(insCode); 2441 //add the resNum to the groups 2442 siteResidues.add(residueNumber); 2443 2444 logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID); 2445 2446 line = line.substring(11); 2447 } 2448 2449 logger.debug("Current SiteMap (contains {} sites):", siteToResidueMap.keySet().size()); 2450 for (String key : siteToResidueMap.keySet()) { 2451 logger.debug(key + " : " + siteToResidueMap.get(key)); 2452 } 2453 2454 } 2455 2456 //Site variable related to parsing the REMARK 800 records. 2457 Site site; 2458 2459 private String[] keywords; 2460 private void pdb_REMARK_800_Handler(String line){ 2461 2462 if (params.isHeaderOnly()) return; 2463 2464 // 'REMARK 800 SITE_IDENTIFIER: CAT ' 2465 line = line.substring(11); 2466 String[] fields = line.split(": "); 2467 2468 if (fields.length == 2) { 2469 if ("SITE_IDENTIFIER".equals(fields[0])) { 2470 // remark800Counter++; 2471 String siteID = fields[1].trim(); 2472 2473 logger.debug("siteID: '{}'", siteID); 2474 2475 //fetch the siteResidues from the map 2476 site = siteMap.get(siteID); 2477 2478 //if the siteResidues doesn't yet exist, make a new one. 2479 if (site == null || !siteID.equals(site.getSiteID())) { 2480 site = new Site(siteID, new ArrayList<Group>()); 2481 siteMap.put(site.getSiteID(), site); 2482 2483 logger.debug("New Site made: {}", site); 2484 logger.debug("Now made {} sites", siteMap.size()); 2485 2486 } 2487 } 2488 if ("EVIDENCE_CODE".equals(fields[0])) { 2489 // remark800Counter++; 2490 String evCode = fields[1].trim(); 2491 2492 logger.debug("evCode: '{}'", evCode); 2493 2494 //fetch the siteResidues from the map 2495 site.setEvCode(evCode); 2496 } 2497 if ("SITE_DESCRIPTION".equals(fields[0])) { 2498 // remark800Counter++; 2499 String desc = fields[1].trim(); 2500 2501 logger.debug("desc: '{}'", desc); 2502 2503 //fetch the siteResidues from the map 2504 site.setDescription(desc); 2505 2506 logger.debug("Finished making REMARK 800 for site {}", site.getSiteID()); 2507 logger.debug(site.remark800toPDB()); 2508 2509 } 2510 } 2511 } 2512 2513 private int intFromString(String intString){ 2514 int val = Integer.MIN_VALUE; 2515 try { 2516 val = Integer.parseInt(intString.trim()); 2517 } catch (NumberFormatException ex){ 2518 logger.info("Could not parse a number: " + ex.getMessage()); 2519 } 2520 return val; 2521 } 2522 2523 2524 2525 /** 2526 * Finds in the given list of chains the first one that has as name the given chainID. 2527 * If no such Chain can be found it returns null. 2528 */ 2529 private static Chain isKnownChain(String chainID, List<Chain> chains){ 2530 2531 for (int i = 0; i< chains.size();i++){ 2532 Chain testchain = chains.get(i); 2533 if (chainID.equals(testchain.getName())) { 2534 return testchain; 2535 } 2536 } 2537 2538 return null; 2539 } 2540 2541 2542 2543 private BufferedReader getBufferedReader(InputStream inStream) 2544 throws IOException { 2545 2546 BufferedReader buf ; 2547 if (inStream == null) { 2548 throw new IOException ("input stream is null!"); 2549 } 2550 2551 buf = new BufferedReader (new InputStreamReader (inStream)); 2552 return buf ; 2553 2554 } 2555 2556 2557 2558 /** 2559 * Parse a PDB file and return a datastructure implementing 2560 * PDBStructure interface. 2561 * 2562 * @param inStream an InputStream object 2563 * @return a Structure object 2564 * @throws IOException 2565 */ 2566 public Structure parsePDBFile(InputStream inStream) 2567 throws IOException 2568 { 2569 2570 BufferedReader buf = getBufferedReader(inStream); 2571 2572 return parsePDBFile(buf); 2573 2574 } 2575 2576 /** 2577 * Parse a PDB file and return a datastructure implementing 2578 * PDBStructure interface. 2579 * 2580 * @param buf a BufferedReader object 2581 * @return the Structure object 2582 * @throws IOException ... 2583 */ 2584 public Structure parsePDBFile(BufferedReader buf) 2585 throws IOException 2586 { 2587 // set the correct max values for parsing... 2588 loadMaxAtoms = params.getMaxAtoms(); 2589 atomCAThreshold = params.getAtomCaThreshold(); 2590 2591 2592 // (re)set structure 2593 2594 allModels = new ArrayList<>(); 2595 structure = new StructureImpl() ; 2596 currentModel = null; 2597 currentChain = null; 2598 currentGroup = null; 2599 // we initialise to true since at the beginning of the file we are always starting a new molecule 2600 startOfMolecule = true; 2601 startOfModel = true; 2602 2603 seqResChains = new ArrayList<>(); 2604 siteMap = new LinkedHashMap<>(); 2605 pdbHeader = new PDBHeader(); 2606 connects = new ArrayList<>(); 2607 previousContinuationField = ""; 2608 continuationField = ""; 2609 continuationString = ""; 2610 current_compound = null; 2611 sourceLines.clear(); 2612 compndLines.clear(); 2613 keywordsLines.clear(); 2614 isLastCompndLine = false; 2615 isLastSourceLine = false; 2616 prevMolId = -1; 2617 entities.clear(); 2618 helixList.clear(); 2619 strandList.clear(); 2620 turnList.clear(); 2621 lengthCheck = -1; 2622 atomCount = 0; 2623 atomOverflow = false; 2624 linkRecords = new ArrayList<>(); 2625 siteToResidueMap.clear(); 2626 2627 blankChainIdsPresent = false; 2628 2629 parseCAonly = params.isParseCAOnly(); 2630 2631 String line = null; 2632 2633 while ((line = buf.readLine()) != null) { 2634 2635 // ignore empty lines 2636 if ( "".equals(line) || 2637 (line.equals(NEWLINE))){ 2638 continue; 2639 } 2640 2641 2642 // ignore short TER and END lines 2643 if ( line.startsWith("END")) { 2644 continue; 2645 } 2646 2647 if ( line.length() < 6 && !line.startsWith("TER")) { 2648 logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" ); 2649 continue; 2650 } 2651 2652 String recordName = null; 2653 if (line.length()<6) 2654 recordName = line.trim(); 2655 else 2656 recordName = line.substring (0, 6).trim (); 2657 2658 try { 2659 if ("ATOM".equals(recordName)) 2660 pdb_ATOM_Handler(line); 2661 else if ("SEQRES".equals(recordName)) 2662 pdb_SEQRES_Handler(line); 2663 else if ("HETATM".equals(recordName)) 2664 pdb_ATOM_Handler(line); 2665 else if ("MODEL".equals(recordName)) 2666 pdb_MODEL_Handler(line); 2667 else if ("TER".equals(recordName)) 2668 pdb_TER_Handler(); 2669 else if ("HEADER".equals(recordName)) 2670 pdb_HEADER_Handler(line); 2671 else if ("AUTHOR".equals(recordName)) 2672 pdb_AUTHOR_Handler(line); 2673 else if ("TITLE".equals(recordName)) 2674 pdb_TITLE_Handler(line); 2675 else if ("SOURCE".equals(recordName)) 2676 sourceLines.add(line); //pdb_SOURCE_Handler 2677 else if ("COMPND".equals(recordName)) 2678 compndLines.add(line); //pdb_COMPND_Handler 2679 else if ("KEYWDS".equals(recordName)) 2680 keywordsLines.add(line); 2681 else if ("JRNL".equals(recordName)) 2682 pdb_JRNL_Handler(line); 2683 else if ("EXPDTA".equals(recordName)) 2684 pdb_EXPDTA_Handler(line); 2685 else if ("CRYST1".equals(recordName)) 2686 pdb_CRYST1_Handler(line); 2687 else if (recordName.startsWith("MTRIX")) 2688 pdb_MTRIXn_Handler(line); 2689 else if ("REMARK".equals(recordName)) 2690 pdb_REMARK_Handler(line); 2691 else if ("CONECT".equals(recordName)) 2692 pdb_CONECT_Handler(line); 2693 else if ("REVDAT".equals(recordName)) 2694 pdb_REVDAT_Handler(line); 2695 else if ("DBREF".equals(recordName)) 2696 pdb_DBREF_Handler(line); 2697 else if ("SITE".equals(recordName)) 2698 pdb_SITE_Handler(line); 2699 else if ("SSBOND".equals(recordName)) 2700 pdb_SSBOND_Handler(line); 2701 else if ("LINK".equals(recordName)) 2702 pdb_LINK_Handler(line); 2703 else if ( params.isParseSecStruc()) { 2704 if ( "HELIX".equals(recordName) ) pdb_HELIX_Handler ( line ) ; 2705 else if ("SHEET".equals(recordName)) pdb_SHEET_Handler(line ) ; 2706 else if ("TURN".equals(recordName)) pdb_TURN_Handler( line ) ; 2707 } 2708 } catch (StringIndexOutOfBoundsException | NullPointerException ex) { 2709 logger.info("Unable to parse [" + line + "]"); 2710 } 2711 } 2712 2713 makeCompounds(compndLines, sourceLines); 2714 2715 handlePDBKeywords(keywordsLines); 2716 2717 triggerEndFileChecks(); 2718 2719 if (params.shouldCreateAtomBonds()) { 2720 formBonds(); 2721 } 2722 2723 if ( params.shouldCreateAtomCharges()) { 2724 addCharges(); 2725 } 2726 2727 if ( params.isParseSecStruc() && !params.isHeaderOnly()) 2728 setSecStruc(); 2729 2730 // Now correct the alternate location group 2731 StructureTools.cleanUpAltLocs(structure); 2732 2733 return structure; 2734 2735 } 2736 2737 2738 /** 2739 * Add the charges to the Structure 2740 */ 2741 private void addCharges() { 2742 ChargeAdder.addCharges(structure); 2743 } 2744 2745 /** 2746 * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained. 2747 * @author Jules Jacobsen 2748 * @param compoundList 2749 * @param sourceList 2750 */ 2751 private void makeCompounds(List<String> compoundList, 2752 List<String> sourceList) { 2753 // System.out.println("[makeCompounds] making compounds from compoundLines"); 2754 2755 for (String line : compoundList) { 2756 if (compoundList.indexOf(line) + 1 == compoundList.size()) { 2757 // System.out.println("[makeCompounds] Final line in compoundLines."); 2758 isLastCompndLine = true; 2759 } 2760 pdb_COMPND_Handler(line); 2761 2762 } 2763 // System.out.println("[makeCompounds] adding sources to compounds from sourceLines"); 2764 // since we're starting again from the first compound, reset it here 2765 if ( entities.size() == 0){ 2766 current_compound = new EntityInfo(); 2767 } else { 2768 current_compound = entities.get(0); 2769 } 2770 for (String line : sourceList) { 2771 if (sourceList.indexOf(line) + 1 == sourceList.size()) { 2772 // System.out.println("[makeCompounds] Final line in sourceLines."); 2773 isLastSourceLine = true; 2774 } 2775 pdb_SOURCE_Handler(line); 2776 } 2777 2778 } 2779 2780 /**Parse KEYWODS record of the PDB file.<br> 2781 * A keyword may be split over two lines. whether a keyword ends by the end 2782 * of a line or it is aplit over two lines, a <code>space</code> is added 2783 * between the 2 lines's contents, unless the first line ends in 2784 * a '-' character. 2785 * <pre> 2786 * Record Format 2787 * COLUMNS DATA TYPE FIELD DEFINITION 2788 * --------------------------------------------------------------------------------- 2789 * 1 - 6 Record name "KEYWDS" 2790 * 9 - 10 Continuation continuation Allows concatenation of records if necessary. 2791 * 11 - 79 List keywds Comma-separated list of keywords relevant 2792 * to the entry. 2793 * Example 2794 * 1 2 3 4 5 6 7 8 2795 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2796 * KEYWDS LYASE, TRICARBOXYLIC ACID CYCLE, MITOCHONDRION, OXIDATIVE 2797 * KEYWDS 2 METABOLISM 2798 * </pre> 2799 * @param lines The KEWODS record lines. 2800 * @author Amr ALHOSSARY 2801 */ 2802 private void handlePDBKeywords(List<String> lines) { 2803 StringBuilder fullList = new StringBuilder(); 2804 for (String line : lines) { 2805 String kwList = line.substring(10).trim(); 2806 if(kwList.length() > 0) { 2807 if(fullList.length() > 0 && fullList.indexOf("-", fullList.length()-1) < 0) { 2808 fullList.append(' '); 2809 } 2810 fullList.append(kwList); 2811 } 2812 } 2813 String fulllengthList = fullList.toString(); 2814 keywords = fulllengthList.split("( )*,( )*"); 2815 ArrayList<String> lst = new ArrayList<>(keywords.length); 2816 for (String keyword : keywords) { 2817 if(keyword.length() == 0) { 2818 logger.debug("Keyword empty in structure {}", structure.getIdentifier().toString()); 2819 continue; 2820 } 2821 lst.add(keyword); 2822 } 2823 pdbHeader.setKeywords(lst); 2824 } 2825 2826 /** 2827 * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide 2828 * bonds), peptide bonds, and intra-residue bonds. 2829 * <p> 2830 * Note: the current implementation only looks at the first model of each 2831 * structure. This may need to be fixed in the future. 2832 */ 2833 private void formBonds() { 2834 2835 BondMaker maker = new BondMaker(structure, params); 2836 2837 // LINK records should be preserved, they are the way that 2838 // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers. 2839 // The analogy in mmCIF is the _struct_conn record. 2840 for (LinkRecord linkRecord : linkRecords) { 2841 maker.formLinkRecordBond(linkRecord); 2842 } 2843 2844 maker.formDisulfideBonds(ssbonds); 2845 2846 maker.makeBonds(); 2847 } 2848 2849 2850 2851 private void triggerEndFileChecks(){ 2852 2853 // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines) 2854 if (currentChain!=null && currentGroup!=null) { 2855 currentChain.addGroup(currentGroup); 2856 } 2857 if (currentModel!=null && currentChain!=null) { 2858 currentModel.add(currentChain); 2859 } 2860 if (currentModel!=null) { 2861 allModels.add(currentModel); 2862 } 2863 2864 if (blankChainIdsPresent) { 2865 // from biojava 5.0 there's limited support for old pdb files with blank chain ids 2866 logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly."); 2867 } 2868 2869 // reordering chains following the mmcif model and assigning entities 2870 assignChainsAndEntities(); 2871 structure.setEntityInfos(entities); 2872 2873 2874 2875 // header data 2876 2877 Date modDate = pdbHeader.getModDate(); 2878 if ( modDate.equals(new Date(0)) ) { 2879 // modification date = deposition date 2880 Date depositionDate = pdbHeader.getDepDate(); 2881 2882 if (! depositionDate.equals(modDate)){ 2883 // depDate is 0000-00-00 2884 pdbHeader.setModDate(depositionDate); 2885 } 2886 } 2887 2888 structure.setPDBHeader(pdbHeader); 2889 structure.setCrystallographicInfo(crystallographicInfo); 2890 2891 //set the JournalArticle, if there is one 2892 if (!journalLines.isEmpty()) { 2893 buildjournalArticle(); 2894 pdbHeader.setJournalArticle(journalArticle); 2895 } 2896 2897 structure.setDBRefs(dbrefs); 2898 2899 // Only align if requested (default) and not when headerOnly mode with no Atoms. 2900 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 2901 if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){ 2902 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 2903 SeqRes2AtomAligner aligner = new SeqRes2AtomAligner(); 2904 aligner.align(structure,seqResChains); 2905 2906 } else { 2907 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 2908 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 2909 } 2910 2911 2912 2913 //associate the temporary Groups in the siteMap to the ones 2914 if (!params.isHeaderOnly()) { 2915 // Only can link SITES if Atom Groups were parsed. 2916 linkSitesToGroups(); // will work now that setSites is called 2917 } 2918 2919 if ( bioAssemblyParser != null){ 2920 bioAssemblyParser.setMacromolecularSizes(); 2921 pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap()); 2922 } 2923 2924 if (ncsOperators !=null && ncsOperators.size()>0) { 2925 crystallographicInfo.setNcsOperators( 2926 ncsOperators.toArray(new Matrix4d[ncsOperators.size()])); 2927 } 2928 2929 2930 // rfree end file check 2931 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 2932 // Here we follow this strategy: 2933 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 2934 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 2935 2936 if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) { 2937 pdbHeader.setRfree(rfreeNoCutoffLine); 2938 } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) { 2939 pdbHeader.setRfree(rfreeStandardLine); 2940 } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) { 2941 pdbHeader.setRfree(rfreeStandardLine); 2942 } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE 2943 2944 2945 2946 } 2947 2948 private void setSecStruc(){ 2949 2950 setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2951 SecStrucType.helix4); 2952 setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2953 SecStrucType.extended); 2954 setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2955 SecStrucType.turn); 2956 2957 //Now insert random coil to the Groups that did not have SS information 2958 GroupIterator gi = new GroupIterator(structure); 2959 while (gi.hasNext()){ 2960 Group g = gi.next(); 2961 if (g.hasAminoAtoms()){ 2962 if (g.getProperty(Group.SEC_STRUC) == null){ 2963 SecStrucInfo ss = new SecStrucInfo(g, 2964 SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2965 SecStrucType.coil); 2966 g.setProperty(Group.SEC_STRUC, ss); 2967 } 2968 } 2969 } 2970 2971 } 2972 2973 private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){ 2974 2975 2976 Iterator<Map<String,String>> iter = secList.iterator(); 2977 nextElement: 2978 while (iter.hasNext()){ 2979 Map<String,String> m = iter.next(); 2980 2981 // assign all residues in this range to this secondary structure type 2982 // String initResName = (String)m.get("initResName"); 2983 String initChainId = m.get("initChainId"); 2984 String initSeqNum = m.get("initSeqNum" ); 2985 String initICode = m.get("initICode" ); 2986 // String endResName = (String)m.get("endResName" ); 2987 String endChainId = m.get("endChainId" ); 2988 String endSeqNum = m.get("endSeqNum"); 2989 String endICode = m.get("endICode"); 2990 2991 if (" ".equals(initICode)) 2992 initICode = ""; 2993 if (" ".equals(endICode)) 2994 endICode = ""; 2995 2996 GroupIterator gi = new GroupIterator(structure); 2997 boolean inRange = false; 2998 while (gi.hasNext()){ 2999 Group g = gi.next(); 3000 Chain c = g.getChain(); 3001 3002 if (c.getName().equals(initChainId)){ 3003 3004 String pdbCode = initSeqNum + initICode; 3005 if ( g.getResidueNumber().toString().equals(pdbCode) ) { 3006 inRange = true; 3007 } 3008 } 3009 if ( inRange){ 3010 if (g.hasAminoAtoms()) { 3011 SecStrucInfo ss = new SecStrucInfo(g, assignment, type); 3012 g.setProperty(Group.SEC_STRUC, ss); 3013 } 3014 3015 } 3016 if ( c.getName().equals(endChainId)){ 3017 String pdbCode = endSeqNum + endICode; 3018 if (pdbCode.equals(g.getResidueNumber().toString())){ 3019 inRange = false; 3020 continue nextElement; 3021 } 3022 } 3023 } 3024 } 3025 } 3026 3027 /** 3028 * Gets all chains with given chainName from given models list 3029 * @param chainName 3030 * @param polyModels 3031 * @return 3032 */ 3033 private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) { 3034 List<List<Chain>> models = new ArrayList<>(); 3035 3036 for (List<Chain> chains:polyModels) { 3037 List<Chain> matchingChains = new ArrayList<>(); 3038 models.add(matchingChains); 3039 for (Chain c:chains) { 3040 if (c.getName().equals(chainName)) { 3041 matchingChains.add(c); 3042 } 3043 } 3044 } 3045 return models; 3046 } 3047 3048 /** 3049 * Split the given chain (containing non-polymer groups and water groups only) 3050 * into individual chains per non-polymer group and individual chains per contiguous sets of water groups. 3051 * @param chain 3052 * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains 3053 */ 3054 private static List<List<Chain>> splitNonPolyChain(Chain chain) { 3055 List<Chain> splitNonPolys = new ArrayList<>(); 3056 List<Chain> waterChains = new ArrayList<>(); 3057 3058 Chain split = null; 3059 boolean previousGroupIsWater = false; 3060 3061 for (Group g:chain.getAtomGroups()){ 3062 3063 if (!previousGroupIsWater) { 3064 // add last one if there's one 3065 if (split!=null) { 3066 splitNonPolys.add(split); 3067 } 3068 split = new ChainImpl(); 3069 split.setName(chain.getName()); 3070 } else if (!g.isWater()) { 3071 // previous group is water and this group is not water: we change from a water chain to a non-poly 3072 // we'll need to add now the water chain to the list of water chains 3073 waterChains.add(split); 3074 split = new ChainImpl(); 3075 split.setName(chain.getName()); 3076 } 3077 3078 if (g.isWater()) { 3079 previousGroupIsWater = true; 3080 } else { 3081 previousGroupIsWater = false; 3082 3083 } 3084 3085 // this should include alt locs (referenced from the main group) 3086 split.addGroup(g); 3087 3088 } 3089 3090 // adding the last split chain: either to water or non-poly depending on what was the last seen group 3091 if (split!=null) { 3092 if (previousGroupIsWater) 3093 waterChains.add(split); 3094 else 3095 splitNonPolys.add(split); 3096 } 3097 3098 3099 List<List<Chain>> all = new ArrayList<>(2); 3100 all.add(splitNonPolys); 3101 all.add(waterChains); 3102 3103 return all; 3104 } 3105 3106 /** 3107 * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files 3108 * @param polys 3109 * @param nonPolys 3110 * @param waters 3111 */ 3112 private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) { 3113 3114 for (int i=0; i<polys.size(); i++) { 3115 String asymId = "A"; 3116 3117 for (Chain poly:polys.get(i)) { 3118 poly.setId(asymId); 3119 asymId = getNextAsymId(asymId); 3120 } 3121 for (Chain nonPoly:nonPolys.get(i)) { 3122 nonPoly.setId(asymId); 3123 asymId = getNextAsymId(asymId); 3124 } 3125 for (Chain water:waters.get(i)) { 3126 water.setId(asymId); 3127 asymId = getNextAsymId(asymId); 3128 } 3129 } 3130 } 3131 3132 /** 3133 * Gets the next asym id given an asymId, according to the convention followed by 3134 * mmCIF files produced by the PDB 3135 * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,... 3136 * @param asymId 3137 * @return 3138 */ 3139 private String getNextAsymId(String asymId) { 3140 if (asymId.length()==1) { 3141 if (!"Z".equals(asymId)) { 3142 return Character.toString(getNextChar(asymId.charAt(0))); 3143 } else { 3144 return "AA"; 3145 } 3146 } else if (asymId.length()==2) { 3147 if ("ZZ".equals(asymId)) { 3148 return "AAA"; 3149 } 3150 char[] c = new char[2]; 3151 asymId.getChars(0, 2, c, 0); 3152 c[0] = getNextChar(c[0]); 3153 if (c[0]=='A') { 3154 c[1] = getNextChar(c[1]); 3155 } 3156 return String.valueOf(c); 3157 } else if (asymId.length()==3) { 3158 char[] c = new char[3]; 3159 asymId.getChars(0, 3, c, 0); 3160 c[0] = getNextChar(c[0]); 3161 if (c[0]=='A') { 3162 c[1] = getNextChar(c[1]); 3163 if (c[1]=='A') { 3164 c[2] = getNextChar(c[2]); 3165 } 3166 } 3167 return String.valueOf(c); 3168 } 3169 return null; 3170 } 3171 3172 private char getNextChar(char c) { 3173 if (c!='Z') { 3174 return ((char)(c+1)); 3175 } else { 3176 return 'A'; 3177 } 3178 } 3179 3180 /** 3181 * Here we assign chains following the mmCIF data model: 3182 * one chain per polymer, one chain per non-polymer group and 3183 * several water chains. 3184 * <p> 3185 * Subsequently we assign entities for them: either from those read from 3186 * COMPOUND records or from those found heuristically through {@link EntityFinder} 3187 * 3188 */ 3189 private void assignChainsAndEntities(){ 3190 3191 List<List<Chain>> polyModels = new ArrayList<>(); 3192 List<List<Chain>> nonPolyModels = new ArrayList<>(); 3193 List<List<Chain>> waterModels = new ArrayList<>(); 3194 3195 for (List<Chain> model:allModels) { 3196 3197 List<Chain> polyChains = new ArrayList<>(); 3198 List<Chain> nonPolyChains = new ArrayList<>(); 3199 List<Chain> waterChains = new ArrayList<>(); 3200 3201 polyModels.add(polyChains); 3202 nonPolyModels.add(nonPolyChains); 3203 waterModels.add(waterChains); 3204 3205 for (Chain c:model) { 3206 3207 // we only have entities for polymeric chains, all others are ignored for assigning entities 3208 if (c.isWaterOnly()) { 3209 waterChains.add(c); 3210 3211 } else if (c.isPureNonPolymer()) { 3212 nonPolyChains.add(c); 3213 3214 } else { 3215 polyChains.add(c); 3216 } 3217 } 3218 } 3219 3220 List<List<Chain>> splitNonPolyModels = new ArrayList<>(); 3221 for (int i=0; i<nonPolyModels.size(); i++) { 3222 List<Chain> nonPolyModel = nonPolyModels.get(i); 3223 List<Chain> waterModel = waterModels.get(i); 3224 3225 List<Chain> splitNonPolys = new ArrayList<>(); 3226 splitNonPolyModels.add(splitNonPolys); 3227 3228 for (Chain nonPoly:nonPolyModel) { 3229 List<List<Chain>> splits = splitNonPolyChain(nonPoly); 3230 splitNonPolys.addAll(splits.get(0)); 3231 waterModel.addAll(splits.get(1)); 3232 } 3233 } 3234 3235 3236 // now we have all chains as in mmcif, let's assign ids following the mmcif rules 3237 assignAsymIds(polyModels, splitNonPolyModels, waterModels); 3238 3239 3240 if (!entities.isEmpty()) { 3241 // if the file contained COMPOUND records then we can assign entities to the poly chains 3242 for (EntityInfo comp : entities){ 3243 List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId()); 3244 if ( chainIds == null) 3245 continue; 3246 for ( String chainId : chainIds) { 3247 3248 List<List<Chain>> models = findChains(chainId, polyModels); 3249 3250 for (List<Chain> matchingChains:models) { 3251 for (Chain chain:matchingChains) { 3252 comp.addChain(chain); 3253 chain.setEntityInfo(comp); 3254 } 3255 3256 if (matchingChains.isEmpty()) { 3257 // usually if this happens something is wrong with the PDB header 3258 // e.g. 2brd - there is no Chain A, although it is specified in the header 3259 // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES 3260 // but the authors didn't observe in the density so it's completely missing 3261 // from the ATOM lines 3262 logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId()); 3263 } 3264 } 3265 } 3266 } 3267 3268 } else { 3269 3270 logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically"); 3271 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 3272 entities = EntityFinder.findPolyEntities(polyModels); 3273 3274 } 3275 3276 // now we assign entities to the nonpoly and water chains 3277 EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities); 3278 3279 3280 // in some rare cases purely non-polymer or purely water chain are present in pdb files 3281 // see https://github.com/biojava/biojava/pull/394 3282 // these case should be covered by the above 3283 3284 3285 // now that we have entities in chains we add the chains to the structure 3286 3287 for (int i=0;i<allModels.size();i++) { 3288 List<Chain> model = new ArrayList<>(); 3289 model.addAll(polyModels.get(i)); 3290 model.addAll(splitNonPolyModels.get(i)); 3291 model.addAll(waterModels.get(i)); 3292 structure.addModel(model); 3293 } 3294 3295 3296 } 3297 3298 /** 3299 * Links the Sites in the siteMap to the Groups in the Structure via the 3300 * siteToResidueMap ResidueNumber. 3301 * @author Jules Jacobsen 3302 * @return 3303 */ 3304 private void linkSitesToGroups() { 3305 3306 //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size()); 3307 3308 //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back. 3309 //the return list 3310 3311 if ( siteMap == null || siteToResidueMap == null){ 3312 logger.info("Sites can not be linked to residues!"); 3313 3314 return; 3315 } 3316 3317 List<Site> sites = null; 3318 //check that there are chains with which to associate the groups 3319 if (structure.getChains().isEmpty()) { 3320 sites = new ArrayList<>(siteMap.values()); 3321 logger.info("No chains to link Site Groups with - Sites will not be present in the Structure"); 3322 return; 3323 } 3324 3325 //check that the keys in the siteMap and SiteToResidueMap are equal 3326 if (! siteMap.keySet().equals(siteToResidueMap.keySet())) { 3327 logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure"); 3328 logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet()); 3329 //return; 3330 } 3331 3332 //so we have chains - associate the siteResidues-related groups with the ones 3333 //already in in the chains 3334 for (String key : siteMap.keySet()) { 3335 Site currentSite = siteMap.get(key); 3336 List<ResidueNumber> linkedGroups = siteToResidueMap.get(key); 3337 if ( linkedGroups == null) 3338 continue; 3339 for (ResidueNumber residueNumber : linkedGroups) { 3340 3341 String pdbCode = residueNumber.toString(); 3342 String chain = residueNumber.getChainName(); 3343 // System.out.println("chain: '" + chain + "'"); 3344 // String resNum = resNum.getSeqNum().toString(); 3345 // System.out.println("resNum: '" + resNum + "'"); 3346 3347 Group linkedGroup = null; 3348 try { 3349 //TODO: implement findGroup(ResidueNumber resNum) 3350 linkedGroup = structure.findGroup(chain, pdbCode); 3351 } catch (StructureException ex) { 3352 logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")"); 3353 continue; 3354 } 3355 3356 // System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID()); 3357 currentSite.getGroups().add(linkedGroup); 3358 } 3359 } 3360 3361 //System.out.println("SITEMAP: " + siteMap); 3362 3363 sites = new ArrayList<>(siteMap.values()); 3364 structure.setSites(sites); 3365 //System.out.println("STRUCTURE SITES: " + structure.getSites().size()); 3366 // for (Site site : structure.getSites()) { 3367 // System.out.println(site); 3368 // } 3369 // System.out.println("Linked Site Groups with Chains"); 3370 3371 } 3372 3373 private void buildjournalArticle() { 3374 3375 logger.debug("building new JournalArticle"); 3376 // for (String line : journalLines) { 3377 // System.out.println(line); 3378 // } 3379 3380 this.journalArticle = new JournalArticle(); 3381 // JRNL AUTH M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI, 3382 // JRNL AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT 3383 // JRNL TITL A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY 3384 // JRNL TITL 2 STAPHYLOCOCCUS AUREUS. 3385 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3386 // JRNL REFN ISSN 1529-2908 3387 // JRNL PMID 17351618 3388 // JRNL DOI 10.1038/NI1450 3389 StringBuffer auth = new StringBuffer(); 3390 StringBuffer titl = new StringBuffer(); 3391 StringBuffer edit = new StringBuffer(); 3392 StringBuffer ref = new StringBuffer(); 3393 StringBuffer publ = new StringBuffer(); 3394 StringBuffer refn = new StringBuffer(); 3395 StringBuffer pmid = new StringBuffer(); 3396 StringBuffer doi = new StringBuffer(); 3397 3398 for (String line : journalLines) { 3399 if ( line.length() < 19 ) { 3400 logger.info("can not process Journal line: " + line); 3401 continue; 3402 } 3403 // System.out.println("'" + line + "'"); 3404 String subField = line.substring(12, 16); 3405 // System.out.println("'" + subField + "'"); 3406 if ("AUTH".equals(subField)) { 3407 auth.append(line.substring(19, line.length()).trim()); 3408 3409 logger.debug("AUTH '{}'", auth.toString()); 3410 3411 } 3412 if ("TITL".equals(subField)) { 3413 //add a space to the end of a line so that when wrapped the 3414 //words on the join won't be concatenated 3415 titl.append(line.substring(19, line.length()).trim()).append(" "); 3416 3417 logger.debug("TITL '{}'", titl.toString()); 3418 3419 } 3420 if ("EDIT".equals(subField)) { 3421 edit.append(line.substring(19, line.length()).trim()); 3422 3423 logger.debug("EDIT '{}'", edit.toString()); 3424 3425 } 3426 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3427 if ("REF ".equals(subField)) { 3428 ref.append(line.substring(19, line.length()).trim()).append(" "); 3429 3430 logger.debug("REF '{}'", ref.toString()); 3431 3432 } 3433 if ("PUBL".equals(subField)) { 3434 publ.append(line.substring(19, line.length()).trim()).append(" "); 3435 3436 logger.debug("PUBL '{}'", publ.toString()); 3437 3438 } 3439 // JRNL REFN ISSN 1529-2908 3440 if ("REFN".equals(subField)) { 3441 if ( line.length() < 35 ) { 3442 logger.info("can not process Journal REFN line: " + line); 3443 continue; 3444 } 3445 refn.append(line.substring(35, line.length()).trim()); 3446 3447 logger.debug("REFN '{}'", refn.toString()); 3448 3449 } 3450 // JRNL PMID 17351618 3451 if ("PMID".equals(subField)) { 3452 pmid.append(line.substring(19, line.length()).trim()); 3453 3454 logger.debug("PMID '{}'", pmid.toString()); 3455 3456 } 3457 // JRNL DOI 10.1038/NI1450 3458 if ("DOI ".equals(subField)) { 3459 doi.append(line.substring(19, line.length()).trim()); 3460 3461 logger.debug("DOI '{}'", doi.toString()); 3462 3463 } 3464 } 3465 3466 //now set the parts of the JournalArticle 3467 journalArticle.setAuthorList(authorBuilder(auth.toString())); 3468 journalArticle.setEditorList(authorBuilder(edit.toString())); 3469 journalArticle.setRef(ref.toString()); 3470 JournalParser journalParser = new JournalParser(ref.toString()); 3471 journalArticle.setJournalName(journalParser.getJournalName()); 3472 if (!"TO BE PUBLISHED".equals(journalArticle.getJournalName())) { 3473 journalArticle.setIsPublished(true); 3474 } 3475 journalArticle.setVolume(journalParser.getVolume()); 3476 journalArticle.setStartPage(journalParser.getStartPage()); 3477 journalArticle.setPublicationDate(journalParser.getPublicationDate()); 3478 journalArticle.setPublisher(publ.toString().trim()); 3479 journalArticle.setTitle(titl.toString().trim()); 3480 journalArticle.setRefn(refn.toString().trim()); 3481 journalArticle.setPmid(pmid.toString().trim()); 3482 journalArticle.setDoi(doi.toString().trim()); 3483 3484 3485 logger.debug("Made JournalArticle:"); 3486 logger.debug(journalArticle.toString()); 3487 3488 } 3489 3490 //inner class to deal with all the journal info 3491 private class JournalParser { 3492 3493 private String journalName; 3494 private String volume; 3495 private String startPage; 3496 private int publicationDate; 3497 3498 3499 public JournalParser(String ref) { 3500 3501 logger.debug("JournalParser init '{}'", ref); 3502 3503 3504 if ("TO BE PUBLISHED ".equals(ref)) { 3505 journalName = ref.trim(); 3506 3507 logger.debug(String.format("JournalParser found journalString '%s'", journalName)); 3508 3509 return; 3510 } 3511 3512 if (ref.length() < 48) { 3513 logger.info("REF line too short - must be at least 48 characters to be valid for parsing."); 3514 journalName = ""; 3515 volume = ""; 3516 startPage = ""; 3517 publicationDate = 0; 3518 return; 3519 } 3520 //can be multi line: 3521 //REF PHILOS.TRANS.R.SOC.LONDON, V. 293 53 1981 3522 //REF 2 SER.B 3523 3524 //or 3525 3526 //REF GLYCOGEN PHOSPHORYLASE B: 1 1991 3527 //REF 2 DESCRIPTION OF THE PROTEIN 3528 //REF 3 STRUCTURE 3529 3530 //but usually single line 3531 //REF NUCLEIC ACIDS RES. 2009 3532 //REF MOL.CELL 2009 3533 //REF NAT.STRUCT.MOL.BIOL. V. 16 238 2009 3534 //REF ACTA CRYSTALLOGR.,SECT.F V. 65 199 2009 3535 //check if the date is present at the end of the line. 3536 // 09876543210987654321 3537 //'J.BIOL.CHEM. V. 280 23000 2005 ' 3538 //'J.AM.CHEM.SOC. V. 130 16011 2008 ' 3539 //'NAT.STRUCT.MOL.BIOL. V. 16 238 2009' 3540 String volumeInformation = ref.substring(30, 48); 3541 3542 logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation)); 3543 3544 //volumeInformation: 'V. 293 53 1981 ' 3545 // String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim(); 3546 // String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim(); 3547 // String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim(); 3548 // String journalString = ref.substring(0 , ref.length() - 18).trim(); 3549 String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim(); 3550 String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim(); 3551 String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim(); 3552 //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk) 3553 String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim(); 3554 journalString = journalString.trim(); 3555 // System.out.println("journalString: " + journalString); 3556 3557 logger.debug(String.format("JournalParser found volumeString '%s'", volumeString)); 3558 logger.debug(String.format("JournalParser found startPageString '%s'", startPageString)); 3559 logger.debug(String.format("JournalParser found dateString '%s'", dateString)); 3560 logger.debug(String.format("JournalParser found journalString '%s'", journalString)); 3561 3562 3563 if (!" ".equals(dateString)) { 3564 try { 3565 publicationDate = Integer.valueOf(dateString); 3566 } catch (NumberFormatException nfe) { 3567 logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1"); 3568 } 3569 // if (DEBUG) { 3570 // System.out.println("JournalParser set date " + publicationDate); 3571 // } 3572 } 3573 3574 if (!" ".equals(startPageString)) { 3575 startPage = startPageString; 3576 // if (DEBUG) { 3577 // System.out.println("JournalParser set startPage " + startPage); 3578 // } 3579 } 3580 3581 if (!" ".equals(volumeString)) { 3582 volume = volumeString; 3583 // if (DEBUG) { 3584 // System.out.println("JournalParser set volume " + volume); 3585 // } 3586 } 3587 3588 if (!" ".equals(journalString)) { 3589 journalName = journalString; 3590 3591 logger.debug("JournalParser set journalName {}", journalName); 3592 3593 } 3594 } 3595 3596 private String getJournalName() { 3597 return journalName; 3598 } 3599 3600 private int getPublicationDate() { 3601 return publicationDate; 3602 } 3603 3604 private String getStartPage() { 3605 return startPage; 3606 } 3607 3608 private String getVolume() { 3609 return volume; 3610 } 3611 } 3612 3613 private List<Author> authorBuilder(String authorString) { 3614 ArrayList<Author> authorList = new ArrayList<>(); 3615 3616 if ("".equals(authorString)) { 3617 return authorList; 3618 } 3619 3620 String[] authors = authorString.split(","); 3621 // if (DEBUG) { 3622 // for (int i = 0; i < authors.length; i++) { 3623 // String string = authors[i]; 3624 // System.out.println("authorBuilder author: '" + string + "'"); 3625 // } 3626 // } 3627 // AUTH SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS 3628 // AUTH 2 DISEASE (SSGCID) 3629 // or 3630 // AUTH E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET, 3631 // AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA, 3632 // AUTH 3 A.BOCHKAREV,D.COSSAR, 3633 // AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC) 3634 // or 3635 // AUTH T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER 3636 if (authors.length == 1) { 3637 //only one element means it's a consortium only 3638 Author author = new Author(); 3639 author.setSurname(authors[0]); 3640 3641 logger.debug("Set consortium author name {}", author.getSurname()); 3642 3643 authorList.add(author); 3644 } else { 3645 for (int i = 0; i < authors.length; i++) { 3646 String authorFullName = authors[i]; 3647 3648 logger.debug("Building author {}", authorFullName); 3649 3650 Author author = new Author(); 3651 String regex = "\\."; 3652 String[] authorNames = authorFullName.split(regex); 3653 // if (DEBUG) { 3654 // System.out.println("authorNames size " + authorNames.length); 3655 // for (int j = 0; j < authorNames.length; j++) { 3656 // String name = authorNames[j]; 3657 // System.out.println("split authName '" + name + "'"); 3658 // 3659 // } 3660 // } 3661 if (authorNames.length == 0) { 3662 author.setSurname(authorFullName); 3663 3664 logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname()); 3665 3666 } 3667 //again there might be a consortium name so there may be no elements 3668 else if (authorNames.length == 1) { 3669 author.setSurname(authorNames[0]); 3670 3671 logger.debug("Set consortium author name in multiple author block {}", author.getSurname 3672 ()); 3673 3674 } else { 3675 String initials = ""; 3676 for (int j = 0; j < authorNames.length - 1; j++) { 3677 String initial = authorNames[j]; 3678 // if (DEBUG) { 3679 // System.out.println("adding initial '" + initial + "'"); 3680 // } 3681 //build the initials back up again 3682 initials += initial + "."; 3683 } 3684 3685 logger.debug("built initials '{}'", initials); 3686 3687 author.setInitials(initials); 3688 //surname is always last 3689 int lastName = authorNames.length - 1; 3690 String surname = authorNames[lastName]; 3691 3692 logger.debug("built author surname {}", surname); 3693 3694 author.setSurname(surname); 3695 3696 } 3697 authorList.add(author); 3698 } 3699 } 3700 return authorList; 3701 } 3702 3703 public void setFileParsingParameters(FileParsingParameters params) 3704 { 3705 this.params= params; 3706 3707 // set the correct max values for parsing... 3708 loadMaxAtoms = params.getMaxAtoms(); 3709 atomCAThreshold = params.getAtomCaThreshold(); 3710 3711 3712 } 3713 3714 public FileParsingParameters getFileParsingParameters(){ 3715 return params; 3716 } 3717 3718 3719}