001/* 002 * 003 * This code may be freely distributed and modified under the 004 * terms of the GNU Lesser General Public Licence. This should 005 * be distributed with the code. If you do not have a copy, 006 * see: 007 * 008 * http://www.gnu.org/copyleft/lesser.html 009 * 010 * Copyright for this code is held jointly by the individual 011 * authors. These should be listed in @author doc comments. 012 * 013 * For more information on the BioJava project and its aims, 014 * or to join the biojava-l mailing list, visit the home page 015 * at: 016 * 017 * http://www.biojava.org/ 018 * 019 * Created on 16.03.2004 020 * 021 */ 022package org.biojava.nbio.structure.io; 023 024import static java.lang.Math.min; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.text.DateFormat; 031import java.text.ParseException; 032import java.text.SimpleDateFormat; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Date; 036import java.util.HashMap; 037import java.util.Iterator; 038import java.util.LinkedHashMap; 039import java.util.List; 040import java.util.Locale; 041import java.util.Map; 042import java.util.StringTokenizer; 043import java.util.regex.Matcher; 044import java.util.regex.Pattern; 045 046import javax.vecmath.Matrix4d; 047 048import org.biojava.nbio.structure.AminoAcid; 049import org.biojava.nbio.structure.AminoAcidImpl; 050import org.biojava.nbio.structure.Atom; 051import org.biojava.nbio.structure.AtomImpl; 052import org.biojava.nbio.structure.Author; 053import org.biojava.nbio.structure.Chain; 054import org.biojava.nbio.structure.ChainImpl; 055import org.biojava.nbio.structure.DBRef; 056import org.biojava.nbio.structure.Element; 057import org.biojava.nbio.structure.EntityInfo; 058import org.biojava.nbio.structure.EntityType; 059import org.biojava.nbio.structure.Group; 060import org.biojava.nbio.structure.GroupIterator; 061import org.biojava.nbio.structure.HetatomImpl; 062import org.biojava.nbio.structure.JournalArticle; 063import org.biojava.nbio.structure.NucleotideImpl; 064import org.biojava.nbio.structure.PDBCrystallographicInfo; 065import org.biojava.nbio.structure.PDBHeader; 066import org.biojava.nbio.structure.PdbId; 067import org.biojava.nbio.structure.ResidueNumber; 068import org.biojava.nbio.structure.Site; 069import org.biojava.nbio.structure.Structure; 070import org.biojava.nbio.structure.StructureException; 071import org.biojava.nbio.structure.StructureImpl; 072import org.biojava.nbio.structure.StructureTools; 073import org.biojava.nbio.structure.chem.ChemCompAtom; 074import org.biojava.nbio.structure.chem.ChemCompGroupFactory; 075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord; 076import org.biojava.nbio.structure.secstruc.SecStrucInfo; 077import org.biojava.nbio.structure.secstruc.SecStrucType; 078import org.biojava.nbio.structure.xtal.CrystalCell; 079import org.biojava.nbio.structure.xtal.SpaceGroup; 080import org.biojava.nbio.structure.xtal.SymoplibParser; 081import org.slf4j.Logger; 082import org.slf4j.LoggerFactory; 083 084 085/** 086 * This class implements the actual PDB file parsing. Do not access it directly, but 087 * via the PDBFileReader class. 088 * 089 * <h2>Parsing</h2> 090 * 091 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods. 092 * 093 * 094 * <p> 095 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD. 096 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically 097 * switch to a C-alpha only representation. 098 * 099 * <p> 100 * The result of the parsing of the PDB file is a new {@link Structure} object. 101 * 102 * <p> 103 * For more documentation on how to work with the Structure API please 104 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top"> 105 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a> 106 * 107 * 108 * 109 * 110 * <h2>Example</h2> 111 * <p> 112 * Q: How can I get a Structure object from a PDB file? 113 * <p> 114 * A: 115 * <pre> 116 * public {@link Structure} loadStructure(String pathToPDBFile){ 117 * // The PDBFileParser is wrapped by the PDBFileReader 118 * {@link PDBFileReader} pdbreader = new {@link PDBFileReader}(); 119 * 120 * {@link Structure} structure = null; 121 * try{ 122 * structure = pdbreader.getStructure(pathToPDBFile); 123 * System.out.println(structure); 124 * } catch (IOException e) { 125 * e.printStackTrace(); 126 * } 127 * return structure; 128 * } 129 * </pre> 130 * 131 * 132 * @author Andreas Prlic 133 * @author Jules Jacobsen 134 * @author Jose Duarte 135 * @since 1.4 136 */ 137public class PDBFileParser { 138 139 140 141 private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class); 142 143 // for printing 144 private static final String NEWLINE = System.getProperty("line.separator"); 145 146 147 // required for parsing: 148 private String pdbId; //the actual id of the entry 149 private Structure structure; 150 private List<List<Chain>> allModels; // a temp data structure to keep all models 151 private List<Chain> currentModel; // contains the ATOM records for each model 152 private Chain currentChain; 153 private Group currentGroup; 154 155 private List<Chain> seqResChains; // contains all the chains for the SEQRES records 156 //we're going to work on the assumption that the files are current - 157 //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true. 158 //if true then lines will be truncated at 72 characters in certain cases 159 //(pdb_COMPOUND_handler for example) 160 private boolean isLegacyFormat = false; 161 162 private boolean blankChainIdsPresent = false; 163 164 // for re-creating the biological assembly 165 private PDBBioAssemblyParser bioAssemblyParser = null; 166 167 private PDBHeader pdbHeader; 168 private PDBCrystallographicInfo crystallographicInfo; 169 private JournalArticle journalArticle; 170 private List<Map<String, Integer>> connects ; 171 private List<Map<String,String>> helixList; 172 private List<Map<String,String>> strandList; 173 private List<Map<String,String>> turnList; 174 175 private int lengthCheck ; 176 177 private boolean isLastCompndLine = false; 178 private boolean isLastSourceLine = false; 179 private EntityInfo current_compound; 180 private List<EntityInfo> entities = new ArrayList<EntityInfo>(); 181 private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>(); 182 private List<String> compndLines = new ArrayList<String>(); 183 private List<String> sourceLines = new ArrayList<String>(); 184 private List<String> journalLines = new ArrayList<String>(); 185 private List<String> keywordsLines = new ArrayList<String>(); 186 private List<DBRef> dbrefs; 187 private Map<String, Site> siteMap = new LinkedHashMap<String, Site>(); 188 private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>(); 189 190 private List<SSBondImpl> ssbonds = new ArrayList<>(); 191 192 // for storing LINK until we have all the atoms parsed 193 private List<LinkRecord> linkRecords; 194 195 private Matrix4d currentNcsOp; 196 private List<Matrix4d> ncsOperators; 197 198 // for parsing COMPOUND and SOURCE Header lines 199 private int prevMolId; 200 private String previousContinuationField; 201 private String continuationField; 202 private String continuationString; 203 204 private DateFormat dateFormat; 205 206 // for rfree parsing 207 private float rfreeStandardLine = -1; 208 private float rfreeNoCutoffLine = -1; 209 210 private static final List<String> compndFieldValues = new ArrayList<String>( 211 Arrays.asList( 212 "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:", 213 "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:", 214 "BIOLOGICAL_UNIT:", "OTHER_DETAILS:" 215 )); 216 217 218 private static final List<String> ignoreCompndFieldValues = new ArrayList<String>( 219 Arrays.asList( 220 "HETEROGEN:","ENGINEEREED:","FRAGMENT,", 221 "MUTANT:","SYNTHETIC:" 222 )); 223 // ENGINEEREED in pdb219d 224 225 private static final List<String> sourceFieldValues = new ArrayList<String>( 226 Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:", 227 "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:", 228 "ORGANISM_TAXID:","STRAIN:", 229 "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:", 230 "CELL:", "ORGANELLE:", "SECRETION:", "GENE:", 231 "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:", 232 "EXPRESSION_SYSTEM_TAXID:", 233 "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:", 234 "EXPRESSION_SYSTEM_CELL_LINE:", 235 "EXPRESSION_SYSTEM_ATCC_NUMBER:", 236 "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:", 237 "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:", 238 "EXPRESSION_SYSTEM_CELLULAR_LOCATION:", 239 "EXPRESSION_SYSTEM_VECTOR_TYPE:", 240 "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:", 241 "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:")); 242 243 private int atomCount; 244 245 // parsing options: 246 247 private int atomCAThreshold ; 248 249 private int loadMaxAtoms; 250 251 private boolean atomOverflow; 252 253 /** flag to tell parser to only read Calpha coordinates **/ 254 private boolean parseCAonly; 255 256 257 private FileParsingParameters params; 258 259 private boolean startOfMolecule; 260 private boolean startOfModel; 261 262 public PDBFileParser() { 263 params = new FileParsingParameters(); 264 265 allModels = new ArrayList<>(); 266 structure = null ; 267 currentModel = null; 268 currentChain = null; 269 currentGroup = null; 270 // we initialise to true since at the beginning of the file we are always starting a new molecule 271 startOfMolecule = true; 272 startOfModel = true; 273 274 275 pdbHeader = new PDBHeader(); 276 crystallographicInfo = new PDBCrystallographicInfo(); 277 connects = new ArrayList<Map<String,Integer>>() ; 278 279 280 helixList = new ArrayList<Map<String,String>>(); 281 strandList = new ArrayList<Map<String,String>>(); 282 turnList = new ArrayList<Map<String,String>>(); 283 current_compound = null; 284 dbrefs = new ArrayList<DBRef>(); 285 siteMap = null; 286 dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US); 287 atomCount = 0; 288 atomOverflow = false; 289 parseCAonly = false; 290 291 // this SHOULD not be done 292 // DONOT:setFileParsingParameters(params); 293 // set the correct max values for parsing... 294 loadMaxAtoms = params.getMaxAtoms(); 295 atomCAThreshold = params.getAtomCaThreshold(); 296 297 linkRecords = new ArrayList<LinkRecord>(); 298 299 blankChainIdsPresent = false; 300 301 } 302 303 /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */ 304 private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) { 305 306 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3); 307 if ( g != null && !g.getChemComp().isEmpty()) 308 return g; 309 310 311 Group group; 312 if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){ 313 group = new HetatomImpl(); 314 315 } else if(StructureTools.isNucleotide(aminoCode3)) { 316 // it is a nucleotide 317 NucleotideImpl nu = new NucleotideImpl(); 318 group = nu; 319 320 } else { 321 AminoAcidImpl aa = new AminoAcidImpl() ; 322 aa.setAminoType(aminoCode1); 323 group = aa ; 324 } 325 326 // System.out.println("new resNum type: "+ resNum.getType() ); 327 return group ; 328 } 329 330 331 332 // Handler methods to deal with PDB file records properly. 333 /** 334 Handler for 335 HEADER Record Format 336 <pre> 337 COLUMNS DATA TYPE FIELD DEFINITION 338 ---------------------------------------------------------------------------------- 339 1 - 6 Record name "HEADER" 340 11 - 50 String(40) classification Classifies the molecule(s) 341 51 - 59 Date depDate Deposition date. This is the date 342 the coordinates were received by 343 the PDB 344 63 - 66 IDcode idCode This identifier is unique within PDB 345 </pre> 346 */ 347 private void pdb_HEADER_Handler(String line) { 348 349 String classification = null; 350 String deposition_date = null; 351 String pdbCode = null; 352 353 int len = line.trim().length(); 354 if(len > 10) { 355 classification = line.substring (10, min(len,50)).trim() ; 356 pdbHeader.setClassification(classification); 357 } 358 if(len > 50) { 359 deposition_date = line.substring (50, min(len,59)).trim() ; 360 try { 361 Date dep = dateFormat.parse(deposition_date); 362 pdbHeader.setDepDate(dep); 363 364 } catch (ParseException e){ 365 logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date"); 366 } 367 } 368 if(len > 62) { 369 pdbCode = line.substring (62, min(len,66)).trim() ; 370 pdbId = pdbCode; 371 372 logger.debug("Parsing entry " + pdbId); 373 374 375 PdbId pdbIdToSet; 376 try { 377 pdbIdToSet = new PdbId(pdbCode); 378 } catch (IllegalArgumentException e) { 379 logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode); 380 pdbIdToSet = null; 381 } 382 structure.setPdbId(pdbIdToSet); 383 pdbHeader.setPdbId(pdbIdToSet); 384 } 385 386 //*really* old files (you'll need to hunt to find these as they 387 //should have been remediated) have headers like below. Plus the 388 //pdbId at positions 72-76 is present in every line 389 390 //HEADER PROTEINASE INHIBITOR (TRYPSIN) 05-OCT-84 5PTI 5PTI 3 391 //HEADER TRANSFERASE (ACYLTRANSFERASE) 02-SEP-92 1LAC 1LAC 2 392 if (len > 66) { 393 if (pdbId.equals(line.substring (72, 76))){ 394 isLegacyFormat = true; 395 logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly."); 396 } 397 } 398 399 } 400 401 402 /** 403 * Parses the following record: 404 * <pre> 405 * COLUMNS DATA TYPE FIELD DEFINITION 406 * ------------------------------------------------------------------------------------ 407 * 1 - 6 Record name "AUTHOR" 408 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 409 * 11 - 79 List authorList List of the author names, separated 410 * by commas. 411 * 412 * </pre> 413 * @param line 414 */ 415 private void pdb_AUTHOR_Handler(String line) { 416 417 String authors = line.substring(10).trim(); 418 419 String auth = pdbHeader.getAuthors(); 420 if (auth == null){ 421 pdbHeader.setAuthors(authors); 422 } else { 423 auth += authors; 424 pdbHeader.setAuthors(auth); 425 } 426 427 } 428 429 430 431 /** 432 * Parses the following record: 433 * 434 * <pre> 435 * COLUMNS DATA TYPE FIELD DEFINITION 436 * -------------------------------------------------------------------- 437 * 1 - 6 Record name "HELIX " 438 * 8 - 10 Integer serNum Serial number of the helix. 439 * This starts at 1 and increases 440 * incrementally. 441 * 12 - 14 LString(3) helixID Helix identifier. In addition 442 * to a serial number, each helix is 443 * given an alphanumeric character 444 * helix identifier. 445 * 16 - 18 Residue name initResName Name of the initial residue. 446 * 20 Character initChainID Chain identifier for the chain 447 * containing this helix. 448 * 22 - 25 Integer initSeqNum Sequence number of the initial 449 * residue. 450 * 26 AChar initICode Insertion code of the initial 451 * residue. 452 * 28 - 30 Residue name endResName Name of the terminal residue of 453 * the helix. 454 * 32 Character endChainID Chain identifier for the chain 455 * containing this helix. 456 * 34 - 37 Integer endSeqNum Sequence number of the terminal 457 * residue. 458 * 38 AChar endICode Insertion code of the terminal 459 * residue. 460 * 39 - 40 Integer helixClass Helix class (see below). 461 * 41 - 70 String comment Comment about this helix. 462 * 72 - 76 Integer length Length of this helix. 463 * </pre> 464 */ 465 private void pdb_HELIX_Handler(String line){ 466 467 if (params.isHeaderOnly()) return; 468 469 if (line.length()<38) { 470 logger.info("HELIX line has length under 38. Ignoring it."); 471 return; 472 } 473 474 String initResName = line.substring(15,18).trim(); 475 String initChainId = line.substring(19,20); 476 String initSeqNum = line.substring(21,25).trim(); 477 String initICode = line.substring(25,26); 478 String endResName = line.substring(27,30).trim(); 479 String endChainId = line.substring(31,32); 480 String endSeqNum = line.substring(33,37).trim(); 481 String endICode = line.substring(37,38); 482 483 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 484 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 485 486 Map<String,String> m = new HashMap<String,String>(); 487 488 m.put("initResName",initResName); 489 m.put("initChainId", initChainId); 490 m.put("initSeqNum", initSeqNum); 491 m.put("initICode", initICode); 492 m.put("endResName", endResName); 493 m.put("endChainId", endChainId); 494 m.put("endSeqNum",endSeqNum); 495 m.put("endICode",endICode); 496 497 helixList.add(m); 498 499 } 500 501 /** 502 * Handler for 503 * <pre> 504 * COLUMNS DATA TYPE FIELD DEFINITION 505 * -------------------------------------------------------------- 506 * 1 - 6 Record name "SHEET " 507 * 8 - 10 Integer strand Strand number which starts at 1 508 * for each strand within a sheet 509 * and increases by one. 510 * 12 - 14 LString(3) sheetID Sheet identifier. 511 * 15 - 16 Integer numStrands Number of strands in sheet. 512 * 18 - 20 Residue name initResName Residue name of initial residue. 513 * 22 Character initChainID Chain identifier of initial 514 * residue in strand. 515 * 23 - 26 Integer initSeqNum Sequence number of initial 516 * residue in strand. 517 * 27 AChar initICode Insertion code of initial residue 518 * in strand. 519 * 29 - 31 Residue name endResName Residue name of terminal residue. 520 * 33 Character endChainID Chain identifier of terminal 521 * residue. 522 * 34 - 37 Integer endSeqNum Sequence number of terminal 523 * residue. 524 * 38 AChar endICode Insertion code of terminal 525 * residue. 526 * 39 - 40 Integer sense Sense of strand with respect to 527 * previous strand in the sheet. 0 528 * if first strand, 1 if parallel, 529 * -1 if anti-parallel. 530 * 42 - 45 Atom curAtom Registration. Atom name in 531 * current strand. 532 * 46 - 48 Residue name curResName Registration. Residue name in 533 * current strand. 534 * 50 Character curChainId Registration. Chain identifier in 535 * current strand. 536 * 51 - 54 Integer curResSeq Registration. Residue sequence 537 * number in current strand. 538 * 55 AChar curICode Registration. Insertion code in 539 * current strand. 540 * 57 - 60 Atom prevAtom Registration. Atom name in 541 * previous strand. 542 * 61 - 63 Residue name prevResName Registration. Residue name in 543 * previous strand. 544 * 65 Character prevChainId Registration. Chain identifier in 545 * previous strand. 546 * 66 - 69 Integer prevResSeq Registration. Residue sequence 547 * number in previous strand. 548 * 70 AChar prevICode Registration. Insertion code in 549 * previous strand. 550 * </pre> 551 */ 552 private void pdb_SHEET_Handler( String line){ 553 554 if (params.isHeaderOnly()) return; 555 556 if (line.length()<38) { 557 logger.info("SHEET line has length under 38. Ignoring it."); 558 return; 559 } 560 561 String initResName = line.substring(17,20).trim(); 562 String initChainId = line.substring(21,22); 563 String initSeqNum = line.substring(22,26).trim(); 564 String initICode = line.substring(26,27); 565 String endResName = line.substring(28,31).trim(); 566 String endChainId = line.substring(32,33); 567 String endSeqNum = line.substring(33,37).trim(); 568 String endICode = line.substring(37,38); 569 570 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 571 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 572 573 Map<String,String> m = new HashMap<String,String>(); 574 575 m.put("initResName",initResName); 576 m.put("initChainId", initChainId); 577 m.put("initSeqNum", initSeqNum); 578 m.put("initICode", initICode); 579 m.put("endResName", endResName); 580 m.put("endChainId", endChainId); 581 m.put("endSeqNum",endSeqNum); 582 m.put("endICode",endICode); 583 584 strandList.add(m); 585 } 586 587 588 /** 589 * Handler for TURN lines 590 * <pre> 591 * COLUMNS DATA TYPE FIELD DEFINITION 592 * -------------------------------------------------------------------- 593 * 1 - 6 Record name "TURN " 594 * 8 - 10 Integer seq Turn number; starts with 1 and 595 * increments by one. 596 * 12 - 14 LString(3) turnId Turn identifier 597 * 16 - 18 Residue name initResName Residue name of initial residue in 598 * turn. 599 * 20 Character initChainId Chain identifier for the chain 600 * containing this turn. 601 * 21 - 24 Integer initSeqNum Sequence number of initial residue 602 * in turn. 603 * 25 AChar initICode Insertion code of initial residue 604 * in turn. 605 * 27 - 29 Residue name endResName Residue name of terminal residue 606 * of turn. 607 * 31 Character endChainId Chain identifier for the chain 608 * containing this turn. 609 * 32 - 35 Integer endSeqNum Sequence number of terminal 610 * residue of turn. 611 * 36 AChar endICode Insertion code of terminal residue 612 * of turn. 613 * 41 - 70 String comment Associated comment. 614 * </pre> 615 * @param line 616 */ 617 private void pdb_TURN_Handler( String line){ 618 619 if (params.isHeaderOnly()) return; 620 621 if (line.length()<36) { 622 logger.info("TURN line has length under 36. Ignoring it."); 623 return; 624 } 625 626 String initResName = line.substring(15,18).trim(); 627 String initChainId = line.substring(19,20); 628 String initSeqNum = line.substring(20,24).trim(); 629 String initICode = line.substring(24,25); 630 String endResName = line.substring(26,29).trim(); 631 String endChainId = line.substring(30,31); 632 String endSeqNum = line.substring(31,35).trim(); 633 String endICode = line.substring(35,36); 634 635 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 636 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 637 638 Map<String,String> m = new HashMap<String,String>(); 639 640 m.put("initResName",initResName); 641 m.put("initChainId", initChainId); 642 m.put("initSeqNum", initSeqNum); 643 m.put("initICode", initICode); 644 m.put("endResName", endResName); 645 m.put("endChainId", endChainId); 646 m.put("endSeqNum",endSeqNum); 647 m.put("endICode",endICode); 648 649 turnList.add(m); 650 } 651 652 /** 653 * Handler for 654 * REVDAT Record format: 655 * <pre> 656 * 657 * COLUMNS DATA TYPE FIELD DEFINITION 658 * ---------------------------------------------------------------------------------- 659 * 1 - 6 Record name "REVDAT" 660 * 8 - 10 Integer modNum Modification number. 661 * 11 - 12 Continuation continuation Allows concatenation of multiple 662 * records. 663 * 14 - 22 Date modDate Date of modification (or release for 664 * new entries). This is not repeated 665 * on continuation lines. 666 * 24 - 28 String(5) modId Identifies this particular 667 * modification. It links to the 668 * archive used internally by PDB. 669 * This is not repeated on continuation 670 * lines. 671 * 32 Integer modType An integer identifying the type of 672 * modification. In case of revisions 673 * with more than one possible modType, 674 * the highest value applicable will be 675 * assigned. 676 * 40 - 45 LString(6) record Name of the modified record. 677 * 47 - 52 LString(6) record Name of the modified record. 678 * 54 - 59 LString(6) record Name of the modified record. 679 * 61 - 66 LString(6) record Name of the modified record. 680 * </pre> 681 */ 682 private void pdb_REVDAT_Handler(String line) { 683 684 // keep the first as latest modified date and the last as release date 685 Date modDate = pdbHeader.getModDate(); 686 687 if ( modDate==null || modDate.equals(new Date(0)) ) { 688 689 // modified date is still uninitialized 690 String modificationDate = line.substring (13, 22).trim() ; 691 692 try { 693 Date dep = dateFormat.parse(modificationDate); 694 pdbHeader.setModDate(dep); 695 pdbHeader.setRelDate(dep); 696 } catch (ParseException e){ 697 logger.info("Could not parse revision date string '"+modificationDate+"'. "); 698 } 699 700 } else { 701 702 // set as the release date 703 String releaseDate = line.substring (13, 22).trim() ; 704 705 try { 706 Date dep = dateFormat.parse(releaseDate); 707 pdbHeader.setRelDate(dep); 708 } catch (ParseException e){ 709 logger.info("Could not parse revision date string '"+releaseDate+"'. "); 710 } 711 } 712 } 713 714 /** 715 * Handler for 716 * SEQRES record format 717 * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied. 718 * <p> 719 * Record Format: 720 * <p> 721 * <pre> 722 * COLUMNS DATA TYPE FIELD DEFINITION 723 * --------------------------------------------------------------------------------- 724 * 1 - 6 Record name "SEQRES" 725 * 9 - 10 Integer serNum Serial number of the SEQRES record 726 * for the current chain. Starts at 1 727 * and increments by one each line. 728 * Reset to 1 for each chain. 729 * 12 Character chainID Chain identifier. This may be any 730 * single legal character, including a 731 * blank which is used if there is 732 * only one chain. 733 * 14 - 17 Integer numRes Number of residues in the chain. 734 * This value is repeated on every 735 * record. 736 * 20 - 22 Residue name resName Residue name. 737 * 24 - 26 Residue name resName Residue name. 738 * 28 - 30 Residue name resName Residue name. 739 * 32 - 34 Residue name resName Residue name. 740 * 36 - 38 Residue name resName Residue name. 741 * 40 - 42 Residue name resName Residue name. 742 * 44 - 46 Residue name resName Residue name. 743 * 48 - 50 Residue name resName Residue name. 744 * 52 - 54 Residue name resName Residue name. 745 * 56 - 58 Residue name resName Residue name. 746 * 60 - 62 Residue name resName Residue name. 747 * 64 - 66 Residue name resName Residue name. 748 * 68 - 70 Residue name resName Residue name. 749 * </pre> 750 * @author Jules Jacobsen 751 */ 752 private void pdb_SEQRES_Handler(String line) { 753 754 /* 755 * 1 2 3 4 5 6 7 756 * 1234567890123456789012345678901234567890123456789012345678901234567890 757 * SEQRES 1 A 376 LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR 758 * SEQRES 1 A 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 759 * SEQRES 2 A 21 TYR GLN LEU GLU ASN TYR CYS ASN 760 * SEQRES 1 B 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 761 * SEQRES 2 B 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 762 * SEQRES 3 B 30 THR PRO LYS ALA 763 * SEQRES 1 C 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 764 * SEQRES 2 C 21 TYR GLN LEU GLU ASN TYR CYS ASN 765 * SEQRES 1 D 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 766 * SEQRES 2 D 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 767 * SEQRES 3 D 30 THR PRO LYS ALA 768 */ 769 770 String recordName = line.substring(0, 6).trim(); 771 String chainID = line.substring(11, 12); 772 String newLength = line.substring(13,17).trim(); 773 String subSequence = line.substring(18); 774 775 if ( lengthCheck == -1 ){ 776 lengthCheck = Integer.parseInt(newLength); 777 } 778 779 StringTokenizer subSequenceResidues = new StringTokenizer(subSequence); 780 781 Character aminoCode1 = null; 782 if (! recordName.equals(AminoAcid.SEQRESRECORD)) { 783 // should not have been called 784 return; 785 } 786 787 currentChain = isKnownChain(chainID, seqResChains); 788 if ( currentChain == null) { 789 790 currentChain = new ChainImpl(); 791 currentChain.setId(chainID); 792 currentChain.setName(chainID); 793 794 } 795 796 while (subSequenceResidues.hasMoreTokens()) { 797 798 String threeLetter = subSequenceResidues.nextToken(); 799 800 aminoCode1 = StructureTools.get1LetterCode(threeLetter); 801 802 //if (aminoCode1 == null) { 803 // could be a nucleotide... 804 // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide... 805 //} 806 currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter); 807 808 currentGroup.setPDBName(threeLetter); 809 810 if ( currentGroup instanceof AminoAcid){ 811 AminoAcid aa = (AminoAcid)currentGroup; 812 aa.setRecordType(AminoAcid.SEQRESRECORD); 813 } 814 // add the current resNum to the new chain. 815 currentChain.addGroup(currentGroup); 816 817 } 818 Chain test = isKnownChain(chainID, seqResChains); 819 820 if ( test == null) 821 seqResChains.add(currentChain); 822 823 if (currentGroup != null) 824 currentGroup.trimToSize(); 825 826 currentGroup = null; 827 currentChain = null; 828 829 // the current chain is finished! 830 //if ( current_chain.getLength() != lengthCheck ){ 831 // System.err.println("the length of chain " + current_chain.getName() + "(" + 832 // current_chain.getLength() + ") does not match the expected " + lengthCheck); 833 //} 834 835 lengthCheck = Integer.parseInt(newLength); 836 837 } 838 839 840 841 /** 842 * Handler for 843 * TITLE Record Format 844 * <pre> 845 COLUMNS DATA TYPE FIELD DEFINITION 846 ---------------------------------------------------------------------------------- 847 1 - 6 Record name "TITLE " 848 9 - 10 Continuation continuation Allows concatenation of multiple 849 records. 850 11 - 70 String title Title of the experiment. 851 * </pre> 852 * 853 */ 854 private void pdb_TITLE_Handler(String line) { 855 String title; 856 if ( line.length() > 79) 857 title = line.substring(10,80).trim(); 858 else 859 title = line.substring(10,line.length()).trim(); 860 861 String t = pdbHeader.getTitle(); 862 if ( (t != null) && (! t.equals("")) ){ 863 if (t.endsWith("-")) 864 t += ""; // if last line ends with a hyphen then we don't add space 865 else 866 t += " "; 867 } 868 else t = ""; 869 870 t += title; 871 872 pdbHeader.setTitle(t); 873 } 874 875 /** 876 * JRNL handler. 877 * The JRNL record contains the primary literature citation that describes the experiment which resulted 878 * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary 879 * reference, then there is no JRNL reference. Other references are given in REMARK 1. 880 * 881 * Record Format 882 * <pre> 883 * COLUMNS DATA TYPE FIELD DEFINITION 884 * ----------------------------------------------------------------------- 885 * 1 - 6 Record name "JRNL " 886 * 887 * 13 - 70 LString text See Details below. 888 * </pre> 889 */ 890 private void pdb_JRNL_Handler(String line) { 891 //add the strings to the journalLines 892 //the actual JournalArticle is then built when the whole entry is being 893 //finalized with triggerEndFileChecks() 894 //JRNL TITL NMR SOLUTION STRUCTURE OF RECOMBINANT TICK 1TAP 10 895 if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) { 896 //trim off the trailing PDB id from legacy files. 897 //are we really trying to still cater for these museum pieces? 898 899 logger.debug("trimming legacy PDB id from end of JRNL section line"); 900 901 line = line.substring(0, line.length() - 8); 902 journalLines.add(line); 903 } else { 904 journalLines.add(line); 905 } 906 } 907 908 /** 909 * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same 910 * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be 911 * missing. Don't say I didn't warn you. 912 * 913 * @param line 914 */ 915 private void pdb_COMPND_Handler(String line) { 916 917 logger.debug("previousContinuationField is " 918 + previousContinuationField); 919 logger.debug("current continuationField is " 920 + continuationField); 921 logger.debug("current continuationString is " 922 + continuationString); 923 logger.debug("current compound is " 924 + current_compound); 925 926 927 // In legacy PDB files the line ends with the PDB code and a serial number, chop those off! 928 //format version 3.0 onwards will have 80 characters in a line 929 // if (line.length() > 72) { 930 if (isLegacyFormat) { 931 // if (DEBUG) { 932 // System.out.println("We have a legacy file - truncating line length to 71 characters:"); 933 // System.out.println(line); 934 // } 935 line = line.substring(0, 72); 936 } 937 938 line = line.substring(10, line.length()); 939 940 941 String[] fieldList = line.trim().split("\\s+"); 942 int fl = fieldList.length; 943 if (fl > 0) { 944 String field0 = fieldList[0]; 945 if (compndFieldValues.contains(field0)) { 946 continuationField = field0; 947 if (previousContinuationField.equals("")) { 948 previousContinuationField = continuationField; 949 } 950 } else if (field0.endsWith(";") && compndFieldValues.contains(field0.substring(0, field0.length()-1)) ) { 951 // the ':' character indicates the end of a field name and should be invalid as part the first data token 952 // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check 953 // UPDATE: There is no harm of having a ':' in the first data token. e.g. 3fdj contains a ':'. 954 // The intended case occurs only if the token is a key followed by a colon and a semicolon without spaces, e.g. "COMPND 2 MOLECULE:;" 955 logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier"); 956 return; 957 } 958 } else { 959 // the line will be added as data to the previous field 960 } 961 962 963 line = line.replace(continuationField, "").trim(); 964 965 StringTokenizer compndTokens = new StringTokenizer(line); 966 967 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 968 969 while (compndTokens.hasMoreTokens()) { 970 String token = compndTokens.nextToken(); 971 972 if (previousContinuationField.equals("")) { 973 previousContinuationField = continuationField; 974 } 975 976 if (previousContinuationField.equals(continuationField) 977 && compndFieldValues.contains(continuationField)) { 978 979 logger.debug("Still in field " + continuationField); 980 logger.debug("token = " + token); 981 982 continuationString = continuationString.concat(token + " "); 983 984 logger.debug("continuationString = " 985 + continuationString); 986 987 } 988 if (!continuationField.equals(previousContinuationField)) { 989 990 if (continuationString.equals("")) { 991 continuationString = token; 992 993 } else { 994 995 compndValueSetter(previousContinuationField, 996 continuationString); 997 previousContinuationField = continuationField; 998 continuationString = token + " "; 999 } 1000 } else if (ignoreCompndFieldValues.contains(token)) { 1001 // this field shall be ignored 1002 //continuationField = token; 1003 } 1004 } 1005 if (isLastCompndLine) { 1006 // final line in the section - finish off the compound 1007 // System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header."); 1008 compndValueSetter(continuationField, continuationString); 1009 continuationString = ""; 1010 if (current_compound!=null) entities.add(current_compound); 1011 } 1012 } 1013 1014 /** 1015 * Set the value in the current molId object 1016 * @param field 1017 * @param value 1018 */ 1019 private void compndValueSetter(String field, String value) { 1020 1021 value = value.trim().replace(";", ""); 1022 if (field.equals("MOL_ID:")) { 1023 1024 int i = -1; 1025 try { 1026 i = Integer.valueOf(value); 1027 } catch (NumberFormatException e){ 1028 logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value); 1029 } 1030 if (i>0 && prevMolId!=i) { 1031 1032 if (current_compound!=null) entities.add(current_compound); 1033 1034 logger.debug("Initialising new Compound with mol_id {}", i); 1035 1036 current_compound = new EntityInfo(); 1037 1038 current_compound.setMolId(i); 1039 1040 // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25 1041 current_compound.setType(EntityType.POLYMER); 1042 1043 prevMolId = i; 1044 } 1045 1046 } 1047 1048 // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return 1049 if (current_compound==null) { 1050 return; 1051 } 1052 1053 if (field.equals("MOLECULE:")) { 1054 current_compound.setDescription(value); 1055 1056 } 1057 if (field.equals("CHAIN:")) { 1058 //System.out.println(value); 1059 StringTokenizer chainTokens = new StringTokenizer(value, ","); 1060 List<String> chains = new ArrayList<String>(); 1061 1062 while (chainTokens.hasMoreTokens()) { 1063 String chainID = chainTokens.nextToken().trim(); 1064 // NULL is used in old PDB files to represent empty chain DI 1065 if (chainID.equals("NULL")) 1066 chainID = " "; 1067 chains.add(chainID); 1068 } 1069 compoundMolIds2chainIds.put(current_compound.getMolId(),chains); 1070 1071 } 1072 if (field.equals("SYNONYM:")) { 1073 1074 StringTokenizer synonyms = new StringTokenizer(value, ","); 1075 List<String> names = new ArrayList<String>(); 1076 1077 while (synonyms.hasMoreTokens()) { 1078 names.add(synonyms.nextToken()); 1079 1080 current_compound.setSynonyms(names); 1081 } 1082 1083 } 1084 1085 if (field.equals("EC:")) { 1086 1087 StringTokenizer ecNumTokens = new StringTokenizer(value, ","); 1088 List<String> ecNums = new ArrayList<String>(); 1089 1090 while (ecNumTokens.hasMoreTokens()) { 1091 ecNums.add(ecNumTokens.nextToken()); 1092 1093 current_compound.setEcNums(ecNums); 1094 } 1095 1096 } 1097 if (field.equals("FRAGMENT:")) { 1098 1099 current_compound.setFragment(value); 1100 1101 } 1102 if (field.equals("ENGINEERED:")) { 1103 1104 current_compound.setEngineered(value); 1105 1106 } 1107 if (field.equals("MUTATION:")) { 1108 1109 current_compound.setMutation(value); 1110 1111 } 1112 if (field.equals("BIOLOGICAL_UNIT:")) { 1113 1114 current_compound.setBiologicalUnit(value); 1115 1116 } 1117 if (field.equals("OTHER_DETAILS:")) { 1118 1119 current_compound.setDetails(value); 1120 1121 } 1122 1123 } 1124 1125 1126 /** 1127 * Handler for 1128 * SOURCE Record format 1129 * 1130 * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied. 1131 * Record Format 1132 * <pre> 1133 * COLUMNS DATA TYPE FIELD DEFINITION 1134 * ------------------------------------------------------------------------------- 1135 * 1 - 6 Record name "SOURCE" 1136 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 1137 * 11 - 70 Specification srcName Identifies the source of the macromolecule in 1138 * list a token: value format. 1139 * </pre> 1140 * @param line the line to be parsed 1141 */ 1142 private void pdb_SOURCE_Handler(String line) { 1143 // works in the same way as the pdb_COMPND_Handler. 1144 String continuationNr = line.substring(9, 10).trim(); 1145 1146 1147 1148 logger.debug("current continuationNo is " 1149 + continuationNr); 1150 logger.debug("previousContinuationField is " 1151 + previousContinuationField); 1152 logger.debug("current continuationField is " 1153 + continuationField); 1154 logger.debug("current continuationString is " 1155 + continuationString); 1156 logger.debug("current compound is " 1157 + current_compound); 1158 1159 1160 // following the docs, the last valid character should be 79, chop off the rest 1161 if (line.length() > 79) { 1162 line = line.substring(0, 79); 1163 } 1164 1165 line = line.substring(10, line.length()); 1166 1167 logger.debug("LINE: >" + line + "<"); 1168 1169 String[] fieldList = line.split("\\s+"); 1170 1171 if (!fieldList[0].equals("") 1172 && sourceFieldValues.contains(fieldList[0])) { 1173 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'"); 1174 continuationField = fieldList[0]; 1175 if (previousContinuationField.equals("")) { 1176 previousContinuationField = continuationField; 1177 } 1178 1179 } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) { 1180 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'"); 1181 continuationField = fieldList[1]; 1182 if (previousContinuationField.equals("")) { 1183 previousContinuationField = continuationField; 1184 } 1185 1186 } else { 1187 if (continuationNr.equals("")) { 1188 1189 logger.debug("looks like an old PDB file"); 1190 1191 continuationField = "MOLECULE:"; 1192 if (previousContinuationField.equals("")) { 1193 previousContinuationField = continuationField; 1194 } 1195 } 1196 1197 } 1198 1199 line = line.replace(continuationField, "").trim(); 1200 1201 StringTokenizer compndTokens = new StringTokenizer(line); 1202 1203 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 1204 1205 while (compndTokens.hasMoreTokens()) { 1206 String token = compndTokens.nextToken(); 1207 1208 if (previousContinuationField.equals("")) { 1209 // System.out.println("previousContinuationField is empty. Setting to : " + continuationField); 1210 previousContinuationField = continuationField; 1211 } 1212 1213 if (previousContinuationField.equals(continuationField) 1214 && sourceFieldValues.contains(continuationField)) { 1215 1216 logger.debug("Still in field " + continuationField); 1217 1218 continuationString = continuationString.concat(token + " "); 1219 1220 logger.debug("continuationString = " 1221 + continuationString); 1222 } 1223 if (!continuationField.equals(previousContinuationField)) { 1224 1225 if (continuationString.equals("")) { 1226 continuationString = token; 1227 1228 } else { 1229 1230 sourceValueSetter(previousContinuationField, 1231 continuationString); 1232 previousContinuationField = continuationField; 1233 continuationString = token + " "; 1234 } 1235 } else if (ignoreCompndFieldValues.contains(token)) { 1236 // this field shall be ignored 1237 //continuationField = token; 1238 } 1239 } 1240 if (isLastSourceLine) { 1241 // final line in the section - finish off the compound 1242 // System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header."); 1243 sourceValueSetter(continuationField, continuationString); 1244 continuationString = ""; 1245 //compounds.add(current_compound); 1246 } 1247 1248 } 1249 1250 1251 /** 1252 * Set the value in the current molId object 1253 * 1254 * @param field 1255 * @param value 1256 */ 1257 private void sourceValueSetter(String field, String value) { 1258 1259 value = value.trim().replace(";", ""); 1260 // System.out.println("[sourceValueSetter] " + field); 1261 if (field.equals("MOL_ID:")) { 1262 1263 try { 1264 current_compound = entities.get(Integer.valueOf(value) - 1); 1265 } catch (NumberFormatException e){ 1266 logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage()); 1267 return; 1268 } 1269 1270 1271 // System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId()); 1272 1273 } 1274 if (field.equals("SYNTHETIC:")) { 1275 current_compound.setSynthetic(value); 1276 } else if (field.equals("FRAGMENT:")) { 1277 current_compound.setFragment(value); 1278 } else if (field.equals("ORGANISM_SCIENTIFIC:")) { 1279 current_compound.setOrganismScientific(value); 1280 } else if (field.equals("ORGANISM_TAXID:")) { 1281 current_compound.setOrganismTaxId(value); 1282 } else if (field.equals("ORGANISM_COMMON:")) { 1283 current_compound.setOrganismCommon(value); 1284 } else if (field.equals("STRAIN:")) { 1285 current_compound.setStrain(value); 1286 } else if (field.equals("VARIANT:")) { 1287 current_compound.setVariant(value); 1288 } else if (field.equals("CELL_LINE:")) { 1289 current_compound.setCellLine(value); 1290 } else if (field.equals("ATCC:")) { 1291 current_compound.setAtcc(value); 1292 } else if (field.equals("ORGAN:")) { 1293 current_compound.setOrgan(value); 1294 } else if (field.equals("TISSUE:")) { 1295 current_compound.setTissue(value); 1296 } else if (field.equals("CELL:")) { 1297 current_compound.setCell(value); 1298 } else if (field.equals("ORGANELLE:")) { 1299 current_compound.setOrganelle(value); 1300 } else if (field.equals("SECRETION:")) { 1301 current_compound.setSecretion(value); 1302 } else if (field.equals("GENE:")) { 1303 current_compound.setGene(value); 1304 } else if (field.equals("CELLULAR_LOCATION:")) { 1305 current_compound.setCellularLocation(value); 1306 } else if (field.equals("EXPRESSION_SYSTEM:")) { 1307 current_compound.setExpressionSystem(value); 1308 } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) { 1309 current_compound.setExpressionSystemTaxId(value); 1310 } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) { 1311 current_compound.setExpressionSystemStrain(value); 1312 } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) { 1313 current_compound.setExpressionSystemVariant(value); 1314 } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) { 1315 current_compound.setExpressionSystemCellLine(value); 1316 } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) { 1317 current_compound.setExpressionSystemAtccNumber(value); 1318 } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) { 1319 current_compound.setExpressionSystemOrgan(value); 1320 } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) { 1321 current_compound.setExpressionSystemTissue(value); 1322 } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) { 1323 current_compound.setExpressionSystemCell(value); 1324 } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) { 1325 current_compound.setExpressionSystemOrganelle(value); 1326 } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) { 1327 current_compound.setExpressionSystemCellularLocation(value); 1328 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) { 1329 current_compound.setExpressionSystemVectorType(value); 1330 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) { 1331 current_compound.setExpressionSystemVector(value); 1332 } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) { 1333 current_compound.setExpressionSystemPlasmid(value); 1334 } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) { 1335 current_compound.setExpressionSystemGene(value); 1336 } else if (field.equals("OTHER_DETAILS:")) { 1337 current_compound.setExpressionSystemOtherDetails(value); 1338 } 1339 1340 } 1341 1342 /** 1343 * Handler for REMARK lines 1344 */ 1345 private void pdb_REMARK_Handler(String line) { 1346 1347 if ( line == null || line.length() < 11) 1348 return; 1349 1350 1351 if (line.startsWith("REMARK 800")) { 1352 pdb_REMARK_800_Handler(line); 1353 1354 } else if ( line.startsWith("REMARK 350")){ 1355 1356 if ( params.isParseBioAssembly()) { 1357 1358 if (bioAssemblyParser == null){ 1359 bioAssemblyParser = new PDBBioAssemblyParser(); 1360 } 1361 1362 bioAssemblyParser.pdb_REMARK_350_Handler(line); 1363 } 1364 } else if (line.startsWith("REMARK 2")) { 1365 //REMARK 2 RESOLUTION. 1366 Pattern pR = Pattern.compile("^REMARK 2 RESOLUTION.\\s+(\\d+\\.\\d+)\\s+ANGSTROMS\\..*"); 1367 handleResolutionLine(line, pR); 1368 1369 // REMARK 3 (for R free) 1370 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1371 // then last one encountered will be taken 1372 } else if (line.startsWith("REMARK 3 FREE R VALUE")) { 1373 1374 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 1375 // Here we follow this strategy: 1376 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 1377 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 1378 1379 Pattern pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*"); 1380 Matcher mR = pR.matcher(line); 1381 if (mR.matches()) { 1382 try { 1383 rfreeNoCutoffLine = Float.parseFloat(mR.group(1)); 1384 } catch (NumberFormatException e) { 1385 logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it"); 1386 } 1387 } 1388 pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*"); 1389 mR = pR.matcher(line); 1390 if (mR.matches()) { 1391 try { 1392 rfreeStandardLine = Float.parseFloat(mR.group(1)); 1393 } catch (NumberFormatException e) { 1394 logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1)); 1395 } 1396 } 1397 1398 // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries) 1399 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1400 // then last one encountered will be taken 1401 } else if (line.startsWith("REMARK 3 RESOLUTION RANGE HIGH")){ 1402 Pattern pR = Pattern.compile("^REMARK 3 RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*"); 1403 handleResolutionLine(line, pR); 1404 } else if (line.startsWith("REMARK 3 EFFECTIVE RESOLUTION")){ 1405 Pattern pR = Pattern.compile("^REMARK 3 EFFECTIVE RESOLUTION \\(ANGSTROMS\\)\\s+:\\s+(\\d+\\.\\d+).*"); 1406 handleResolutionLine(line, pR); 1407 } 1408 } 1409 1410 public void handleResolutionLine(String line, Pattern pR) { 1411 Matcher mR = pR.matcher(line); 1412 if (mR.matches()) { 1413 final String resString = mR.group(1); 1414 try { 1415 float res = Float.parseFloat(resString); 1416 final float resInHeader = pdbHeader.getResolution(); 1417 if (resInHeader!=PDBHeader.DEFAULT_RESOLUTION && resInHeader != res) { 1418 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} " 1419 ,resString, String.format("%4.2f",resInHeader)); 1420 } 1421 pdbHeader.setResolution(res); 1422 } catch (NumberFormatException e) { 1423 logger.info("Could not parse resolution '{}', ignoring it",resString); 1424 } 1425 } 1426 } 1427 1428 1429 1430 1431 1432 1433 /** 1434 * Handler for 1435 * EXPDTA Record Format 1436 <pre> 1437 COLUMNS DATA TYPE FIELD DEFINITION 1438 ------------------------------------------------------------------------------- 1439 1 - 6 Record name "EXPDTA" 1440 9 - 10 Continuation continuation Allows concatenation of multiple 1441 records. 1442 11 - 70 SList technique The experimental technique(s) with 1443 optional comment describing the 1444 sample or experiment. 1445 1446 allowed techniques are: 1447 ELECTRON DIFFRACTION 1448 FIBER DIFFRACTION 1449 FLUORESCENCE TRANSFER 1450 NEUTRON DIFFRACTION 1451 NMR 1452 THEORETICAL MODEL 1453 X-RAY DIFFRACTION 1454 </pre> 1455 */ 1456 private void pdb_EXPDTA_Handler(String line) { 1457 1458 String technique ; 1459 if (line.length() > 69) 1460 technique = line.substring (10, 70).trim() ; 1461 else 1462 technique = line.substring(10).trim(); 1463 1464 for (String singleTechnique: technique.split(";\\s+")) { 1465 pdbHeader.setExperimentalTechnique(singleTechnique); 1466 } 1467 1468 1469 } 1470 1471 /** 1472 * Handler for 1473 * CRYST1 Record Format 1474 * The CRYST1 record presents the unit cell parameters, space group, and Z value. 1475 * If the entry describes a structure determined by a technique other than X-ray crystallography, 1476 * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1477 * <pre> 1478 * COLUMNS DATA TYPE FIELD DEFINITION 1479 * ------------------------------------------------------------- 1480 * 1 - 6 Record name "CRYST1" 1481 * 7 - 15 Real(9.3) a a (Angstroms). 1482 * 16 - 24 Real(9.3) b b (Angstroms). 1483 * 25 - 33 Real(9.3) c c (Angstroms). 1484 * 34 - 40 Real(7.2) alpha alpha (degrees). 1485 * 41 - 47 Real(7.2) beta beta (degrees). 1486 * 48 - 54 Real(7.2) gamma gamma (degrees). 1487 * 56 - 66 LString sGroup Space group. 1488 * 67 - 70 Integer z Z value. 1489 * </pre> 1490 */ 1491 private void pdb_CRYST1_Handler(String line) { 1492 // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1') 1493 if (line.length() < 58) { 1494 logger.warn("CRYST1 record has fewer than 58 columns: will ignore it"); 1495 return; 1496 } 1497 1498 float a; 1499 float b; 1500 float c; 1501 float alpha; 1502 float beta; 1503 float gamma; 1504 String spaceGroup = ""; 1505 1506 try { 1507 a = Float.parseFloat(line.substring(6,15).trim()); 1508 b = Float.parseFloat(line.substring(15,24).trim()); 1509 c = Float.parseFloat(line.substring(24,33).trim()); 1510 alpha = Float.parseFloat(line.substring(33,40).trim()); 1511 beta = Float.parseFloat(line.substring(40,47).trim()); 1512 gamma = Float.parseFloat(line.substring(47,54).trim()); 1513 } catch (NumberFormatException e) { 1514 logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line); 1515 return ; 1516 } 1517 if (line.length()>=66) { 1518 // for well formatted files 1519 spaceGroup = line.substring(55,66).trim(); 1520 } else { 1521 // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value 1522 spaceGroup = line.substring(55,line.length()).trim(); 1523 } 1524 1525 CrystalCell xtalCell = new CrystalCell(); 1526 xtalCell.setA(a); 1527 xtalCell.setB(b); 1528 xtalCell.setC(c); 1529 xtalCell.setAlpha(alpha); 1530 xtalCell.setBeta(beta); 1531 xtalCell.setGamma(gamma); 1532 1533 if (!xtalCell.isCellReasonable()) { 1534 // If the entry describes a structure determined by a technique other than X-ray crystallography, 1535 // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1536 // if so we don't add the crystal cell and it remains null 1537 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", 1538 CrystalCell.MIN_VALID_CELL_SIZE); 1539 } else { 1540 crystallographicInfo.setCrystalCell(xtalCell); 1541 } 1542 1543 SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup); 1544 if (sg==null) { 1545 logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group"); 1546 crystallographicInfo.setNonStandardSg(true); 1547 } else { 1548 crystallographicInfo.setSpaceGroup(sg); 1549 crystallographicInfo.setNonStandardSg(false); 1550 } 1551 } 1552 1553 /** 1554 * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries) 1555 * 1556 * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn 1557 * <pre> 1558 * COLUMNS DATA TYPE FIELD DEFINITION 1559 * ------------------------------------------------------------- 1560 * 1561 * 1 - 6 Record name "MTRIXn" n=1, 2, or 3 1562 * 8 - 10 Integer serial Serial number. 1563 * 11 - 20 Real(10.6) m[n][1] Mn1 1564 * 21 - 30 Real(10.6) m[n][2] Mn2 1565 * 31 - 40 Real(10.6) m[n][3] Mn3 1566 * 46 - 55 Real(10.5) v[n] Vn 1567 * 60 Integer iGiven 1 1568 * 1569 * </pre> 1570 * Note that we ignore operators with iGiven==1 1571 * 1572 * @param line 1573 */ 1574 private void pdb_MTRIXn_Handler(String line) { 1575 1576 // don't process incomplete records 1577 if (line.length() < 55) { 1578 logger.info("MTRIXn record has fewer than 55 columns: will ignore it"); 1579 return; 1580 } 1581 1582 1583 try { 1584 1585 int rowIndex = Integer.parseInt(line.substring(5,6)); 1586 double col1Value = Double.parseDouble(line.substring(10,20)); 1587 double col2Value = Double.parseDouble(line.substring(20,30)); 1588 double col3Value = Double.parseDouble(line.substring(30,40)); 1589 double translValue = Double.parseDouble(line.substring(45,55)); 1590 int iGiven = 0; 1591 if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) { 1592 iGiven = Integer.parseInt(line.substring(59,60)); 1593 } 1594 1595 if (iGiven == 1) return; 1596 1597 if (ncsOperators==null) { 1598 // we initialise on first pass 1599 ncsOperators = new ArrayList<Matrix4d>(); 1600 } 1601 1602 if (currentNcsOp==null) { 1603 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1604 } 1605 1606 currentNcsOp.setElement(rowIndex-1, 0, col1Value); 1607 currentNcsOp.setElement(rowIndex-1, 1, col2Value); 1608 currentNcsOp.setElement(rowIndex-1, 2, col3Value); 1609 currentNcsOp.setElement(rowIndex-1, 3, translValue); 1610 1611 1612 if (rowIndex==3) { 1613 ncsOperators.add(currentNcsOp); 1614 // we initialise for next matrix to come 1615 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1616 } 1617 1618 } catch (NumberFormatException e) { 1619 logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<"); 1620 } 1621 } 1622 1623 /** 1624 * Handler for ATOM. 1625 * Record Format: 1626 * 1627 * <pre> 1628 * ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1629 * 1630 * COLUMNS DATA TYPE FIELD DEFINITION 1631 * --------------------------------------------------------------------------------- 1632 * 1 - 6 Record name "ATOM " 1633 * 7 - 11 Integer serial Atom serial number. 1634 * 13 - 16 Atom name Atom name. 1635 * 17 Character altLoc Alternate location indicator. 1636 * 18 - 20 Residue name resName Residue name. 1637 * 22 Character chainID Chain identifier. 1638 * 23 - 26 Integer resSeq Residue sequence number. 1639 * 27 AChar iCode Code for insertion of residues. 1640 * 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 1641 * 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 1642 * 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 1643 * 55 - 60 Real(6.2) occupancy Occupancy. 1644 * 61 - 66 Real(6.2) tempFactor Temperature factor. 1645 * 73 - 76 LString(4) segID Segment identifier, left-justified. 1646 * 77 - 78 LString(2) element Element symbol, right-justified. 1647 * 79 - 80 LString(2) charge Charge on the atom. 1648 * </pre> 1649 */ 1650 private void pdb_ATOM_Handler(String line) { 1651 1652 if ( params.isHeaderOnly()) 1653 return; 1654 1655 // let's first get the chain name which will serve to identify if we are starting a new molecule 1656 String chainName = line.substring(21,22); 1657 1658 if (chainName.equals(" ")) { 1659 blankChainIdsPresent = true; 1660 } 1661 1662 if (currentChain!=null && !currentChain.getName().equals(chainName)) { 1663 // new chain name: another molecule coming 1664 startOfMolecule = true; 1665 } 1666 1667 if (startOfMolecule) { 1668 // we add last chain if there was one 1669 if (currentChain!=null) { 1670 currentModel.add(currentChain); 1671 // let's not forget adding the last group to the finishing chain 1672 if (currentGroup!=null) { 1673 currentChain.addGroup(currentGroup); 1674 } 1675 } 1676 // we initialise the new molecule to come 1677 currentChain = new ChainImpl(); 1678 // note that the chainId (asym id) is set properly later in assignAsymIds 1679 currentChain.setId(chainName); 1680 currentChain.setName(chainName); 1681 1682 } 1683 1684 if (startOfModel) { 1685 // we add last model if there was one 1686 if (currentModel!=null) { 1687 allModels.add(currentModel); 1688 } 1689 // we initialise the model to come 1690 currentModel = new ArrayList<>(); 1691 } 1692 1693 1694 // let's get the residue number and see if we need to start a new group 1695 1696 String groupCode3 = line.substring(17,20).trim(); 1697 String resNum = line.substring(22,26).trim(); 1698 Character iCode = line.substring(26,27).charAt(0); 1699 if ( iCode == ' ') 1700 iCode = null; 1701 ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode); 1702 1703 //recordName groupCode3 1704 //| | resNum 1705 //| | | iCode 1706 //| | | | | || 1707 //ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1708 //ATOM 1964 N ARG H 221A 5.963 -16.715 27.669 1.00 28.59 N 1709 1710 Character aminoCode1 = StructureTools.get1LetterCode(groupCode3); 1711 1712 String recordName = line.substring (0, 6).trim (); 1713 1714 boolean isHetAtomInFile = false; 1715 1716 if (recordName.equals("HETATM") ){ 1717 // HETATOM RECORDS are treated slightly differently 1718 // some modified amino acids that we want to treat as amino acids 1719 // can be found as HETATOM records 1720 if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) 1721 aminoCode1 = null; 1722 1723 isHetAtomInFile = true; 1724 } 1725 1726 if ( startOfMolecule) { 1727 1728 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1729 1730 currentGroup.setPDBName(groupCode3); 1731 currentGroup.setResidueNumber(residueNumber); 1732 currentGroup.setHetAtomInFile(isHetAtomInFile); 1733 1734 } 1735 1736 // resetting states 1737 startOfModel = false; 1738 startOfMolecule = false; 1739 1740 1741 Character altLoc = line.substring (16, 17).charAt(0); 1742 Group altGroup = null; 1743 1744 1745 // check if residue number is the same ... 1746 if ( ! residueNumber.equals(currentGroup.getResidueNumber())) { 1747 1748 currentChain.addGroup(currentGroup); 1749 currentGroup.trimToSize(); 1750 1751 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1752 1753 currentGroup.setPDBName(groupCode3); 1754 currentGroup.setResidueNumber(residueNumber); 1755 currentGroup.setHetAtomInFile(isHetAtomInFile); 1756 1757 } else { 1758 // same residueNumber, but altLocs... 1759 1760 // test altLoc 1761 if ( ! altLoc.equals(' ')) { 1762 logger.debug("found altLoc! " + currentGroup + " " + altGroup); 1763 altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3); 1764 if ( altGroup.getChain() == null) { 1765 // need to set current chain 1766 altGroup.setChain(currentChain); 1767 } 1768 1769 } 1770 } 1771 1772 atomCount++; 1773 1774 if ( atomCount == atomCAThreshold ) { 1775 // throw away the SEQRES lines - too much to deal with... 1776 logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines"); 1777 seqResChains.clear(); 1778 1779 switchCAOnly(); 1780 1781 } 1782 1783 1784 1785 if ( atomCount == loadMaxAtoms){ 1786 logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line); 1787 return; 1788 } 1789 if ( atomCount > loadMaxAtoms){ 1790 return; 1791 } 1792 1793 1794 // 1 2 3 4 5 6 1795 //012345678901234567890123456789012345678901234567890123456789 1796 //ATOM 1 N MET 1 20.154 29.699 5.276 1.0 1797 //ATOM 112 CA ASP 112 41.017 33.527 28.371 1.00 0.00 1798 //ATOM 53 CA MET 7 23.772 33.989 -21.600 1.00 0.00 C 1799 //ATOM 112 CA ASP 112 37.613 26.621 33.571 0 0 1800 1801 1802 String fullname = line.substring (12, 16); 1803 1804 // check for CA only if requested 1805 if ( parseCAonly ){ 1806 // yes , user wants to get CA only 1807 // only parse CA atoms... 1808 if (! fullname.equals(" CA ")){ 1809 //System.out.println("ignoring " + line); 1810 atomCount--; 1811 return; 1812 } 1813 } 1814 1815 if ( params.getAcceptedAtomNames() != null) { 1816 1817 boolean found = false; 1818 for (String ok : params.getAcceptedAtomNames()){ 1819 //System.out.println(ok + "< >" + fullname +"<"); 1820 1821 if ( ok.equals(fullname.trim())) { 1822 found = true; 1823 break; 1824 } 1825 } 1826 if ( ! found) { 1827 atomCount--; 1828 return; 1829 } 1830 } 1831 // create new atom 1832 1833 int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ()); 1834 AtomImpl atom = new AtomImpl() ; 1835 atom.setPDBserial(pdbnumber) ; 1836 1837 atom.setAltLoc(altLoc); 1838 atom.setName(fullname.trim()); 1839 1840 double x = Double.parseDouble (line.substring (30, 38).trim()); 1841 double y = Double.parseDouble (line.substring (38, 46).trim()); 1842 double z = Double.parseDouble (line.substring (46, 54).trim()); 1843 1844 double[] coords = new double[3]; 1845 coords[0] = x ; 1846 coords[1] = y ; 1847 coords[2] = z ; 1848 atom.setCoords(coords); 1849 1850 float occu = 1.0f; 1851 if ( line.length() > 59 ) { 1852 try { 1853 // occu and tempf are sometimes not used :-/ 1854 occu = Float.parseFloat (line.substring (54, 60).trim()); 1855 } catch (NumberFormatException e){} 1856 } 1857 1858 float tempf = 0.0f; 1859 if ( line.length() > 65) { 1860 try { 1861 tempf = Float.parseFloat (line.substring (60, 66).trim()); 1862 } catch (NumberFormatException e){} 1863 } 1864 1865 atom.setOccupancy( occu ); 1866 atom.setTempFactor( tempf ); 1867 1868 1869 1870 1871 // Parse element from the element field. If this field is 1872 // missing (i.e. misformatted PDB file), then parse the 1873 // element from the chemical component. 1874 Element element = Element.R; 1875 boolean guessElement = true; 1876 if ( line.length() > 77 ) { 1877 // parse element from element field 1878 String elementSymbol = line.substring(76, 78).trim(); 1879 if (elementSymbol.isEmpty()) { 1880 logger.info("Element column was empty for atom {} {}. Assigning atom element " 1881 + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber); 1882 } else { 1883 1884 try { 1885 element = Element.valueOfIgnoreCase(elementSymbol); 1886 guessElement = false; 1887 } catch (IllegalArgumentException e){ 1888 logger.info("Element {} of atom {} {} was not recognised. Assigning atom element " 1889 + "from Chemical Component Dictionary information", elementSymbol, 1890 fullname.trim(), pdbnumber); 1891 } 1892 } 1893 } else { 1894 logger.info("Missformatted PDB file: element column of atom {} {} is not present. " 1895 + "Assigning atom element from Chemical Component Dictionary information", 1896 fullname.trim(), pdbnumber); 1897 } 1898 if (guessElement) { 1899 String elementSymbol = null; 1900 if (currentGroup.getChemComp() != null) { 1901 for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) { 1902 if (a.getAtomId().equals(fullname.trim())) { 1903 elementSymbol = a.getTypeSymbol(); 1904 break; 1905 } 1906 } 1907 if (elementSymbol == null) { 1908 logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. " 1909 + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName()); 1910 } else { 1911 try { 1912 element = Element.valueOfIgnoreCase(elementSymbol); 1913 } catch (IllegalArgumentException e) { 1914 // this can still happen for cases like UNK 1915 logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. " 1916 + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber); 1917 } 1918 } 1919 } else { 1920 logger.warn("Chemical Component Dictionary information was not found for Atom name {}. " 1921 + "Assigning generic element R to it", fullname.trim()); 1922 } 1923 1924 } 1925 atom.setElement(element); 1926 1927 1928 //see if chain_id is one of the previous chains ... 1929 if ( altGroup != null) { 1930 altGroup.addAtom(atom); 1931 altGroup = null; 1932 } 1933 else { 1934 currentGroup.addAtom(atom); 1935 } 1936 1937 1938 // make sure that main group has all atoms 1939 // GitHub issue: #76 1940 if ( ! currentGroup.hasAtom(atom.getName())) { 1941 currentGroup.addAtom(atom); 1942 } 1943 1944 1945 1946 } 1947 1948 1949 private Group getCorrectAltLocGroup( Character altLoc, 1950 String recordName, Character aminoCode1, String groupCode3) { 1951 1952 // see if we know this altLoc already; 1953 List<Atom> atoms = currentGroup.getAtoms(); 1954 if ( atoms.size() > 0) { 1955 Atom a1 = atoms.get(0); 1956 // we are just adding atoms to the current group 1957 // probably there is a second group following later... 1958 if (a1.getAltLoc().equals(altLoc)) { 1959 1960 return currentGroup; 1961 } 1962 } 1963 1964 List<Group> altLocs = currentGroup.getAltLocs(); 1965 for ( Group altLocG : altLocs ){ 1966 atoms = altLocG.getAtoms(); 1967 if ( atoms.size() > 0) { 1968 for ( Atom a1 : atoms) { 1969 if (a1.getAltLoc().equals( altLoc)) { 1970 1971 return altLocG; 1972 } 1973 } 1974 } 1975 } 1976 1977 // no matching altLoc group found. 1978 // build it up. 1979 1980 if ( groupCode3.equals(currentGroup.getPDBName())) { 1981 if ( currentGroup.getAtoms().size() == 0) { 1982 //System.out.println("current group is empty " + current_group + " " + altLoc); 1983 return currentGroup; 1984 } 1985 //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc); 1986 Group altLocG = (Group) currentGroup.clone(); 1987 // drop atoms from cloned group... 1988 // https://redmine.open-bio.org/issues/3307 1989 altLocG.setAtoms(new ArrayList<Atom>()); 1990 altLocG.getAltLocs().clear(); 1991 currentGroup.addAltLoc(altLocG); 1992 return altLocG; 1993 } 1994 1995 // System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3); 1996 Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3); 1997 1998 1999 altLocG.setPDBName(groupCode3); 2000 2001 altLocG.setResidueNumber(currentGroup.getResidueNumber()); 2002 currentGroup.addAltLoc(altLocG); 2003 return altLocG; 2004 } 2005 2006 private void switchCAOnly(){ 2007 parseCAonly = true; 2008 2009 2010 currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel); 2011 2012 for ( int i =0; i< structure.nrModels() ; i++){ 2013 // iterate over all known models ... 2014 List<Chain> model = structure.getModel(i); 2015 model = CAConverter.getRepresentativeAtomsOnly(model); 2016 structure.setModel(i,model); 2017 } 2018 2019 currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain); 2020 2021 } 2022 2023 2024 /** safes repeating a few lines ... */ 2025 private Integer conect_helper (String line,int start,int end) { 2026 if (line.length() < end) return null; 2027 2028 String sbond = line.substring(start,end).trim(); 2029 int bond = -1 ; 2030 Integer b = null ; 2031 2032 if ( ! sbond.equals("")) { 2033 bond = Integer.parseInt(sbond); 2034 b = new Integer(bond); 2035 } 2036 2037 return b ; 2038 } 2039 2040 /** 2041 * Handler for CONECT Record Format 2042 <pre> 2043 COLUMNS DATA TYPE FIELD DEFINITION 2044 --------------------------------------------------------------------------------- 2045 1 - 6 Record name "CONECT" 2046 7 - 11 Integer serial Atom serial number 2047 12 - 16 Integer serial Serial number of bonded atom 2048 17 - 21 Integer serial Serial number of bonded atom 2049 22 - 26 Integer serial Serial number of bonded atom 2050 27 - 31 Integer serial Serial number of bonded atom 2051 32 - 36 Integer serial Serial number of hydrogen bonded 2052 atom 2053 37 - 41 Integer serial Serial number of hydrogen bonded 2054 atom 2055 42 - 46 Integer serial Serial number of salt bridged 2056 atom 2057 47 - 51 Integer serial Serial number of hydrogen bonded 2058 atom 2059 52 - 56 Integer serial Serial number of hydrogen bonded 2060 atom 2061 57 - 61 Integer serial Serial number of salt bridged 2062 atom 2063 </pre> 2064 */ 2065 private void pdb_CONECT_Handler(String line) { 2066 2067 if ( atomOverflow) { 2068 return ; 2069 } 2070 if (params.isHeaderOnly()) { 2071 return; 2072 } 2073 2074 // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines... 2075 try { 2076 int atomserial = Integer.parseInt (line.substring(6 ,11).trim()); 2077 Integer bond1 = conect_helper(line,11,16); 2078 Integer bond2 = conect_helper(line,16,21); 2079 Integer bond3 = conect_helper(line,21,26); 2080 Integer bond4 = conect_helper(line,26,31); 2081 Integer hyd1 = conect_helper(line,31,36); 2082 Integer hyd2 = conect_helper(line,36,41); 2083 Integer salt1 = conect_helper(line,41,46); 2084 Integer hyd3 = conect_helper(line,46,51); 2085 Integer hyd4 = conect_helper(line,51,56); 2086 Integer salt2 = conect_helper(line,56,61); 2087 2088 //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+ 2089 // hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2); 2090 HashMap<String, Integer> cons = new HashMap<String, Integer>(); 2091 cons.put("atomserial",new Integer(atomserial)); 2092 2093 if ( bond1 != null) cons.put("bond1",bond1); 2094 if ( bond2 != null) cons.put("bond2",bond2); 2095 if ( bond3 != null) cons.put("bond3",bond3); 2096 if ( bond4 != null) cons.put("bond4",bond4); 2097 if ( hyd1 != null) cons.put("hydrogen1",hyd1); 2098 if ( hyd2 != null) cons.put("hydrogen2",hyd2); 2099 if ( salt1 != null) cons.put("salt1",salt1); 2100 if ( hyd3 != null) cons.put("hydrogen3",hyd3); 2101 if ( hyd4 != null) cons.put("hydrogen4",hyd4); 2102 if ( salt2 != null) cons.put("salt2",salt2); 2103 2104 connects.add(cons); 2105 } catch (NumberFormatException e){ 2106 logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line); 2107 return; 2108 } 2109 } 2110 2111 /** 2112 * Handler for MODEL Record Format 2113 * <pre> 2114 * COLUMNS DATA TYPE FIELD DEFINITION 2115 * ---------------------------------------------------------------------- 2116 * 1 - 6 Record name "MODEL " 2117 * 11 - 14 Integer serial Model serial number. 2118 * </pre> 2119 */ 2120 private void pdb_MODEL_Handler(String line) { 2121 2122 if (params.isHeaderOnly()) return; 2123 2124 // new model: we start a new molecule 2125 startOfMolecule = true; 2126 startOfModel = true; 2127 2128 } 2129 2130 /** 2131 * Handler for TER record. The record is used in deposited PDB files and many others, 2132 * but it's often forgotten by some softwares. In any case it helps identifying the 2133 * start of ligand molecules so we use it for that. 2134 */ 2135 private void pdb_TER_Handler() { 2136 startOfMolecule = true; 2137 } 2138 2139 2140 /** 2141 * DBREF handler 2142 * <pre> 2143 * COLUMNS DATA TYPE FIELD DEFINITION 2144 * ---------------------------------------------------------------- 2145 * 1 - 6 Record name "DBREF " 2146 * 8 - 11 IDcode idCode ID code of this entry. 2147 * 13 Character chainID Chain identifier. 2148 * 15 - 18 Integer seqBegin Initial sequence number 2149 * of the PDB sequence segment. 2150 * 19 AChar insertBegin Initial insertion code 2151 * of the PDB sequence segment. 2152 * 21 - 24 Integer seqEnd Ending sequence number 2153 * of the PDB sequence segment. 2154 * 25 AChar insertEnd Ending insertion code 2155 * of the PDB sequence segment. 2156 * 27 - 32 LString database Sequence database name. 2157 * 34 - 41 LString dbAccession Sequence database accession code. 2158 * 43 - 54 LString dbIdCode Sequence database 2159 * identification code. 2160 * 56 - 60 Integer dbseqBegin Initial sequence number of the 2161 * database seqment. 2162 * 61 AChar idbnsBeg Insertion code of initial residue 2163 * of the segment, if PDB is the 2164 * reference. 2165 * 63 - 67 Integer dbseqEnd Ending sequence number of the 2166 * database segment. 2167 * 68 AChar dbinsEnd Insertion code of the ending 2168 * residue of the segment, if PDB is 2169 * the reference. 2170 * </pre> 2171 */ 2172 private void pdb_DBREF_Handler(String line){ 2173 2174 logger.debug("Parsing DBREF " + line); 2175 2176 DBRef dbref = new DBRef(); 2177 String idCode = line.substring(7,11); 2178 String chainName = line.substring(12,13); 2179 String seqBegin = line.substring(14,18); 2180 String insertBegin = line.substring(18,19); 2181 String seqEnd = line.substring(20,24); 2182 String insertEnd = line.substring(24,25); 2183 String database = line.substring(26,32); 2184 String dbAccession = line.substring(33,41); 2185 String dbIdCode = line.substring(42,54); 2186 String dbseqBegin = line.substring(55,60); 2187 String idbnsBeg = line.substring(60,61); 2188 String dbseqEnd = line.substring(62,67); 2189 // Support implicit space character at end 2190 String dbinsEnd; 2191 if(line.length() >= 68) 2192 dbinsEnd = line.substring(67,68); 2193 else 2194 dbinsEnd = " "; 2195 2196 dbref.setIdCode(idCode); 2197 dbref.setChainName(chainName); 2198 dbref.setSeqBegin(intFromString(seqBegin)); 2199 dbref.setInsertBegin(insertBegin.charAt(0)); 2200 dbref.setSeqEnd(intFromString(seqEnd)); 2201 dbref.setInsertEnd(insertEnd.charAt(0)); 2202 dbref.setDatabase(database.trim()); 2203 dbref.setDbAccession(dbAccession.trim()); 2204 dbref.setDbIdCode(dbIdCode.trim()); 2205 dbref.setDbSeqBegin(intFromString(dbseqBegin)); 2206 dbref.setIdbnsBegin(idbnsBeg.charAt(0)); 2207 dbref.setDbSeqEnd(intFromString(dbseqEnd)); 2208 dbref.setIdbnsEnd(dbinsEnd.charAt(0)); 2209 2210 //System.out.println(dbref.toPDB()); 2211 dbrefs.add(dbref); 2212 } 2213 2214 2215 /** 2216 * Process the disulfide bond info provided by an SSBOND record 2217 * 2218 * <pre> 2219 COLUMNS DATA TYPE FIELD DEFINITION 2220 ------------------------------------------------------------------- 2221 1 - 6 Record name "SSBOND" 2222 8 - 10 Integer serNum Serial number. 2223 12 - 14 LString(3) "CYS" Residue name. 2224 16 Character chainID1 Chain identifier. 2225 18 - 21 Integer seqNum1 Residue sequence number. 2226 22 AChar icode1 Insertion code. 2227 26 - 28 LString(3) "CYS" Residue name. 2228 30 Character chainID2 Chain identifier. 2229 32 - 35 Integer seqNum2 Residue sequence number. 2230 36 AChar icode2 Insertion code. 2231 60 - 65 SymOP sym1 Symmetry oper for 1st resid 2232 67 - 72 SymOP sym2 Symmetry oper for 2nd resid 2233 * </pre> 2234 */ 2235 private void pdb_SSBOND_Handler(String line){ 2236 2237 if (params.isHeaderOnly()) return; 2238 2239 if (line.length()<36) { 2240 logger.info("SSBOND line has length under 36. Ignoring it."); 2241 return; 2242 } 2243 2244 String chain1 = line.substring(15,16); 2245 String seqNum1 = line.substring(17,21).trim(); 2246 String icode1 = line.substring(21,22); 2247 String chain2 = line.substring(29,30); 2248 String seqNum2 = line.substring(31,35).trim(); 2249 String icode2 = line.substring(35,36); 2250 2251 if (line.length()>=72) { 2252 String symop1 = line.substring(59, 65).trim(); 2253 String symop2 = line.substring(66, 72).trim(); 2254 2255 // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them 2256 if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing 2257 (!symop1.equals("1555") || !symop2.equals("1555")) ) { 2258 logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2); 2259 return; 2260 } 2261 } 2262 2263 if (icode1.equals(" ")) 2264 icode1 = ""; 2265 if (icode2.equals(" ")) 2266 icode2 = ""; 2267 2268 SSBondImpl ssbond = new SSBondImpl(); 2269 2270 ssbond.setChainID1(chain1); 2271 ssbond.setResnum1(seqNum1); 2272 ssbond.setChainID2(chain2); 2273 ssbond.setResnum2(seqNum2); 2274 ssbond.setInsCode1(icode1); 2275 ssbond.setInsCode2(icode2); 2276 ssbonds.add(ssbond); 2277 } 2278 2279 2280 /** 2281 * Takes care of LINK records. These take the format of: 2282 * 2283 * <pre> 2284 * COLUMNS DATA TYPE FIELD DEFINITION 2285 * -------------------------------------------------------------------------------- 2286 * 1 - 6 Record name "LINK " 2287 * 13 - 16 Atom name1 Atom name. 2288 * 17 Character altLoc1 Alternate location indicator. 2289 * 18 - 20 Residue name resName1 Residue name. 2290 * 22 Character chainID1 Chain identifier. 2291 * 23 - 26 Integer resSeq1 Residue sequence number. 2292 * 27 AChar iCode1 Insertion code. 2293 * 43 - 46 Atom name2 Atom name. 2294 * 47 Character altLoc2 Alternate location indicator. 2295 * 48 - 50 Residue name resName2 Residue name. 2296 * 52 Character chainID2 Chain identifier. 2297 * 53 - 56 Integer resSeq2 Residue sequence number. 2298 * 57 AChar iCode2 Insertion code. 2299 * 60 - 65 SymOP sym1 Symmetry operator for 1st atom. 2300 * 67 - 72 SymOP sym2 Symmetry operator for 2nd atom. 2301 * </pre> 2302 * 2303 * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK) 2304 * 2305 * @param line the LINK record line to parse. 2306 */ 2307 private void pdb_LINK_Handler(String line) { 2308 2309 if (params.isHeaderOnly()) return; 2310 2311 // Check for the minimal set of fields. 2312 if (line.length()<56) { 2313 logger.info("LINK line has length under 56. Ignoring it."); 2314 return; 2315 } 2316 2317 int len = line.length(); 2318 2319 String name1 = line.substring(12, 16).trim(); 2320 String altLoc1 = line.substring(16, 17).trim(); 2321 String resName1 = line.substring(17, 20).trim(); 2322 String chainID1 = line.substring(21, 22).trim(); 2323 String resSeq1 = line.substring(22, 26).trim(); 2324 String iCode1 = line.substring(26, 27).trim(); 2325 2326 String name2 = line.substring(42, 46).trim(); 2327 String altLoc2 = line.substring(46, 47).trim(); 2328 String resName2 = line.substring(47, 50).trim(); 2329 String chainID2 = line.substring(51, 52).trim(); 2330 String resSeq2 = line.substring(52, 56).trim(); 2331 String iCode2 = null; // Might get trimmed if blank. 2332 if (len > 56) iCode2 = line.substring(56, 57).trim(); 2333 2334 String sym1 = null; 2335 if (len > 64) sym1 = line.substring(59, 65).trim(); 2336 String sym2 = null; 2337 if (len > 71) sym2 = line.substring(66, 72).trim(); 2338 2339 linkRecords.add(new LinkRecord( 2340 name1, altLoc1, resName1, chainID1, resSeq1, iCode1, 2341 name2, altLoc2, resName2, chainID2, resSeq2, iCode2, 2342 sym1, sym2)); 2343 } 2344 2345 /** 2346 * Handler for the SITE records. <br> 2347 * 2348 * <pre> 2349 * 2350 * COLUMNS DATA TYPE FIELD DEFINITION 2351 * --------------------------------------------------------------------------------- 2352 * 1 - 6 Record name "SITE " 2353 * 8 - 10 Integer seqNum Sequence number. 2354 * 12 - 14 LString(3) siteID Site name. 2355 * 16 - 17 Integer numRes Number of residues that compose the siteResidues. 2356 * 19 - 21 Residue name resName1 Residue name for first residue that 2357 * creates the siteResidues. 2358 * 23 Character chainID1 Chain identifier for first residue of siteResidues. 2359 * 24 - 27 Integer seq1 Residue sequence number for first residue 2360 * of the siteResidues. 2361 * 28 AChar iCode1 Insertion code for first residue of the siteResidues. 2362 * 2363 * example: 2364 * 1 2 3 4 5 6 7 8 2365 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2366 * SITE 1 AC1 3 HIS A 94 HIS A 96 HIS A 119 2367 * SITE 1 AC2 5 ASN A 62 GLY A 63 HIS A 64 HOH A 328 2368 * SITE 2 AC2 5 HOH A 634 2369 * SITE 1 AC3 5 GLN A 136 GLN A 137 PRO A 138 GLU A 205 2370 * SITE 2 AC3 5 CYS A 206 2371 * SITE 1 AC4 11 HIS A 64 HIS A 94 HIS A 96 HIS A 119 2372 * SITE 2 AC4 11 LEU A 198 THR A 199 THR A 200 TRP A 209 2373 * SITE 3 AC4 11 HOH A 572 HOH A 582 HOH A 635 2374 * </pre> 2375 * @param line the SITE line record being currently read 2376 * @author Amr ALHOSSARY 2377 * @author Jules Jacobsen 2378 */ 2379 private void pdb_SITE_Handler(String line){ 2380 2381 if (params.isHeaderOnly()) return; 2382 2383 // make a map of: SiteId to List<ResidueNumber> 2384 2385 logger.debug("Site Line:"+line); 2386 2387 2388 String siteID = line.substring(11, 14); 2389 //fetch the siteResidues from the map 2390 List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID); 2391 2392 //if the siteResidues doesn't yet exist, make a new one. 2393 if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){ 2394 siteResidues = new ArrayList<ResidueNumber>(); 2395 siteToResidueMap.put(siteID.trim(), siteResidues); 2396 2397 logger.debug(String.format("New Site made: %s %s", siteID, siteResidues)); 2398 logger.debug("Now made " + siteMap.size() + " sites"); 2399 2400 } 2401 2402 logger.debug(String.format("SiteId: %s", siteID)); 2403 2404 2405 //line = 'SITE 1 AC1 6 ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2406 //line.substring(18) = 'ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2407 line = line.substring(18); 2408 String groupString = null; 2409 //groupString = 'ARG H 221A' 2410 //keep iterating through chunks of 10 characters - these are the groups in the siteResidues 2411 while (!(groupString = line.substring(0, 10)).equals(" ")) { 2412 //groupstring: 'ARG H 221A' 2413 2414 logger.debug("groupString: '" + groupString + "'"); 2415 2416 //set the residue name 2417 //residueName = 'ARG' 2418 String residueName = groupString.substring(0, 3); 2419 Character aminoCode1 = StructureTools.get1LetterCode(residueName); 2420 if (aminoCode1 != null) { 2421 if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 2422 aminoCode1 = null; 2423 } 2424 } 2425 2426 //this is already in the right format, so no need to fiddle with it... 2427 //pdbCode = 'H 221A' 2428 // String pdbCode = groupString.substring(4, 10).trim(); 2429 String chainId = groupString.substring(4, 5); 2430 Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim()); 2431 Character insCode = groupString.substring(9, 10).charAt(0); 2432 //set insCode to null as a measure to prevent storing thousands of empty Strings 2433 //- the empty value is returned using Group.getInsCode() 2434 // if (insCode.equals(" ")) { 2435 // insCode = null; 2436 // } 2437 2438 logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode)); 2439 2440 //make a new resNum with the data - this will be linked up with a site later 2441 ResidueNumber residueNumber = new ResidueNumber(); 2442 2443 2444 logger.debug("pdbCode: '" + resNum + insCode + "'"); 2445 2446 residueNumber.setChainName(chainId); 2447 residueNumber.setSeqNum(resNum); 2448 residueNumber.setInsCode(insCode); 2449 //add the resNum to the groups 2450 siteResidues.add(residueNumber); 2451 2452 logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID); 2453 2454 line = line.substring(11); 2455 } 2456 2457 logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):"); 2458 for (String key : siteToResidueMap.keySet()) { 2459 logger.debug(key + " : " + siteToResidueMap.get(key)); 2460 } 2461 2462 } 2463 2464 //Site variable related to parsing the REMARK 800 records. 2465 Site site; 2466 2467 private String[] keywords; 2468 private void pdb_REMARK_800_Handler(String line){ 2469 2470 if (params.isHeaderOnly()) return; 2471 2472 // 'REMARK 800 SITE_IDENTIFIER: CAT ' 2473 line = line.substring(11); 2474 String[] fields = line.split(": "); 2475 2476 if (fields.length == 2) { 2477 if (fields[0].equals("SITE_IDENTIFIER")) { 2478 // remark800Counter++; 2479 String siteID = fields[1].trim(); 2480 2481 logger.debug("siteID: '" + siteID +"'"); 2482 2483 //fetch the siteResidues from the map 2484 site = siteMap.get(siteID); 2485 2486 //if the siteResidues doesn't yet exist, make a new one. 2487 if (site == null || !siteID.equals(site.getSiteID())) { 2488 site = new Site(siteID, new ArrayList<Group>()); 2489 siteMap.put(site.getSiteID(), site); 2490 2491 logger.debug("New Site made: " + site); 2492 logger.debug("Now made " + siteMap.size() + " sites"); 2493 2494 } 2495 } 2496 if (fields[0].equals("EVIDENCE_CODE")) { 2497 // remark800Counter++; 2498 String evCode = fields[1].trim(); 2499 2500 logger.debug("evCode: '" + evCode +"'"); 2501 2502 //fetch the siteResidues from the map 2503 site.setEvCode(evCode); 2504 } 2505 if (fields[0].equals("SITE_DESCRIPTION")) { 2506 // remark800Counter++; 2507 String desc = fields[1].trim(); 2508 2509 logger.debug("desc: '" + desc +"'"); 2510 2511 //fetch the siteResidues from the map 2512 site.setDescription(desc); 2513 2514 logger.debug("Finished making REMARK 800 for site " + site.getSiteID()); 2515 logger.debug(site.remark800toPDB()); 2516 2517 } 2518 } 2519 } 2520 2521 private int intFromString(String intString){ 2522 int val = Integer.MIN_VALUE; 2523 try { 2524 val = Integer.parseInt(intString.trim()); 2525 } catch (NumberFormatException ex){ 2526 logger.info("Could not parse a number: " + ex.getMessage()); 2527 } 2528 return val; 2529 } 2530 2531 2532 2533 /** 2534 * Finds in the given list of chains the first one that has as name the given chainID. 2535 * If no such Chain can be found it returns null. 2536 */ 2537 private static Chain isKnownChain(String chainID, List<Chain> chains){ 2538 2539 for (int i = 0; i< chains.size();i++){ 2540 Chain testchain = chains.get(i); 2541 if (chainID.equals(testchain.getName())) { 2542 return testchain; 2543 } 2544 } 2545 2546 return null; 2547 } 2548 2549 2550 2551 private BufferedReader getBufferedReader(InputStream inStream) 2552 throws IOException { 2553 2554 BufferedReader buf ; 2555 if (inStream == null) { 2556 throw new IOException ("input stream is null!"); 2557 } 2558 2559 buf = new BufferedReader (new InputStreamReader (inStream)); 2560 return buf ; 2561 2562 } 2563 2564 2565 2566 /** 2567 * Parse a PDB file and return a datastructure implementing 2568 * PDBStructure interface. 2569 * 2570 * @param inStream an InputStream object 2571 * @return a Structure object 2572 * @throws IOException 2573 */ 2574 public Structure parsePDBFile(InputStream inStream) 2575 throws IOException 2576 { 2577 2578 BufferedReader buf = getBufferedReader(inStream); 2579 2580 return parsePDBFile(buf); 2581 2582 } 2583 2584 /** 2585 * Parse a PDB file and return a datastructure implementing 2586 * PDBStructure interface. 2587 * 2588 * @param buf a BufferedReader object 2589 * @return the Structure object 2590 * @throws IOException ... 2591 */ 2592 public Structure parsePDBFile(BufferedReader buf) 2593 throws IOException 2594 { 2595 // set the correct max values for parsing... 2596 loadMaxAtoms = params.getMaxAtoms(); 2597 atomCAThreshold = params.getAtomCaThreshold(); 2598 2599 2600 // (re)set structure 2601 2602 allModels = new ArrayList<>(); 2603 structure = new StructureImpl() ; 2604 currentModel = null; 2605 currentChain = null; 2606 currentGroup = null; 2607 // we initialise to true since at the beginning of the file we are always starting a new molecule 2608 startOfMolecule = true; 2609 startOfModel = true; 2610 2611 seqResChains = new ArrayList<Chain>(); 2612 siteMap = new LinkedHashMap<String, Site>(); 2613 pdbHeader = new PDBHeader(); 2614 connects = new ArrayList<Map<String,Integer>>(); 2615 previousContinuationField = ""; 2616 continuationField = ""; 2617 continuationString = ""; 2618 current_compound = null; 2619 sourceLines.clear(); 2620 compndLines.clear(); 2621 keywordsLines.clear(); 2622 isLastCompndLine = false; 2623 isLastSourceLine = false; 2624 prevMolId = -1; 2625 entities.clear(); 2626 helixList.clear(); 2627 strandList.clear(); 2628 turnList.clear(); 2629 lengthCheck = -1; 2630 atomCount = 0; 2631 atomOverflow = false; 2632 linkRecords = new ArrayList<LinkRecord>(); 2633 siteToResidueMap.clear(); 2634 2635 blankChainIdsPresent = false; 2636 2637 parseCAonly = params.isParseCAOnly(); 2638 2639 String line = null; 2640 2641 while ((line = buf.readLine()) != null) { 2642 2643 // ignore empty lines 2644 if ( line.equals("") || 2645 (line.equals(NEWLINE))){ 2646 continue; 2647 } 2648 2649 2650 // ignore short TER and END lines 2651 if ( line.startsWith("END")) { 2652 continue; 2653 } 2654 2655 if ( line.length() < 6 && !line.startsWith("TER")) { 2656 logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" ); 2657 continue; 2658 } 2659 2660 String recordName = null; 2661 if (line.length()<6) 2662 recordName = line.trim(); 2663 else 2664 recordName = line.substring (0, 6).trim (); 2665 2666 try { 2667 if (recordName.equals("ATOM")) 2668 pdb_ATOM_Handler(line); 2669 else if (recordName.equals("SEQRES")) 2670 pdb_SEQRES_Handler(line); 2671 else if (recordName.equals("HETATM")) 2672 pdb_ATOM_Handler(line); 2673 else if (recordName.equals("MODEL")) 2674 pdb_MODEL_Handler(line); 2675 else if (recordName.equals("TER")) 2676 pdb_TER_Handler(); 2677 else if (recordName.equals("HEADER")) 2678 pdb_HEADER_Handler(line); 2679 else if (recordName.equals("AUTHOR")) 2680 pdb_AUTHOR_Handler(line); 2681 else if (recordName.equals("TITLE")) 2682 pdb_TITLE_Handler(line); 2683 else if (recordName.equals("SOURCE")) 2684 sourceLines.add(line); //pdb_SOURCE_Handler 2685 else if (recordName.equals("COMPND")) 2686 compndLines.add(line); //pdb_COMPND_Handler 2687 else if (recordName.equals("KEYWDS")) 2688 keywordsLines.add(line); 2689 else if (recordName.equals("JRNL")) 2690 pdb_JRNL_Handler(line); 2691 else if (recordName.equals("EXPDTA")) 2692 pdb_EXPDTA_Handler(line); 2693 else if (recordName.equals("CRYST1")) 2694 pdb_CRYST1_Handler(line); 2695 else if (recordName.startsWith("MTRIX")) 2696 pdb_MTRIXn_Handler(line); 2697 else if (recordName.equals("REMARK")) 2698 pdb_REMARK_Handler(line); 2699 else if (recordName.equals("CONECT")) 2700 pdb_CONECT_Handler(line); 2701 else if (recordName.equals("REVDAT")) 2702 pdb_REVDAT_Handler(line); 2703 else if (recordName.equals("DBREF")) 2704 pdb_DBREF_Handler(line); 2705 else if (recordName.equals("SITE")) 2706 pdb_SITE_Handler(line); 2707 else if (recordName.equals("SSBOND")) 2708 pdb_SSBOND_Handler(line); 2709 else if (recordName.equals("LINK")) 2710 pdb_LINK_Handler(line); 2711 else if ( params.isParseSecStruc()) { 2712 if ( recordName.equals("HELIX") ) pdb_HELIX_Handler ( line ) ; 2713 else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ; 2714 else if (recordName.equals("TURN")) pdb_TURN_Handler( line ) ; 2715 } 2716 } catch (StringIndexOutOfBoundsException | NullPointerException ex) { 2717 logger.info("Unable to parse [" + line + "]"); 2718 } 2719 } 2720 2721 makeCompounds(compndLines, sourceLines); 2722 2723 handlePDBKeywords(keywordsLines); 2724 2725 triggerEndFileChecks(); 2726 2727 if (params.shouldCreateAtomBonds()) { 2728 formBonds(); 2729 } 2730 2731 if ( params.shouldCreateAtomCharges()) { 2732 addCharges(); 2733 } 2734 2735 if ( params.isParseSecStruc() && !params.isHeaderOnly()) 2736 setSecStruc(); 2737 2738 // Now correct the alternate location group 2739 StructureTools.cleanUpAltLocs(structure); 2740 2741 return structure; 2742 2743 } 2744 2745 2746 /** 2747 * Add the charges to the Structure 2748 */ 2749 private void addCharges() { 2750 ChargeAdder.addCharges(structure); 2751 } 2752 2753 /** 2754 * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained. 2755 * @author Jules Jacobsen 2756 * @param compoundList 2757 * @param sourceList 2758 */ 2759 private void makeCompounds(List<String> compoundList, 2760 List<String> sourceList) { 2761 // System.out.println("[makeCompounds] making compounds from compoundLines"); 2762 2763 for (String line : compoundList) { 2764 if (compoundList.indexOf(line) + 1 == compoundList.size()) { 2765 // System.out.println("[makeCompounds] Final line in compoundLines."); 2766 isLastCompndLine = true; 2767 } 2768 pdb_COMPND_Handler(line); 2769 2770 } 2771 // System.out.println("[makeCompounds] adding sources to compounds from sourceLines"); 2772 // since we're starting again from the first compound, reset it here 2773 if ( entities.size() == 0){ 2774 current_compound = new EntityInfo(); 2775 } else { 2776 current_compound = entities.get(0); 2777 } 2778 for (String line : sourceList) { 2779 if (sourceList.indexOf(line) + 1 == sourceList.size()) { 2780 // System.out.println("[makeCompounds] Final line in sourceLines."); 2781 isLastSourceLine = true; 2782 } 2783 pdb_SOURCE_Handler(line); 2784 } 2785 2786 } 2787 2788 /**Parse KEYWODS record of the PDB file.<br> 2789 * A keyword may be split over two lines. whether a keyword ends by the end 2790 * of a line or it is aplit over two lines, a <code>space</code> is added 2791 * between the 2 lines's contents, unless the first line ends in 2792 * a '-' character. 2793 * <pre> 2794 * Record Format 2795 * COLUMNS DATA TYPE FIELD DEFINITION 2796 * --------------------------------------------------------------------------------- 2797 * 1 - 6 Record name "KEYWDS" 2798 * 9 - 10 Continuation continuation Allows concatenation of records if necessary. 2799 * 11 - 79 List keywds Comma-separated list of keywords relevant 2800 * to the entry. 2801 * Example 2802 * 1 2 3 4 5 6 7 8 2803 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2804 * KEYWDS LYASE, TRICARBOXYLIC ACID CYCLE, MITOCHONDRION, OXIDATIVE 2805 * KEYWDS 2 METABOLISM 2806 * </pre> 2807 * @param lines The KEWODS record lines. 2808 * @author Amr ALHOSSARY 2809 */ 2810 private void handlePDBKeywords(List<String> lines) { 2811 StringBuilder fullList = new StringBuilder(); 2812 for (String line : lines) { 2813 String kwList = line.substring(10).trim(); 2814 if(kwList.length() > 0) { 2815 if(fullList.length() > 0 && fullList.indexOf("-", fullList.length()-1) < 0) { 2816 fullList.append(' '); 2817 } 2818 fullList.append(kwList); 2819 } 2820 } 2821 String fulllengthList = fullList.toString(); 2822 keywords = fulllengthList.split("( )*,( )*"); 2823 ArrayList<String> lst = new ArrayList<String>(keywords.length); 2824 for (String keyword : keywords) { 2825 if(keyword.length() == 0) { 2826 logger.debug("Keyword empty in structure {}", structure.getIdentifier().toString()); 2827 continue; 2828 } 2829 lst.add(keyword); 2830 } 2831 pdbHeader.setKeywords(lst); 2832 } 2833 2834 /** 2835 * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide 2836 * bonds), peptide bonds, and intra-residue bonds. 2837 * <p> 2838 * Note: the current implementation only looks at the first model of each 2839 * structure. This may need to be fixed in the future. 2840 */ 2841 private void formBonds() { 2842 2843 BondMaker maker = new BondMaker(structure, params); 2844 2845 // LINK records should be preserved, they are the way that 2846 // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers. 2847 // The analogy in mmCIF is the _struct_conn record. 2848 for (LinkRecord linkRecord : linkRecords) { 2849 maker.formLinkRecordBond(linkRecord); 2850 } 2851 2852 maker.formDisulfideBonds(ssbonds); 2853 2854 maker.makeBonds(); 2855 } 2856 2857 2858 2859 private void triggerEndFileChecks(){ 2860 2861 // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines) 2862 if (currentChain!=null && currentGroup!=null) { 2863 currentChain.addGroup(currentGroup); 2864 } 2865 if (currentModel!=null && currentChain!=null) { 2866 currentModel.add(currentChain); 2867 } 2868 if (currentModel!=null) { 2869 allModels.add(currentModel); 2870 } 2871 2872 if (blankChainIdsPresent) { 2873 // from biojava 5.0 there's limited support for old pdb files with blank chain ids 2874 logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly."); 2875 } 2876 2877 // reordering chains following the mmcif model and assigning entities 2878 assignChainsAndEntities(); 2879 structure.setEntityInfos(entities); 2880 2881 2882 2883 // header data 2884 2885 Date modDate = pdbHeader.getModDate(); 2886 if ( modDate.equals(new Date(0)) ) { 2887 // modification date = deposition date 2888 Date depositionDate = pdbHeader.getDepDate(); 2889 2890 if (! depositionDate.equals(modDate)){ 2891 // depDate is 0000-00-00 2892 pdbHeader.setModDate(depositionDate); 2893 } 2894 } 2895 2896 structure.setPDBHeader(pdbHeader); 2897 structure.setCrystallographicInfo(crystallographicInfo); 2898 2899 //set the JournalArticle, if there is one 2900 if (!journalLines.isEmpty()) { 2901 buildjournalArticle(); 2902 pdbHeader.setJournalArticle(journalArticle); 2903 } 2904 2905 structure.setDBRefs(dbrefs); 2906 2907 // Only align if requested (default) and not when headerOnly mode with no Atoms. 2908 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 2909 if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){ 2910 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 2911 SeqRes2AtomAligner aligner = new SeqRes2AtomAligner(); 2912 aligner.align(structure,seqResChains); 2913 2914 } else { 2915 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 2916 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 2917 } 2918 2919 2920 2921 //associate the temporary Groups in the siteMap to the ones 2922 if (!params.isHeaderOnly()) { 2923 // Only can link SITES if Atom Groups were parsed. 2924 linkSitesToGroups(); // will work now that setSites is called 2925 } 2926 2927 if ( bioAssemblyParser != null){ 2928 bioAssemblyParser.setMacromolecularSizes(); 2929 pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap()); 2930 } 2931 2932 if (ncsOperators !=null && ncsOperators.size()>0) { 2933 crystallographicInfo.setNcsOperators( 2934 ncsOperators.toArray(new Matrix4d[ncsOperators.size()])); 2935 } 2936 2937 2938 // rfree end file check 2939 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 2940 // Here we follow this strategy: 2941 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 2942 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 2943 2944 if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) { 2945 pdbHeader.setRfree(rfreeNoCutoffLine); 2946 } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) { 2947 pdbHeader.setRfree(rfreeStandardLine); 2948 } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) { 2949 pdbHeader.setRfree(rfreeStandardLine); 2950 } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE 2951 2952 2953 2954 } 2955 2956 private void setSecStruc(){ 2957 2958 setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2959 SecStrucType.helix4); 2960 setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2961 SecStrucType.extended); 2962 setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2963 SecStrucType.turn); 2964 2965 //Now insert random coil to the Groups that did not have SS information 2966 GroupIterator gi = new GroupIterator(structure); 2967 while (gi.hasNext()){ 2968 Group g = gi.next(); 2969 if (g.hasAminoAtoms()){ 2970 if (g.getProperty(Group.SEC_STRUC) == null){ 2971 SecStrucInfo ss = new SecStrucInfo(g, 2972 SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2973 SecStrucType.coil); 2974 g.setProperty(Group.SEC_STRUC, ss); 2975 } 2976 } 2977 } 2978 2979 } 2980 2981 private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){ 2982 2983 2984 Iterator<Map<String,String>> iter = secList.iterator(); 2985 nextElement: 2986 while (iter.hasNext()){ 2987 Map<String,String> m = iter.next(); 2988 2989 // assign all residues in this range to this secondary structure type 2990 // String initResName = (String)m.get("initResName"); 2991 String initChainId = m.get("initChainId"); 2992 String initSeqNum = m.get("initSeqNum" ); 2993 String initICode = m.get("initICode" ); 2994 // String endResName = (String)m.get("endResName" ); 2995 String endChainId = m.get("endChainId" ); 2996 String endSeqNum = m.get("endSeqNum"); 2997 String endICode = m.get("endICode"); 2998 2999 if (initICode.equals(" ")) 3000 initICode = ""; 3001 if (endICode.equals(" ")) 3002 endICode = ""; 3003 3004 GroupIterator gi = new GroupIterator(structure); 3005 boolean inRange = false; 3006 while (gi.hasNext()){ 3007 Group g = gi.next(); 3008 Chain c = g.getChain(); 3009 3010 if (c.getName().equals(initChainId)){ 3011 3012 String pdbCode = initSeqNum + initICode; 3013 if ( g.getResidueNumber().toString().equals(pdbCode) ) { 3014 inRange = true; 3015 } 3016 } 3017 if ( inRange){ 3018 if (g.hasAminoAtoms()) { 3019 SecStrucInfo ss = new SecStrucInfo(g, assignment, type); 3020 g.setProperty(Group.SEC_STRUC, ss); 3021 } 3022 3023 } 3024 if ( c.getName().equals(endChainId)){ 3025 String pdbCode = endSeqNum + endICode; 3026 if (pdbCode.equals(g.getResidueNumber().toString())){ 3027 inRange = false; 3028 continue nextElement; 3029 } 3030 } 3031 } 3032 } 3033 } 3034 3035 /** 3036 * Gets all chains with given chainName from given models list 3037 * @param chainName 3038 * @param polyModels 3039 * @return 3040 */ 3041 private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) { 3042 List<List<Chain>> models = new ArrayList<>(); 3043 3044 for (List<Chain> chains:polyModels) { 3045 List<Chain> matchingChains = new ArrayList<>(); 3046 models.add(matchingChains); 3047 for (Chain c:chains) { 3048 if (c.getName().equals(chainName)) { 3049 matchingChains.add(c); 3050 } 3051 } 3052 } 3053 return models; 3054 } 3055 3056 /** 3057 * Split the given chain (containing non-polymer groups and water groups only) 3058 * into individual chains per non-polymer group and individual chains per contiguous sets of water groups. 3059 * @param chain 3060 * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains 3061 */ 3062 private static List<List<Chain>> splitNonPolyChain(Chain chain) { 3063 List<Chain> splitNonPolys = new ArrayList<>(); 3064 List<Chain> waterChains = new ArrayList<>(); 3065 3066 Chain split = null; 3067 boolean previousGroupIsWater = false; 3068 3069 for (Group g:chain.getAtomGroups()){ 3070 3071 if (!previousGroupIsWater) { 3072 // add last one if there's one 3073 if (split!=null) { 3074 splitNonPolys.add(split); 3075 } 3076 split = new ChainImpl(); 3077 split.setName(chain.getName()); 3078 } else if (!g.isWater()) { 3079 // previous group is water and this group is not water: we change from a water chain to a non-poly 3080 // we'll need to add now the water chain to the list of water chains 3081 waterChains.add(split); 3082 split = new ChainImpl(); 3083 split.setName(chain.getName()); 3084 } 3085 3086 if (g.isWater()) { 3087 previousGroupIsWater = true; 3088 } else { 3089 previousGroupIsWater = false; 3090 3091 } 3092 3093 // this should include alt locs (referenced from the main group) 3094 split.addGroup(g); 3095 3096 } 3097 3098 // adding the last split chain: either to water or non-poly depending on what was the last seen group 3099 if (split!=null) { 3100 if (previousGroupIsWater) 3101 waterChains.add(split); 3102 else 3103 splitNonPolys.add(split); 3104 } 3105 3106 3107 List<List<Chain>> all = new ArrayList<>(2); 3108 all.add(splitNonPolys); 3109 all.add(waterChains); 3110 3111 return all; 3112 } 3113 3114 /** 3115 * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files 3116 * @param polys 3117 * @param nonPolys 3118 * @param waters 3119 */ 3120 private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) { 3121 3122 for (int i=0; i<polys.size(); i++) { 3123 String asymId = "A"; 3124 3125 for (Chain poly:polys.get(i)) { 3126 poly.setId(asymId); 3127 asymId = getNextAsymId(asymId); 3128 } 3129 for (Chain nonPoly:nonPolys.get(i)) { 3130 nonPoly.setId(asymId); 3131 asymId = getNextAsymId(asymId); 3132 } 3133 for (Chain water:waters.get(i)) { 3134 water.setId(asymId); 3135 asymId = getNextAsymId(asymId); 3136 } 3137 } 3138 } 3139 3140 /** 3141 * Gets the next asym id given an asymId, according to the convention followed by 3142 * mmCIF files produced by the PDB 3143 * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,... 3144 * @param asymId 3145 * @return 3146 */ 3147 private String getNextAsymId(String asymId) { 3148 if (asymId.length()==1) { 3149 if (!asymId.equals("Z")) { 3150 return Character.toString(getNextChar(asymId.charAt(0))); 3151 } else { 3152 return "AA"; 3153 } 3154 } else if (asymId.length()==2) { 3155 if (asymId.equals("ZZ")) { 3156 return "AAA"; 3157 } 3158 char[] c = new char[2]; 3159 asymId.getChars(0, 2, c, 0); 3160 c[0] = getNextChar(c[0]); 3161 if (c[0]=='A') { 3162 c[1] = getNextChar(c[1]); 3163 } 3164 return new String(c); 3165 } else if (asymId.length()==3) { 3166 char[] c = new char[3]; 3167 asymId.getChars(0, 3, c, 0); 3168 c[0] = getNextChar(c[0]); 3169 if (c[0]=='A') { 3170 c[1] = getNextChar(c[1]); 3171 if (c[1]=='A') { 3172 c[2] = getNextChar(c[2]); 3173 } 3174 } 3175 return new String(c); 3176 } 3177 return null; 3178 } 3179 3180 private char getNextChar(char c) { 3181 if (c!='Z') { 3182 return ((char)(c+1)); 3183 } else { 3184 return 'A'; 3185 } 3186 } 3187 3188 /** 3189 * Here we assign chains following the mmCIF data model: 3190 * one chain per polymer, one chain per non-polymer group and 3191 * several water chains. 3192 * <p> 3193 * Subsequently we assign entities for them: either from those read from 3194 * COMPOUND records or from those found heuristically through {@link EntityFinder} 3195 * 3196 */ 3197 private void assignChainsAndEntities(){ 3198 3199 List<List<Chain>> polyModels = new ArrayList<>(); 3200 List<List<Chain>> nonPolyModels = new ArrayList<>(); 3201 List<List<Chain>> waterModels = new ArrayList<>(); 3202 3203 for (List<Chain> model:allModels) { 3204 3205 List<Chain> polyChains = new ArrayList<>(); 3206 List<Chain> nonPolyChains = new ArrayList<>(); 3207 List<Chain> waterChains = new ArrayList<>(); 3208 3209 polyModels.add(polyChains); 3210 nonPolyModels.add(nonPolyChains); 3211 waterModels.add(waterChains); 3212 3213 for (Chain c:model) { 3214 3215 // we only have entities for polymeric chains, all others are ignored for assigning entities 3216 if (c.isWaterOnly()) { 3217 waterChains.add(c); 3218 3219 } else if (c.isPureNonPolymer()) { 3220 nonPolyChains.add(c); 3221 3222 } else { 3223 polyChains.add(c); 3224 } 3225 } 3226 } 3227 3228 List<List<Chain>> splitNonPolyModels = new ArrayList<>(); 3229 for (int i=0; i<nonPolyModels.size(); i++) { 3230 List<Chain> nonPolyModel = nonPolyModels.get(i); 3231 List<Chain> waterModel = waterModels.get(i); 3232 3233 List<Chain> splitNonPolys = new ArrayList<>(); 3234 splitNonPolyModels.add(splitNonPolys); 3235 3236 for (Chain nonPoly:nonPolyModel) { 3237 List<List<Chain>> splits = splitNonPolyChain(nonPoly); 3238 splitNonPolys.addAll(splits.get(0)); 3239 waterModel.addAll(splits.get(1)); 3240 } 3241 } 3242 3243 3244 // now we have all chains as in mmcif, let's assign ids following the mmcif rules 3245 assignAsymIds(polyModels, splitNonPolyModels, waterModels); 3246 3247 3248 if (!entities.isEmpty()) { 3249 // if the file contained COMPOUND records then we can assign entities to the poly chains 3250 for (EntityInfo comp : entities){ 3251 List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId()); 3252 if ( chainIds == null) 3253 continue; 3254 for ( String chainId : chainIds) { 3255 3256 List<List<Chain>> models = findChains(chainId, polyModels); 3257 3258 for (List<Chain> matchingChains:models) { 3259 for (Chain chain:matchingChains) { 3260 comp.addChain(chain); 3261 chain.setEntityInfo(comp); 3262 } 3263 3264 if (matchingChains.isEmpty()) { 3265 // usually if this happens something is wrong with the PDB header 3266 // e.g. 2brd - there is no Chain A, although it is specified in the header 3267 // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES 3268 // but the authors didn't observe in the density so it's completely missing 3269 // from the ATOM lines 3270 logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId()); 3271 } 3272 } 3273 } 3274 } 3275 3276 } else { 3277 3278 logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically"); 3279 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 3280 entities = EntityFinder.findPolyEntities(polyModels); 3281 3282 } 3283 3284 // now we assign entities to the nonpoly and water chains 3285 EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities); 3286 3287 3288 // in some rare cases purely non-polymer or purely water chain are present in pdb files 3289 // see https://github.com/biojava/biojava/pull/394 3290 // these case should be covered by the above 3291 3292 3293 // now that we have entities in chains we add the chains to the structure 3294 3295 for (int i=0;i<allModels.size();i++) { 3296 List<Chain> model = new ArrayList<>(); 3297 model.addAll(polyModels.get(i)); 3298 model.addAll(splitNonPolyModels.get(i)); 3299 model.addAll(waterModels.get(i)); 3300 structure.addModel(model); 3301 } 3302 3303 3304 } 3305 3306 /** 3307 * Links the Sites in the siteMap to the Groups in the Structure via the 3308 * siteToResidueMap ResidueNumber. 3309 * @author Jules Jacobsen 3310 * @return 3311 */ 3312 private void linkSitesToGroups() { 3313 3314 //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size()); 3315 3316 //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back. 3317 //the return list 3318 3319 if ( siteMap == null || siteToResidueMap == null){ 3320 logger.info("Sites can not be linked to residues!"); 3321 3322 return; 3323 } 3324 3325 List<Site> sites = null; 3326 //check that there are chains with which to associate the groups 3327 if (structure.getChains().isEmpty()) { 3328 sites = new ArrayList<Site>(siteMap.values()); 3329 logger.info("No chains to link Site Groups with - Sites will not be present in the Structure"); 3330 return; 3331 } 3332 3333 //check that the keys in the siteMap and SiteToResidueMap are equal 3334 if (! siteMap.keySet().equals(siteToResidueMap.keySet())) { 3335 logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure"); 3336 logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet()); 3337 //return; 3338 } 3339 3340 //so we have chains - associate the siteResidues-related groups with the ones 3341 //already in in the chains 3342 for (String key : siteMap.keySet()) { 3343 Site currentSite = siteMap.get(key); 3344 List<ResidueNumber> linkedGroups = siteToResidueMap.get(key); 3345 if ( linkedGroups == null) 3346 continue; 3347 for (ResidueNumber residueNumber : linkedGroups) { 3348 3349 String pdbCode = residueNumber.toString(); 3350 String chain = residueNumber.getChainName(); 3351 // System.out.println("chain: '" + chain + "'"); 3352 // String resNum = resNum.getSeqNum().toString(); 3353 // System.out.println("resNum: '" + resNum + "'"); 3354 3355 Group linkedGroup = null; 3356 try { 3357 //TODO: implement findGroup(ResidueNumber resNum) 3358 linkedGroup = structure.findGroup(chain, pdbCode); 3359 } catch (StructureException ex) { 3360 logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")"); 3361 continue; 3362 } 3363 3364 // System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID()); 3365 currentSite.getGroups().add(linkedGroup); 3366 } 3367 } 3368 3369 //System.out.println("SITEMAP: " + siteMap); 3370 3371 sites = new ArrayList<Site>(siteMap.values()); 3372 structure.setSites(sites); 3373 //System.out.println("STRUCTURE SITES: " + structure.getSites().size()); 3374 // for (Site site : structure.getSites()) { 3375 // System.out.println(site); 3376 // } 3377 // System.out.println("Linked Site Groups with Chains"); 3378 3379 } 3380 3381 private void buildjournalArticle() { 3382 3383 logger.debug("building new JournalArticle"); 3384 // for (String line : journalLines) { 3385 // System.out.println(line); 3386 // } 3387 3388 this.journalArticle = new JournalArticle(); 3389 // JRNL AUTH M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI, 3390 // JRNL AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT 3391 // JRNL TITL A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY 3392 // JRNL TITL 2 STAPHYLOCOCCUS AUREUS. 3393 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3394 // JRNL REFN ISSN 1529-2908 3395 // JRNL PMID 17351618 3396 // JRNL DOI 10.1038/NI1450 3397 StringBuffer auth = new StringBuffer(); 3398 StringBuffer titl = new StringBuffer(); 3399 StringBuffer edit = new StringBuffer(); 3400 StringBuffer ref = new StringBuffer(); 3401 StringBuffer publ = new StringBuffer(); 3402 StringBuffer refn = new StringBuffer(); 3403 StringBuffer pmid = new StringBuffer(); 3404 StringBuffer doi = new StringBuffer(); 3405 3406 for (String line : journalLines) { 3407 if ( line.length() < 19 ) { 3408 logger.info("can not process Journal line: " + line); 3409 continue; 3410 } 3411 // System.out.println("'" + line + "'"); 3412 String subField = line.substring(12, 16); 3413 // System.out.println("'" + subField + "'"); 3414 if (subField.equals("AUTH")) { 3415 auth.append(line.substring(19, line.length()).trim()); 3416 3417 logger.debug("AUTH '" + auth.toString() + "'"); 3418 3419 } 3420 if (subField.equals("TITL")) { 3421 //add a space to the end of a line so that when wrapped the 3422 //words on the join won't be concatenated 3423 titl.append(line.substring(19, line.length()).trim()).append(" "); 3424 3425 logger.debug("TITL '" + titl.toString() + "'"); 3426 3427 } 3428 if (subField.equals("EDIT")) { 3429 edit.append(line.substring(19, line.length()).trim()); 3430 3431 logger.debug("EDIT '" + edit.toString() + "'"); 3432 3433 } 3434 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3435 if (subField.equals("REF ")) { 3436 ref.append(line.substring(19, line.length()).trim()).append(" "); 3437 3438 logger.debug("REF '" + ref.toString() + "'"); 3439 3440 } 3441 if (subField.equals("PUBL")) { 3442 publ.append(line.substring(19, line.length()).trim()).append(" "); 3443 3444 logger.debug("PUBL '" + publ.toString() + "'"); 3445 3446 } 3447 // JRNL REFN ISSN 1529-2908 3448 if (subField.equals("REFN")) { 3449 if ( line.length() < 35 ) { 3450 logger.info("can not process Journal REFN line: " + line); 3451 continue; 3452 } 3453 refn.append(line.substring(35, line.length()).trim()); 3454 3455 logger.debug("REFN '" + refn.toString() + "'"); 3456 3457 } 3458 // JRNL PMID 17351618 3459 if (subField.equals("PMID")) { 3460 pmid.append(line.substring(19, line.length()).trim()); 3461 3462 logger.debug("PMID '" + pmid.toString() + "'"); 3463 3464 } 3465 // JRNL DOI 10.1038/NI1450 3466 if (subField.equals("DOI ")) { 3467 doi.append(line.substring(19, line.length()).trim()); 3468 3469 logger.debug("DOI '" + doi.toString() + "'"); 3470 3471 } 3472 } 3473 3474 //now set the parts of the JournalArticle 3475 journalArticle.setAuthorList(authorBuilder(auth.toString())); 3476 journalArticle.setEditorList(authorBuilder(edit.toString())); 3477 journalArticle.setRef(ref.toString()); 3478 JournalParser journalParser = new JournalParser(ref.toString()); 3479 journalArticle.setJournalName(journalParser.getJournalName()); 3480 if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) { 3481 journalArticle.setIsPublished(true); 3482 } 3483 journalArticle.setVolume(journalParser.getVolume()); 3484 journalArticle.setStartPage(journalParser.getStartPage()); 3485 journalArticle.setPublicationDate(journalParser.getPublicationDate()); 3486 journalArticle.setPublisher(publ.toString().trim()); 3487 journalArticle.setTitle(titl.toString().trim()); 3488 journalArticle.setRefn(refn.toString().trim()); 3489 journalArticle.setPmid(pmid.toString().trim()); 3490 journalArticle.setDoi(doi.toString().trim()); 3491 3492 3493 logger.debug("Made JournalArticle:"); 3494 logger.debug(journalArticle.toString()); 3495 3496 } 3497 3498 //inner class to deal with all the journal info 3499 private class JournalParser { 3500 3501 private String journalName; 3502 private String volume; 3503 private String startPage; 3504 private int publicationDate; 3505 3506 3507 public JournalParser(String ref) { 3508 3509 logger.debug("JournalParser init '" + ref + "'"); 3510 3511 3512 if (ref.equals("TO BE PUBLISHED ")) { 3513 journalName = ref.trim(); 3514 3515 logger.debug(String.format("JournalParser found journalString '%s'", journalName)); 3516 3517 return; 3518 } 3519 3520 if (ref.length() < 48) { 3521 logger.info("REF line too short - must be at least 48 characters to be valid for parsing."); 3522 journalName = ""; 3523 volume = ""; 3524 startPage = ""; 3525 publicationDate = 0; 3526 return; 3527 } 3528 //can be multi line: 3529 //REF PHILOS.TRANS.R.SOC.LONDON, V. 293 53 1981 3530 //REF 2 SER.B 3531 3532 //or 3533 3534 //REF GLYCOGEN PHOSPHORYLASE B: 1 1991 3535 //REF 2 DESCRIPTION OF THE PROTEIN 3536 //REF 3 STRUCTURE 3537 3538 //but usually single line 3539 //REF NUCLEIC ACIDS RES. 2009 3540 //REF MOL.CELL 2009 3541 //REF NAT.STRUCT.MOL.BIOL. V. 16 238 2009 3542 //REF ACTA CRYSTALLOGR.,SECT.F V. 65 199 2009 3543 //check if the date is present at the end of the line. 3544 // 09876543210987654321 3545 //'J.BIOL.CHEM. V. 280 23000 2005 ' 3546 //'J.AM.CHEM.SOC. V. 130 16011 2008 ' 3547 //'NAT.STRUCT.MOL.BIOL. V. 16 238 2009' 3548 String volumeInformation = ref.substring(30, 48); 3549 3550 logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation)); 3551 3552 //volumeInformation: 'V. 293 53 1981 ' 3553 // String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim(); 3554 // String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim(); 3555 // String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim(); 3556 // String journalString = ref.substring(0 , ref.length() - 18).trim(); 3557 String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim(); 3558 String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim(); 3559 String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim(); 3560 //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk) 3561 String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim(); 3562 journalString = journalString.trim(); 3563 // System.out.println("journalString: " + journalString); 3564 3565 logger.debug(String.format("JournalParser found volumeString '%s'", volumeString)); 3566 logger.debug(String.format("JournalParser found startPageString '%s'", startPageString)); 3567 logger.debug(String.format("JournalParser found dateString '%s'", dateString)); 3568 logger.debug(String.format("JournalParser found journalString '%s'", journalString)); 3569 3570 3571 if (!dateString.equals(" ")) { 3572 try { 3573 publicationDate = Integer.valueOf(dateString); 3574 } catch (NumberFormatException nfe) { 3575 logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1"); 3576 } 3577 // if (DEBUG) { 3578 // System.out.println("JournalParser set date " + publicationDate); 3579 // } 3580 } 3581 3582 if (!startPageString.equals(" ")) { 3583 startPage = startPageString; 3584 // if (DEBUG) { 3585 // System.out.println("JournalParser set startPage " + startPage); 3586 // } 3587 } 3588 3589 if (!volumeString.equals(" ")) { 3590 volume = volumeString; 3591 // if (DEBUG) { 3592 // System.out.println("JournalParser set volume " + volume); 3593 // } 3594 } 3595 3596 if (!journalString.equals(" ")) { 3597 journalName = journalString; 3598 3599 logger.debug("JournalParser set journalName " + journalName); 3600 3601 } 3602 } 3603 3604 private String getJournalName() { 3605 return journalName; 3606 } 3607 3608 private int getPublicationDate() { 3609 return publicationDate; 3610 } 3611 3612 private String getStartPage() { 3613 return startPage; 3614 } 3615 3616 private String getVolume() { 3617 return volume; 3618 } 3619 } 3620 3621 private List<Author> authorBuilder(String authorString) { 3622 ArrayList<Author> authorList = new ArrayList<Author>(); 3623 3624 if (authorString.equals("")) { 3625 return authorList; 3626 } 3627 3628 String[] authors = authorString.split(","); 3629 // if (DEBUG) { 3630 // for (int i = 0; i < authors.length; i++) { 3631 // String string = authors[i]; 3632 // System.out.println("authorBuilder author: '" + string + "'"); 3633 // } 3634 // } 3635 // AUTH SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS 3636 // AUTH 2 DISEASE (SSGCID) 3637 // or 3638 // AUTH E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET, 3639 // AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA, 3640 // AUTH 3 A.BOCHKAREV,D.COSSAR, 3641 // AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC) 3642 // or 3643 // AUTH T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER 3644 if (authors.length == 1) { 3645 //only one element means it's a consortium only 3646 Author author = new Author(); 3647 author.setSurname(authors[0]); 3648 3649 logger.debug("Set consortium author name " + author.getSurname()); 3650 3651 authorList.add(author); 3652 } else { 3653 for (int i = 0; i < authors.length; i++) { 3654 String authorFullName = authors[i]; 3655 3656 logger.debug("Building author " + authorFullName); 3657 3658 Author author = new Author(); 3659 String regex = "\\."; 3660 String[] authorNames = authorFullName.split(regex); 3661 // if (DEBUG) { 3662 // System.out.println("authorNames size " + authorNames.length); 3663 // for (int j = 0; j < authorNames.length; j++) { 3664 // String name = authorNames[j]; 3665 // System.out.println("split authName '" + name + "'"); 3666 // 3667 // } 3668 // } 3669 if (authorNames.length == 0) { 3670 author.setSurname(authorFullName); 3671 3672 logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname()); 3673 3674 } 3675 //again there might be a consortium name so there may be no elements 3676 else if (authorNames.length == 1) { 3677 author.setSurname(authorNames[0]); 3678 3679 logger.debug("Set consortium author name in multiple author block " + author.getSurname 3680 ()); 3681 3682 } else { 3683 String initials = ""; 3684 for (int j = 0; j < authorNames.length - 1; j++) { 3685 String initial = authorNames[j]; 3686 // if (DEBUG) { 3687 // System.out.println("adding initial '" + initial + "'"); 3688 // } 3689 //build the initials back up again 3690 initials += initial + "."; 3691 } 3692 3693 logger.debug("built initials '" + initials + "'"); 3694 3695 author.setInitials(initials); 3696 //surname is always last 3697 int lastName = authorNames.length - 1; 3698 String surname = authorNames[lastName]; 3699 3700 logger.debug("built author surname " + surname); 3701 3702 author.setSurname(surname); 3703 3704 } 3705 authorList.add(author); 3706 } 3707 } 3708 return authorList; 3709 } 3710 3711 public void setFileParsingParameters(FileParsingParameters params) 3712 { 3713 this.params= params; 3714 3715 // set the correct max values for parsing... 3716 loadMaxAtoms = params.getMaxAtoms(); 3717 atomCAThreshold = params.getAtomCaThreshold(); 3718 3719 3720 } 3721 3722 public FileParsingParameters getFileParsingParameters(){ 3723 return params; 3724 } 3725 3726 3727}