001/* 002 * 003 * This code may be freely distributed and modified under the 004 * terms of the GNU Lesser General Public Licence. This should 005 * be distributed with the code. If you do not have a copy, 006 * see: 007 * 008 * http://www.gnu.org/copyleft/lesser.html 009 * 010 * Copyright for this code is held jointly by the individual 011 * authors. These should be listed in @author doc comments. 012 * 013 * For more information on the BioJava project and its aims, 014 * or to join the biojava-l mailing list, visit the home page 015 * at: 016 * 017 * http://www.biojava.org/ 018 * 019 * Created on 16.03.2004 020 * 021 */ 022package org.biojava.nbio.structure.io; 023 024import static java.lang.Math.min; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.text.DateFormat; 031import java.text.ParseException; 032import java.text.SimpleDateFormat; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Collections; 036import java.util.Comparator; 037import java.util.Date; 038import java.util.HashMap; 039import java.util.Iterator; 040import java.util.LinkedHashMap; 041import java.util.List; 042import java.util.Locale; 043import java.util.Map; 044import java.util.StringTokenizer; 045import java.util.regex.Matcher; 046import java.util.regex.Pattern; 047 048import javax.vecmath.Matrix4d; 049 050import org.biojava.nbio.structure.AminoAcid; 051import org.biojava.nbio.structure.AminoAcidImpl; 052import org.biojava.nbio.structure.Atom; 053import org.biojava.nbio.structure.AtomImpl; 054import org.biojava.nbio.structure.Author; 055import org.biojava.nbio.structure.Chain; 056import org.biojava.nbio.structure.ChainImpl; 057import org.biojava.nbio.structure.Compound; 058import org.biojava.nbio.structure.DBRef; 059import org.biojava.nbio.structure.Element; 060import org.biojava.nbio.structure.Group; 061import org.biojava.nbio.structure.GroupIterator; 062import org.biojava.nbio.structure.GroupType; 063import org.biojava.nbio.structure.HetatomImpl; 064import org.biojava.nbio.structure.JournalArticle; 065import org.biojava.nbio.structure.NucleotideImpl; 066import org.biojava.nbio.structure.PDBCrystallographicInfo; 067import org.biojava.nbio.structure.PDBHeader; 068import org.biojava.nbio.structure.ResidueNumber; 069import org.biojava.nbio.structure.Site; 070import org.biojava.nbio.structure.Structure; 071import org.biojava.nbio.structure.StructureException; 072import org.biojava.nbio.structure.StructureImpl; 073import org.biojava.nbio.structure.StructureTools; 074import org.biojava.nbio.structure.io.mmcif.ChemCompGroupFactory; 075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord; 076import org.biojava.nbio.structure.secstruc.SecStrucInfo; 077import org.biojava.nbio.structure.secstruc.SecStrucType; 078import org.biojava.nbio.structure.xtal.CrystalCell; 079import org.biojava.nbio.structure.xtal.SpaceGroup; 080import org.biojava.nbio.structure.xtal.SymoplibParser; 081import org.slf4j.Logger; 082import org.slf4j.LoggerFactory; 083 084 085 086/** 087 * This class implements the actual PDB file parsing. Do not access it directly, but 088 * via the PDBFileReader class. 089 * 090 * <h2>Parsing</h2> 091 * 092 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods. 093 * 094 * 095 * <p> 096 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD. 097 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically 098 * switch to a C-alpha only representation. 099 * </p> 100 * 101 * <p> 102 * The result of the parsing of the PDB file is a new {@link Structure} object. 103 * </p> 104 * 105 * 106 * For more documentation on how to work with the Structure API please 107 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top"> 108 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a> 109 * 110 * 111 * 112 * 113 * <h2>Example</h2> 114 * <p> 115 * Q: How can I get a Structure object from a PDB file? 116 * </p> 117 * <p> 118 * A: 119 * <pre> 120 * public {@link Structure} loadStructure(String pathToPDBFile){ 121 * // The PDBFileParser is wrapped by the PDBFileReader 122 * {@link PDBFileReader} pdbreader = new {@link PDBFileReader}(); 123 * 124 * {@link Structure} structure = null; 125 * try{ 126 * structure = pdbreader.getStructure(pathToPDBFile); 127 * System.out.println(structure); 128 * } catch (IOException e) { 129 * e.printStackTrace(); 130 * } 131 * return structure; 132 * } 133 * </pre> 134 * 135 * 136 * @author Andreas Prlic 137 * @author Jules Jacobsen 138 * @author Jose Duarte 139 * @since 1.4 140 */ 141public class PDBFileParser { 142 143 144 145 private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class); 146 147 // for printing 148 private static final String NEWLINE = System.getProperty("line.separator"); 149 150 151 // required for parsing: 152 private String pdbId; //the actual id of the entry 153 private Structure structure; 154 private List<Chain> current_model; // contains the ATOM records for each model 155 private Chain current_chain; 156 private Group current_group; 157 158 private List<Chain> seqResChains; // contains all the chains for the SEQRES records 159 //we're going to work on the assumption that the files are current - 160 //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true. 161 //if true then lines will be truncated at 72 characters in certain cases 162 //(pdb_COMPOUND_handler for example) 163 private boolean isLegacyFormat = false; 164 165 166 // for re-creating the biological assembly 167 168 private PDBBioAssemblyParser bioAssemblyParser = null; 169 170 private PDBHeader pdbHeader; 171 private PDBCrystallographicInfo crystallographicInfo; 172 private JournalArticle journalArticle; 173 private List<Map<String, Integer>> connects ; 174 private List<Map<String,String>> helixList; 175 private List<Map<String,String>> strandList; 176 private List<Map<String,String>> turnList; 177 178 private int lengthCheck ; 179 180 private boolean isLastCompndLine = false; 181 private boolean isLastSourceLine = false; 182 private Compound current_compound; 183 private List<Compound> compounds = new ArrayList<Compound>(); 184 private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>(); 185 private List<String> compndLines = new ArrayList<String>(); 186 private List<String> sourceLines = new ArrayList<String>(); 187 private List<String> journalLines = new ArrayList<String>(); 188 private List<DBRef> dbrefs; 189 private Map<String, Site> siteMap = new LinkedHashMap<String, Site>(); 190 private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>(); 191 192 private List<SSBondImpl> ssbonds = new ArrayList<>(); 193 194 private Matrix4d currentNcsOp; 195 private List<Matrix4d> ncsOperators; 196 197 // for storing LINK until we have all the atoms parsed 198 private List<LinkRecord> linkRecords; 199 200 // for parsing COMPOUND and SOURCE Header lines 201 private int prevMolId; 202 private String previousContinuationField; 203 private String continuationField; 204 private String continuationString; 205 206 private DateFormat dateFormat; 207 208 // for rfree parsing 209 private float rfreeStandardLine = -1; 210 private float rfreeNoCutoffLine = -1; 211 212 private static final List<String> compndFieldValues = new ArrayList<String>( 213 Arrays.asList( 214 "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:", 215 "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:", 216 "BIOLOGICAL_UNIT:", "OTHER_DETAILS:" 217 )); 218 219 220 private static final List<String> ignoreCompndFieldValues = new ArrayList<String>( 221 Arrays.asList( 222 "HETEROGEN:","ENGINEEREED:","FRAGMENT,", 223 "MUTANT:","SYNTHETIC:" 224 )); 225 // ENGINEEREED in pdb219d 226 227 private static final List<String> sourceFieldValues = new ArrayList<String>( 228 Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:", 229 "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:", 230 "ORGANISM_TAXID:","STRAIN:", 231 "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:", 232 "CELL:", "ORGANELLE:", "SECRETION:", "GENE:", 233 "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:", 234 "EXPRESSION_SYSTEM_TAXID:", 235 "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:", 236 "EXPRESSION_SYSTEM_CELL_LINE:", 237 "EXPRESSION_SYSTEM_ATCC_NUMBER:", 238 "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:", 239 "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:", 240 "EXPRESSION_SYSTEM_CELLULAR_LOCATION:", 241 "EXPRESSION_SYSTEM_VECTOR_TYPE:", 242 "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:", 243 "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:")); 244 245 private int atomCount; 246 247 // parsing options: 248 249 private int my_ATOM_CA_THRESHOLD ; 250 251 private int load_max_atoms; 252 253 private boolean atomOverflow; 254 255 /** flag to tell parser to only read Calpha coordinates **/ 256 private boolean parseCAonly; 257 258 259 private FileParsingParameters params; 260 261 public PDBFileParser() { 262 params = new FileParsingParameters(); 263 264 structure = null ; 265 current_model = new ArrayList<Chain>(); 266 current_chain = null ; 267 current_group = null ; 268 pdbHeader = new PDBHeader(); 269 crystallographicInfo = new PDBCrystallographicInfo(); 270 connects = new ArrayList<Map<String,Integer>>() ; 271 272 273 helixList = new ArrayList<Map<String,String>>(); 274 strandList = new ArrayList<Map<String,String>>(); 275 turnList = new ArrayList<Map<String,String>>(); 276 current_compound = null; 277 dbrefs = new ArrayList<DBRef>(); 278 siteMap = null; 279 dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US); 280 atomCount = 0; 281 atomOverflow = false; 282 parseCAonly = false; 283 284 // this SHOULD not be done 285 // DONOT:setFileParsingParameters(params); 286 // set the correct max values for parsing... 287 load_max_atoms = params.getMaxAtoms(); 288 my_ATOM_CA_THRESHOLD = params.getAtomCaThreshold(); 289 290 linkRecords = new ArrayList<LinkRecord>(); 291 } 292 293 /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */ 294 private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) { 295 296 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3); 297 if ( g != null && !g.getChemComp().isEmpty()) 298 return g; 299 300 301 Group group; 302 if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){ 303 group = new HetatomImpl(); 304 305 } else if(StructureTools.isNucleotide(aminoCode3)) { 306 // it is a nucleotide 307 NucleotideImpl nu = new NucleotideImpl(); 308 group = nu; 309 310 } else { 311 AminoAcidImpl aa = new AminoAcidImpl() ; 312 aa.setAminoType(aminoCode1); 313 group = aa ; 314 } 315 316 // System.out.println("new resNum type: "+ resNum.getType() ); 317 return group ; 318 } 319 320 321 322 // Handler methods to deal with PDB file records properly. 323 /** 324 Handler for 325 HEADER Record Format 326 327 COLUMNS DATA TYPE FIELD DEFINITION 328 ---------------------------------------------------------------------------------- 329 1 - 6 Record name "HEADER" 330 11 - 50 String(40) classification Classifies the molecule(s) 331 51 - 59 Date depDate Deposition date. This is the date 332 the coordinates were received by 333 the PDB 334 63 - 66 IDcode idCode This identifier is unique within PDB 335 336 */ 337 private void pdb_HEADER_Handler(String line) { 338 //System.out.println(line); 339 340 String classification = null; 341 String deposition_date = null; 342 String pdbCode = null; 343 344 int len = line.trim().length(); 345 if(len > 10) { 346 classification = line.substring (10, min(len,50)).trim() ; 347 pdbHeader.setClassification(classification); 348 } 349 if(len > 50) { 350 deposition_date = line.substring (50, min(len,59)).trim() ; 351 try { 352 Date dep = dateFormat.parse(deposition_date); 353 pdbHeader.setDepDate(dep); 354 355 } catch (ParseException e){ 356 logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date"); 357 } 358 } 359 if(len > 62) { 360 pdbCode = line.substring (62, min(len,66)).trim() ; 361 pdbId = pdbCode; 362 363 logger.debug("Parsing entry " + pdbId); 364 365 366 structure.setPDBCode(pdbCode); 367 pdbHeader.setIdCode(pdbCode); 368 } 369 370 //*really* old files (you'll need to hunt to find these as they 371 //should have been remediated) have headers like below. Plus the 372 //pdbId at positions 72-76 is present in every line 373 374 //HEADER PROTEINASE INHIBITOR (TRYPSIN) 05-OCT-84 5PTI 5PTI 3 375 //HEADER TRANSFERASE (ACYLTRANSFERASE) 02-SEP-92 1LAC 1LAC 2 376 if (len > 66) { 377 if (pdbId.equals(line.substring (72, 76))){ 378 isLegacyFormat = true; 379 System.out.println(pdbId + " is a LEGACY entry - this will most likely not parse correctly."); 380 } 381 } 382 383 } 384 385 386 /** parses the following record: 387 * <pre> 388 * COLUMNS DATA TYPE FIELD DEFINITION 389 * ------------------------------------------------------------------------------------ 390 * 1 - 6 Record name "AUTHOR" 391 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 392 * 11 - 79 List authorList List of the author names, separated 393 * by commas. 394 * 395 * </pre> 396 * @param line 397 */ 398 private void pdb_AUTHOR_Handler(String line) { 399 400 String authors = line.substring(10).trim(); 401 402 String auth = pdbHeader.getAuthors(); 403 if (auth == null){ 404 pdbHeader.setAuthors(authors); 405 } else { 406 auth += authors; 407 pdbHeader.setAuthors(auth); 408 } 409 410 } 411 412 413 414 /** parses the following record: 415 * 416 * <pre> 417 * COLUMNS DATA TYPE FIELD DEFINITION 418 * -------------------------------------------------------------------- 419 * 1 - 6 Record name "HELIX " 420 * 8 - 10 Integer serNum Serial number of the helix. 421 * This starts at 1 and increases 422 * incrementally. 423 * 12 - 14 LString(3) helixID Helix identifier. In addition 424 * to a serial number, each helix is 425 * given an alphanumeric character 426 * helix identifier. 427 * 16 - 18 Residue name initResName Name of the initial residue. 428 * 20 Character initChainID Chain identifier for the chain 429 * containing this helix. 430 * 22 - 25 Integer initSeqNum Sequence number of the initial 431 * residue. 432 * 26 AChar initICode Insertion code of the initial 433 * residue. 434 * 28 - 30 Residue name endResName Name of the terminal residue of 435 * the helix. 436 * 32 Character endChainID Chain identifier for the chain 437 * containing this helix. 438 * 34 - 37 Integer endSeqNum Sequence number of the terminal 439 * residue. 440 * 38 AChar endICode Insertion code of the terminal 441 * residue. 442 * 39 - 40 Integer helixClass Helix class (see below). 443 * 41 - 70 String comment Comment about this helix. 444 * 72 - 76 Integer length Length of this helix. 445 * </pre> 446 */ 447 448 private void pdb_HELIX_Handler(String line){ 449 450 if (params.isHeaderOnly()) return; 451 452 if (line.length()<38) { 453 logger.info("HELIX line has length under 38. Ignoring it."); 454 return; 455 } 456 457 String initResName = line.substring(15,18).trim(); 458 String initChainId = line.substring(19,20); 459 String initSeqNum = line.substring(21,25).trim(); 460 String initICode = line.substring(25,26); 461 String endResName = line.substring(27,30).trim(); 462 String endChainId = line.substring(31,32); 463 String endSeqNum = line.substring(33,37).trim(); 464 String endICode = line.substring(37,38); 465 466 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 467 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 468 469 Map<String,String> m = new HashMap<String,String>(); 470 471 m.put("initResName",initResName); 472 m.put("initChainId", initChainId); 473 m.put("initSeqNum", initSeqNum); 474 m.put("initICode", initICode); 475 m.put("endResName", endResName); 476 m.put("endChainId", endChainId); 477 m.put("endSeqNum",endSeqNum); 478 m.put("endICode",endICode); 479 480 helixList.add(m); 481 482 } 483 484 /** 485 * Handler for 486 * <pre> 487 * COLUMNS DATA TYPE FIELD DEFINITION 488 * -------------------------------------------------------------- 489 * 1 - 6 Record name "SHEET " 490 * 8 - 10 Integer strand Strand number which starts at 1 491 * for each strand within a sheet 492 * and increases by one. 493 * 12 - 14 LString(3) sheetID Sheet identifier. 494 * 15 - 16 Integer numStrands Number of strands in sheet. 495 * 18 - 20 Residue name initResName Residue name of initial residue. 496 * 22 Character initChainID Chain identifier of initial 497 * residue in strand. 498 * 23 - 26 Integer initSeqNum Sequence number of initial 499 * residue in strand. 500 * 27 AChar initICode Insertion code of initial residue 501 * in strand. 502 * 29 - 31 Residue name endResName Residue name of terminal residue. 503 * 33 Character endChainID Chain identifier of terminal 504 * residue. 505 * 34 - 37 Integer endSeqNum Sequence number of terminal 506 * residue. 507 * 38 AChar endICode Insertion code of terminal 508 * residue. 509 * 39 - 40 Integer sense Sense of strand with respect to 510 * previous strand in the sheet. 0 511 * if first strand, 1 if parallel, 512 * -1 if anti-parallel. 513 * 42 - 45 Atom curAtom Registration. Atom name in 514 * current strand. 515 * 46 - 48 Residue name curResName Registration. Residue name in 516 * current strand. 517 * 50 Character curChainId Registration. Chain identifier in 518 * current strand. 519 * 51 - 54 Integer curResSeq Registration. Residue sequence 520 * number in current strand. 521 * 55 AChar curICode Registration. Insertion code in 522 * current strand. 523 * 57 - 60 Atom prevAtom Registration. Atom name in 524 * previous strand. 525 * 61 - 63 Residue name prevResName Registration. Residue name in 526 * previous strand. 527 * 65 Character prevChainId Registration. Chain identifier in 528 * previous strand. 529 * 66 - 69 Integer prevResSeq Registration. Residue sequence 530 * number in previous strand. 531 * 70 AChar prevICode Registration. Insertion code in 532 * previous strand. 533 * </pre> 534 */ 535 private void pdb_SHEET_Handler( String line){ 536 537 if (params.isHeaderOnly()) return; 538 539 if (line.length()<38) { 540 logger.info("SHEET line has length under 38. Ignoring it."); 541 return; 542 } 543 544 String initResName = line.substring(17,20).trim(); 545 String initChainId = line.substring(21,22); 546 String initSeqNum = line.substring(22,26).trim(); 547 String initICode = line.substring(26,27); 548 String endResName = line.substring(28,31).trim(); 549 String endChainId = line.substring(32,33); 550 String endSeqNum = line.substring(33,37).trim(); 551 String endICode = line.substring(37,38); 552 553 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 554 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 555 556 Map<String,String> m = new HashMap<String,String>(); 557 558 m.put("initResName",initResName); 559 m.put("initChainId", initChainId); 560 m.put("initSeqNum", initSeqNum); 561 m.put("initICode", initICode); 562 m.put("endResName", endResName); 563 m.put("endChainId", endChainId); 564 m.put("endSeqNum",endSeqNum); 565 m.put("endICode",endICode); 566 567 strandList.add(m); 568 } 569 570 571 /** 572 * Handler for TURN lines 573 * <pre> 574 * COLUMNS DATA TYPE FIELD DEFINITION 575 * -------------------------------------------------------------------- 576 * 1 - 6 Record name "TURN " 577 * 8 - 10 Integer seq Turn number; starts with 1 and 578 * increments by one. 579 * 12 - 14 LString(3) turnId Turn identifier 580 * 16 - 18 Residue name initResName Residue name of initial residue in 581 * turn. 582 * 20 Character initChainId Chain identifier for the chain 583 * containing this turn. 584 * 21 - 24 Integer initSeqNum Sequence number of initial residue 585 * in turn. 586 * 25 AChar initICode Insertion code of initial residue 587 * in turn. 588 * 27 - 29 Residue name endResName Residue name of terminal residue 589 * of turn. 590 * 31 Character endChainId Chain identifier for the chain 591 * containing this turn. 592 * 32 - 35 Integer endSeqNum Sequence number of terminal 593 * residue of turn. 594 * 36 AChar endICode Insertion code of terminal residue 595 * of turn. 596 * 41 - 70 String comment Associated comment. 597 * </pre> 598 * @param line 599 */ 600 private void pdb_TURN_Handler( String line){ 601 602 if (params.isHeaderOnly()) return; 603 604 if (line.length()<36) { 605 logger.info("TURN line has length under 36. Ignoring it."); 606 return; 607 } 608 609 String initResName = line.substring(15,18).trim(); 610 String initChainId = line.substring(19,20); 611 String initSeqNum = line.substring(20,24).trim(); 612 String initICode = line.substring(24,25); 613 String endResName = line.substring(26,29).trim(); 614 String endChainId = line.substring(30,31); 615 String endSeqNum = line.substring(31,35).trim(); 616 String endICode = line.substring(35,36); 617 618 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 619 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 620 621 Map<String,String> m = new HashMap<String,String>(); 622 623 m.put("initResName",initResName); 624 m.put("initChainId", initChainId); 625 m.put("initSeqNum", initSeqNum); 626 m.put("initICode", initICode); 627 m.put("endResName", endResName); 628 m.put("endChainId", endChainId); 629 m.put("endSeqNum",endSeqNum); 630 m.put("endICode",endICode); 631 632 turnList.add(m); 633 } 634 635 /** 636 * Handler for 637 * REVDAT Record format: 638 * 639 * COLUMNS DATA TYPE FIELD DEFINITION 640 * ---------------------------------------------------------------------------------- 641 * 1 - 6 Record name "REVDAT" 642 * 8 - 10 Integer modNum Modification number. 643 * 11 - 12 Continuation continuation Allows concatenation of multiple 644 * records. 645 * 14 - 22 Date modDate Date of modification (or release for 646 * new entries). This is not repeated 647 * on continuation lines. 648 * 24 - 28 String(5) modId Identifies this particular 649 * modification. It links to the 650 * archive used internally by PDB. 651 * This is not repeated on continuation 652 * lines. 653 * 32 Integer modType An integer identifying the type of 654 * modification. In case of revisions 655 * with more than one possible modType, 656 * the highest value applicable will be 657 * assigned. 658 * 40 - 45 LString(6) record Name of the modified record. 659 * 47 - 52 LString(6) record Name of the modified record. 660 * 54 - 59 LString(6) record Name of the modified record. 661 * 61 - 66 LString(6) record Name of the modified record. 662 */ 663 private void pdb_REVDAT_Handler(String line) { 664 665 // only keep the first... 666 Date modDate = pdbHeader.getModDate(); 667 668 if ( modDate==null || modDate.equals(new Date(0)) ) { 669 // modDate is still uninitialized 670 String modificationDate = line.substring (13, 22).trim() ; 671 672 try { 673 Date dep = dateFormat.parse(modificationDate); 674 pdbHeader.setModDate(dep); 675 } catch (ParseException e){ 676 logger.info("Could not parse modification date string '"+modificationDate+"'. Will continue without modification date"); 677 } 678 679 } 680 } 681 682 /** @author Jules Jacobsen 683 * Handler for 684 * SEQRES record format 685 * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied. 686 * <p/> 687 * Record Format 688 * <p/> 689 * COLUMNS DATA TYPE FIELD DEFINITION 690 * --------------------------------------------------------------------------------- 691 * 1 - 6 Record name "SEQRES" 692 * <p/> 693 * 9 - 10 Integer serNum Serial number of the SEQRES record 694 * for the current chain. Starts at 1 695 * and increments by one each line. 696 * Reset to 1 for each chain. 697 * <p/> 698 * 12 Character chainID Chain identifier. This may be any 699 * single legal character, including a 700 * blank which is used if there is 701 * only one chain. 702 * <p/> 703 * 14 - 17 Integer numRes Number of residues in the chain. 704 * This value is repeated on every 705 * record. 706 * <p/> 707 * 20 - 22 Residue name resName Residue name. 708 * <p/> 709 * 24 - 26 Residue name resName Residue name. 710 * <p/> 711 * 28 - 30 Residue name resName Residue name. 712 * <p/> 713 * 32 - 34 Residue name resName Residue name. 714 * <p/> 715 * 36 - 38 Residue name resName Residue name. 716 * <p/> 717 * 40 - 42 Residue name resName Residue name. 718 * <p/> 719 * 44 - 46 Residue name resName Residue name. 720 * <p/> 721 * 48 - 50 Residue name resName Residue name. 722 * <p/> 723 * 52 - 54 Residue name resName Residue name. 724 * <p/> 725 * 56 - 58 Residue name resName Residue name. 726 * <p/> 727 * 60 - 62 Residue name resName Residue name. 728 * <p/> 729 * 64 - 66 Residue name resName Residue name. 730 * <p/> 731 * 68 - 70 Residue name resName Residue name. 732 */ 733 private void pdb_SEQRES_Handler(String line) { 734 735 /* 736 * 1 2 3 4 5 6 7 737 * 1234567890123456789012345678901234567890123456789012345678901234567890 738 * SEQRES 1 A 376 LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR 739 * SEQRES 1 A 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 740 * SEQRES 2 A 21 TYR GLN LEU GLU ASN TYR CYS ASN 741 * SEQRES 1 B 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 742 * SEQRES 2 B 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 743 * SEQRES 3 B 30 THR PRO LYS ALA 744 * SEQRES 1 C 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 745 * SEQRES 2 C 21 TYR GLN LEU GLU ASN TYR CYS ASN 746 * SEQRES 1 D 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 747 * SEQRES 2 D 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 748 * SEQRES 3 D 30 THR PRO LYS ALA 749 */ 750 751 String recordName = line.substring(0, 6).trim(); 752 String chainID = line.substring(11, 12); 753 String newLength = line.substring(13,17).trim(); 754 String subSequence = line.substring(18); 755 756 if ( lengthCheck == -1 ){ 757 lengthCheck = Integer.parseInt(newLength); 758 } 759 760 StringTokenizer subSequenceResidues = new StringTokenizer(subSequence); 761 762 Character aminoCode1 = null; 763 if (! recordName.equals(AminoAcid.SEQRESRECORD)) { 764 // should not have been called 765 return; 766 } 767 768 current_chain = isKnownChain(chainID, seqResChains); 769 if ( current_chain == null) { 770 771 current_chain = new ChainImpl(); 772 current_chain.setChainID(chainID); 773 774 } 775 776 while (subSequenceResidues.hasMoreTokens()) { 777 778 String threeLetter = subSequenceResidues.nextToken(); 779 780 aminoCode1 = StructureTools.get1LetterCode(threeLetter); 781 782 //if (aminoCode1 == null) { 783 // could be a nucleotide... 784 // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide... 785 //} 786 current_group = getNewGroup("ATOM", aminoCode1, threeLetter); 787 788 current_group.setPDBName(threeLetter); 789 790 if ( current_group instanceof AminoAcid){ 791 AminoAcid aa = (AminoAcid)current_group; 792 aa.setRecordType(AminoAcid.SEQRESRECORD); 793 } 794 // add the current resNum to the new chain. 795 current_chain.addGroup(current_group); 796 797 } 798 Chain test = isKnownChain(chainID, seqResChains); 799 800 if ( test == null) 801 seqResChains.add(current_chain); 802 803 if (current_group != null) 804 current_group.trimToSize(); 805 806 current_group = null; 807 current_chain = null; 808 809 // the current chain is finished! 810 //if ( current_chain.getLength() != lengthCheck ){ 811 // System.err.println("the length of chain " + current_chain.getName() + "(" + 812 // current_chain.getLength() + ") does not match the expected " + lengthCheck); 813 //} 814 815 lengthCheck = Integer.parseInt(newLength); 816 817 } 818 819 820 821 /** Handler for 822 TITLE Record Format 823 824 COLUMNS DATA TYPE FIELD DEFINITION 825 ---------------------------------------------------------------------------------- 826 1 - 6 Record name "TITLE " 827 9 - 10 Continuation continuation Allows concatenation of multiple 828 records. 829 11 - 70 String title Title of the experiment. 830 831 832 */ 833 private void pdb_TITLE_Handler(String line) { 834 String title; 835 if ( line.length() > 79) 836 title = line.substring(10,80).trim(); 837 else 838 title = line.substring(10,line.length()).trim(); 839 840 String t = pdbHeader.getTitle(); 841 if ( (t != null) && (! t.equals("")) ){ 842 if (t.endsWith("-")) 843 t += ""; // if last line ends with a hyphen then we don't add space 844 else 845 t += " "; 846 } 847 else t = ""; 848 849 t += title; 850 851 pdbHeader.setTitle(t); 852 } 853 854 /** 855 * JRNL handler. 856 * The JRNL record contains the primary literature citation that describes the experiment which resulted 857 * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary 858 * reference, then there is no JRNL reference. Other references are given in REMARK 1. 859 * 860 * Record Format 861 * 862 * COLUMNS DATA TYPE FIELD DEFINITION 863 * ----------------------------------------------------------------------- 864 * 1 - 6 Record name "JRNL " 865 * 866 * 13 - 70 LString text See Details below. 867 * 868 */ 869 private void pdb_JRNL_Handler(String line) { 870 //add the strings to the journalLines 871 //the actual JournalArticle is then built when the whole entry is being 872 //finalized with triggerEndFileChecks() 873 //JRNL TITL NMR SOLUTION STRUCTURE OF RECOMBINANT TICK 1TAP 10 874 if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) { 875 //trim off the trailing PDB id from legacy files. 876 //are we really trying to still cater for these museum pieces? 877 878 logger.debug("trimming legacy PDB id from end of JRNL section line"); 879 880 line = line.substring(0, line.length() - 8); 881 journalLines.add(line); 882 } else { 883 journalLines.add(line); 884 } 885 } 886 887 /** 888 * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same 889 * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be 890 * missing. Don't say I didn't warn you. 891 * 892 * @param line 893 */ 894 private void pdb_COMPND_Handler(String line) { 895 896 logger.debug("previousContinuationField is " 897 + previousContinuationField); 898 logger.debug("current continuationField is " 899 + continuationField); 900 logger.debug("current continuationString is " 901 + continuationString); 902 logger.debug("current compound is " 903 + current_compound); 904 905 906 // In legacy PDB files the line ends with the PDB code and a serial number, chop those off! 907 //format version 3.0 onwards will have 80 characters in a line 908 // if (line.length() > 72) { 909 if (isLegacyFormat) { 910 // if (DEBUG) { 911 // System.out.println("We have a legacy file - truncating line length to 71 characters:"); 912 // System.out.println(line); 913 // } 914 line = line.substring(0, 72); 915 } 916 917 line = line.substring(10, line.length()); 918 919 920 String[] fieldList = line.trim().split("\\s+"); 921 int fl = fieldList.length; 922 if ((fl >0 ) && compndFieldValues.contains(fieldList[0])) { 923 924 continuationField = fieldList[0]; 925 if (previousContinuationField.equals("")) { 926 previousContinuationField = continuationField; 927 } 928 929 } else if (fl>0) { 930 // the ':' character indicates the end of a field name and should be invalid as part the first data token 931 // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check 932 if (fieldList[0].contains(":") ) { 933 logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier"); 934 return; 935 } 936 937 } else { 938 939 // the line will be added as data to the previous field 940 } 941 942 line = line.replace(continuationField, "").trim(); 943 944 StringTokenizer compndTokens = new StringTokenizer(line); 945 946 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 947 948 while (compndTokens.hasMoreTokens()) { 949 String token = compndTokens.nextToken(); 950 951 if (previousContinuationField.equals("")) { 952 previousContinuationField = continuationField; 953 } 954 955 if (previousContinuationField.equals(continuationField) 956 && compndFieldValues.contains(continuationField)) { 957 958 logger.debug("Still in field " + continuationField); 959 logger.debug("token = " + token); 960 961 continuationString = continuationString.concat(token + " "); 962 963 logger.debug("continuationString = " 964 + continuationString); 965 966 } 967 if (!continuationField.equals(previousContinuationField)) { 968 969 if (continuationString.equals("")) { 970 continuationString = token; 971 972 } else { 973 974 compndValueSetter(previousContinuationField, 975 continuationString); 976 previousContinuationField = continuationField; 977 continuationString = token + " "; 978 } 979 } else if (ignoreCompndFieldValues.contains(token)) { 980 // this field shall be ignored 981 //continuationField = token; 982 } 983 } 984 if (isLastCompndLine) { 985 // final line in the section - finish off the compound 986 // System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header."); 987 compndValueSetter(continuationField, continuationString); 988 continuationString = ""; 989 if (current_compound!=null) compounds.add(current_compound); 990 } 991 } 992 993 /** 994 * Set the value in the currrent molId object 995 * @param field 996 * @param value 997 */ 998 private void compndValueSetter(String field, String value) { 999 1000 value = value.trim().replace(";", ""); 1001 if (field.equals("MOL_ID:")) { 1002 1003 int i = -1; 1004 try { 1005 i = Integer.valueOf(value); 1006 } catch (NumberFormatException e){ 1007 logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value); 1008 } 1009 if (i>0 && prevMolId!=i) { 1010 1011 if (current_compound!=null) compounds.add(current_compound); 1012 1013 logger.debug("Initialising new Compound with mol_id {}", i); 1014 1015 current_compound = new Compound(); 1016 1017 current_compound.setMolId(i); 1018 1019 prevMolId = i; 1020 } 1021 1022 } 1023 1024 // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return 1025 if (current_compound==null) { 1026 return; 1027 } 1028 1029 if (field.equals("MOLECULE:")) { 1030 current_compound.setMolName(value); 1031 1032 } 1033 if (field.equals("CHAIN:")) { 1034 //System.out.println(value); 1035 StringTokenizer chainTokens = new StringTokenizer(value, ","); 1036 List<String> chains = new ArrayList<String>(); 1037 1038 while (chainTokens.hasMoreTokens()) { 1039 String chainID = chainTokens.nextToken().trim(); 1040 // NULL is used in old PDB files to represent empty chain DI 1041 if (chainID.equals("NULL")) 1042 chainID = " "; 1043 chains.add(chainID); 1044 } 1045 compoundMolIds2chainIds.put(current_compound.getMolId(),chains); 1046 1047 } 1048 if (field.equals("SYNONYM:")) { 1049 1050 StringTokenizer synonyms = new StringTokenizer(value, ","); 1051 List<String> names = new ArrayList<String>(); 1052 1053 while (synonyms.hasMoreTokens()) { 1054 names.add(synonyms.nextToken()); 1055 1056 current_compound.setSynonyms(names); 1057 } 1058 1059 } 1060 1061 if (field.equals("EC:")) { 1062 1063 StringTokenizer ecNumTokens = new StringTokenizer(value, ","); 1064 List<String> ecNums = new ArrayList<String>(); 1065 1066 while (ecNumTokens.hasMoreTokens()) { 1067 ecNums.add(ecNumTokens.nextToken()); 1068 1069 current_compound.setEcNums(ecNums); 1070 } 1071 1072 } 1073 if (field.equals("FRAGMENT:")) { 1074 1075 current_compound.setFragment(value); 1076 1077 } 1078 if (field.equals("ENGINEERED:")) { 1079 1080 current_compound.setEngineered(value); 1081 1082 } 1083 if (field.equals("MUTATION:")) { 1084 1085 current_compound.setMutation(value); 1086 1087 } 1088 if (field.equals("BIOLOGICAL_UNIT:")) { 1089 1090 current_compound.setBiologicalUnit(value); 1091 1092 } 1093 if (field.equals("OTHER_DETAILS:")) { 1094 1095 current_compound.setDetails(value); 1096 1097 } 1098 1099 } 1100 1101 1102 /** Handler for 1103 * SOURCE Record format 1104 * 1105 * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied. 1106 * Record Format 1107 * 1108 * COLUMNS DATA TYPE FIELD DEFINITION 1109 * ------------------------------------------------------------------------------- 1110 * 1 - 6 Record name "SOURCE" 1111 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 1112 * 11 - 70 Specification srcName Identifies the source of the macromolecule in 1113 * list a token: value format. 1114 * @param line the line to be parsed 1115 */ 1116 private void pdb_SOURCE_Handler(String line) { 1117 // works in the same way as the pdb_COMPND_Handler. 1118 String continuationNr = line.substring(9, 10).trim(); 1119 1120 1121 1122 logger.debug("current continuationNo is " 1123 + continuationNr); 1124 logger.debug("previousContinuationField is " 1125 + previousContinuationField); 1126 logger.debug("current continuationField is " 1127 + continuationField); 1128 logger.debug("current continuationString is " 1129 + continuationString); 1130 logger.debug("current compound is " 1131 + current_compound); 1132 1133 1134 // following the docs, the last valid character should be 79, chop off the rest 1135 if (line.length() > 79) { 1136 line = line.substring(0, 79); 1137 } 1138 1139 line = line.substring(10, line.length()); 1140 1141 logger.debug("LINE: >" + line + "<"); 1142 1143 String[] fieldList = line.split("\\s+"); 1144 1145 if (!fieldList[0].equals("") 1146 && sourceFieldValues.contains(fieldList[0])) { 1147 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'"); 1148 continuationField = fieldList[0]; 1149 if (previousContinuationField.equals("")) { 1150 previousContinuationField = continuationField; 1151 } 1152 1153 } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) { 1154 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'"); 1155 continuationField = fieldList[1]; 1156 if (previousContinuationField.equals("")) { 1157 previousContinuationField = continuationField; 1158 } 1159 1160 } else { 1161 if (continuationNr.equals("")) { 1162 1163 logger.debug("looks like an old PDB file"); 1164 1165 continuationField = "MOLECULE:"; 1166 if (previousContinuationField.equals("")) { 1167 previousContinuationField = continuationField; 1168 } 1169 } 1170 1171 } 1172 1173 line = line.replace(continuationField, "").trim(); 1174 1175 StringTokenizer compndTokens = new StringTokenizer(line); 1176 1177 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 1178 1179 while (compndTokens.hasMoreTokens()) { 1180 String token = compndTokens.nextToken(); 1181 1182 if (previousContinuationField.equals("")) { 1183 // System.out.println("previousContinuationField is empty. Setting to : " + continuationField); 1184 previousContinuationField = continuationField; 1185 } 1186 1187 if (previousContinuationField.equals(continuationField) 1188 && sourceFieldValues.contains(continuationField)) { 1189 1190 logger.debug("Still in field " + continuationField); 1191 1192 continuationString = continuationString.concat(token + " "); 1193 1194 logger.debug("continuationString = " 1195 + continuationString); 1196 } 1197 if (!continuationField.equals(previousContinuationField)) { 1198 1199 if (continuationString.equals("")) { 1200 continuationString = token; 1201 1202 } else { 1203 1204 sourceValueSetter(previousContinuationField, 1205 continuationString); 1206 previousContinuationField = continuationField; 1207 continuationString = token + " "; 1208 } 1209 } else if (ignoreCompndFieldValues.contains(token)) { 1210 // this field shall be ignored 1211 //continuationField = token; 1212 } 1213 } 1214 if (isLastSourceLine) { 1215 // final line in the section - finish off the compound 1216 // System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header."); 1217 sourceValueSetter(continuationField, continuationString); 1218 continuationString = ""; 1219 //compounds.add(current_compound); 1220 } 1221 1222 } 1223 1224 1225 /** set the value in the currrent molId object 1226 * 1227 * @param field 1228 * @param value 1229 */ 1230 private void sourceValueSetter(String field, String value) { 1231 1232 value = value.trim().replace(";", ""); 1233 // System.out.println("[sourceValueSetter] " + field); 1234 if (field.equals("MOL_ID:")) { 1235 1236 try { 1237 current_compound = compounds.get(Integer.valueOf(value) - 1); 1238 } catch (NumberFormatException e){ 1239 logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage()); 1240 return; 1241 } 1242 1243 1244 // System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId()); 1245 1246 } 1247 if (field.equals("SYNTHETIC:")) { 1248 current_compound.setSynthetic(value); 1249 } else if (field.equals("FRAGMENT:")) { 1250 current_compound.setFragment(value); 1251 } else if (field.equals("ORGANISM_SCIENTIFIC:")) { 1252 current_compound.setOrganismScientific(value); 1253 } else if (field.equals("ORGANISM_TAXID:")) { 1254 current_compound.setOrganismTaxId(value); 1255 } else if (field.equals("ORGANISM_COMMON:")) { 1256 current_compound.setOrganismCommon(value); 1257 } else if (field.equals("STRAIN:")) { 1258 current_compound.setStrain(value); 1259 } else if (field.equals("VARIANT:")) { 1260 current_compound.setVariant(value); 1261 } else if (field.equals("CELL_LINE:")) { 1262 current_compound.setCellLine(value); 1263 } else if (field.equals("ATCC:")) { 1264 current_compound.setAtcc(value); 1265 } else if (field.equals("ORGAN:")) { 1266 current_compound.setOrgan(value); 1267 } else if (field.equals("TISSUE:")) { 1268 current_compound.setTissue(value); 1269 } else if (field.equals("CELL:")) { 1270 current_compound.setCell(value); 1271 } else if (field.equals("ORGANELLE:")) { 1272 current_compound.setOrganelle(value); 1273 } else if (field.equals("SECRETION:")) { 1274 current_compound.setSecretion(value); 1275 } else if (field.equals("GENE:")) { 1276 current_compound.setGene(value); 1277 } else if (field.equals("CELLULAR_LOCATION:")) { 1278 current_compound.setCellularLocation(value); 1279 } else if (field.equals("EXPRESSION_SYSTEM:")) { 1280 current_compound.setExpressionSystem(value); 1281 } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) { 1282 current_compound.setExpressionSystemTaxId(value); 1283 } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) { 1284 current_compound.setExpressionSystemStrain(value); 1285 } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) { 1286 current_compound.setExpressionSystemVariant(value); 1287 } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) { 1288 current_compound.setExpressionSystemCellLine(value); 1289 } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) { 1290 current_compound.setExpressionSystemAtccNumber(value); 1291 } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) { 1292 current_compound.setExpressionSystemOrgan(value); 1293 } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) { 1294 current_compound.setExpressionSystemTissue(value); 1295 } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) { 1296 current_compound.setExpressionSystemCell(value); 1297 } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) { 1298 current_compound.setExpressionSystemOrganelle(value); 1299 } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) { 1300 current_compound.setExpressionSystemCellularLocation(value); 1301 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) { 1302 current_compound.setExpressionSystemVectorType(value); 1303 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) { 1304 current_compound.setExpressionSystemVector(value); 1305 } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) { 1306 current_compound.setExpressionSystemPlasmid(value); 1307 } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) { 1308 current_compound.setExpressionSystemGene(value); 1309 } else if (field.equals("OTHER_DETAILS:")) { 1310 current_compound.setExpressionSystemOtherDetails(value); 1311 } 1312 1313 } 1314 1315 /** 1316 * Handler for REMARK lines 1317 */ 1318 private void pdb_REMARK_Handler(String line) { 1319 1320 if ( line == null || line.length() < 11) 1321 return; 1322 1323 1324 if (line.startsWith("REMARK 800")) { 1325 pdb_REMARK_800_Handler(line); 1326 1327 } else if ( line.startsWith("REMARK 350")){ 1328 1329 if ( params.isParseBioAssembly()) { 1330 1331 if (bioAssemblyParser == null){ 1332 bioAssemblyParser = new PDBBioAssemblyParser(); 1333 } 1334 1335 bioAssemblyParser.pdb_REMARK_350_Handler(line); 1336 } 1337 1338 // REMARK 3 (for R free) 1339 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1340 // then last one encountered will be taken 1341 } else if (line.startsWith("REMARK 3 FREE R VALUE")) { 1342 1343 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 1344 // Here we follow this strategy: 1345 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 1346 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 1347 1348 Pattern pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*"); 1349 Matcher mR = pR.matcher(line); 1350 if (mR.matches()) { 1351 try { 1352 rfreeNoCutoffLine = Float.parseFloat(mR.group(1)); 1353 } catch (NumberFormatException e) { 1354 logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it"); 1355 } 1356 } 1357 pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*"); 1358 mR = pR.matcher(line); 1359 if (mR.matches()) { 1360 try { 1361 rfreeStandardLine = Float.parseFloat(mR.group(1)); 1362 } catch (NumberFormatException e) { 1363 logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1)); 1364 } 1365 } 1366 1367 // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries) 1368 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1369 // then last one encountered will be taken 1370 } else if (line.startsWith("REMARK 3 RESOLUTION RANGE HIGH")){ 1371 Pattern pR = Pattern.compile("^REMARK 3 RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*"); 1372 Matcher mR = pR.matcher(line); 1373 if (mR.matches()) { 1374 try { 1375 float res = Float.parseFloat(mR.group(1)); 1376 if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) { 1377 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} " 1378 ,mR.group(1), String.format("%4.2f",pdbHeader.getResolution())); 1379 } 1380 pdbHeader.setResolution(res); 1381 } catch (NumberFormatException e) { 1382 logger.info("Could not parse resolution '{}', ignoring it",mR.group(1)); 1383 } 1384 } 1385 } 1386 1387 } 1388 1389 1390 1391 1392 1393 1394 /** Handler for 1395 EXPDTA Record Format 1396 1397 COLUMNS DATA TYPE FIELD DEFINITION 1398 ------------------------------------------------------------------------------- 1399 1 - 6 Record name "EXPDTA" 1400 9 - 10 Continuation continuation Allows concatenation of multiple 1401 records. 1402 11 - 70 SList technique The experimental technique(s) with 1403 optional comment describing the 1404 sample or experiment. 1405 1406 allowed techniques are: 1407 ELECTRON DIFFRACTION 1408 FIBER DIFFRACTION 1409 FLUORESCENCE TRANSFER 1410 NEUTRON DIFFRACTION 1411 NMR 1412 THEORETICAL MODEL 1413 X-RAY DIFFRACTION 1414 1415 */ 1416 1417 private void pdb_EXPDTA_Handler(String line) { 1418 1419 String technique ; 1420 if (line.length() > 69) 1421 technique = line.substring (10, 70).trim() ; 1422 else 1423 technique = line.substring(10).trim(); 1424 1425 for (String singleTechnique: technique.split(";\\s+")) { 1426 pdbHeader.setExperimentalTechnique(singleTechnique); 1427 } 1428 1429 1430 } 1431 1432 /** Handler for 1433 * CRYST1 Record Format 1434 * The CRYST1 record presents the unit cell parameters, space group, and Z value. 1435 * If the entry describes a structure determined by a technique other than X-ray crystallography, 1436 * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1437 * 1438 * COLUMNS DATA TYPE FIELD DEFINITION 1439 * ------------------------------------------------------------- 1440 * 1 - 6 Record name "CRYST1" 1441 * 7 - 15 Real(9.3) a a (Angstroms). 1442 * 16 - 24 Real(9.3) b b (Angstroms). 1443 * 25 - 33 Real(9.3) c c (Angstroms). 1444 * 34 - 40 Real(7.2) alpha alpha (degrees). 1445 * 41 - 47 Real(7.2) beta beta (degrees). 1446 * 48 - 54 Real(7.2) gamma gamma (degrees). 1447 * 56 - 66 LString sGroup Space group. 1448 * 67 - 70 Integer z Z value. 1449 * 1450 */ 1451 1452 private void pdb_CRYST1_Handler(String line) { 1453 // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 63 1454 if (line.length() < 63) { 1455 logger.warn("CRYST1 record has fewer than 63 columns: will ignore it"); 1456 return; 1457 } 1458 1459 float a; 1460 float b; 1461 float c; 1462 float alpha; 1463 float beta; 1464 float gamma; 1465 String spaceGroup = ""; 1466 1467 try { 1468 a = Float.parseFloat(line.substring(6,15).trim()); 1469 b = Float.parseFloat(line.substring(15,24).trim()); 1470 c = Float.parseFloat(line.substring(24,33).trim()); 1471 alpha = Float.parseFloat(line.substring(33,40).trim()); 1472 beta = Float.parseFloat(line.substring(40,47).trim()); 1473 gamma = Float.parseFloat(line.substring(47,54).trim()); 1474 } catch (NumberFormatException e) { 1475 logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line); 1476 return ; 1477 } 1478 if (line.length()>=66) { 1479 // for well formatted files 1480 spaceGroup = line.substring(55,66).trim(); 1481 } else { 1482 // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value 1483 spaceGroup = line.substring(55,line.length()).trim(); 1484 } 1485 1486 CrystalCell xtalCell = new CrystalCell(); 1487 xtalCell.setA(a); 1488 xtalCell.setB(b); 1489 xtalCell.setC(c); 1490 xtalCell.setAlpha(alpha); 1491 xtalCell.setBeta(beta); 1492 xtalCell.setGamma(gamma); 1493 1494 if (!xtalCell.isCellReasonable()) { 1495 // If the entry describes a structure determined by a technique other than X-ray crystallography, 1496 // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1497 // if so we don't add the crystal cell and it remains null 1498 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", 1499 CrystalCell.MIN_VALID_CELL_SIZE); 1500 } else { 1501 crystallographicInfo.setCrystalCell(xtalCell); 1502 } 1503 1504 SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup); 1505 if (sg==null) { 1506 logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group"); 1507 crystallographicInfo.setNonStandardSg(true); 1508 } else { 1509 crystallographicInfo.setSpaceGroup(sg); 1510 crystallographicInfo.setNonStandardSg(false); 1511 } 1512 } 1513 1514 /** 1515 * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries) 1516 * 1517 * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn 1518 * 1519 * COLUMNS DATA TYPE FIELD DEFINITION 1520 * ------------------------------------------------------------- 1521 * 1522 * 1 - 6 Record name "MTRIXn" n=1, 2, or 3 1523 * 8 - 10 Integer serial Serial number. 1524 * 11 - 20 Real(10.6) m[n][1] Mn1 1525 * 21 - 30 Real(10.6) m[n][2] Mn2 1526 * 31 - 40 Real(10.6) m[n][3] Mn3 1527 * 46 - 55 Real(10.5) v[n] Vn 1528 * 60 Integer iGiven 1 1529 * 1530 * Note that we ignore operators with iGiven==1 1531 * 1532 * @param line 1533 */ 1534 private void pdb_MTRIXn_Handler(String line) { 1535 1536 // don't process incomplete records 1537 if (line.length() < 60) { 1538 logger.info("MTRIXn record has fewer than 60 columns: will ignore it"); 1539 return; 1540 } 1541 1542 1543 try { 1544 1545 int rowIndex = Integer.parseInt(line.substring(5,6)); 1546 double col1Value = Double.parseDouble(line.substring(10,20)); 1547 double col2Value = Double.parseDouble(line.substring(20,30)); 1548 double col3Value = Double.parseDouble(line.substring(30,40)); 1549 double translValue = Double.parseDouble(line.substring(45,55)); 1550 int iGiven = 0; 1551 if (!line.substring(59,60).trim().equals("")) { 1552 iGiven = Integer.parseInt(line.substring(59,60)); 1553 } 1554 1555 if (iGiven == 1) return; 1556 1557 if (ncsOperators==null) { 1558 // we initialise on first pass 1559 ncsOperators = new ArrayList<Matrix4d>(); 1560 } 1561 1562 if (currentNcsOp==null) { 1563 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1564 } 1565 1566 currentNcsOp.setElement(rowIndex-1, 0, col1Value); 1567 currentNcsOp.setElement(rowIndex-1, 1, col2Value); 1568 currentNcsOp.setElement(rowIndex-1, 2, col3Value); 1569 currentNcsOp.setElement(rowIndex-1, 3, translValue); 1570 1571 1572 if (rowIndex==3) { 1573 ncsOperators.add(currentNcsOp); 1574 // we initialise for next matrix to come 1575 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1576 } 1577 1578 } catch (NumberFormatException e) { 1579 logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<"); 1580 } 1581 } 1582 1583 /** 1584 * Decides whether or not a Group is qualified to be added to the 1585 * Structure.hetGroups list. If it likes it, it adds it. 1586 * @param group 1587 */ 1588 private void addTohetGroupsDecider(Group group) { 1589 boolean wanted = false; 1590 //these are HET groups, but they are usually less interesting 1591 //than other types 1592 if (group.getPDBName().equals("HOH")) 1593 return; 1594 if (group.getChemComp() == null) { 1595 if (group.getType().equals(GroupType.HETATM)) { 1596 wanted = true; 1597 } 1598 } else if (!group.getChemComp().isStandard()) { 1599 //also want to add modified amino acids e.g. TYS 1600 //these are GroupType.AMINOACID, so we need to check the ChemComp 1601 wanted = true; 1602 } 1603 1604 if (wanted) { 1605 if (! structure.getHetGroups().contains(group)) { 1606 // System.out.println("Added " + group + " to structure.hetgroups"); 1607 structure.getHetGroups().add(group); 1608 } 1609 } 1610 } 1611 1612 /** 1613 Handler for 1614 ATOM Record Format 1615 * 1616 * <pre> 1617 * ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1618 * 1619 * COLUMNS DATA TYPE FIELD DEFINITION 1620 * --------------------------------------------------------------------------------- 1621 * 1 - 6 Record name "ATOM " 1622 * 7 - 11 Integer serial Atom serial number. 1623 * 13 - 16 Atom name Atom name. 1624 * 17 Character altLoc Alternate location indicator. 1625 * 18 - 20 Residue name resName Residue name. 1626 * 22 Character chainID Chain identifier. 1627 * 23 - 26 Integer resSeq Residue sequence number. 1628 * 27 AChar iCode Code for insertion of residues. 1629 * 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 1630 * 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 1631 * 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 1632 * 55 - 60 Real(6.2) occupancy Occupancy. 1633 * 61 - 66 Real(6.2) tempFactor Temperature factor. 1634 * 73 - 76 LString(4) segID Segment identifier, left-justified. 1635 * 77 - 78 LString(2) element Element symbol, right-justified. 1636 * 79 - 80 LString(2) charge Charge on the atom. 1637 * </pre> 1638 */ 1639 private void pdb_ATOM_Handler(String line) { 1640 // build up chains first. 1641 // headerOnly just goes down to chain resolution. 1642 1643 if ( params.isHeaderOnly()) 1644 return; 1645 1646 boolean startOfNewChain = false; 1647 1648 String chain_id = line.substring(21,22); 1649 1650 if (current_chain == null) { 1651 current_chain = new ChainImpl(); 1652 current_chain.setChainID(chain_id); 1653 startOfNewChain = true; 1654 current_model.add(current_chain); 1655 } 1656 1657 1658 if ( ! chain_id.equals(current_chain.getChainID()) ) { 1659 1660 startOfNewChain = true; 1661 1662 // end up old chain... 1663 current_chain.addGroup(current_group); 1664 1665 // see if old chain is known ... 1666 Chain testchain ; 1667 testchain = isKnownChain(current_chain.getChainID(),current_model); 1668 1669 //System.out.println("trying to re-using known chain " + current_chain.getName() + " " + chain_id); 1670 if ( testchain != null && testchain.getChainID().equals(chain_id)){ 1671 //System.out.println("re-using known chain " + current_chain.getName() + " " + chain_id); 1672 1673 } else { 1674 1675 testchain = isKnownChain(chain_id,current_model); 1676 } 1677 1678 if ( testchain == null) { 1679 //System.out.println("unknown chain. creating new chain."); 1680 1681 current_chain = new ChainImpl(); 1682 current_chain.setChainID(chain_id); 1683 1684 } else { 1685 current_chain = testchain; 1686 } 1687 1688 if ( ! current_model.contains(current_chain)) 1689 current_model.add(current_chain); 1690 1691 1692 } 1693 1694 // process group data: 1695 // join residue numbers and insertion codes together 1696 String recordName = line.substring (0, 6).trim (); 1697 1698 String groupCode3 = line.substring(17,20).trim(); 1699 // pdbCode is the old way of doing things...it's a concatenation 1700 //of resNum and iCode which are now defined explicitly 1701 String resNum = line.substring(22,26).trim(); 1702 Character iCode = line.substring(26,27).charAt(0); 1703 if ( iCode == ' ') 1704 iCode = null; 1705 ResidueNumber residueNumber = new ResidueNumber(chain_id, Integer.valueOf(resNum), iCode); 1706 1707 //recordName groupCode3 1708 //| | resNum 1709 //| | | iCode 1710 //| | | | | || 1711 //ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1712 //ATOM 1964 N ARG H 221A 5.963 -16.715 27.669 1.00 28.59 N 1713 1714 Character aminoCode1 = null; 1715 1716 if ( recordName.equals("ATOM") ){ 1717 aminoCode1 = StructureTools.get1LetterCode(groupCode3); 1718 } else { 1719 // HETATOM RECORDS are treated slightly differently 1720 // some modified amino acids that we want to treat as amino acids 1721 // can be found as HETATOM records 1722 aminoCode1 = StructureTools.get1LetterCode(groupCode3); 1723 if ( aminoCode1 != null) 1724 if ( aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) 1725 aminoCode1 = null; 1726 } 1727 1728 if (current_group == null) { 1729 1730 current_group = getNewGroup(recordName,aminoCode1,groupCode3); 1731 1732 //if ((current_group instanceof AminoAcidImpl) && groupCode3.length()!=3) { 1733 // throw new PDBParseException("amino acid name is not of length 3! (" + groupCode3 +")"); 1734 //} 1735 current_group.setPDBName(groupCode3); 1736 current_group.setResidueNumber(residueNumber); 1737 // System.out.println("Made new group: " + groupCode3 + " " + resNum + " " + iCode); 1738 addTohetGroupsDecider(current_group); 1739 } 1740 1741 1742 if ( startOfNewChain) { 1743 //System.out.println("end of chain: "+current_chain.getName()+" >"+chain_id+"<"); 1744 1745 current_group = getNewGroup(recordName,aminoCode1,groupCode3); 1746 1747 //if ((current_group instanceof AminoAcidImpl) && groupCode3.length()!=3) { 1748 // throw new PDBParseException("amino acid name is not of length 3! (" + groupCode3 +")"); 1749 //} 1750 current_group.setPDBName(groupCode3); 1751 current_group.setResidueNumber(residueNumber); 1752 addTohetGroupsDecider(current_group); 1753 // System.out.println("Made new start of chain group: " + groupCode3 + " " + resNum + " " + iCode); 1754 } 1755 1756 1757 Character altLoc = new Character(line.substring (16, 17).charAt(0)); 1758 Group altGroup = null; 1759 1760 //System.out.println(current_group + " " + residueNumber); 1761 1762 // check if residue number is the same ... 1763 // insertion code is part of residue number 1764 if ( ! residueNumber.equals(current_group.getResidueNumber())) { 1765 1766 current_chain.addGroup(current_group); 1767 current_group.trimToSize(); 1768 1769 current_group = getNewGroup(recordName,aminoCode1,groupCode3); 1770 1771 //if ((current_group instanceof AminoAcidImpl) && groupCode3.length()!=3) { 1772 // throw new PDBParseException("amino acid name is not of length 3! (" + groupCode3 +")"); 1773 //} 1774 current_group.setPDBName(groupCode3); 1775 current_group.setResidueNumber(residueNumber); 1776 addTohetGroupsDecider(current_group); 1777 // System.out.println("Made new group: " + groupCode3 + " " + resNum + " " + iCode); 1778 1779 } else { 1780 // same residueNumber, but altLocs... 1781 1782 // test altLoc 1783 if ( ! altLoc.equals(' ')) { 1784 logger.debug("found altLoc! " + current_group + " " + altGroup); 1785 altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3); 1786 if ( altGroup.getChain() == null) { 1787 // need to set current chain 1788 altGroup.setChain(current_chain); 1789 } 1790 1791 } 1792 } 1793 1794 atomCount++; 1795 1796 if ( atomCount == my_ATOM_CA_THRESHOLD ) { 1797 // throw away the SEQRES lines - too much to deal with... 1798 logger.warn("more than " + my_ATOM_CA_THRESHOLD + " atoms in this structure, ignoring the SEQRES lines"); 1799 seqResChains.clear(); 1800 1801 switchCAOnly(); 1802 1803 } 1804 1805 1806 1807 if ( atomCount == load_max_atoms){ 1808 logger.warn("too many atoms (>"+load_max_atoms+"in this protein structure."); 1809 logger.warn("ignoring lines after: " + line); 1810 return; 1811 } 1812 if ( atomCount > load_max_atoms){ 1813 //System.out.println("too many atoms in this protein structure."); 1814 //System.out.println("ignoring line: " + line); 1815 return; 1816 } 1817 1818 1819 // 1 2 3 4 5 6 1820 //012345678901234567890123456789012345678901234567890123456789 1821 //ATOM 1 N MET 1 20.154 29.699 5.276 1.0 1822 //ATOM 112 CA ASP 112 41.017 33.527 28.371 1.00 0.00 1823 //ATOM 53 CA MET 7 23.772 33.989 -21.600 1.00 0.00 C 1824 //ATOM 112 CA ASP 112 37.613 26.621 33.571 0 0 1825 1826 1827 String fullname = line.substring (12, 16); 1828 1829 // check for CA only if requested 1830 if ( parseCAonly ){ 1831 // yes , user wants to get CA only 1832 // only parse CA atoms... 1833 if (! fullname.equals(" CA ")){ 1834 //System.out.println("ignoring " + line); 1835 atomCount--; 1836 return; 1837 } 1838 } 1839 1840 if ( params.getAcceptedAtomNames() != null) { 1841 1842 boolean found = false; 1843 for (String ok : params.getAcceptedAtomNames()){ 1844 //System.out.println(ok + "< >" + fullname +"<"); 1845 1846 if ( ok.equals(fullname.trim())) { 1847 found = true; 1848 break; 1849 } 1850 } 1851 if ( ! found) { 1852 atomCount--; 1853 return; 1854 } 1855 } 1856 // create new atom 1857 1858 int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ()); 1859 AtomImpl atom = new AtomImpl() ; 1860 atom.setPDBserial(pdbnumber) ; 1861 1862 atom.setAltLoc(altLoc); 1863 atom.setName(fullname.trim()); 1864 1865 double x = Double.parseDouble (line.substring (30, 38).trim()); 1866 double y = Double.parseDouble (line.substring (38, 46).trim()); 1867 double z = Double.parseDouble (line.substring (46, 54).trim()); 1868 1869 double[] coords = new double[3]; 1870 coords[0] = x ; 1871 coords[1] = y ; 1872 coords[2] = z ; 1873 atom.setCoords(coords); 1874 1875 float occu = 1.0f; 1876 if ( line.length() > 59 ) { 1877 try { 1878 // occu and tempf are sometimes not used :-/ 1879 occu = Float.parseFloat (line.substring (54, 60).trim()); 1880 } catch (NumberFormatException e){} 1881 } 1882 1883 float tempf = 0.0f; 1884 if ( line.length() > 65) { 1885 try { 1886 tempf = Float.parseFloat (line.substring (60, 66).trim()); 1887 } catch (NumberFormatException e){} 1888 } 1889 1890 atom.setOccupancy( occu ); 1891 atom.setTempFactor( tempf ); 1892 1893 1894 1895 1896 // Parse element from the element field. If this field is 1897 // missing (i.e. misformatted PDB file), then parse the 1898 // name from the atom name. 1899 Element element = Element.R; 1900 if ( line.length() > 77 ) { 1901 // parse element from element field 1902 try { 1903 element = Element.valueOfIgnoreCase(line.substring (76, 78).trim()); 1904 } catch (IllegalArgumentException e){} 1905 } else { 1906 // parse the name from the atom name 1907 String elementSymbol = null; 1908 // for atom names with 4 characters, the element is 1909 // at the first position, example HG23 in Valine 1910 if (fullname.trim().length() == 4) { 1911 elementSymbol = fullname.substring(0, 1); 1912 } else if ( fullname.trim().length() > 1){ 1913 elementSymbol = fullname.substring(0, 2).trim(); 1914 } else { 1915 // unknown element... 1916 elementSymbol = "R"; 1917 } 1918 1919 try { 1920 element = Element.valueOfIgnoreCase(elementSymbol); 1921 } catch (IllegalArgumentException e){} 1922 } 1923 atom.setElement(element); 1924 1925 1926 //see if chain_id is one of the previous chains ... 1927 if ( altGroup != null) { 1928 altGroup.addAtom(atom); 1929 altGroup = null; 1930 } 1931 else { 1932 current_group.addAtom(atom); 1933 } 1934 1935 1936 // make sure that main group has all atoms 1937 // GitHub issue: #76 1938 if ( ! current_group.hasAtom(atom.getName())) { 1939 current_group.addAtom(atom); 1940 } 1941 1942 1943 1944 //System.out.println("current group: " + current_group); 1945 } 1946 1947 1948 private Group getCorrectAltLocGroup( Character altLoc, 1949 String recordName, Character aminoCode1, String groupCode3) { 1950 1951 // see if we know this altLoc already; 1952 List<Atom> atoms = current_group.getAtoms(); 1953 if ( atoms.size() > 0) { 1954 Atom a1 = atoms.get(0); 1955 // we are just adding atoms to the current group 1956 // probably there is a second group following later... 1957 if (a1.getAltLoc().equals(altLoc)) { 1958 1959 return current_group; 1960 } 1961 } 1962 1963 List<Group> altLocs = current_group.getAltLocs(); 1964 for ( Group altLocG : altLocs ){ 1965 atoms = altLocG.getAtoms(); 1966 if ( atoms.size() > 0) { 1967 for ( Atom a1 : atoms) { 1968 if (a1.getAltLoc().equals( altLoc)) { 1969 1970 return altLocG; 1971 } 1972 } 1973 } 1974 } 1975 1976 // no matching altLoc group found. 1977 // build it up. 1978 1979 if ( groupCode3.equals(current_group.getPDBName())) { 1980 if ( current_group.getAtoms().size() == 0) { 1981 //System.out.println("current group is empty " + current_group + " " + altLoc); 1982 return current_group; 1983 } 1984 //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc); 1985 Group altLocG = (Group) current_group.clone(); 1986 // drop atoms from cloned group... 1987 // https://redmine.open-bio.org/issues/3307 1988 altLocG.setAtoms(new ArrayList<Atom>()); 1989 altLocG.getAltLocs().clear(); 1990 current_group.addAltLoc(altLocG); 1991 return altLocG; 1992 } 1993 1994 // System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3); 1995 Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3); 1996 1997 1998 altLocG.setPDBName(groupCode3); 1999 2000 altLocG.setResidueNumber(current_group.getResidueNumber()); 2001 current_group.addAltLoc(altLocG); 2002 return altLocG; 2003 } 2004 2005 private void switchCAOnly(){ 2006 parseCAonly = true; 2007 2008 2009 current_model = CAConverter.getRepresentativeAtomsOnly(current_model); 2010 2011 for ( int i =0; i< structure.nrModels() ; i++){ 2012 // iterate over all known models ... 2013 List<Chain> model = structure.getModel(i); 2014 model = CAConverter.getRepresentativeAtomsOnly(model); 2015 structure.setModel(i,model); 2016 } 2017 2018 current_chain = CAConverter.getRepresentativeAtomsOnly(current_chain); 2019 2020 } 2021 2022 2023 /** safes repeating a few lines ... */ 2024 private Integer conect_helper (String line,int start,int end) { 2025 if (line.length() < end) return null; 2026 2027 String sbond = line.substring(start,end).trim(); 2028 int bond = -1 ; 2029 Integer b = null ; 2030 2031 if ( ! sbond.equals("")) { 2032 bond = Integer.parseInt(sbond); 2033 b = new Integer(bond); 2034 } 2035 2036 return b ; 2037 } 2038 2039 /** 2040 Handler for 2041 CONECT Record Format 2042 2043 COLUMNS DATA TYPE FIELD DEFINITION 2044 --------------------------------------------------------------------------------- 2045 1 - 6 Record name "CONECT" 2046 7 - 11 Integer serial Atom serial number 2047 12 - 16 Integer serial Serial number of bonded atom 2048 17 - 21 Integer serial Serial number of bonded atom 2049 22 - 26 Integer serial Serial number of bonded atom 2050 27 - 31 Integer serial Serial number of bonded atom 2051 32 - 36 Integer serial Serial number of hydrogen bonded 2052 atom 2053 37 - 41 Integer serial Serial number of hydrogen bonded 2054 atom 2055 42 - 46 Integer serial Serial number of salt bridged 2056 atom 2057 47 - 51 Integer serial Serial number of hydrogen bonded 2058 atom 2059 52 - 56 Integer serial Serial number of hydrogen bonded 2060 atom 2061 57 - 61 Integer serial Serial number of salt bridged 2062 atom 2063 */ 2064 private void pdb_CONECT_Handler(String line) { 2065 //System.out.println(line); 2066 // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines... 2067 if ( atomOverflow) { 2068 return ; 2069 } 2070 if (params.isHeaderOnly()) { 2071 return; 2072 } 2073 try { 2074 int atomserial = Integer.parseInt (line.substring(6 ,11).trim()); 2075 Integer bond1 = conect_helper(line,11,16); 2076 Integer bond2 = conect_helper(line,16,21); 2077 Integer bond3 = conect_helper(line,21,26); 2078 Integer bond4 = conect_helper(line,26,31); 2079 Integer hyd1 = conect_helper(line,31,36); 2080 Integer hyd2 = conect_helper(line,36,41); 2081 Integer salt1 = conect_helper(line,41,46); 2082 Integer hyd3 = conect_helper(line,46,51); 2083 Integer hyd4 = conect_helper(line,51,56); 2084 Integer salt2 = conect_helper(line,56,61); 2085 2086 //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+ 2087 // hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2); 2088 HashMap<String, Integer> cons = new HashMap<String, Integer>(); 2089 cons.put("atomserial",new Integer(atomserial)); 2090 2091 if ( bond1 != null) cons.put("bond1",bond1); 2092 if ( bond2 != null) cons.put("bond2",bond2); 2093 if ( bond3 != null) cons.put("bond3",bond3); 2094 if ( bond4 != null) cons.put("bond4",bond4); 2095 if ( hyd1 != null) cons.put("hydrogen1",hyd1); 2096 if ( hyd2 != null) cons.put("hydrogen2",hyd2); 2097 if ( salt1 != null) cons.put("salt1",salt1); 2098 if ( hyd3 != null) cons.put("hydrogen3",hyd3); 2099 if ( hyd4 != null) cons.put("hydrogen4",hyd4); 2100 if ( salt2 != null) cons.put("salt2",salt2); 2101 2102 connects.add(cons); 2103 } catch (NumberFormatException e){ 2104 logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line); 2105 return; 2106 } 2107 } 2108 2109 /** 2110 Handler for 2111 MODEL Record Format 2112 2113 COLUMNS DATA TYPE FIELD DEFINITION 2114 ---------------------------------------------------------------------- 2115 1 - 6 Record name "MODEL " 2116 11 - 14 Integer serial Model serial number. 2117 */ 2118 private void pdb_MODEL_Handler(String line) { 2119 2120 if (params.isHeaderOnly()) return; 2121 2122 // check beginning of file ... 2123 if (current_chain != null) { 2124 if (current_group != null) { 2125 current_chain.addGroup(current_group); 2126 current_group.trimToSize(); 2127 } 2128 2129 Chain ch = isKnownChain(current_chain.getChainID(),current_model) ; 2130 if ( ch == null ) { 2131 current_model.add(current_chain); 2132 } 2133 2134 structure.addModel(current_model); 2135 current_model = new ArrayList<Chain>(); 2136 current_chain = null; 2137 current_group = null; 2138 } 2139 2140 } 2141 2142 2143 /** 2144 * COLUMNS DATA TYPE FIELD DEFINITION 2145 * ---------------------------------------------------------------- 2146 * 1 - 6 Record name "DBREF " 2147 * 8 - 11 IDcode idCode ID code of this entry. 2148 * 13 Character chainID Chain identifier. 2149 * 15 - 18 Integer seqBegin Initial sequence number 2150 * of the PDB sequence segment. 2151 * 19 AChar insertBegin Initial insertion code 2152 * of the PDB sequence segment. 2153 * 21 - 24 Integer seqEnd Ending sequence number 2154 * of the PDB sequence segment. 2155 * 25 AChar insertEnd Ending insertion code 2156 * of the PDB sequence segment. 2157 * 27 - 32 LString database Sequence database name. 2158 * 34 - 41 LString dbAccession Sequence database accession code. 2159 * 43 - 54 LString dbIdCode Sequence database 2160 * identification code. 2161 * 56 - 60 Integer dbseqBegin Initial sequence number of the 2162 * database seqment. 2163 * 61 AChar idbnsBeg Insertion code of initial residue 2164 * of the segment, if PDB is the 2165 * reference. 2166 * 63 - 67 Integer dbseqEnd Ending sequence number of the 2167 * database segment. 2168 * 68 AChar dbinsEnd Insertion code of the ending 2169 * residue of the segment, if PDB is 2170 * the reference. 2171 */ 2172 private void pdb_DBREF_Handler(String line){ 2173 2174 logger.debug("Parsing DBREF " + line); 2175 2176 DBRef dbref = new DBRef(); 2177 String idCode = line.substring(7,11); 2178 String chainId = line.substring(12,13); 2179 String seqBegin = line.substring(14,18); 2180 String insertBegin = line.substring(18,19); 2181 String seqEnd = line.substring(20,24); 2182 String insertEnd = line.substring(24,25); 2183 String database = line.substring(26,32); 2184 String dbAccession = line.substring(33,41); 2185 String dbIdCode = line.substring(42,54); 2186 String dbseqBegin = line.substring(55,60); 2187 String idbnsBeg = line.substring(60,61); 2188 String dbseqEnd = line.substring(62,67); 2189 // Support implicit space character at end 2190 String dbinsEnd; 2191 if(line.length() >= 68) 2192 dbinsEnd = line.substring(67,68); 2193 else 2194 dbinsEnd = " "; 2195 2196 dbref.setIdCode(idCode); 2197 dbref.setChainId(chainId); 2198 dbref.setSeqBegin(intFromString(seqBegin)); 2199 dbref.setInsertBegin(insertBegin.charAt(0)); 2200 dbref.setSeqEnd(intFromString(seqEnd)); 2201 dbref.setInsertEnd(insertEnd.charAt(0)); 2202 dbref.setDatabase(database.trim()); 2203 dbref.setDbAccession(dbAccession.trim()); 2204 dbref.setDbIdCode(dbIdCode.trim()); 2205 dbref.setDbSeqBegin(intFromString(dbseqBegin)); 2206 dbref.setIdbnsBegin(idbnsBeg.charAt(0)); 2207 dbref.setDbSeqEnd(intFromString(dbseqEnd)); 2208 dbref.setIdbnsEnd(dbinsEnd.charAt(0)); 2209 2210 //System.out.println(dbref.toPDB()); 2211 dbrefs.add(dbref); 2212 } 2213 2214 /* 2215 * For each het group that appears in the entry, the wwPDB checks that the corresponding HET, HETNAM, HETSYN, FORMUL, HETATM, and CONECT records appear, if applicable. The HET record is generated automatically using the Chemical Component Dictionary and information from the HETATM records. 2216 2217 * Record Format 2218 * 2219 * <pre> 2220 * COLUMNS DATA TYPE FIELD DEFINITION 2221 * --------------------------------------------------------------------------------- 2222 * 1 - 6 Record name "HET " 2223 * 8 - 10 LString(3) hetID Het identifier, right-justified. 2224 * 13 Character ChainID Chain identifier. 2225 * 14 - 17 Integer seqNum Sequence number. 2226 * 18 AChar iCode Insertion code. 2227 * 21 - 25 Integer numHetAtoms Number of HETATM records for the group 2228 * present in the entry. 2229 * 31 - 70 String text Text describing Het group. 2230 * 2231 * Each unique hetID represents a unique molecule. 2232 * 2233 * Relationships to Other Record Types 2234 * 2235 * For each het group that appears in the entry, there must be corresponding HET, HETNAM, HETSYN, FORMUL,HETATM, and CONECT records. LINK records may also be created. 2236 * 2237 * Example 2238 * 2239 * 1 2 3 4 5 6 7 8 2240 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2241 * HET TRS 975 8 2242 * 2243 * HET UDP A1457 25 2244 * HET B3P A1458 19 2245 * 2246 * HET NAG Y 3 15 2247 * HET FUC Y 4 10 2248 * HET NON Y 5 12 2249 * HET UNK A 161 1 2250 * </pre> 2251 * 2252 * Heterogen sections are HET, HETNAM, HETSYN, FORMUL 2253 * @see http://www.wwpdb.org/documentation/format32/sect4.html 2254 */ 2255 //private void pdb_HET_handler(String line) { 2256 2257 //} 2258 2259 /** 2260 * Process the disulfide bond info provided by an SSBOND record 2261 * 2262 * 2263 COLUMNS DATA TYPE FIELD DEFINITION 2264 ------------------------------------------------------------------- 2265 1 - 6 Record name "SSBOND" 2266 8 - 10 Integer serNum Serial number. 2267 12 - 14 LString(3) "CYS" Residue name. 2268 16 Character chainID1 Chain identifier. 2269 18 - 21 Integer seqNum1 Residue sequence number. 2270 22 AChar icode1 Insertion code. 2271 26 - 28 LString(3) "CYS" Residue name. 2272 30 Character chainID2 Chain identifier. 2273 32 - 35 Integer seqNum2 Residue sequence number. 2274 36 AChar icode2 Insertion code. 2275 60 - 65 SymOP sym1 Symmetry oper for 1st resid 2276 67 - 72 SymOP sym2 Symmetry oper for 2nd resid 2277 */ 2278 private void pdb_SSBOND_Handler(String line){ 2279 2280 if (params.isHeaderOnly()) return; 2281 2282 if (line.length()<36) { 2283 logger.info("SSBOND line has length under 36. Ignoring it."); 2284 return; 2285 } 2286 2287 String chain1 = line.substring(15,16); 2288 String seqNum1 = line.substring(17,21).trim(); 2289 String icode1 = line.substring(21,22); 2290 String chain2 = line.substring(29,30); 2291 String seqNum2 = line.substring(31,35).trim(); 2292 String icode2 = line.substring(35,36); 2293 2294 if (line.length()>=72) { 2295 String symop1 = line.substring(59, 65).trim(); 2296 String symop2 = line.substring(66, 72).trim(); 2297 2298 // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them 2299 if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing 2300 (!symop1.equals("1555") || !symop2.equals("1555")) ) { 2301 logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2); 2302 return; 2303 } 2304 } 2305 2306 if (icode1.equals(" ")) 2307 icode1 = ""; 2308 if (icode2.equals(" ")) 2309 icode2 = ""; 2310 2311 SSBondImpl ssbond = new SSBondImpl(); 2312 2313 ssbond.setChainID1(chain1); 2314 ssbond.setResnum1(seqNum1); 2315 ssbond.setChainID2(chain2); 2316 ssbond.setResnum2(seqNum2); 2317 ssbond.setInsCode1(icode1); 2318 ssbond.setInsCode2(icode2); 2319 ssbonds.add(ssbond); 2320 } 2321 2322 2323 /** 2324 * Takes care of LINK records. These take the format of: 2325 * 2326 * <pre> 2327 * COLUMNS DATA TYPE FIELD DEFINITION 2328 * -------------------------------------------------------------------------------- 2329 * 1 - 6 Record name "LINK " 2330 * 13 - 16 Atom name1 Atom name. 2331 * 17 Character altLoc1 Alternate location indicator. 2332 * 18 - 20 Residue name resName1 Residue name. 2333 * 22 Character chainID1 Chain identifier. 2334 * 23 - 26 Integer resSeq1 Residue sequence number. 2335 * 27 AChar iCode1 Insertion code. 2336 * 43 - 46 Atom name2 Atom name. 2337 * 47 Character altLoc2 Alternate location indicator. 2338 * 48 - 50 Residue name resName2 Residue name. 2339 * 52 Character chainID2 Chain identifier. 2340 * 53 - 56 Integer resSeq2 Residue sequence number. 2341 * 57 AChar iCode2 Insertion code. 2342 * 60 - 65 SymOP sym1 Symmetry operator for 1st atom. 2343 * 67 - 72 SymOP sym2 Symmetry operator for 2nd atom. 2344 * </pre> 2345 * 2346 * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK) 2347 * 2348 * @param line the LINK record line to parse. 2349 */ 2350 private void pdb_LINK_Handler(String line) { 2351 2352 if (params.isHeaderOnly()) return; 2353 2354 // Check for the minimal set of fields. 2355 if (line.length()<56) { 2356 logger.info("LINK line has length under 56. Ignoring it."); 2357 return; 2358 } 2359 2360 int len = line.length(); 2361 2362 String name1 = line.substring(12, 16).trim(); 2363 String altLoc1 = line.substring(16, 17).trim(); 2364 String resName1 = line.substring(17, 20).trim(); 2365 String chainID1 = line.substring(21, 22).trim(); 2366 String resSeq1 = line.substring(22, 26).trim(); 2367 String iCode1 = line.substring(26, 27).trim(); 2368 2369 String name2 = line.substring(42, 46).trim(); 2370 String altLoc2 = line.substring(46, 47).trim(); 2371 String resName2 = line.substring(47, 50).trim(); 2372 String chainID2 = line.substring(51, 52).trim(); 2373 String resSeq2 = line.substring(52, 56).trim(); 2374 String iCode2 = null; // Might get trimmed if blank. 2375 if (len > 56) iCode2 = line.substring(56, 57).trim(); 2376 2377 String sym1 = null; 2378 if (len > 64) sym1 = line.substring(59, 65).trim(); 2379 String sym2 = null; 2380 if (len > 71) sym2 = line.substring(66, 72).trim(); 2381 2382// System.err.println("LINK"); 2383// System.err.println("\tName: " + name1); 2384// System.err.println("\tAlt Loc: " + altLoc1); 2385// System.err.println("\tRes name: " + resName1); 2386// System.err.println("\tChain ID: " + chainID1); 2387// System.err.println("\tRes Seq: " + resSeq1); 2388// System.err.println("\tIns Code: " + iCode1); 2389// System.err.println(name1 + "." + altLoc1 + "." + resName1 + "." + chainID1 + "." + resSeq1 + "." + iCode1); 2390// System.err.println(name2 + "." + altLoc2 + "." + resName2 + "." + chainID2 + "." + resSeq2 + "." + iCode2); 2391// System.err.println(sym1 + "." + sym2); 2392// System.err.println(); 2393 2394 linkRecords.add(new LinkRecord( 2395 name1, altLoc1, resName1, chainID1, resSeq1, iCode1, 2396 name2, altLoc2, resName2, chainID2, resSeq2, iCode2, 2397 sym1, sym2)); 2398 } 2399 2400 /** 2401 * Handler for the SITE records. <br> 2402 * 2403 * <pre> 2404 * 2405 * COLUMNS DATA TYPE FIELD DEFINITION 2406 * --------------------------------------------------------------------------------- 2407 * 1 - 6 Record name "SITE " 2408 * 8 - 10 Integer seqNum Sequence number. 2409 * 12 - 14 LString(3) siteID Site name. 2410 * 16 - 17 Integer numRes Number of residues that compose the siteResidues. 2411 * 19 - 21 Residue name resName1 Residue name for first residue that 2412 * creates the siteResidues. 2413 * 23 Character chainID1 Chain identifier for first residue of siteResidues. 2414 * 24 - 27 Integer seq1 Residue sequence number for first residue 2415 * of the siteResidues. 2416 * 28 AChar iCode1 Insertion code for first residue of the siteResidues. 2417 * 2418 * example: 2419 * 1 2 3 4 5 6 7 8 2420 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2421 * SITE 1 AC1 3 HIS A 94 HIS A 96 HIS A 119 2422 * SITE 1 AC2 5 ASN A 62 GLY A 63 HIS A 64 HOH A 328 2423 * SITE 2 AC2 5 HOH A 634 2424 * SITE 1 AC3 5 GLN A 136 GLN A 137 PRO A 138 GLU A 205 2425 * SITE 2 AC3 5 CYS A 206 2426 * SITE 1 AC4 11 HIS A 64 HIS A 94 HIS A 96 HIS A 119 2427 * SITE 2 AC4 11 LEU A 198 THR A 199 THR A 200 TRP A 209 2428 * SITE 3 AC4 11 HOH A 572 HOH A 582 HOH A 635 2429 * </pre> 2430 * @param line the SITE line record being currently read 2431 * @author Amr AL-Hossary 2432 * @author Jules Jacobsen 2433 */ 2434 private void pdb_SITE_Handler(String line){ 2435 2436 if (params.isHeaderOnly()) return; 2437 2438 // make a map of: SiteId to List<ResidueNumber> 2439 2440 logger.debug("Site Line:"+line); 2441 2442 2443 String siteID = line.substring(11, 14); 2444 //fetch the siteResidues from the map 2445 List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID); 2446 2447 //if the siteResidues doesn't yet exist, make a new one. 2448 if (siteResidues == null |! siteToResidueMap.containsKey(siteID.trim())){ 2449 siteResidues = new ArrayList<ResidueNumber>(); 2450 siteToResidueMap.put(siteID.trim(), siteResidues); 2451 2452 logger.debug(String.format("New Site made: %s %s", siteID, siteResidues)); 2453 logger.debug("Now made " + siteMap.size() + " sites"); 2454 2455 } 2456 2457 logger.debug(String.format("SiteId: %s", siteID)); 2458 2459 2460 //line = 'SITE 1 AC1 6 ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2461 //line.substring(18) = 'ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2462 line = line.substring(18); 2463 String groupString = null; 2464 //groupString = 'ARG H 221A' 2465 //keep iterating through chunks of 10 characters - these are the groups in the siteResidues 2466 while (!(groupString = line.substring(0, 10)).equals(" ")) { 2467 //groupstring: 'ARG H 221A' 2468 2469 logger.debug("groupString: '" + groupString + "'"); 2470 2471 //set the residue name 2472 //residueName = 'ARG' 2473 String residueName = groupString.substring(0, 3); 2474 Character aminoCode1 = StructureTools.get1LetterCode(residueName); 2475 if (aminoCode1 != null) { 2476 if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 2477 aminoCode1 = null; 2478 } 2479 } 2480 2481 //this is already in the right format, so no need to fiddle with it... 2482 //pdbCode = 'H 221A' 2483 // String pdbCode = groupString.substring(4, 10).trim(); 2484 String chainId = groupString.substring(4, 5); 2485 Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim()); 2486 Character insCode = groupString.substring(9, 10).charAt(0); 2487 //set insCode to null as a measure to prevent storing thousands of empty Strings 2488 //- the empty value is returned using Group.getInsCode() 2489 // if (insCode.equals(" ")) { 2490 // insCode = null; 2491 // } 2492 2493 logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode)); 2494 2495 //make a new resNum with the data - this will be linked up with a site later 2496 ResidueNumber residueNumber = new ResidueNumber(); 2497 2498 2499 logger.debug("pdbCode: '" + resNum + insCode + "'"); 2500 2501 residueNumber.setChainId(chainId); 2502 residueNumber.setSeqNum(resNum); 2503 residueNumber.setInsCode(insCode); 2504 //add the resNum to the groups 2505 siteResidues.add(residueNumber); 2506 2507 logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID); 2508 2509 line = line.substring(11); 2510 } 2511 2512 logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):"); 2513 for (String key : siteToResidueMap.keySet()) { 2514 logger.debug(key + " : " + siteToResidueMap.get(key)); 2515 } 2516 2517 } 2518 2519 //Site variable related to parsing the REMARK 800 records. 2520 Site site; 2521 private void pdb_REMARK_800_Handler(String line){ 2522 2523 if (params.isHeaderOnly()) return; 2524 2525 // 'REMARK 800 SITE_IDENTIFIER: CAT ' 2526 line = line.substring(11); 2527 String[] fields = line.split(": "); 2528 2529 if (fields.length == 2) { 2530 if (fields[0].equals("SITE_IDENTIFIER")) { 2531 // remark800Counter++; 2532 String siteID = fields[1].trim(); 2533 2534 logger.debug("siteID: '" + siteID +"'"); 2535 2536 //fetch the siteResidues from the map 2537 site = siteMap.get(siteID); 2538 2539 //if the siteResidues doesn't yet exist, make a new one. 2540 if (site == null || !siteID.equals(site.getSiteID())) { 2541 site = new Site(siteID, new ArrayList<Group>()); 2542 siteMap.put(site.getSiteID(), site); 2543 2544 logger.debug("New Site made: " + site); 2545 logger.debug("Now made " + siteMap.size() + " sites"); 2546 2547 } 2548 } 2549 if (fields[0].equals("EVIDENCE_CODE")) { 2550 // remark800Counter++; 2551 String evCode = fields[1].trim(); 2552 2553 logger.debug("evCode: '" + evCode +"'"); 2554 2555 //fetch the siteResidues from the map 2556 site.setEvCode(evCode); 2557 } 2558 if (fields[0].equals("SITE_DESCRIPTION")) { 2559 // remark800Counter++; 2560 String desc = fields[1].trim(); 2561 2562 logger.debug("desc: '" + desc +"'"); 2563 2564 //fetch the siteResidues from the map 2565 site.setDescription(desc); 2566 2567 logger.debug("Finished making REMARK 800 for site " + site.getSiteID()); 2568 logger.debug(site.remark800toPDB()); 2569 2570 } 2571 } 2572 } 2573 2574 private int intFromString(String intString){ 2575 int val = Integer.MIN_VALUE; 2576 try { 2577 val = Integer.parseInt(intString.trim()); 2578 } catch (NumberFormatException ex){ 2579 logger.info("Could not parse a number: " + ex.getMessage()); 2580 } 2581 return val; 2582 } 2583 2584 2585 2586 /** test if the chain is already known (is in current_model 2587 * ArrayList) and if yes, returns the chain 2588 * if no -> returns null 2589 */ 2590 private Chain isKnownChain(String chainID, List<Chain> chains){ 2591 2592 for (int i = 0; i< chains.size();i++){ 2593 Chain testchain = chains.get(i); 2594 //System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<"); 2595 if (chainID.equals(testchain.getChainID())) { 2596 //System.out.println("chain "+ chainID+" already known ..."); 2597 return testchain; 2598 } 2599 } 2600 2601 return null; 2602 } 2603 2604 2605 2606 private BufferedReader getBufferedReader(InputStream inStream) 2607 throws IOException { 2608 2609 BufferedReader buf ; 2610 if (inStream == null) { 2611 throw new IOException ("input stream is null!"); 2612 } 2613 2614 buf = new BufferedReader (new InputStreamReader (inStream)); 2615 return buf ; 2616 2617 } 2618 2619 2620 2621 /** 2622 * Parse a PDB file and return a datastructure implementing 2623 * PDBStructure interface. 2624 * 2625 * @param inStream an InputStream object 2626 * @return a Structure object 2627 * @throws IOException 2628 */ 2629 public Structure parsePDBFile(InputStream inStream) 2630 throws IOException 2631 { 2632 2633 BufferedReader buf = getBufferedReader(inStream); 2634 2635 return parsePDBFile(buf); 2636 2637 } 2638 2639 /** 2640 * Parse a PDB file and return a datastructure implementing 2641 * PDBStructure interface. 2642 * 2643 * @param buf a BufferedReader object 2644 * @return the Structure object 2645 * @throws IOException ... 2646 */ 2647 2648 public Structure parsePDBFile(BufferedReader buf) 2649 throws IOException 2650 { 2651 // set the correct max values for parsing... 2652 load_max_atoms = params.getMaxAtoms(); 2653 my_ATOM_CA_THRESHOLD = params.getAtomCaThreshold(); 2654 2655 2656 // (re)set structure 2657 2658 structure = new StructureImpl() ; 2659 current_model = new ArrayList<Chain>(); 2660 seqResChains = new ArrayList<Chain>(); 2661 siteMap = new LinkedHashMap<String, Site>(); 2662 current_chain = null ; 2663 current_group = null ; 2664 pdbHeader = new PDBHeader(); 2665 connects = new ArrayList<Map<String,Integer>>(); 2666 previousContinuationField = ""; 2667 continuationField = ""; 2668 continuationString = ""; 2669 current_compound = null; 2670 sourceLines.clear(); 2671 compndLines.clear(); 2672 isLastCompndLine = false; 2673 isLastSourceLine = false; 2674 prevMolId = -1; 2675 compounds.clear(); 2676 helixList.clear(); 2677 strandList.clear(); 2678 turnList.clear(); 2679 lengthCheck = -1; 2680 atomCount = 0; 2681 atomOverflow = false; 2682 linkRecords = new ArrayList<LinkRecord>(); 2683 siteToResidueMap.clear(); 2684 2685 parseCAonly = params.isParseCAOnly(); 2686 2687 String line = null; 2688 2689 while ((line = buf.readLine()) != null) { 2690 2691 // ignore empty lines 2692 if ( line.equals("") || 2693 (line.equals(NEWLINE))){ 2694 continue; 2695 } 2696 2697 2698 // ignore short TER and END lines 2699 if ( (line.startsWith("TER")) || 2700 (line.startsWith("END"))) { 2701 continue; 2702 } 2703 2704 if ( line.length() < 6) { 2705 logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" ); 2706 continue; 2707 } 2708 2709 String recordName = line.substring (0, 6).trim (); 2710 2711 try { 2712 if (recordName.equals("ATOM")) 2713 pdb_ATOM_Handler(line); 2714 else if (recordName.equals("SEQRES")) 2715 pdb_SEQRES_Handler(line); 2716 else if (recordName.equals("HETATM")) 2717 pdb_ATOM_Handler(line); 2718 else if (recordName.equals("MODEL")) 2719 pdb_MODEL_Handler(line); 2720 else if (recordName.equals("HEADER")) 2721 pdb_HEADER_Handler(line); 2722 else if (recordName.equals("AUTHOR")) 2723 pdb_AUTHOR_Handler(line); 2724 else if (recordName.equals("TITLE")) 2725 pdb_TITLE_Handler(line); 2726 else if (recordName.equals("SOURCE")) 2727 sourceLines.add(line); //pdb_SOURCE_Handler 2728 else if (recordName.equals("COMPND")) 2729 compndLines.add(line); //pdb_COMPND_Handler 2730 else if (recordName.equals("JRNL")) 2731 pdb_JRNL_Handler(line); 2732 else if (recordName.equals("EXPDTA")) 2733 pdb_EXPDTA_Handler(line); 2734 else if (recordName.equals("CRYST1")) 2735 pdb_CRYST1_Handler(line); 2736 else if (recordName.startsWith("MTRIX")) 2737 pdb_MTRIXn_Handler(line); 2738 else if (recordName.equals("REMARK")) 2739 pdb_REMARK_Handler(line); 2740 else if (recordName.equals("CONECT")) 2741 pdb_CONECT_Handler(line); 2742 else if (recordName.equals("REVDAT")) 2743 pdb_REVDAT_Handler(line); 2744 else if (recordName.equals("DBREF")) 2745 pdb_DBREF_Handler(line); 2746 else if (recordName.equals("SITE")) 2747 pdb_SITE_Handler(line); 2748 else if (recordName.equals("SSBOND")) 2749 pdb_SSBOND_Handler(line); 2750 else if (recordName.equals("LINK")) 2751 pdb_LINK_Handler(line); 2752 else if ( params.isParseSecStruc()) { 2753 if ( recordName.equals("HELIX") ) pdb_HELIX_Handler ( line ) ; 2754 else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ; 2755 else if (recordName.equals("TURN")) pdb_TURN_Handler( line ) ; 2756 } 2757 } catch (StringIndexOutOfBoundsException ex) { 2758 logger.warn("Unable to parse [" + line + "]"); 2759 } 2760 2761 2762 } 2763 2764 makeCompounds(compndLines, sourceLines); 2765 2766 triggerEndFileChecks(); 2767 2768 if (params.shouldCreateAtomBonds()) { 2769 formBonds(); 2770 } 2771 2772 if ( params.shouldCreateAtomCharges()) { 2773 addCharges(); 2774 } 2775 2776 if ( params.isParseSecStruc() && !params.isHeaderOnly()) 2777 setSecStruc(); 2778 2779 2780 return structure; 2781 2782 } 2783 2784 private void addCharges() { 2785 ChargeAdder.addCharges(structure); 2786 } 2787 2788 /** 2789 * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained. 2790 * @author Jules Jacobsen 2791 * @param compoundList 2792 * @param sourceList 2793 */ 2794 private void makeCompounds(List<String> compoundList, 2795 List<String> sourceList) { 2796 // System.out.println("[makeCompounds] making compounds from compoundLines"); 2797 2798 for (String line : compoundList) { 2799 if (compoundList.indexOf(line) + 1 == compoundList.size()) { 2800 // System.out.println("[makeCompounds] Final line in compoundLines."); 2801 isLastCompndLine = true; 2802 } 2803 pdb_COMPND_Handler(line); 2804 2805 } 2806 // System.out.println("[makeCompounds] adding sources to compounds from sourceLines"); 2807 // since we're starting again from the first compound, reset it here 2808 if ( compounds.size() == 0){ 2809 current_compound = new Compound(); 2810 } else { 2811 current_compound = compounds.get(0); 2812 } 2813 for (String line : sourceList) { 2814 if (sourceList.indexOf(line) + 1 == sourceList.size()) { 2815 // System.out.println("[makeCompounds] Final line in sourceLines."); 2816 isLastSourceLine = true; 2817 } 2818 pdb_SOURCE_Handler(line); 2819 } 2820 2821 } 2822 2823 /** 2824 * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide 2825 * bonds), peptide bonds, and intra-residue bonds. 2826 * <p> 2827 * Note: the current implementation only looks at the first model of each 2828 * structure. This may need to be fixed in the future. 2829 */ 2830 private void formBonds() { 2831 2832 BondMaker maker = new BondMaker(structure, params); 2833 2834 // TODO do we want link records at all? aren't they overlapping with other bonds that we infer (peptide/nucleotide bonds) or get from chemical components (intra-molecule bonds) - JD 2016-03-03 2835 for (LinkRecord linkRecord : linkRecords) { 2836 maker.formLinkRecordBond(linkRecord); 2837 } 2838 2839 maker.formDisulfideBonds(ssbonds); 2840 2841 maker.makeBonds(); 2842 } 2843 2844 2845 2846 private void triggerEndFileChecks(){ 2847 // finish and add ... 2848 2849 Date modDate = pdbHeader.getModDate(); 2850 if ( modDate.equals(new Date(0)) ) { 2851 // modification date = deposition date 2852 Date depositionDate = pdbHeader.getDepDate(); 2853 2854 if (! depositionDate.equals(modDate)){ 2855 // depDate is 0000-00-00 2856 pdbHeader.setDepDate(depositionDate); 2857 } 2858 2859 } 2860 2861 // a problem occurred earlier so current_chain = null ... 2862 // most likely the buffered reader did not provide data ... 2863 if ( current_chain != null ) { 2864 current_chain.addGroup(current_group); 2865 2866 if (isKnownChain(current_chain.getChainID(),current_model) == null) { 2867 current_model.add(current_chain); 2868 } 2869 } 2870 2871 //set the JournalArticle, if there is one 2872 if (!journalLines.isEmpty()) { 2873 buildjournalArticle(); 2874 pdbHeader.setJournalArticle(journalArticle); 2875 } 2876 2877 2878 structure.addModel(current_model); 2879 structure.setPDBHeader(pdbHeader); 2880 structure.setCrystallographicInfo(crystallographicInfo); 2881 2882 // TODO after 4.2 release we should remove setConnections/getConnections and rely only on Atom.getBonds/setBonds - JD 2016-03-03 2883 structure.setConnections(connects); 2884 2885 structure.setDBRefs(dbrefs); 2886 2887 // Only align if requested (default) and not when headerOnly mode with no Atoms. 2888 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 2889 if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){ 2890 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 2891 SeqRes2AtomAligner aligner = new SeqRes2AtomAligner(); 2892 aligner.align(structure,seqResChains); 2893 2894 } else { 2895 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 2896 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 2897 } 2898 2899 2900 linkChains2Compound(structure); 2901 structure.setCompounds(compounds); 2902 2903 //associate the temporary Groups in the siteMap to the ones 2904 2905 if (!params.isHeaderOnly()) { 2906 // Only can link SITES if Atom Groups were parsed. 2907 linkSitesToGroups(); // will work now that setSites is called 2908 } 2909 2910 if ( bioAssemblyParser != null){ 2911 pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap()); 2912 //System.out.println("setting nr bioAssemblies: " + pdbHeader.getNrBioAssemblies()); 2913 //System.out.println(pdbHeader.getBioUnitTranformationMap().keySet()); 2914 } 2915 2916 if (ncsOperators !=null && ncsOperators.size()>0) { 2917 crystallographicInfo.setNcsOperators( 2918 ncsOperators.toArray(new Matrix4d[ncsOperators.size()])); 2919 } 2920 2921 2922 // rfree end file check 2923 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 2924 // Here we follow this strategy: 2925 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 2926 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 2927 2928 if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) { 2929 pdbHeader.setRfree(rfreeNoCutoffLine); 2930 } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) { 2931 pdbHeader.setRfree(rfreeStandardLine); 2932 } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) { 2933 pdbHeader.setRfree(rfreeStandardLine); 2934 } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE 2935 2936 2937 // to make sure we have Compounds linked to chains, we call getCompounds() which will lazily initialise the 2938 // compounds using heuristics (see CompoundFinder) in the case that they were not explicitly present in the file 2939 structure.getCompounds(); 2940 } 2941 2942 private void setSecStruc(){ 2943 2944 setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2945 SecStrucType.helix4); 2946 setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2947 SecStrucType.extended); 2948 setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2949 SecStrucType.turn); 2950 2951 //Now insert random coil to the Groups that did not have SS information 2952 GroupIterator gi = new GroupIterator(structure); 2953 while (gi.hasNext()){ 2954 Group g = gi.next(); 2955 if (g.hasAminoAtoms()){ 2956 if (g.getProperty(Group.SEC_STRUC) == null){ 2957 SecStrucInfo ss = new SecStrucInfo(g, 2958 SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2959 SecStrucType.coil); 2960 g.setProperty(Group.SEC_STRUC, ss); 2961 } 2962 } 2963 } 2964 2965 } 2966 2967 private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){ 2968 2969 2970 Iterator<Map<String,String>> iter = secList.iterator(); 2971 nextElement: 2972 while (iter.hasNext()){ 2973 Map<String,String> m = iter.next(); 2974 2975 // assign all residues in this range to this secondary structure type 2976 // String initResName = (String)m.get("initResName"); 2977 String initChainId = m.get("initChainId"); 2978 String initSeqNum = m.get("initSeqNum" ); 2979 String initICode = m.get("initICode" ); 2980 // String endResName = (String)m.get("endResName" ); 2981 String endChainId = m.get("endChainId" ); 2982 String endSeqNum = m.get("endSeqNum"); 2983 String endICode = m.get("endICode"); 2984 2985 if (initICode.equals(" ")) 2986 initICode = ""; 2987 if (endICode.equals(" ")) 2988 endICode = ""; 2989 2990 GroupIterator gi = new GroupIterator(structure); 2991 boolean inRange = false; 2992 while (gi.hasNext()){ 2993 Group g = gi.next(); 2994 Chain c = g.getChain(); 2995 2996 if (c.getChainID().equals(initChainId)){ 2997 2998 String pdbCode = initSeqNum + initICode; 2999 if ( g.getResidueNumber().toString().equals(pdbCode) ) { 3000 inRange = true; 3001 } 3002 } 3003 if ( inRange){ 3004 if (g.hasAminoAtoms()) { 3005 SecStrucInfo ss = new SecStrucInfo(g, assignment, type); 3006 g.setProperty(Group.SEC_STRUC, ss); 3007 } 3008 3009 } 3010 if ( c.getChainID().equals(endChainId)){ 3011 String pdbCode = endSeqNum + endICode; 3012 if (pdbCode.equals(g.getResidueNumber().toString())){ 3013 inRange = false; 3014 continue nextElement; 3015 } 3016 } 3017 } 3018 } 3019 } 3020 3021 3022 /** After the parsing of a PDB file the {@link Chain} and {@link Compound} 3023 * objects need to be linked to each other. 3024 * 3025 * @param s the structure 3026 */ 3027 public void linkChains2Compound(Structure s){ 3028 3029 3030 for(Compound comp : compounds){ 3031 List<Chain> chains = new ArrayList<Chain>(); 3032 List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId()); 3033 if ( chainIds == null) 3034 continue; 3035 for ( String chainId : chainIds) { 3036 if ( chainId.equals("NULL")) 3037 chainId = " "; 3038 try { 3039 3040 Chain c = s.findChain(chainId); 3041 chains.add(c); 3042 3043 } catch (StructureException e){ 3044 // usually if this happens something is wrong with the PDB header 3045 // e.g. 2brd - there is no Chain A, although it is specified in the header 3046 // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES 3047 // but the authors didn't observe in the density so it's completely missing 3048 // from the ATOM lines 3049 logger.warn("Could not find chain {} to link to compound (entity) {}. The chain will be missing in the compound.", chainId, comp.getMolId()); 3050 } 3051 } 3052 comp.setChains(chains); 3053 } 3054 3055 if ( compounds.size() == 1) { 3056 Compound comp = compounds.get(0); 3057 if ( compoundMolIds2chainIds.get(comp.getMolId()) == null){ 3058 List<Chain> chains = s.getChains(0); 3059 if ( chains.size() == 1) { 3060 // this is an old style PDB file - add the ChainI 3061 Chain ch = chains.get(0); 3062 comp.addChain(ch); 3063 } 3064 } 3065 } 3066 3067 for (Compound comp: compounds){ 3068 if ( compoundMolIds2chainIds.get(comp.getMolId()) == null) { 3069 // could not link to chain 3070 // TODO: should this be allowed to happen? 3071 continue; 3072 } 3073 for ( String chainId : compoundMolIds2chainIds.get(comp.getMolId())){ 3074 if ( chainId.equals("NULL")) 3075 continue; 3076 try { 3077 Chain c = s.getChainByPDB(chainId); 3078 c.setCompound(comp); 3079 } catch (StructureException e){ 3080 logger.warn("Chain {} was not found, can't assign a compound (entity) to it.",chainId); 3081 } 3082 } 3083 } 3084 3085 // in rare cases where a purely non-polymer or purely water chain is present we have missed it above 3086 // we need now to assign a new compound to it so that at least the structure is consistent 3087 // see https://github.com/biojava/biojava/pull/394 3088 3089 if (compounds!=null && !compounds.isEmpty()) { 3090 for (Chain c: s.getChains()) { 3091 if (c.getCompound() == null) { 3092 3093 Compound compound = new Compound(); 3094 compound.addChain(c); 3095 compound.setMolId(findMaxCompoundId(compounds)+1); 3096 c.setCompound(compound); 3097 compounds.add(compound); 3098 3099 logger.warn("No compound (entity) found in file for chain {}. Creating new compound {} for it.", c.getChainID(), compound.getMolId()); 3100 } 3101 } 3102 } 3103 } 3104 3105 private static int findMaxCompoundId(List<Compound> compounds) { 3106 3107 return 3108 3109 Collections.max(compounds, new Comparator<Compound>() { 3110 @Override 3111 public int compare(Compound o1, Compound o2) { 3112 return new Integer(o1.getMolId()).compareTo(o2.getMolId()); 3113 } 3114 }).getMolId(); 3115 } 3116 3117 /** 3118 * Links the Sites in the siteMap to the Groups in the Structure via the 3119 * siteToResidueMap ResidueNumber. 3120 * @author Jules Jacobsen 3121 * @return 3122 */ 3123 private void linkSitesToGroups() { 3124 3125 //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size()); 3126 3127 //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back. 3128 //the return list 3129 3130 if ( siteMap == null || siteToResidueMap == null){ 3131 logger.info("Sites can not be linked to residues!"); 3132 3133 return; 3134 } 3135 3136 List<Site> sites = null; 3137 //check that there are chains with which to associate the groups 3138 if (structure.getChains().isEmpty()) { 3139 sites = new ArrayList<Site>(siteMap.values()); 3140 logger.info("No chains to link Site Groups with - Sites will not be present in the Structure"); 3141 return; 3142 } 3143 3144 //check that the keys in the siteMap and SiteToResidueMap are equal 3145 if (! siteMap.keySet().equals(siteToResidueMap.keySet())) { 3146 logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure"); 3147 logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet()); 3148 //return; 3149 } 3150 3151 //so we have chains - associate the siteResidues-related groups with the ones 3152 //already in in the chains 3153 for (String key : siteMap.keySet()) { 3154 Site currentSite = siteMap.get(key); 3155 List<ResidueNumber> linkedGroups = siteToResidueMap.get(key); 3156 if ( linkedGroups == null) 3157 continue; 3158 for (ResidueNumber residueNumber : linkedGroups) { 3159 3160 String pdbCode = residueNumber.toString(); 3161 String chain = residueNumber.getChainId(); 3162 // System.out.println("chain: '" + chain + "'"); 3163 // String resNum = resNum.getSeqNum().toString(); 3164 // System.out.println("resNum: '" + resNum + "'"); 3165 3166 Group linkedGroup = null; 3167 try { 3168 //TODO: implement findGroup(ResidueNumber resNum) 3169 linkedGroup = structure.findGroup(chain, pdbCode); 3170 } catch (StructureException ex) { 3171 logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")"); 3172 continue; 3173 } 3174 3175 // System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID()); 3176 currentSite.getGroups().add(linkedGroup); 3177 } 3178 } 3179 3180 //System.out.println("SITEMAP: " + siteMap); 3181 3182 sites = new ArrayList<Site>(siteMap.values()); 3183 structure.setSites(sites); 3184 //System.out.println("STRUCTURE SITES: " + structure.getSites().size()); 3185 // for (Site site : structure.getSites()) { 3186 // System.out.println(site); 3187 // } 3188 // System.out.println("Linked Site Groups with Chains"); 3189 3190 } 3191 3192 private void buildjournalArticle() { 3193 3194 logger.debug("building new JournalArticle"); 3195 // for (String line : journalLines) { 3196 // System.out.println(line); 3197 // } 3198 3199 this.journalArticle = new JournalArticle(); 3200 // JRNL AUTH M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI, 3201 // JRNL AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT 3202 // JRNL TITL A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY 3203 // JRNL TITL 2 STAPHYLOCOCCUS AUREUS. 3204 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3205 // JRNL REFN ISSN 1529-2908 3206 // JRNL PMID 17351618 3207 // JRNL DOI 10.1038/NI1450 3208 StringBuffer auth = new StringBuffer(); 3209 StringBuffer titl = new StringBuffer(); 3210 StringBuffer edit = new StringBuffer(); 3211 StringBuffer ref = new StringBuffer(); 3212 StringBuffer publ = new StringBuffer(); 3213 StringBuffer refn = new StringBuffer(); 3214 StringBuffer pmid = new StringBuffer(); 3215 StringBuffer doi = new StringBuffer(); 3216 3217 for (String line : journalLines) { 3218 if ( line.length() < 19 ) { 3219 logger.info("can not process Journal line: " + line); 3220 continue; 3221 } 3222 // System.out.println("'" + line + "'"); 3223 String subField = line.substring(12, 16); 3224 // System.out.println("'" + subField + "'"); 3225 if (subField.equals("AUTH")) { 3226 auth.append(line.substring(19, line.length()).trim()); 3227 3228 logger.debug("AUTH '" + auth.toString() + "'"); 3229 3230 } 3231 if (subField.equals("TITL")) { 3232 //add a space to the end of a line so that when wrapped the 3233 //words on the join won't be concatenated 3234 titl.append(line.substring(19, line.length()).trim()).append(" "); 3235 3236 logger.debug("TITL '" + titl.toString() + "'"); 3237 3238 } 3239 if (subField.equals("EDIT")) { 3240 edit.append(line.substring(19, line.length()).trim()); 3241 3242 logger.debug("EDIT '" + edit.toString() + "'"); 3243 3244 } 3245 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3246 if (subField.equals("REF ")) { 3247 ref.append(line.substring(19, line.length()).trim()).append(" "); 3248 3249 logger.debug("REF '" + ref.toString() + "'"); 3250 3251 } 3252 if (subField.equals("PUBL")) { 3253 publ.append(line.substring(19, line.length()).trim()).append(" "); 3254 3255 logger.debug("PUBL '" + publ.toString() + "'"); 3256 3257 } 3258 // JRNL REFN ISSN 1529-2908 3259 if (subField.equals("REFN")) { 3260 if ( line.length() < 35 ) { 3261 logger.info("can not process Journal REFN line: " + line); 3262 continue; 3263 } 3264 refn.append(line.substring(35, line.length()).trim()); 3265 3266 logger.debug("REFN '" + refn.toString() + "'"); 3267 3268 } 3269 // JRNL PMID 17351618 3270 if (subField.equals("PMID")) { 3271 pmid.append(line.substring(19, line.length()).trim()); 3272 3273 logger.debug("PMID '" + pmid.toString() + "'"); 3274 3275 } 3276 // JRNL DOI 10.1038/NI1450 3277 if (subField.equals("DOI ")) { 3278 doi.append(line.substring(19, line.length()).trim()); 3279 3280 logger.debug("DOI '" + doi.toString() + "'"); 3281 3282 } 3283 } 3284 3285 //now set the parts of the JournalArticle 3286 journalArticle.setAuthorList(authorBuilder(auth.toString())); 3287 journalArticle.setEditorList(authorBuilder(edit.toString())); 3288 journalArticle.setRef(ref.toString()); 3289 JournalParser journalParser = new JournalParser(ref.toString()); 3290 journalArticle.setJournalName(journalParser.getJournalName()); 3291 if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) { 3292 journalArticle.setIsPublished(true); 3293 } 3294 journalArticle.setVolume(journalParser.getVolume()); 3295 journalArticle.setStartPage(journalParser.getStartPage()); 3296 journalArticle.setPublicationDate(journalParser.getPublicationDate()); 3297 journalArticle.setPublisher(publ.toString().trim()); 3298 journalArticle.setTitle(titl.toString().trim()); 3299 journalArticle.setRefn(refn.toString().trim()); 3300 journalArticle.setPmid(pmid.toString().trim()); 3301 journalArticle.setDoi(doi.toString().trim()); 3302 3303 3304 logger.debug("Made JournalArticle:"); 3305 logger.debug(journalArticle.toString()); 3306 3307 } 3308 3309 //inner class to deal with all the journal info 3310 private class JournalParser { 3311 3312 private String journalName; 3313 private String volume; 3314 private String startPage; 3315 private int publicationDate; 3316 3317 3318 public JournalParser(String ref) { 3319 3320 logger.debug("JournalParser init '" + ref + "'"); 3321 3322 3323 if (ref.equals("TO BE PUBLISHED ")) { 3324 journalName = ref.trim(); 3325 3326 logger.debug(String.format("JournalParser found journalString '%s'", journalName)); 3327 3328 return; 3329 } 3330 3331 if (ref.length() < 48) { 3332 logger.info("REF line too short - must be at least 48 characters to be valid for parsing."); 3333 journalName = ""; 3334 volume = ""; 3335 startPage = ""; 3336 publicationDate = 0; 3337 return; 3338 } 3339 //can be multi line: 3340 //REF PHILOS.TRANS.R.SOC.LONDON, V. 293 53 1981 3341 //REF 2 SER.B 3342 3343 //or 3344 3345 //REF GLYCOGEN PHOSPHORYLASE B: 1 1991 3346 //REF 2 DESCRIPTION OF THE PROTEIN 3347 //REF 3 STRUCTURE 3348 3349 //but usually single line 3350 //REF NUCLEIC ACIDS RES. 2009 3351 //REF MOL.CELL 2009 3352 //REF NAT.STRUCT.MOL.BIOL. V. 16 238 2009 3353 //REF ACTA CRYSTALLOGR.,SECT.F V. 65 199 2009 3354 //check if the date is present at the end of the line. 3355 // 09876543210987654321 3356 //'J.BIOL.CHEM. V. 280 23000 2005 ' 3357 //'J.AM.CHEM.SOC. V. 130 16011 2008 ' 3358 //'NAT.STRUCT.MOL.BIOL. V. 16 238 2009' 3359 String volumeInformation = ref.substring(30, 48); 3360 3361 logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation)); 3362 3363 //volumeInformation: 'V. 293 53 1981 ' 3364 // String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim(); 3365 // String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim(); 3366 // String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim(); 3367 // String journalString = ref.substring(0 , ref.length() - 18).trim(); 3368 String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim(); 3369 String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim(); 3370 String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim(); 3371 //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk) 3372 String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim(); 3373 journalString = journalString.trim(); 3374 // System.out.println("journalString: " + journalString); 3375 3376 logger.debug(String.format("JournalParser found volumeString '%s'", volumeString)); 3377 logger.debug(String.format("JournalParser found startPageString '%s'", startPageString)); 3378 logger.debug(String.format("JournalParser found dateString '%s'", dateString)); 3379 logger.debug(String.format("JournalParser found journalString '%s'", journalString)); 3380 3381 3382 if (!dateString.equals(" ")) { 3383 try { 3384 publicationDate = Integer.valueOf(dateString); 3385 } catch (NumberFormatException nfe) { 3386 logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1"); 3387 } 3388 // if (DEBUG) { 3389 // System.out.println("JournalParser set date " + publicationDate); 3390 // } 3391 } 3392 3393 if (!startPageString.equals(" ")) { 3394 startPage = startPageString; 3395 // if (DEBUG) { 3396 // System.out.println("JournalParser set startPage " + startPage); 3397 // } 3398 } 3399 3400 if (!volumeString.equals(" ")) { 3401 volume = volumeString; 3402 // if (DEBUG) { 3403 // System.out.println("JournalParser set volume " + volume); 3404 // } 3405 } 3406 3407 if (!journalString.equals(" ")) { 3408 journalName = journalString; 3409 3410 logger.debug("JournalParser set journalName " + journalName); 3411 3412 } 3413 } 3414 3415 private String getJournalName() { 3416 return journalName; 3417 } 3418 3419 private int getPublicationDate() { 3420 return publicationDate; 3421 } 3422 3423 private String getStartPage() { 3424 return startPage; 3425 } 3426 3427 private String getVolume() { 3428 return volume; 3429 } 3430 } 3431 3432 private List<Author> authorBuilder(String authorString) { 3433 ArrayList<Author> authorList = new ArrayList<Author>(); 3434 3435 if (authorString.equals("")) { 3436 return authorList; 3437 } 3438 3439 String[] authors = authorString.split(","); 3440 // if (DEBUG) { 3441 // for (int i = 0; i < authors.length; i++) { 3442 // String string = authors[i]; 3443 // System.out.println("authorBuilder author: '" + string + "'"); 3444 // } 3445 // } 3446 // AUTH SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS 3447 // AUTH 2 DISEASE (SSGCID) 3448 // or 3449 // AUTH E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET, 3450 // AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA, 3451 // AUTH 3 A.BOCHKAREV,D.COSSAR, 3452 // AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC) 3453 // or 3454 // AUTH T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER 3455 if (authors.length == 1) { 3456 //only one element means it's a consortium only 3457 Author author = new Author(); 3458 author.setSurname(authors[0]); 3459 3460 logger.debug("Set consortium author name " + author.getSurname()); 3461 3462 authorList.add(author); 3463 } else { 3464 for (int i = 0; i < authors.length; i++) { 3465 String authorFullName = authors[i]; 3466 3467 logger.debug("Building author " + authorFullName); 3468 3469 Author author = new Author(); 3470 String regex = "\\."; 3471 String[] authorNames = authorFullName.split(regex); 3472 // if (DEBUG) { 3473 // System.out.println("authorNames size " + authorNames.length); 3474 // for (int j = 0; j < authorNames.length; j++) { 3475 // String name = authorNames[j]; 3476 // System.out.println("split authName '" + name + "'"); 3477 // 3478 // } 3479 // } 3480 if (authorNames.length == 0) { 3481 author.setSurname(authorFullName); 3482 3483 logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname()); 3484 3485 } 3486 //again there might be a consortium name so there may be no elements 3487 else if (authorNames.length == 1) { 3488 author.setSurname(authorNames[0]); 3489 3490 logger.debug("Set consortium author name in multiple author block " + author.getSurname 3491 ()); 3492 3493 } else { 3494 String initials = ""; 3495 for (int j = 0; j < authorNames.length - 1; j++) { 3496 String initial = authorNames[j]; 3497 // if (DEBUG) { 3498 // System.out.println("adding initial '" + initial + "'"); 3499 // } 3500 //build the initials back up again 3501 initials += initial + "."; 3502 } 3503 3504 logger.debug("built initials '" + initials + "'"); 3505 3506 author.setInitials(initials); 3507 //surname is always last 3508 int lastName = authorNames.length - 1; 3509 String surname = authorNames[lastName]; 3510 3511 logger.debug("built author surname " + surname); 3512 3513 author.setSurname(surname); 3514 3515 } 3516 authorList.add(author); 3517 } 3518 } 3519 return authorList; 3520 } 3521 3522 public void setFileParsingParameters(FileParsingParameters params) 3523 { 3524 this.params= params; 3525 3526 // set the correct max values for parsing... 3527 load_max_atoms = params.getMaxAtoms(); 3528 my_ATOM_CA_THRESHOLD = params.getAtomCaThreshold(); 3529 3530 } 3531 3532 public FileParsingParameters getFileParsingParameters(){ 3533 return params; 3534 } 3535 3536 3537}