001/* 002 * 003 * This code may be freely distributed and modified under the 004 * terms of the GNU Lesser General Public Licence. This should 005 * be distributed with the code. If you do not have a copy, 006 * see: 007 * 008 * http://www.gnu.org/copyleft/lesser.html 009 * 010 * Copyright for this code is held jointly by the individual 011 * authors. These should be listed in @author doc comments. 012 * 013 * For more information on the BioJava project and its aims, 014 * or to join the biojava-l mailing list, visit the home page 015 * at: 016 * 017 * http://www.biojava.org/ 018 * 019 * Created on 16.03.2004 020 * 021 */ 022package org.biojava.nbio.structure.io; 023 024import static java.lang.Math.min; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStream; 029import java.io.InputStreamReader; 030import java.text.DateFormat; 031import java.text.ParseException; 032import java.text.SimpleDateFormat; 033import java.util.ArrayList; 034import java.util.Arrays; 035import java.util.Date; 036import java.util.HashMap; 037import java.util.Iterator; 038import java.util.LinkedHashMap; 039import java.util.List; 040import java.util.Locale; 041import java.util.Map; 042import java.util.StringTokenizer; 043import java.util.regex.Matcher; 044import java.util.regex.Pattern; 045 046import javax.vecmath.Matrix4d; 047 048import org.biojava.nbio.structure.AminoAcid; 049import org.biojava.nbio.structure.AminoAcidImpl; 050import org.biojava.nbio.structure.Atom; 051import org.biojava.nbio.structure.AtomImpl; 052import org.biojava.nbio.structure.Author; 053import org.biojava.nbio.structure.Chain; 054import org.biojava.nbio.structure.ChainImpl; 055import org.biojava.nbio.structure.DBRef; 056import org.biojava.nbio.structure.Element; 057import org.biojava.nbio.structure.EntityInfo; 058import org.biojava.nbio.structure.EntityType; 059import org.biojava.nbio.structure.Group; 060import org.biojava.nbio.structure.GroupIterator; 061import org.biojava.nbio.structure.HetatomImpl; 062import org.biojava.nbio.structure.JournalArticle; 063import org.biojava.nbio.structure.NucleotideImpl; 064import org.biojava.nbio.structure.PDBCrystallographicInfo; 065import org.biojava.nbio.structure.PDBHeader; 066import org.biojava.nbio.structure.ResidueNumber; 067import org.biojava.nbio.structure.Site; 068import org.biojava.nbio.structure.Structure; 069import org.biojava.nbio.structure.StructureException; 070import org.biojava.nbio.structure.StructureImpl; 071import org.biojava.nbio.structure.StructureTools; 072import org.biojava.nbio.structure.io.mmcif.ChemCompGroupFactory; 073import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom; 074import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord; 075import org.biojava.nbio.structure.secstruc.SecStrucInfo; 076import org.biojava.nbio.structure.secstruc.SecStrucType; 077import org.biojava.nbio.structure.xtal.CrystalCell; 078import org.biojava.nbio.structure.xtal.SpaceGroup; 079import org.biojava.nbio.structure.xtal.SymoplibParser; 080import org.slf4j.Logger; 081import org.slf4j.LoggerFactory; 082 083 084/** 085 * This class implements the actual PDB file parsing. Do not access it directly, but 086 * via the PDBFileReader class. 087 * 088 * <h2>Parsing</h2> 089 * 090 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods. 091 * 092 * 093 * <p> 094 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD. 095 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically 096 * switch to a C-alpha only representation. 097 * 098 * <p> 099 * The result of the parsing of the PDB file is a new {@link Structure} object. 100 * 101 * <p> 102 * For more documentation on how to work with the Structure API please 103 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top"> 104 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a> 105 * 106 * 107 * 108 * 109 * <h2>Example</h2> 110 * <p> 111 * Q: How can I get a Structure object from a PDB file? 112 * <p> 113 * A: 114 * <pre> 115 * public {@link Structure} loadStructure(String pathToPDBFile){ 116 * // The PDBFileParser is wrapped by the PDBFileReader 117 * {@link PDBFileReader} pdbreader = new {@link PDBFileReader}(); 118 * 119 * {@link Structure} structure = null; 120 * try{ 121 * structure = pdbreader.getStructure(pathToPDBFile); 122 * System.out.println(structure); 123 * } catch (IOException e) { 124 * e.printStackTrace(); 125 * } 126 * return structure; 127 * } 128 * </pre> 129 * 130 * 131 * @author Andreas Prlic 132 * @author Jules Jacobsen 133 * @author Jose Duarte 134 * @since 1.4 135 */ 136public class PDBFileParser { 137 138 139 140 private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class); 141 142 // for printing 143 private static final String NEWLINE = System.getProperty("line.separator"); 144 145 146 // required for parsing: 147 private String pdbId; //the actual id of the entry 148 private Structure structure; 149 private List<List<Chain>> allModels; // a temp data structure to keep all models 150 private List<Chain> currentModel; // contains the ATOM records for each model 151 private Chain currentChain; 152 private Group currentGroup; 153 154 private List<Chain> seqResChains; // contains all the chains for the SEQRES records 155 //we're going to work on the assumption that the files are current - 156 //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true. 157 //if true then lines will be truncated at 72 characters in certain cases 158 //(pdb_COMPOUND_handler for example) 159 private boolean isLegacyFormat = false; 160 161 private boolean blankChainIdsPresent = false; 162 163 // for re-creating the biological assembly 164 private PDBBioAssemblyParser bioAssemblyParser = null; 165 166 private PDBHeader pdbHeader; 167 private PDBCrystallographicInfo crystallographicInfo; 168 private JournalArticle journalArticle; 169 private List<Map<String, Integer>> connects ; 170 private List<Map<String,String>> helixList; 171 private List<Map<String,String>> strandList; 172 private List<Map<String,String>> turnList; 173 174 private int lengthCheck ; 175 176 private boolean isLastCompndLine = false; 177 private boolean isLastSourceLine = false; 178 private EntityInfo current_compound; 179 private List<EntityInfo> entities = new ArrayList<EntityInfo>(); 180 private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>(); 181 private List<String> compndLines = new ArrayList<String>(); 182 private List<String> sourceLines = new ArrayList<String>(); 183 private List<String> journalLines = new ArrayList<String>(); 184 private List<DBRef> dbrefs; 185 private Map<String, Site> siteMap = new LinkedHashMap<String, Site>(); 186 private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>(); 187 188 private List<SSBondImpl> ssbonds = new ArrayList<>(); 189 190 // for storing LINK until we have all the atoms parsed 191 private List<LinkRecord> linkRecords; 192 193 private Matrix4d currentNcsOp; 194 private List<Matrix4d> ncsOperators; 195 196 // for parsing COMPOUND and SOURCE Header lines 197 private int prevMolId; 198 private String previousContinuationField; 199 private String continuationField; 200 private String continuationString; 201 202 private DateFormat dateFormat; 203 204 // for rfree parsing 205 private float rfreeStandardLine = -1; 206 private float rfreeNoCutoffLine = -1; 207 208 private static final List<String> compndFieldValues = new ArrayList<String>( 209 Arrays.asList( 210 "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:", 211 "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:", 212 "BIOLOGICAL_UNIT:", "OTHER_DETAILS:" 213 )); 214 215 216 private static final List<String> ignoreCompndFieldValues = new ArrayList<String>( 217 Arrays.asList( 218 "HETEROGEN:","ENGINEEREED:","FRAGMENT,", 219 "MUTANT:","SYNTHETIC:" 220 )); 221 // ENGINEEREED in pdb219d 222 223 private static final List<String> sourceFieldValues = new ArrayList<String>( 224 Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:", 225 "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:", 226 "ORGANISM_TAXID:","STRAIN:", 227 "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:", 228 "CELL:", "ORGANELLE:", "SECRETION:", "GENE:", 229 "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:", 230 "EXPRESSION_SYSTEM_TAXID:", 231 "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:", 232 "EXPRESSION_SYSTEM_CELL_LINE:", 233 "EXPRESSION_SYSTEM_ATCC_NUMBER:", 234 "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:", 235 "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:", 236 "EXPRESSION_SYSTEM_CELLULAR_LOCATION:", 237 "EXPRESSION_SYSTEM_VECTOR_TYPE:", 238 "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:", 239 "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:")); 240 241 private int atomCount; 242 243 // parsing options: 244 245 private int atomCAThreshold ; 246 247 private int loadMaxAtoms; 248 249 private boolean atomOverflow; 250 251 /** flag to tell parser to only read Calpha coordinates **/ 252 private boolean parseCAonly; 253 254 255 private FileParsingParameters params; 256 257 private boolean startOfMolecule; 258 private boolean startOfModel; 259 260 public PDBFileParser() { 261 params = new FileParsingParameters(); 262 263 allModels = new ArrayList<>(); 264 structure = null ; 265 currentModel = null; 266 currentChain = null; 267 currentGroup = null; 268 // we initialise to true since at the beginning of the file we are always starting a new molecule 269 startOfMolecule = true; 270 startOfModel = true; 271 272 273 pdbHeader = new PDBHeader(); 274 crystallographicInfo = new PDBCrystallographicInfo(); 275 connects = new ArrayList<Map<String,Integer>>() ; 276 277 278 helixList = new ArrayList<Map<String,String>>(); 279 strandList = new ArrayList<Map<String,String>>(); 280 turnList = new ArrayList<Map<String,String>>(); 281 current_compound = null; 282 dbrefs = new ArrayList<DBRef>(); 283 siteMap = null; 284 dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US); 285 atomCount = 0; 286 atomOverflow = false; 287 parseCAonly = false; 288 289 // this SHOULD not be done 290 // DONOT:setFileParsingParameters(params); 291 // set the correct max values for parsing... 292 loadMaxAtoms = params.getMaxAtoms(); 293 atomCAThreshold = params.getAtomCaThreshold(); 294 295 linkRecords = new ArrayList<LinkRecord>(); 296 297 blankChainIdsPresent = false; 298 299 } 300 301 /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */ 302 private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) { 303 304 Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3); 305 if ( g != null && !g.getChemComp().isEmpty()) 306 return g; 307 308 309 Group group; 310 if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){ 311 group = new HetatomImpl(); 312 313 } else if(StructureTools.isNucleotide(aminoCode3)) { 314 // it is a nucleotide 315 NucleotideImpl nu = new NucleotideImpl(); 316 group = nu; 317 318 } else { 319 AminoAcidImpl aa = new AminoAcidImpl() ; 320 aa.setAminoType(aminoCode1); 321 group = aa ; 322 } 323 324 // System.out.println("new resNum type: "+ resNum.getType() ); 325 return group ; 326 } 327 328 329 330 // Handler methods to deal with PDB file records properly. 331 /** 332 Handler for 333 HEADER Record Format 334 <pre> 335 COLUMNS DATA TYPE FIELD DEFINITION 336 ---------------------------------------------------------------------------------- 337 1 - 6 Record name "HEADER" 338 11 - 50 String(40) classification Classifies the molecule(s) 339 51 - 59 Date depDate Deposition date. This is the date 340 the coordinates were received by 341 the PDB 342 63 - 66 IDcode idCode This identifier is unique within PDB 343 </pre> 344 */ 345 private void pdb_HEADER_Handler(String line) { 346 347 String classification = null; 348 String deposition_date = null; 349 String pdbCode = null; 350 351 int len = line.trim().length(); 352 if(len > 10) { 353 classification = line.substring (10, min(len,50)).trim() ; 354 pdbHeader.setClassification(classification); 355 } 356 if(len > 50) { 357 deposition_date = line.substring (50, min(len,59)).trim() ; 358 try { 359 Date dep = dateFormat.parse(deposition_date); 360 pdbHeader.setDepDate(dep); 361 362 } catch (ParseException e){ 363 logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date"); 364 } 365 } 366 if(len > 62) { 367 pdbCode = line.substring (62, min(len,66)).trim() ; 368 pdbId = pdbCode; 369 370 logger.debug("Parsing entry " + pdbId); 371 372 373 structure.setPDBCode(pdbCode); 374 pdbHeader.setIdCode(pdbCode); 375 } 376 377 //*really* old files (you'll need to hunt to find these as they 378 //should have been remediated) have headers like below. Plus the 379 //pdbId at positions 72-76 is present in every line 380 381 //HEADER PROTEINASE INHIBITOR (TRYPSIN) 05-OCT-84 5PTI 5PTI 3 382 //HEADER TRANSFERASE (ACYLTRANSFERASE) 02-SEP-92 1LAC 1LAC 2 383 if (len > 66) { 384 if (pdbId.equals(line.substring (72, 76))){ 385 isLegacyFormat = true; 386 logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly."); 387 } 388 } 389 390 } 391 392 393 /** 394 * Parses the following record: 395 * <pre> 396 * COLUMNS DATA TYPE FIELD DEFINITION 397 * ------------------------------------------------------------------------------------ 398 * 1 - 6 Record name "AUTHOR" 399 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 400 * 11 - 79 List authorList List of the author names, separated 401 * by commas. 402 * 403 * </pre> 404 * @param line 405 */ 406 private void pdb_AUTHOR_Handler(String line) { 407 408 String authors = line.substring(10).trim(); 409 410 String auth = pdbHeader.getAuthors(); 411 if (auth == null){ 412 pdbHeader.setAuthors(authors); 413 } else { 414 auth += authors; 415 pdbHeader.setAuthors(auth); 416 } 417 418 } 419 420 421 422 /** 423 * Parses the following record: 424 * 425 * <pre> 426 * COLUMNS DATA TYPE FIELD DEFINITION 427 * -------------------------------------------------------------------- 428 * 1 - 6 Record name "HELIX " 429 * 8 - 10 Integer serNum Serial number of the helix. 430 * This starts at 1 and increases 431 * incrementally. 432 * 12 - 14 LString(3) helixID Helix identifier. In addition 433 * to a serial number, each helix is 434 * given an alphanumeric character 435 * helix identifier. 436 * 16 - 18 Residue name initResName Name of the initial residue. 437 * 20 Character initChainID Chain identifier for the chain 438 * containing this helix. 439 * 22 - 25 Integer initSeqNum Sequence number of the initial 440 * residue. 441 * 26 AChar initICode Insertion code of the initial 442 * residue. 443 * 28 - 30 Residue name endResName Name of the terminal residue of 444 * the helix. 445 * 32 Character endChainID Chain identifier for the chain 446 * containing this helix. 447 * 34 - 37 Integer endSeqNum Sequence number of the terminal 448 * residue. 449 * 38 AChar endICode Insertion code of the terminal 450 * residue. 451 * 39 - 40 Integer helixClass Helix class (see below). 452 * 41 - 70 String comment Comment about this helix. 453 * 72 - 76 Integer length Length of this helix. 454 * </pre> 455 */ 456 private void pdb_HELIX_Handler(String line){ 457 458 if (params.isHeaderOnly()) return; 459 460 if (line.length()<38) { 461 logger.info("HELIX line has length under 38. Ignoring it."); 462 return; 463 } 464 465 String initResName = line.substring(15,18).trim(); 466 String initChainId = line.substring(19,20); 467 String initSeqNum = line.substring(21,25).trim(); 468 String initICode = line.substring(25,26); 469 String endResName = line.substring(27,30).trim(); 470 String endChainId = line.substring(31,32); 471 String endSeqNum = line.substring(33,37).trim(); 472 String endICode = line.substring(37,38); 473 474 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 475 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 476 477 Map<String,String> m = new HashMap<String,String>(); 478 479 m.put("initResName",initResName); 480 m.put("initChainId", initChainId); 481 m.put("initSeqNum", initSeqNum); 482 m.put("initICode", initICode); 483 m.put("endResName", endResName); 484 m.put("endChainId", endChainId); 485 m.put("endSeqNum",endSeqNum); 486 m.put("endICode",endICode); 487 488 helixList.add(m); 489 490 } 491 492 /** 493 * Handler for 494 * <pre> 495 * COLUMNS DATA TYPE FIELD DEFINITION 496 * -------------------------------------------------------------- 497 * 1 - 6 Record name "SHEET " 498 * 8 - 10 Integer strand Strand number which starts at 1 499 * for each strand within a sheet 500 * and increases by one. 501 * 12 - 14 LString(3) sheetID Sheet identifier. 502 * 15 - 16 Integer numStrands Number of strands in sheet. 503 * 18 - 20 Residue name initResName Residue name of initial residue. 504 * 22 Character initChainID Chain identifier of initial 505 * residue in strand. 506 * 23 - 26 Integer initSeqNum Sequence number of initial 507 * residue in strand. 508 * 27 AChar initICode Insertion code of initial residue 509 * in strand. 510 * 29 - 31 Residue name endResName Residue name of terminal residue. 511 * 33 Character endChainID Chain identifier of terminal 512 * residue. 513 * 34 - 37 Integer endSeqNum Sequence number of terminal 514 * residue. 515 * 38 AChar endICode Insertion code of terminal 516 * residue. 517 * 39 - 40 Integer sense Sense of strand with respect to 518 * previous strand in the sheet. 0 519 * if first strand, 1 if parallel, 520 * -1 if anti-parallel. 521 * 42 - 45 Atom curAtom Registration. Atom name in 522 * current strand. 523 * 46 - 48 Residue name curResName Registration. Residue name in 524 * current strand. 525 * 50 Character curChainId Registration. Chain identifier in 526 * current strand. 527 * 51 - 54 Integer curResSeq Registration. Residue sequence 528 * number in current strand. 529 * 55 AChar curICode Registration. Insertion code in 530 * current strand. 531 * 57 - 60 Atom prevAtom Registration. Atom name in 532 * previous strand. 533 * 61 - 63 Residue name prevResName Registration. Residue name in 534 * previous strand. 535 * 65 Character prevChainId Registration. Chain identifier in 536 * previous strand. 537 * 66 - 69 Integer prevResSeq Registration. Residue sequence 538 * number in previous strand. 539 * 70 AChar prevICode Registration. Insertion code in 540 * previous strand. 541 * </pre> 542 */ 543 private void pdb_SHEET_Handler( String line){ 544 545 if (params.isHeaderOnly()) return; 546 547 if (line.length()<38) { 548 logger.info("SHEET line has length under 38. Ignoring it."); 549 return; 550 } 551 552 String initResName = line.substring(17,20).trim(); 553 String initChainId = line.substring(21,22); 554 String initSeqNum = line.substring(22,26).trim(); 555 String initICode = line.substring(26,27); 556 String endResName = line.substring(28,31).trim(); 557 String endChainId = line.substring(32,33); 558 String endSeqNum = line.substring(33,37).trim(); 559 String endICode = line.substring(37,38); 560 561 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 562 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 563 564 Map<String,String> m = new HashMap<String,String>(); 565 566 m.put("initResName",initResName); 567 m.put("initChainId", initChainId); 568 m.put("initSeqNum", initSeqNum); 569 m.put("initICode", initICode); 570 m.put("endResName", endResName); 571 m.put("endChainId", endChainId); 572 m.put("endSeqNum",endSeqNum); 573 m.put("endICode",endICode); 574 575 strandList.add(m); 576 } 577 578 579 /** 580 * Handler for TURN lines 581 * <pre> 582 * COLUMNS DATA TYPE FIELD DEFINITION 583 * -------------------------------------------------------------------- 584 * 1 - 6 Record name "TURN " 585 * 8 - 10 Integer seq Turn number; starts with 1 and 586 * increments by one. 587 * 12 - 14 LString(3) turnId Turn identifier 588 * 16 - 18 Residue name initResName Residue name of initial residue in 589 * turn. 590 * 20 Character initChainId Chain identifier for the chain 591 * containing this turn. 592 * 21 - 24 Integer initSeqNum Sequence number of initial residue 593 * in turn. 594 * 25 AChar initICode Insertion code of initial residue 595 * in turn. 596 * 27 - 29 Residue name endResName Residue name of terminal residue 597 * of turn. 598 * 31 Character endChainId Chain identifier for the chain 599 * containing this turn. 600 * 32 - 35 Integer endSeqNum Sequence number of terminal 601 * residue of turn. 602 * 36 AChar endICode Insertion code of terminal residue 603 * of turn. 604 * 41 - 70 String comment Associated comment. 605 * </pre> 606 * @param line 607 */ 608 private void pdb_TURN_Handler( String line){ 609 610 if (params.isHeaderOnly()) return; 611 612 if (line.length()<36) { 613 logger.info("TURN line has length under 36. Ignoring it."); 614 return; 615 } 616 617 String initResName = line.substring(15,18).trim(); 618 String initChainId = line.substring(19,20); 619 String initSeqNum = line.substring(20,24).trim(); 620 String initICode = line.substring(24,25); 621 String endResName = line.substring(26,29).trim(); 622 String endChainId = line.substring(30,31); 623 String endSeqNum = line.substring(31,35).trim(); 624 String endICode = line.substring(35,36); 625 626 //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " + 627 // endResName + " " + endChainId + " " + endSeqNum + " " + endICode); 628 629 Map<String,String> m = new HashMap<String,String>(); 630 631 m.put("initResName",initResName); 632 m.put("initChainId", initChainId); 633 m.put("initSeqNum", initSeqNum); 634 m.put("initICode", initICode); 635 m.put("endResName", endResName); 636 m.put("endChainId", endChainId); 637 m.put("endSeqNum",endSeqNum); 638 m.put("endICode",endICode); 639 640 turnList.add(m); 641 } 642 643 /** 644 * Handler for 645 * REVDAT Record format: 646 * <pre> 647 * 648 * COLUMNS DATA TYPE FIELD DEFINITION 649 * ---------------------------------------------------------------------------------- 650 * 1 - 6 Record name "REVDAT" 651 * 8 - 10 Integer modNum Modification number. 652 * 11 - 12 Continuation continuation Allows concatenation of multiple 653 * records. 654 * 14 - 22 Date modDate Date of modification (or release for 655 * new entries). This is not repeated 656 * on continuation lines. 657 * 24 - 28 String(5) modId Identifies this particular 658 * modification. It links to the 659 * archive used internally by PDB. 660 * This is not repeated on continuation 661 * lines. 662 * 32 Integer modType An integer identifying the type of 663 * modification. In case of revisions 664 * with more than one possible modType, 665 * the highest value applicable will be 666 * assigned. 667 * 40 - 45 LString(6) record Name of the modified record. 668 * 47 - 52 LString(6) record Name of the modified record. 669 * 54 - 59 LString(6) record Name of the modified record. 670 * 61 - 66 LString(6) record Name of the modified record. 671 * </pre> 672 */ 673 private void pdb_REVDAT_Handler(String line) { 674 675 // keep the first as latest modified date and the last as release date 676 Date modDate = pdbHeader.getModDate(); 677 678 if ( modDate==null || modDate.equals(new Date(0)) ) { 679 680 // modified date is still uninitialized 681 String modificationDate = line.substring (13, 22).trim() ; 682 683 try { 684 Date dep = dateFormat.parse(modificationDate); 685 pdbHeader.setModDate(dep); 686 pdbHeader.setRelDate(dep); 687 } catch (ParseException e){ 688 logger.info("Could not parse revision date string '"+modificationDate+"'. "); 689 } 690 691 } else { 692 693 // set as the release date 694 String releaseDate = line.substring (13, 22).trim() ; 695 696 try { 697 Date dep = dateFormat.parse(releaseDate); 698 pdbHeader.setRelDate(dep); 699 } catch (ParseException e){ 700 logger.info("Could not parse revision date string '"+releaseDate+"'. "); 701 } 702 } 703 } 704 705 /** 706 * Handler for 707 * SEQRES record format 708 * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied. 709 * <p> 710 * Record Format: 711 * <p> 712 * <pre> 713 * COLUMNS DATA TYPE FIELD DEFINITION 714 * --------------------------------------------------------------------------------- 715 * 1 - 6 Record name "SEQRES" 716 * 9 - 10 Integer serNum Serial number of the SEQRES record 717 * for the current chain. Starts at 1 718 * and increments by one each line. 719 * Reset to 1 for each chain. 720 * 12 Character chainID Chain identifier. This may be any 721 * single legal character, including a 722 * blank which is used if there is 723 * only one chain. 724 * 14 - 17 Integer numRes Number of residues in the chain. 725 * This value is repeated on every 726 * record. 727 * 20 - 22 Residue name resName Residue name. 728 * 24 - 26 Residue name resName Residue name. 729 * 28 - 30 Residue name resName Residue name. 730 * 32 - 34 Residue name resName Residue name. 731 * 36 - 38 Residue name resName Residue name. 732 * 40 - 42 Residue name resName Residue name. 733 * 44 - 46 Residue name resName Residue name. 734 * 48 - 50 Residue name resName Residue name. 735 * 52 - 54 Residue name resName Residue name. 736 * 56 - 58 Residue name resName Residue name. 737 * 60 - 62 Residue name resName Residue name. 738 * 64 - 66 Residue name resName Residue name. 739 * 68 - 70 Residue name resName Residue name. 740 * </pre> 741 * @author Jules Jacobsen 742 */ 743 private void pdb_SEQRES_Handler(String line) { 744 745 /* 746 * 1 2 3 4 5 6 7 747 * 1234567890123456789012345678901234567890123456789012345678901234567890 748 * SEQRES 1 A 376 LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR 749 * SEQRES 1 A 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 750 * SEQRES 2 A 21 TYR GLN LEU GLU ASN TYR CYS ASN 751 * SEQRES 1 B 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 752 * SEQRES 2 B 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 753 * SEQRES 3 B 30 THR PRO LYS ALA 754 * SEQRES 1 C 21 GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU 755 * SEQRES 2 C 21 TYR GLN LEU GLU ASN TYR CYS ASN 756 * SEQRES 1 D 30 PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU 757 * SEQRES 2 D 30 ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR 758 * SEQRES 3 D 30 THR PRO LYS ALA 759 */ 760 761 String recordName = line.substring(0, 6).trim(); 762 String chainID = line.substring(11, 12); 763 String newLength = line.substring(13,17).trim(); 764 String subSequence = line.substring(18); 765 766 if ( lengthCheck == -1 ){ 767 lengthCheck = Integer.parseInt(newLength); 768 } 769 770 StringTokenizer subSequenceResidues = new StringTokenizer(subSequence); 771 772 Character aminoCode1 = null; 773 if (! recordName.equals(AminoAcid.SEQRESRECORD)) { 774 // should not have been called 775 return; 776 } 777 778 currentChain = isKnownChain(chainID, seqResChains); 779 if ( currentChain == null) { 780 781 currentChain = new ChainImpl(); 782 currentChain.setId(chainID); 783 currentChain.setName(chainID); 784 785 } 786 787 while (subSequenceResidues.hasMoreTokens()) { 788 789 String threeLetter = subSequenceResidues.nextToken(); 790 791 aminoCode1 = StructureTools.get1LetterCode(threeLetter); 792 793 //if (aminoCode1 == null) { 794 // could be a nucleotide... 795 // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide... 796 //} 797 currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter); 798 799 currentGroup.setPDBName(threeLetter); 800 801 if ( currentGroup instanceof AminoAcid){ 802 AminoAcid aa = (AminoAcid)currentGroup; 803 aa.setRecordType(AminoAcid.SEQRESRECORD); 804 } 805 // add the current resNum to the new chain. 806 currentChain.addGroup(currentGroup); 807 808 } 809 Chain test = isKnownChain(chainID, seqResChains); 810 811 if ( test == null) 812 seqResChains.add(currentChain); 813 814 if (currentGroup != null) 815 currentGroup.trimToSize(); 816 817 currentGroup = null; 818 currentChain = null; 819 820 // the current chain is finished! 821 //if ( current_chain.getLength() != lengthCheck ){ 822 // System.err.println("the length of chain " + current_chain.getName() + "(" + 823 // current_chain.getLength() + ") does not match the expected " + lengthCheck); 824 //} 825 826 lengthCheck = Integer.parseInt(newLength); 827 828 } 829 830 831 832 /** 833 * Handler for 834 * TITLE Record Format 835 * <pre> 836 COLUMNS DATA TYPE FIELD DEFINITION 837 ---------------------------------------------------------------------------------- 838 1 - 6 Record name "TITLE " 839 9 - 10 Continuation continuation Allows concatenation of multiple 840 records. 841 11 - 70 String title Title of the experiment. 842 * </pre> 843 * 844 */ 845 private void pdb_TITLE_Handler(String line) { 846 String title; 847 if ( line.length() > 79) 848 title = line.substring(10,80).trim(); 849 else 850 title = line.substring(10,line.length()).trim(); 851 852 String t = pdbHeader.getTitle(); 853 if ( (t != null) && (! t.equals("")) ){ 854 if (t.endsWith("-")) 855 t += ""; // if last line ends with a hyphen then we don't add space 856 else 857 t += " "; 858 } 859 else t = ""; 860 861 t += title; 862 863 pdbHeader.setTitle(t); 864 } 865 866 /** 867 * JRNL handler. 868 * The JRNL record contains the primary literature citation that describes the experiment which resulted 869 * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary 870 * reference, then there is no JRNL reference. Other references are given in REMARK 1. 871 * 872 * Record Format 873 * <pre> 874 * COLUMNS DATA TYPE FIELD DEFINITION 875 * ----------------------------------------------------------------------- 876 * 1 - 6 Record name "JRNL " 877 * 878 * 13 - 70 LString text See Details below. 879 * </pre> 880 */ 881 private void pdb_JRNL_Handler(String line) { 882 //add the strings to the journalLines 883 //the actual JournalArticle is then built when the whole entry is being 884 //finalized with triggerEndFileChecks() 885 //JRNL TITL NMR SOLUTION STRUCTURE OF RECOMBINANT TICK 1TAP 10 886 if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) { 887 //trim off the trailing PDB id from legacy files. 888 //are we really trying to still cater for these museum pieces? 889 890 logger.debug("trimming legacy PDB id from end of JRNL section line"); 891 892 line = line.substring(0, line.length() - 8); 893 journalLines.add(line); 894 } else { 895 journalLines.add(line); 896 } 897 } 898 899 /** 900 * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same 901 * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be 902 * missing. Don't say I didn't warn you. 903 * 904 * @param line 905 */ 906 private void pdb_COMPND_Handler(String line) { 907 908 logger.debug("previousContinuationField is " 909 + previousContinuationField); 910 logger.debug("current continuationField is " 911 + continuationField); 912 logger.debug("current continuationString is " 913 + continuationString); 914 logger.debug("current compound is " 915 + current_compound); 916 917 918 // In legacy PDB files the line ends with the PDB code and a serial number, chop those off! 919 //format version 3.0 onwards will have 80 characters in a line 920 // if (line.length() > 72) { 921 if (isLegacyFormat) { 922 // if (DEBUG) { 923 // System.out.println("We have a legacy file - truncating line length to 71 characters:"); 924 // System.out.println(line); 925 // } 926 line = line.substring(0, 72); 927 } 928 929 line = line.substring(10, line.length()); 930 931 932 String[] fieldList = line.trim().split("\\s+"); 933 int fl = fieldList.length; 934 if ((fl >0 ) && compndFieldValues.contains(fieldList[0])) { 935 936 continuationField = fieldList[0]; 937 if (previousContinuationField.equals("")) { 938 previousContinuationField = continuationField; 939 } 940 941 } else if (fl>0) { 942 // the ':' character indicates the end of a field name and should be invalid as part the first data token 943 // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check 944 if (fieldList[0].contains(":") ) { 945 logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier"); 946 return; 947 } 948 949 } else { 950 951 // the line will be added as data to the previous field 952 } 953 954 line = line.replace(continuationField, "").trim(); 955 956 StringTokenizer compndTokens = new StringTokenizer(line); 957 958 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 959 960 while (compndTokens.hasMoreTokens()) { 961 String token = compndTokens.nextToken(); 962 963 if (previousContinuationField.equals("")) { 964 previousContinuationField = continuationField; 965 } 966 967 if (previousContinuationField.equals(continuationField) 968 && compndFieldValues.contains(continuationField)) { 969 970 logger.debug("Still in field " + continuationField); 971 logger.debug("token = " + token); 972 973 continuationString = continuationString.concat(token + " "); 974 975 logger.debug("continuationString = " 976 + continuationString); 977 978 } 979 if (!continuationField.equals(previousContinuationField)) { 980 981 if (continuationString.equals("")) { 982 continuationString = token; 983 984 } else { 985 986 compndValueSetter(previousContinuationField, 987 continuationString); 988 previousContinuationField = continuationField; 989 continuationString = token + " "; 990 } 991 } else if (ignoreCompndFieldValues.contains(token)) { 992 // this field shall be ignored 993 //continuationField = token; 994 } 995 } 996 if (isLastCompndLine) { 997 // final line in the section - finish off the compound 998 // System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header."); 999 compndValueSetter(continuationField, continuationString); 1000 continuationString = ""; 1001 if (current_compound!=null) entities.add(current_compound); 1002 } 1003 } 1004 1005 /** 1006 * Set the value in the current molId object 1007 * @param field 1008 * @param value 1009 */ 1010 private void compndValueSetter(String field, String value) { 1011 1012 value = value.trim().replace(";", ""); 1013 if (field.equals("MOL_ID:")) { 1014 1015 int i = -1; 1016 try { 1017 i = Integer.valueOf(value); 1018 } catch (NumberFormatException e){ 1019 logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value); 1020 } 1021 if (i>0 && prevMolId!=i) { 1022 1023 if (current_compound!=null) entities.add(current_compound); 1024 1025 logger.debug("Initialising new Compound with mol_id {}", i); 1026 1027 current_compound = new EntityInfo(); 1028 1029 current_compound.setMolId(i); 1030 1031 // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25 1032 current_compound.setType(EntityType.POLYMER); 1033 1034 prevMolId = i; 1035 } 1036 1037 } 1038 1039 // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return 1040 if (current_compound==null) { 1041 return; 1042 } 1043 1044 if (field.equals("MOLECULE:")) { 1045 current_compound.setDescription(value); 1046 1047 } 1048 if (field.equals("CHAIN:")) { 1049 //System.out.println(value); 1050 StringTokenizer chainTokens = new StringTokenizer(value, ","); 1051 List<String> chains = new ArrayList<String>(); 1052 1053 while (chainTokens.hasMoreTokens()) { 1054 String chainID = chainTokens.nextToken().trim(); 1055 // NULL is used in old PDB files to represent empty chain DI 1056 if (chainID.equals("NULL")) 1057 chainID = " "; 1058 chains.add(chainID); 1059 } 1060 compoundMolIds2chainIds.put(current_compound.getMolId(),chains); 1061 1062 } 1063 if (field.equals("SYNONYM:")) { 1064 1065 StringTokenizer synonyms = new StringTokenizer(value, ","); 1066 List<String> names = new ArrayList<String>(); 1067 1068 while (synonyms.hasMoreTokens()) { 1069 names.add(synonyms.nextToken()); 1070 1071 current_compound.setSynonyms(names); 1072 } 1073 1074 } 1075 1076 if (field.equals("EC:")) { 1077 1078 StringTokenizer ecNumTokens = new StringTokenizer(value, ","); 1079 List<String> ecNums = new ArrayList<String>(); 1080 1081 while (ecNumTokens.hasMoreTokens()) { 1082 ecNums.add(ecNumTokens.nextToken()); 1083 1084 current_compound.setEcNums(ecNums); 1085 } 1086 1087 } 1088 if (field.equals("FRAGMENT:")) { 1089 1090 current_compound.setFragment(value); 1091 1092 } 1093 if (field.equals("ENGINEERED:")) { 1094 1095 current_compound.setEngineered(value); 1096 1097 } 1098 if (field.equals("MUTATION:")) { 1099 1100 current_compound.setMutation(value); 1101 1102 } 1103 if (field.equals("BIOLOGICAL_UNIT:")) { 1104 1105 current_compound.setBiologicalUnit(value); 1106 1107 } 1108 if (field.equals("OTHER_DETAILS:")) { 1109 1110 current_compound.setDetails(value); 1111 1112 } 1113 1114 } 1115 1116 1117 /** 1118 * Handler for 1119 * SOURCE Record format 1120 * 1121 * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied. 1122 * Record Format 1123 * <pre> 1124 * COLUMNS DATA TYPE FIELD DEFINITION 1125 * ------------------------------------------------------------------------------- 1126 * 1 - 6 Record name "SOURCE" 1127 * 9 - 10 Continuation continuation Allows concatenation of multiple records. 1128 * 11 - 70 Specification srcName Identifies the source of the macromolecule in 1129 * list a token: value format. 1130 * </pre> 1131 * @param line the line to be parsed 1132 */ 1133 private void pdb_SOURCE_Handler(String line) { 1134 // works in the same way as the pdb_COMPND_Handler. 1135 String continuationNr = line.substring(9, 10).trim(); 1136 1137 1138 1139 logger.debug("current continuationNo is " 1140 + continuationNr); 1141 logger.debug("previousContinuationField is " 1142 + previousContinuationField); 1143 logger.debug("current continuationField is " 1144 + continuationField); 1145 logger.debug("current continuationString is " 1146 + continuationString); 1147 logger.debug("current compound is " 1148 + current_compound); 1149 1150 1151 // following the docs, the last valid character should be 79, chop off the rest 1152 if (line.length() > 79) { 1153 line = line.substring(0, 79); 1154 } 1155 1156 line = line.substring(10, line.length()); 1157 1158 logger.debug("LINE: >" + line + "<"); 1159 1160 String[] fieldList = line.split("\\s+"); 1161 1162 if (!fieldList[0].equals("") 1163 && sourceFieldValues.contains(fieldList[0])) { 1164 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'"); 1165 continuationField = fieldList[0]; 1166 if (previousContinuationField.equals("")) { 1167 previousContinuationField = continuationField; 1168 } 1169 1170 } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) { 1171 // System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'"); 1172 continuationField = fieldList[1]; 1173 if (previousContinuationField.equals("")) { 1174 previousContinuationField = continuationField; 1175 } 1176 1177 } else { 1178 if (continuationNr.equals("")) { 1179 1180 logger.debug("looks like an old PDB file"); 1181 1182 continuationField = "MOLECULE:"; 1183 if (previousContinuationField.equals("")) { 1184 previousContinuationField = continuationField; 1185 } 1186 } 1187 1188 } 1189 1190 line = line.replace(continuationField, "").trim(); 1191 1192 StringTokenizer compndTokens = new StringTokenizer(line); 1193 1194 // System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'"); 1195 1196 while (compndTokens.hasMoreTokens()) { 1197 String token = compndTokens.nextToken(); 1198 1199 if (previousContinuationField.equals("")) { 1200 // System.out.println("previousContinuationField is empty. Setting to : " + continuationField); 1201 previousContinuationField = continuationField; 1202 } 1203 1204 if (previousContinuationField.equals(continuationField) 1205 && sourceFieldValues.contains(continuationField)) { 1206 1207 logger.debug("Still in field " + continuationField); 1208 1209 continuationString = continuationString.concat(token + " "); 1210 1211 logger.debug("continuationString = " 1212 + continuationString); 1213 } 1214 if (!continuationField.equals(previousContinuationField)) { 1215 1216 if (continuationString.equals("")) { 1217 continuationString = token; 1218 1219 } else { 1220 1221 sourceValueSetter(previousContinuationField, 1222 continuationString); 1223 previousContinuationField = continuationField; 1224 continuationString = token + " "; 1225 } 1226 } else if (ignoreCompndFieldValues.contains(token)) { 1227 // this field shall be ignored 1228 //continuationField = token; 1229 } 1230 } 1231 if (isLastSourceLine) { 1232 // final line in the section - finish off the compound 1233 // System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header."); 1234 sourceValueSetter(continuationField, continuationString); 1235 continuationString = ""; 1236 //compounds.add(current_compound); 1237 } 1238 1239 } 1240 1241 1242 /** 1243 * Set the value in the current molId object 1244 * 1245 * @param field 1246 * @param value 1247 */ 1248 private void sourceValueSetter(String field, String value) { 1249 1250 value = value.trim().replace(";", ""); 1251 // System.out.println("[sourceValueSetter] " + field); 1252 if (field.equals("MOL_ID:")) { 1253 1254 try { 1255 current_compound = entities.get(Integer.valueOf(value) - 1); 1256 } catch (NumberFormatException e){ 1257 logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage()); 1258 return; 1259 } 1260 1261 1262 // System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId()); 1263 1264 } 1265 if (field.equals("SYNTHETIC:")) { 1266 current_compound.setSynthetic(value); 1267 } else if (field.equals("FRAGMENT:")) { 1268 current_compound.setFragment(value); 1269 } else if (field.equals("ORGANISM_SCIENTIFIC:")) { 1270 current_compound.setOrganismScientific(value); 1271 } else if (field.equals("ORGANISM_TAXID:")) { 1272 current_compound.setOrganismTaxId(value); 1273 } else if (field.equals("ORGANISM_COMMON:")) { 1274 current_compound.setOrganismCommon(value); 1275 } else if (field.equals("STRAIN:")) { 1276 current_compound.setStrain(value); 1277 } else if (field.equals("VARIANT:")) { 1278 current_compound.setVariant(value); 1279 } else if (field.equals("CELL_LINE:")) { 1280 current_compound.setCellLine(value); 1281 } else if (field.equals("ATCC:")) { 1282 current_compound.setAtcc(value); 1283 } else if (field.equals("ORGAN:")) { 1284 current_compound.setOrgan(value); 1285 } else if (field.equals("TISSUE:")) { 1286 current_compound.setTissue(value); 1287 } else if (field.equals("CELL:")) { 1288 current_compound.setCell(value); 1289 } else if (field.equals("ORGANELLE:")) { 1290 current_compound.setOrganelle(value); 1291 } else if (field.equals("SECRETION:")) { 1292 current_compound.setSecretion(value); 1293 } else if (field.equals("GENE:")) { 1294 current_compound.setGene(value); 1295 } else if (field.equals("CELLULAR_LOCATION:")) { 1296 current_compound.setCellularLocation(value); 1297 } else if (field.equals("EXPRESSION_SYSTEM:")) { 1298 current_compound.setExpressionSystem(value); 1299 } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) { 1300 current_compound.setExpressionSystemTaxId(value); 1301 } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) { 1302 current_compound.setExpressionSystemStrain(value); 1303 } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) { 1304 current_compound.setExpressionSystemVariant(value); 1305 } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) { 1306 current_compound.setExpressionSystemCellLine(value); 1307 } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) { 1308 current_compound.setExpressionSystemAtccNumber(value); 1309 } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) { 1310 current_compound.setExpressionSystemOrgan(value); 1311 } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) { 1312 current_compound.setExpressionSystemTissue(value); 1313 } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) { 1314 current_compound.setExpressionSystemCell(value); 1315 } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) { 1316 current_compound.setExpressionSystemOrganelle(value); 1317 } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) { 1318 current_compound.setExpressionSystemCellularLocation(value); 1319 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) { 1320 current_compound.setExpressionSystemVectorType(value); 1321 } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) { 1322 current_compound.setExpressionSystemVector(value); 1323 } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) { 1324 current_compound.setExpressionSystemPlasmid(value); 1325 } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) { 1326 current_compound.setExpressionSystemGene(value); 1327 } else if (field.equals("OTHER_DETAILS:")) { 1328 current_compound.setExpressionSystemOtherDetails(value); 1329 } 1330 1331 } 1332 1333 /** 1334 * Handler for REMARK lines 1335 */ 1336 private void pdb_REMARK_Handler(String line) { 1337 1338 if ( line == null || line.length() < 11) 1339 return; 1340 1341 1342 if (line.startsWith("REMARK 800")) { 1343 pdb_REMARK_800_Handler(line); 1344 1345 } else if ( line.startsWith("REMARK 350")){ 1346 1347 if ( params.isParseBioAssembly()) { 1348 1349 if (bioAssemblyParser == null){ 1350 bioAssemblyParser = new PDBBioAssemblyParser(); 1351 } 1352 1353 bioAssemblyParser.pdb_REMARK_350_Handler(line); 1354 } 1355 1356 // REMARK 3 (for R free) 1357 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1358 // then last one encountered will be taken 1359 } else if (line.startsWith("REMARK 3 FREE R VALUE")) { 1360 1361 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 1362 // Here we follow this strategy: 1363 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 1364 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 1365 1366 Pattern pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*"); 1367 Matcher mR = pR.matcher(line); 1368 if (mR.matches()) { 1369 try { 1370 rfreeNoCutoffLine = Float.parseFloat(mR.group(1)); 1371 } catch (NumberFormatException e) { 1372 logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it"); 1373 } 1374 } 1375 pR = Pattern.compile("^REMARK 3 FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*"); 1376 mR = pR.matcher(line); 1377 if (mR.matches()) { 1378 try { 1379 rfreeStandardLine = Float.parseFloat(mR.group(1)); 1380 } catch (NumberFormatException e) { 1381 logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1)); 1382 } 1383 } 1384 1385 // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries) 1386 // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m) 1387 // then last one encountered will be taken 1388 } else if (line.startsWith("REMARK 3 RESOLUTION RANGE HIGH")){ 1389 Pattern pR = Pattern.compile("^REMARK 3 RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*"); 1390 Matcher mR = pR.matcher(line); 1391 if (mR.matches()) { 1392 try { 1393 float res = Float.parseFloat(mR.group(1)); 1394 if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) { 1395 logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} " 1396 ,mR.group(1), String.format("%4.2f",pdbHeader.getResolution())); 1397 } 1398 pdbHeader.setResolution(res); 1399 } catch (NumberFormatException e) { 1400 logger.info("Could not parse resolution '{}', ignoring it",mR.group(1)); 1401 } 1402 } 1403 } 1404 1405 } 1406 1407 1408 1409 1410 1411 1412 /** 1413 * Handler for 1414 * EXPDTA Record Format 1415 <pre> 1416 COLUMNS DATA TYPE FIELD DEFINITION 1417 ------------------------------------------------------------------------------- 1418 1 - 6 Record name "EXPDTA" 1419 9 - 10 Continuation continuation Allows concatenation of multiple 1420 records. 1421 11 - 70 SList technique The experimental technique(s) with 1422 optional comment describing the 1423 sample or experiment. 1424 1425 allowed techniques are: 1426 ELECTRON DIFFRACTION 1427 FIBER DIFFRACTION 1428 FLUORESCENCE TRANSFER 1429 NEUTRON DIFFRACTION 1430 NMR 1431 THEORETICAL MODEL 1432 X-RAY DIFFRACTION 1433 </pre> 1434 */ 1435 private void pdb_EXPDTA_Handler(String line) { 1436 1437 String technique ; 1438 if (line.length() > 69) 1439 technique = line.substring (10, 70).trim() ; 1440 else 1441 technique = line.substring(10).trim(); 1442 1443 for (String singleTechnique: technique.split(";\\s+")) { 1444 pdbHeader.setExperimentalTechnique(singleTechnique); 1445 } 1446 1447 1448 } 1449 1450 /** 1451 * Handler for 1452 * CRYST1 Record Format 1453 * The CRYST1 record presents the unit cell parameters, space group, and Z value. 1454 * If the entry describes a structure determined by a technique other than X-ray crystallography, 1455 * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1456 * <pre> 1457 * COLUMNS DATA TYPE FIELD DEFINITION 1458 * ------------------------------------------------------------- 1459 * 1 - 6 Record name "CRYST1" 1460 * 7 - 15 Real(9.3) a a (Angstroms). 1461 * 16 - 24 Real(9.3) b b (Angstroms). 1462 * 25 - 33 Real(9.3) c c (Angstroms). 1463 * 34 - 40 Real(7.2) alpha alpha (degrees). 1464 * 41 - 47 Real(7.2) beta beta (degrees). 1465 * 48 - 54 Real(7.2) gamma gamma (degrees). 1466 * 56 - 66 LString sGroup Space group. 1467 * 67 - 70 Integer z Z value. 1468 * </pre> 1469 */ 1470 private void pdb_CRYST1_Handler(String line) { 1471 // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1') 1472 if (line.length() < 58) { 1473 logger.warn("CRYST1 record has fewer than 58 columns: will ignore it"); 1474 return; 1475 } 1476 1477 float a; 1478 float b; 1479 float c; 1480 float alpha; 1481 float beta; 1482 float gamma; 1483 String spaceGroup = ""; 1484 1485 try { 1486 a = Float.parseFloat(line.substring(6,15).trim()); 1487 b = Float.parseFloat(line.substring(15,24).trim()); 1488 c = Float.parseFloat(line.substring(24,33).trim()); 1489 alpha = Float.parseFloat(line.substring(33,40).trim()); 1490 beta = Float.parseFloat(line.substring(40,47).trim()); 1491 gamma = Float.parseFloat(line.substring(47,54).trim()); 1492 } catch (NumberFormatException e) { 1493 logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line); 1494 return ; 1495 } 1496 if (line.length()>=66) { 1497 // for well formatted files 1498 spaceGroup = line.substring(55,66).trim(); 1499 } else { 1500 // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value 1501 spaceGroup = line.substring(55,line.length()).trim(); 1502 } 1503 1504 CrystalCell xtalCell = new CrystalCell(); 1505 xtalCell.setA(a); 1506 xtalCell.setB(b); 1507 xtalCell.setC(c); 1508 xtalCell.setAlpha(alpha); 1509 xtalCell.setBeta(beta); 1510 xtalCell.setGamma(gamma); 1511 1512 if (!xtalCell.isCellReasonable()) { 1513 // If the entry describes a structure determined by a technique other than X-ray crystallography, 1514 // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1. 1515 // if so we don't add the crystal cell and it remains null 1516 logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", 1517 CrystalCell.MIN_VALID_CELL_SIZE); 1518 } else { 1519 crystallographicInfo.setCrystalCell(xtalCell); 1520 } 1521 1522 SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup); 1523 if (sg==null) { 1524 logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group"); 1525 crystallographicInfo.setNonStandardSg(true); 1526 } else { 1527 crystallographicInfo.setSpaceGroup(sg); 1528 crystallographicInfo.setNonStandardSg(false); 1529 } 1530 } 1531 1532 /** 1533 * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries) 1534 * 1535 * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn 1536 * <pre> 1537 * COLUMNS DATA TYPE FIELD DEFINITION 1538 * ------------------------------------------------------------- 1539 * 1540 * 1 - 6 Record name "MTRIXn" n=1, 2, or 3 1541 * 8 - 10 Integer serial Serial number. 1542 * 11 - 20 Real(10.6) m[n][1] Mn1 1543 * 21 - 30 Real(10.6) m[n][2] Mn2 1544 * 31 - 40 Real(10.6) m[n][3] Mn3 1545 * 46 - 55 Real(10.5) v[n] Vn 1546 * 60 Integer iGiven 1 1547 * 1548 * </pre> 1549 * Note that we ignore operators with iGiven==1 1550 * 1551 * @param line 1552 */ 1553 private void pdb_MTRIXn_Handler(String line) { 1554 1555 // don't process incomplete records 1556 if (line.length() < 55) { 1557 logger.info("MTRIXn record has fewer than 55 columns: will ignore it"); 1558 return; 1559 } 1560 1561 1562 try { 1563 1564 int rowIndex = Integer.parseInt(line.substring(5,6)); 1565 double col1Value = Double.parseDouble(line.substring(10,20)); 1566 double col2Value = Double.parseDouble(line.substring(20,30)); 1567 double col3Value = Double.parseDouble(line.substring(30,40)); 1568 double translValue = Double.parseDouble(line.substring(45,55)); 1569 int iGiven = 0; 1570 if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) { 1571 iGiven = Integer.parseInt(line.substring(59,60)); 1572 } 1573 1574 if (iGiven == 1) return; 1575 1576 if (ncsOperators==null) { 1577 // we initialise on first pass 1578 ncsOperators = new ArrayList<Matrix4d>(); 1579 } 1580 1581 if (currentNcsOp==null) { 1582 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1583 } 1584 1585 currentNcsOp.setElement(rowIndex-1, 0, col1Value); 1586 currentNcsOp.setElement(rowIndex-1, 1, col2Value); 1587 currentNcsOp.setElement(rowIndex-1, 2, col3Value); 1588 currentNcsOp.setElement(rowIndex-1, 3, translValue); 1589 1590 1591 if (rowIndex==3) { 1592 ncsOperators.add(currentNcsOp); 1593 // we initialise for next matrix to come 1594 currentNcsOp = new Matrix4d(1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1); // initialised to identity 1595 } 1596 1597 } catch (NumberFormatException e) { 1598 logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<"); 1599 } 1600 } 1601 1602 /** 1603 * Handler for ATOM. 1604 * Record Format: 1605 * 1606 * <pre> 1607 * ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1608 * 1609 * COLUMNS DATA TYPE FIELD DEFINITION 1610 * --------------------------------------------------------------------------------- 1611 * 1 - 6 Record name "ATOM " 1612 * 7 - 11 Integer serial Atom serial number. 1613 * 13 - 16 Atom name Atom name. 1614 * 17 Character altLoc Alternate location indicator. 1615 * 18 - 20 Residue name resName Residue name. 1616 * 22 Character chainID Chain identifier. 1617 * 23 - 26 Integer resSeq Residue sequence number. 1618 * 27 AChar iCode Code for insertion of residues. 1619 * 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms. 1620 * 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms. 1621 * 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms. 1622 * 55 - 60 Real(6.2) occupancy Occupancy. 1623 * 61 - 66 Real(6.2) tempFactor Temperature factor. 1624 * 73 - 76 LString(4) segID Segment identifier, left-justified. 1625 * 77 - 78 LString(2) element Element symbol, right-justified. 1626 * 79 - 80 LString(2) charge Charge on the atom. 1627 * </pre> 1628 */ 1629 private void pdb_ATOM_Handler(String line) { 1630 1631 if ( params.isHeaderOnly()) 1632 return; 1633 1634 // let's first get the chain name which will serve to identify if we are starting a new molecule 1635 String chainName = line.substring(21,22); 1636 1637 if (chainName.equals(" ")) { 1638 blankChainIdsPresent = true; 1639 } 1640 1641 if (currentChain!=null && !currentChain.getName().equals(chainName)) { 1642 // new chain name: another molecule coming 1643 startOfMolecule = true; 1644 } 1645 1646 if (startOfMolecule) { 1647 // we add last chain if there was one 1648 if (currentChain!=null) { 1649 currentModel.add(currentChain); 1650 // let's not forget adding the last group to the finishing chain 1651 if (currentGroup!=null) { 1652 currentChain.addGroup(currentGroup); 1653 } 1654 } 1655 // we initialise the new molecule to come 1656 currentChain = new ChainImpl(); 1657 // note that the chainId (asym id) is set properly later in assignAsymIds 1658 currentChain.setId(chainName); 1659 currentChain.setName(chainName); 1660 1661 } 1662 1663 if (startOfModel) { 1664 // we add last model if there was one 1665 if (currentModel!=null) { 1666 allModels.add(currentModel); 1667 } 1668 // we initialise the model to come 1669 currentModel = new ArrayList<>(); 1670 } 1671 1672 1673 // let's get the residue number and see if we need to start a new group 1674 1675 String groupCode3 = line.substring(17,20).trim(); 1676 String resNum = line.substring(22,26).trim(); 1677 Character iCode = line.substring(26,27).charAt(0); 1678 if ( iCode == ' ') 1679 iCode = null; 1680 ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode); 1681 1682 //recordName groupCode3 1683 //| | resNum 1684 //| | | iCode 1685 //| | | | | || 1686 //ATOM 1 N ASP A 15 110.964 24.941 59.191 1.00 83.44 N 1687 //ATOM 1964 N ARG H 221A 5.963 -16.715 27.669 1.00 28.59 N 1688 1689 Character aminoCode1 = StructureTools.get1LetterCode(groupCode3); 1690 1691 String recordName = line.substring (0, 6).trim (); 1692 1693 boolean isHetAtomInFile = false; 1694 1695 if (recordName.equals("HETATM") ){ 1696 // HETATOM RECORDS are treated slightly differently 1697 // some modified amino acids that we want to treat as amino acids 1698 // can be found as HETATOM records 1699 if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) 1700 aminoCode1 = null; 1701 1702 isHetAtomInFile = true; 1703 } 1704 1705 if ( startOfMolecule) { 1706 1707 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1708 1709 currentGroup.setPDBName(groupCode3); 1710 currentGroup.setResidueNumber(residueNumber); 1711 currentGroup.setHetAtomInFile(isHetAtomInFile); 1712 1713 } 1714 1715 // resetting states 1716 startOfModel = false; 1717 startOfMolecule = false; 1718 1719 1720 Character altLoc = new Character(line.substring (16, 17).charAt(0)); 1721 Group altGroup = null; 1722 1723 1724 // check if residue number is the same ... 1725 if ( ! residueNumber.equals(currentGroup.getResidueNumber())) { 1726 1727 currentChain.addGroup(currentGroup); 1728 currentGroup.trimToSize(); 1729 1730 currentGroup = getNewGroup(recordName, aminoCode1, groupCode3); 1731 1732 currentGroup.setPDBName(groupCode3); 1733 currentGroup.setResidueNumber(residueNumber); 1734 currentGroup.setHetAtomInFile(isHetAtomInFile); 1735 1736 } else { 1737 // same residueNumber, but altLocs... 1738 1739 // test altLoc 1740 if ( ! altLoc.equals(' ')) { 1741 logger.debug("found altLoc! " + currentGroup + " " + altGroup); 1742 altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3); 1743 if ( altGroup.getChain() == null) { 1744 // need to set current chain 1745 altGroup.setChain(currentChain); 1746 } 1747 1748 } 1749 } 1750 1751 atomCount++; 1752 1753 if ( atomCount == atomCAThreshold ) { 1754 // throw away the SEQRES lines - too much to deal with... 1755 logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines"); 1756 seqResChains.clear(); 1757 1758 switchCAOnly(); 1759 1760 } 1761 1762 1763 1764 if ( atomCount == loadMaxAtoms){ 1765 logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line); 1766 return; 1767 } 1768 if ( atomCount > loadMaxAtoms){ 1769 return; 1770 } 1771 1772 1773 // 1 2 3 4 5 6 1774 //012345678901234567890123456789012345678901234567890123456789 1775 //ATOM 1 N MET 1 20.154 29.699 5.276 1.0 1776 //ATOM 112 CA ASP 112 41.017 33.527 28.371 1.00 0.00 1777 //ATOM 53 CA MET 7 23.772 33.989 -21.600 1.00 0.00 C 1778 //ATOM 112 CA ASP 112 37.613 26.621 33.571 0 0 1779 1780 1781 String fullname = line.substring (12, 16); 1782 1783 // check for CA only if requested 1784 if ( parseCAonly ){ 1785 // yes , user wants to get CA only 1786 // only parse CA atoms... 1787 if (! fullname.equals(" CA ")){ 1788 //System.out.println("ignoring " + line); 1789 atomCount--; 1790 return; 1791 } 1792 } 1793 1794 if ( params.getAcceptedAtomNames() != null) { 1795 1796 boolean found = false; 1797 for (String ok : params.getAcceptedAtomNames()){ 1798 //System.out.println(ok + "< >" + fullname +"<"); 1799 1800 if ( ok.equals(fullname.trim())) { 1801 found = true; 1802 break; 1803 } 1804 } 1805 if ( ! found) { 1806 atomCount--; 1807 return; 1808 } 1809 } 1810 // create new atom 1811 1812 int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ()); 1813 AtomImpl atom = new AtomImpl() ; 1814 atom.setPDBserial(pdbnumber) ; 1815 1816 atom.setAltLoc(altLoc); 1817 atom.setName(fullname.trim()); 1818 1819 double x = Double.parseDouble (line.substring (30, 38).trim()); 1820 double y = Double.parseDouble (line.substring (38, 46).trim()); 1821 double z = Double.parseDouble (line.substring (46, 54).trim()); 1822 1823 double[] coords = new double[3]; 1824 coords[0] = x ; 1825 coords[1] = y ; 1826 coords[2] = z ; 1827 atom.setCoords(coords); 1828 1829 float occu = 1.0f; 1830 if ( line.length() > 59 ) { 1831 try { 1832 // occu and tempf are sometimes not used :-/ 1833 occu = Float.parseFloat (line.substring (54, 60).trim()); 1834 } catch (NumberFormatException e){} 1835 } 1836 1837 float tempf = 0.0f; 1838 if ( line.length() > 65) { 1839 try { 1840 tempf = Float.parseFloat (line.substring (60, 66).trim()); 1841 } catch (NumberFormatException e){} 1842 } 1843 1844 atom.setOccupancy( occu ); 1845 atom.setTempFactor( tempf ); 1846 1847 1848 1849 1850 // Parse element from the element field. If this field is 1851 // missing (i.e. misformatted PDB file), then parse the 1852 // element from the chemical component. 1853 Element element = Element.R; 1854 boolean guessElement = true; 1855 if ( line.length() > 77 ) { 1856 // parse element from element field 1857 String elementSymbol = line.substring(76, 78).trim(); 1858 if (elementSymbol.isEmpty()) { 1859 logger.info("Element column was empty for atom {} {}. Assigning atom element " 1860 + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber); 1861 } else { 1862 1863 try { 1864 element = Element.valueOfIgnoreCase(elementSymbol); 1865 guessElement = false; 1866 } catch (IllegalArgumentException e){ 1867 logger.info("Element {} of atom {} {} was not recognised. Assigning atom element " 1868 + "from Chemical Component Dictionary information", elementSymbol, 1869 fullname.trim(), pdbnumber); 1870 } 1871 } 1872 } else { 1873 logger.info("Missformatted PDB file: element column of atom {} {} is not present. " 1874 + "Assigning atom element from Chemical Component Dictionary information", 1875 fullname.trim(), pdbnumber); 1876 } 1877 if (guessElement) { 1878 String elementSymbol = null; 1879 if (currentGroup.getChemComp() != null) { 1880 for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) { 1881 if (a.getAtom_id().equals(fullname.trim())) { 1882 elementSymbol = a.getType_symbol(); 1883 break; 1884 } 1885 } 1886 if (elementSymbol == null) { 1887 logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. " 1888 + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName()); 1889 } else { 1890 try { 1891 element = Element.valueOfIgnoreCase(elementSymbol); 1892 } catch (IllegalArgumentException e) { 1893 // this can still happen for cases like UNK 1894 logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. " 1895 + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber); 1896 } 1897 } 1898 } else { 1899 logger.warn("Chemical Component Dictionary information was not found for Atom name {}. " 1900 + "Assigning generic element R to it", fullname.trim()); 1901 } 1902 1903 } 1904 atom.setElement(element); 1905 1906 1907 //see if chain_id is one of the previous chains ... 1908 if ( altGroup != null) { 1909 altGroup.addAtom(atom); 1910 altGroup = null; 1911 } 1912 else { 1913 currentGroup.addAtom(atom); 1914 } 1915 1916 1917 // make sure that main group has all atoms 1918 // GitHub issue: #76 1919 if ( ! currentGroup.hasAtom(atom.getName())) { 1920 currentGroup.addAtom(atom); 1921 } 1922 1923 1924 1925 } 1926 1927 1928 private Group getCorrectAltLocGroup( Character altLoc, 1929 String recordName, Character aminoCode1, String groupCode3) { 1930 1931 // see if we know this altLoc already; 1932 List<Atom> atoms = currentGroup.getAtoms(); 1933 if ( atoms.size() > 0) { 1934 Atom a1 = atoms.get(0); 1935 // we are just adding atoms to the current group 1936 // probably there is a second group following later... 1937 if (a1.getAltLoc().equals(altLoc)) { 1938 1939 return currentGroup; 1940 } 1941 } 1942 1943 List<Group> altLocs = currentGroup.getAltLocs(); 1944 for ( Group altLocG : altLocs ){ 1945 atoms = altLocG.getAtoms(); 1946 if ( atoms.size() > 0) { 1947 for ( Atom a1 : atoms) { 1948 if (a1.getAltLoc().equals( altLoc)) { 1949 1950 return altLocG; 1951 } 1952 } 1953 } 1954 } 1955 1956 // no matching altLoc group found. 1957 // build it up. 1958 1959 if ( groupCode3.equals(currentGroup.getPDBName())) { 1960 if ( currentGroup.getAtoms().size() == 0) { 1961 //System.out.println("current group is empty " + current_group + " " + altLoc); 1962 return currentGroup; 1963 } 1964 //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc); 1965 Group altLocG = (Group) currentGroup.clone(); 1966 // drop atoms from cloned group... 1967 // https://redmine.open-bio.org/issues/3307 1968 altLocG.setAtoms(new ArrayList<Atom>()); 1969 altLocG.getAltLocs().clear(); 1970 currentGroup.addAltLoc(altLocG); 1971 return altLocG; 1972 } 1973 1974 // System.out.println("new group " + recordName + " " + aminoCode1 + " " +groupCode3); 1975 Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3); 1976 1977 1978 altLocG.setPDBName(groupCode3); 1979 1980 altLocG.setResidueNumber(currentGroup.getResidueNumber()); 1981 currentGroup.addAltLoc(altLocG); 1982 return altLocG; 1983 } 1984 1985 private void switchCAOnly(){ 1986 parseCAonly = true; 1987 1988 1989 currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel); 1990 1991 for ( int i =0; i< structure.nrModels() ; i++){ 1992 // iterate over all known models ... 1993 List<Chain> model = structure.getModel(i); 1994 model = CAConverter.getRepresentativeAtomsOnly(model); 1995 structure.setModel(i,model); 1996 } 1997 1998 currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain); 1999 2000 } 2001 2002 2003 /** safes repeating a few lines ... */ 2004 private Integer conect_helper (String line,int start,int end) { 2005 if (line.length() < end) return null; 2006 2007 String sbond = line.substring(start,end).trim(); 2008 int bond = -1 ; 2009 Integer b = null ; 2010 2011 if ( ! sbond.equals("")) { 2012 bond = Integer.parseInt(sbond); 2013 b = new Integer(bond); 2014 } 2015 2016 return b ; 2017 } 2018 2019 /** 2020 * Handler for CONECT Record Format 2021 <pre> 2022 COLUMNS DATA TYPE FIELD DEFINITION 2023 --------------------------------------------------------------------------------- 2024 1 - 6 Record name "CONECT" 2025 7 - 11 Integer serial Atom serial number 2026 12 - 16 Integer serial Serial number of bonded atom 2027 17 - 21 Integer serial Serial number of bonded atom 2028 22 - 26 Integer serial Serial number of bonded atom 2029 27 - 31 Integer serial Serial number of bonded atom 2030 32 - 36 Integer serial Serial number of hydrogen bonded 2031 atom 2032 37 - 41 Integer serial Serial number of hydrogen bonded 2033 atom 2034 42 - 46 Integer serial Serial number of salt bridged 2035 atom 2036 47 - 51 Integer serial Serial number of hydrogen bonded 2037 atom 2038 52 - 56 Integer serial Serial number of hydrogen bonded 2039 atom 2040 57 - 61 Integer serial Serial number of salt bridged 2041 atom 2042 </pre> 2043 */ 2044 private void pdb_CONECT_Handler(String line) { 2045 2046 if ( atomOverflow) { 2047 return ; 2048 } 2049 if (params.isHeaderOnly()) { 2050 return; 2051 } 2052 2053 // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines... 2054 try { 2055 int atomserial = Integer.parseInt (line.substring(6 ,11).trim()); 2056 Integer bond1 = conect_helper(line,11,16); 2057 Integer bond2 = conect_helper(line,16,21); 2058 Integer bond3 = conect_helper(line,21,26); 2059 Integer bond4 = conect_helper(line,26,31); 2060 Integer hyd1 = conect_helper(line,31,36); 2061 Integer hyd2 = conect_helper(line,36,41); 2062 Integer salt1 = conect_helper(line,41,46); 2063 Integer hyd3 = conect_helper(line,46,51); 2064 Integer hyd4 = conect_helper(line,51,56); 2065 Integer salt2 = conect_helper(line,56,61); 2066 2067 //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+ 2068 // hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2); 2069 HashMap<String, Integer> cons = new HashMap<String, Integer>(); 2070 cons.put("atomserial",new Integer(atomserial)); 2071 2072 if ( bond1 != null) cons.put("bond1",bond1); 2073 if ( bond2 != null) cons.put("bond2",bond2); 2074 if ( bond3 != null) cons.put("bond3",bond3); 2075 if ( bond4 != null) cons.put("bond4",bond4); 2076 if ( hyd1 != null) cons.put("hydrogen1",hyd1); 2077 if ( hyd2 != null) cons.put("hydrogen2",hyd2); 2078 if ( salt1 != null) cons.put("salt1",salt1); 2079 if ( hyd3 != null) cons.put("hydrogen3",hyd3); 2080 if ( hyd4 != null) cons.put("hydrogen4",hyd4); 2081 if ( salt2 != null) cons.put("salt2",salt2); 2082 2083 connects.add(cons); 2084 } catch (NumberFormatException e){ 2085 logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line); 2086 return; 2087 } 2088 } 2089 2090 /** 2091 * Handler for MODEL Record Format 2092 * <pre> 2093 * COLUMNS DATA TYPE FIELD DEFINITION 2094 * ---------------------------------------------------------------------- 2095 * 1 - 6 Record name "MODEL " 2096 * 11 - 14 Integer serial Model serial number. 2097 * </pre> 2098 */ 2099 private void pdb_MODEL_Handler(String line) { 2100 2101 if (params.isHeaderOnly()) return; 2102 2103 // new model: we start a new molecule 2104 startOfMolecule = true; 2105 startOfModel = true; 2106 2107 } 2108 2109 /** 2110 * Handler for TER record. The record is used in deposited PDB files and many others, 2111 * but it's often forgotten by some softwares. In any case it helps identifying the 2112 * start of ligand molecules so we use it for that. 2113 */ 2114 private void pdb_TER_Handler() { 2115 startOfMolecule = true; 2116 } 2117 2118 2119 /** 2120 * DBREF handler 2121 * <pre> 2122 * COLUMNS DATA TYPE FIELD DEFINITION 2123 * ---------------------------------------------------------------- 2124 * 1 - 6 Record name "DBREF " 2125 * 8 - 11 IDcode idCode ID code of this entry. 2126 * 13 Character chainID Chain identifier. 2127 * 15 - 18 Integer seqBegin Initial sequence number 2128 * of the PDB sequence segment. 2129 * 19 AChar insertBegin Initial insertion code 2130 * of the PDB sequence segment. 2131 * 21 - 24 Integer seqEnd Ending sequence number 2132 * of the PDB sequence segment. 2133 * 25 AChar insertEnd Ending insertion code 2134 * of the PDB sequence segment. 2135 * 27 - 32 LString database Sequence database name. 2136 * 34 - 41 LString dbAccession Sequence database accession code. 2137 * 43 - 54 LString dbIdCode Sequence database 2138 * identification code. 2139 * 56 - 60 Integer dbseqBegin Initial sequence number of the 2140 * database seqment. 2141 * 61 AChar idbnsBeg Insertion code of initial residue 2142 * of the segment, if PDB is the 2143 * reference. 2144 * 63 - 67 Integer dbseqEnd Ending sequence number of the 2145 * database segment. 2146 * 68 AChar dbinsEnd Insertion code of the ending 2147 * residue of the segment, if PDB is 2148 * the reference. 2149 * </pre> 2150 */ 2151 private void pdb_DBREF_Handler(String line){ 2152 2153 logger.debug("Parsing DBREF " + line); 2154 2155 DBRef dbref = new DBRef(); 2156 String idCode = line.substring(7,11); 2157 String chainName = line.substring(12,13); 2158 String seqBegin = line.substring(14,18); 2159 String insertBegin = line.substring(18,19); 2160 String seqEnd = line.substring(20,24); 2161 String insertEnd = line.substring(24,25); 2162 String database = line.substring(26,32); 2163 String dbAccession = line.substring(33,41); 2164 String dbIdCode = line.substring(42,54); 2165 String dbseqBegin = line.substring(55,60); 2166 String idbnsBeg = line.substring(60,61); 2167 String dbseqEnd = line.substring(62,67); 2168 // Support implicit space character at end 2169 String dbinsEnd; 2170 if(line.length() >= 68) 2171 dbinsEnd = line.substring(67,68); 2172 else 2173 dbinsEnd = " "; 2174 2175 dbref.setIdCode(idCode); 2176 dbref.setChainName(chainName); 2177 dbref.setSeqBegin(intFromString(seqBegin)); 2178 dbref.setInsertBegin(insertBegin.charAt(0)); 2179 dbref.setSeqEnd(intFromString(seqEnd)); 2180 dbref.setInsertEnd(insertEnd.charAt(0)); 2181 dbref.setDatabase(database.trim()); 2182 dbref.setDbAccession(dbAccession.trim()); 2183 dbref.setDbIdCode(dbIdCode.trim()); 2184 dbref.setDbSeqBegin(intFromString(dbseqBegin)); 2185 dbref.setIdbnsBegin(idbnsBeg.charAt(0)); 2186 dbref.setDbSeqEnd(intFromString(dbseqEnd)); 2187 dbref.setIdbnsEnd(dbinsEnd.charAt(0)); 2188 2189 //System.out.println(dbref.toPDB()); 2190 dbrefs.add(dbref); 2191 } 2192 2193 2194 /** 2195 * Process the disulfide bond info provided by an SSBOND record 2196 * 2197 * <pre> 2198 COLUMNS DATA TYPE FIELD DEFINITION 2199 ------------------------------------------------------------------- 2200 1 - 6 Record name "SSBOND" 2201 8 - 10 Integer serNum Serial number. 2202 12 - 14 LString(3) "CYS" Residue name. 2203 16 Character chainID1 Chain identifier. 2204 18 - 21 Integer seqNum1 Residue sequence number. 2205 22 AChar icode1 Insertion code. 2206 26 - 28 LString(3) "CYS" Residue name. 2207 30 Character chainID2 Chain identifier. 2208 32 - 35 Integer seqNum2 Residue sequence number. 2209 36 AChar icode2 Insertion code. 2210 60 - 65 SymOP sym1 Symmetry oper for 1st resid 2211 67 - 72 SymOP sym2 Symmetry oper for 2nd resid 2212 * </pre> 2213 */ 2214 private void pdb_SSBOND_Handler(String line){ 2215 2216 if (params.isHeaderOnly()) return; 2217 2218 if (line.length()<36) { 2219 logger.info("SSBOND line has length under 36. Ignoring it."); 2220 return; 2221 } 2222 2223 String chain1 = line.substring(15,16); 2224 String seqNum1 = line.substring(17,21).trim(); 2225 String icode1 = line.substring(21,22); 2226 String chain2 = line.substring(29,30); 2227 String seqNum2 = line.substring(31,35).trim(); 2228 String icode2 = line.substring(35,36); 2229 2230 if (line.length()>=72) { 2231 String symop1 = line.substring(59, 65).trim(); 2232 String symop2 = line.substring(66, 72).trim(); 2233 2234 // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them 2235 if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing 2236 (!symop1.equals("1555") || !symop2.equals("1555")) ) { 2237 logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2); 2238 return; 2239 } 2240 } 2241 2242 if (icode1.equals(" ")) 2243 icode1 = ""; 2244 if (icode2.equals(" ")) 2245 icode2 = ""; 2246 2247 SSBondImpl ssbond = new SSBondImpl(); 2248 2249 ssbond.setChainID1(chain1); 2250 ssbond.setResnum1(seqNum1); 2251 ssbond.setChainID2(chain2); 2252 ssbond.setResnum2(seqNum2); 2253 ssbond.setInsCode1(icode1); 2254 ssbond.setInsCode2(icode2); 2255 ssbonds.add(ssbond); 2256 } 2257 2258 2259 /** 2260 * Takes care of LINK records. These take the format of: 2261 * 2262 * <pre> 2263 * COLUMNS DATA TYPE FIELD DEFINITION 2264 * -------------------------------------------------------------------------------- 2265 * 1 - 6 Record name "LINK " 2266 * 13 - 16 Atom name1 Atom name. 2267 * 17 Character altLoc1 Alternate location indicator. 2268 * 18 - 20 Residue name resName1 Residue name. 2269 * 22 Character chainID1 Chain identifier. 2270 * 23 - 26 Integer resSeq1 Residue sequence number. 2271 * 27 AChar iCode1 Insertion code. 2272 * 43 - 46 Atom name2 Atom name. 2273 * 47 Character altLoc2 Alternate location indicator. 2274 * 48 - 50 Residue name resName2 Residue name. 2275 * 52 Character chainID2 Chain identifier. 2276 * 53 - 56 Integer resSeq2 Residue sequence number. 2277 * 57 AChar iCode2 Insertion code. 2278 * 60 - 65 SymOP sym1 Symmetry operator for 1st atom. 2279 * 67 - 72 SymOP sym2 Symmetry operator for 2nd atom. 2280 * </pre> 2281 * 2282 * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK) 2283 * 2284 * @param line the LINK record line to parse. 2285 */ 2286 private void pdb_LINK_Handler(String line) { 2287 2288 if (params.isHeaderOnly()) return; 2289 2290 // Check for the minimal set of fields. 2291 if (line.length()<56) { 2292 logger.info("LINK line has length under 56. Ignoring it."); 2293 return; 2294 } 2295 2296 int len = line.length(); 2297 2298 String name1 = line.substring(12, 16).trim(); 2299 String altLoc1 = line.substring(16, 17).trim(); 2300 String resName1 = line.substring(17, 20).trim(); 2301 String chainID1 = line.substring(21, 22).trim(); 2302 String resSeq1 = line.substring(22, 26).trim(); 2303 String iCode1 = line.substring(26, 27).trim(); 2304 2305 String name2 = line.substring(42, 46).trim(); 2306 String altLoc2 = line.substring(46, 47).trim(); 2307 String resName2 = line.substring(47, 50).trim(); 2308 String chainID2 = line.substring(51, 52).trim(); 2309 String resSeq2 = line.substring(52, 56).trim(); 2310 String iCode2 = null; // Might get trimmed if blank. 2311 if (len > 56) iCode2 = line.substring(56, 57).trim(); 2312 2313 String sym1 = null; 2314 if (len > 64) sym1 = line.substring(59, 65).trim(); 2315 String sym2 = null; 2316 if (len > 71) sym2 = line.substring(66, 72).trim(); 2317 2318 linkRecords.add(new LinkRecord( 2319 name1, altLoc1, resName1, chainID1, resSeq1, iCode1, 2320 name2, altLoc2, resName2, chainID2, resSeq2, iCode2, 2321 sym1, sym2)); 2322 } 2323 2324 /** 2325 * Handler for the SITE records. <br> 2326 * 2327 * <pre> 2328 * 2329 * COLUMNS DATA TYPE FIELD DEFINITION 2330 * --------------------------------------------------------------------------------- 2331 * 1 - 6 Record name "SITE " 2332 * 8 - 10 Integer seqNum Sequence number. 2333 * 12 - 14 LString(3) siteID Site name. 2334 * 16 - 17 Integer numRes Number of residues that compose the siteResidues. 2335 * 19 - 21 Residue name resName1 Residue name for first residue that 2336 * creates the siteResidues. 2337 * 23 Character chainID1 Chain identifier for first residue of siteResidues. 2338 * 24 - 27 Integer seq1 Residue sequence number for first residue 2339 * of the siteResidues. 2340 * 28 AChar iCode1 Insertion code for first residue of the siteResidues. 2341 * 2342 * example: 2343 * 1 2 3 4 5 6 7 8 2344 * 12345678901234567890123456789012345678901234567890123456789012345678901234567890 2345 * SITE 1 AC1 3 HIS A 94 HIS A 96 HIS A 119 2346 * SITE 1 AC2 5 ASN A 62 GLY A 63 HIS A 64 HOH A 328 2347 * SITE 2 AC2 5 HOH A 634 2348 * SITE 1 AC3 5 GLN A 136 GLN A 137 PRO A 138 GLU A 205 2349 * SITE 2 AC3 5 CYS A 206 2350 * SITE 1 AC4 11 HIS A 64 HIS A 94 HIS A 96 HIS A 119 2351 * SITE 2 AC4 11 LEU A 198 THR A 199 THR A 200 TRP A 209 2352 * SITE 3 AC4 11 HOH A 572 HOH A 582 HOH A 635 2353 * </pre> 2354 * @param line the SITE line record being currently read 2355 * @author Amr AL-Hossary 2356 * @author Jules Jacobsen 2357 */ 2358 private void pdb_SITE_Handler(String line){ 2359 2360 if (params.isHeaderOnly()) return; 2361 2362 // make a map of: SiteId to List<ResidueNumber> 2363 2364 logger.debug("Site Line:"+line); 2365 2366 2367 String siteID = line.substring(11, 14); 2368 //fetch the siteResidues from the map 2369 List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID); 2370 2371 //if the siteResidues doesn't yet exist, make a new one. 2372 if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){ 2373 siteResidues = new ArrayList<ResidueNumber>(); 2374 siteToResidueMap.put(siteID.trim(), siteResidues); 2375 2376 logger.debug(String.format("New Site made: %s %s", siteID, siteResidues)); 2377 logger.debug("Now made " + siteMap.size() + " sites"); 2378 2379 } 2380 2381 logger.debug(String.format("SiteId: %s", siteID)); 2382 2383 2384 //line = 'SITE 1 AC1 6 ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2385 //line.substring(18) = 'ARG H 221A LYS H 224 HOH H 403 HOH H 460' 2386 line = line.substring(18); 2387 String groupString = null; 2388 //groupString = 'ARG H 221A' 2389 //keep iterating through chunks of 10 characters - these are the groups in the siteResidues 2390 while (!(groupString = line.substring(0, 10)).equals(" ")) { 2391 //groupstring: 'ARG H 221A' 2392 2393 logger.debug("groupString: '" + groupString + "'"); 2394 2395 //set the residue name 2396 //residueName = 'ARG' 2397 String residueName = groupString.substring(0, 3); 2398 Character aminoCode1 = StructureTools.get1LetterCode(residueName); 2399 if (aminoCode1 != null) { 2400 if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) { 2401 aminoCode1 = null; 2402 } 2403 } 2404 2405 //this is already in the right format, so no need to fiddle with it... 2406 //pdbCode = 'H 221A' 2407 // String pdbCode = groupString.substring(4, 10).trim(); 2408 String chainId = groupString.substring(4, 5); 2409 Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim()); 2410 Character insCode = groupString.substring(9, 10).charAt(0); 2411 //set insCode to null as a measure to prevent storing thousands of empty Strings 2412 //- the empty value is returned using Group.getInsCode() 2413 // if (insCode.equals(" ")) { 2414 // insCode = null; 2415 // } 2416 2417 logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode)); 2418 2419 //make a new resNum with the data - this will be linked up with a site later 2420 ResidueNumber residueNumber = new ResidueNumber(); 2421 2422 2423 logger.debug("pdbCode: '" + resNum + insCode + "'"); 2424 2425 residueNumber.setChainName(chainId); 2426 residueNumber.setSeqNum(resNum); 2427 residueNumber.setInsCode(insCode); 2428 //add the resNum to the groups 2429 siteResidues.add(residueNumber); 2430 2431 logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID); 2432 2433 line = line.substring(11); 2434 } 2435 2436 logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):"); 2437 for (String key : siteToResidueMap.keySet()) { 2438 logger.debug(key + " : " + siteToResidueMap.get(key)); 2439 } 2440 2441 } 2442 2443 //Site variable related to parsing the REMARK 800 records. 2444 Site site; 2445 private void pdb_REMARK_800_Handler(String line){ 2446 2447 if (params.isHeaderOnly()) return; 2448 2449 // 'REMARK 800 SITE_IDENTIFIER: CAT ' 2450 line = line.substring(11); 2451 String[] fields = line.split(": "); 2452 2453 if (fields.length == 2) { 2454 if (fields[0].equals("SITE_IDENTIFIER")) { 2455 // remark800Counter++; 2456 String siteID = fields[1].trim(); 2457 2458 logger.debug("siteID: '" + siteID +"'"); 2459 2460 //fetch the siteResidues from the map 2461 site = siteMap.get(siteID); 2462 2463 //if the siteResidues doesn't yet exist, make a new one. 2464 if (site == null || !siteID.equals(site.getSiteID())) { 2465 site = new Site(siteID, new ArrayList<Group>()); 2466 siteMap.put(site.getSiteID(), site); 2467 2468 logger.debug("New Site made: " + site); 2469 logger.debug("Now made " + siteMap.size() + " sites"); 2470 2471 } 2472 } 2473 if (fields[0].equals("EVIDENCE_CODE")) { 2474 // remark800Counter++; 2475 String evCode = fields[1].trim(); 2476 2477 logger.debug("evCode: '" + evCode +"'"); 2478 2479 //fetch the siteResidues from the map 2480 site.setEvCode(evCode); 2481 } 2482 if (fields[0].equals("SITE_DESCRIPTION")) { 2483 // remark800Counter++; 2484 String desc = fields[1].trim(); 2485 2486 logger.debug("desc: '" + desc +"'"); 2487 2488 //fetch the siteResidues from the map 2489 site.setDescription(desc); 2490 2491 logger.debug("Finished making REMARK 800 for site " + site.getSiteID()); 2492 logger.debug(site.remark800toPDB()); 2493 2494 } 2495 } 2496 } 2497 2498 private int intFromString(String intString){ 2499 int val = Integer.MIN_VALUE; 2500 try { 2501 val = Integer.parseInt(intString.trim()); 2502 } catch (NumberFormatException ex){ 2503 logger.info("Could not parse a number: " + ex.getMessage()); 2504 } 2505 return val; 2506 } 2507 2508 2509 2510 /** 2511 * Finds in the given list of chains the first one that has as name the given chainID. 2512 * If no such Chain can be found it returns null. 2513 */ 2514 private static Chain isKnownChain(String chainID, List<Chain> chains){ 2515 2516 for (int i = 0; i< chains.size();i++){ 2517 Chain testchain = chains.get(i); 2518 if (chainID.equals(testchain.getName())) { 2519 return testchain; 2520 } 2521 } 2522 2523 return null; 2524 } 2525 2526 2527 2528 private BufferedReader getBufferedReader(InputStream inStream) 2529 throws IOException { 2530 2531 BufferedReader buf ; 2532 if (inStream == null) { 2533 throw new IOException ("input stream is null!"); 2534 } 2535 2536 buf = new BufferedReader (new InputStreamReader (inStream)); 2537 return buf ; 2538 2539 } 2540 2541 2542 2543 /** 2544 * Parse a PDB file and return a datastructure implementing 2545 * PDBStructure interface. 2546 * 2547 * @param inStream an InputStream object 2548 * @return a Structure object 2549 * @throws IOException 2550 */ 2551 public Structure parsePDBFile(InputStream inStream) 2552 throws IOException 2553 { 2554 2555 BufferedReader buf = getBufferedReader(inStream); 2556 2557 return parsePDBFile(buf); 2558 2559 } 2560 2561 /** 2562 * Parse a PDB file and return a datastructure implementing 2563 * PDBStructure interface. 2564 * 2565 * @param buf a BufferedReader object 2566 * @return the Structure object 2567 * @throws IOException ... 2568 */ 2569 public Structure parsePDBFile(BufferedReader buf) 2570 throws IOException 2571 { 2572 // set the correct max values for parsing... 2573 loadMaxAtoms = params.getMaxAtoms(); 2574 atomCAThreshold = params.getAtomCaThreshold(); 2575 2576 2577 // (re)set structure 2578 2579 allModels = new ArrayList<>(); 2580 structure = new StructureImpl() ; 2581 currentModel = null; 2582 currentChain = null; 2583 currentGroup = null; 2584 // we initialise to true since at the beginning of the file we are always starting a new molecule 2585 startOfMolecule = true; 2586 startOfModel = true; 2587 2588 seqResChains = new ArrayList<Chain>(); 2589 siteMap = new LinkedHashMap<String, Site>(); 2590 pdbHeader = new PDBHeader(); 2591 connects = new ArrayList<Map<String,Integer>>(); 2592 previousContinuationField = ""; 2593 continuationField = ""; 2594 continuationString = ""; 2595 current_compound = null; 2596 sourceLines.clear(); 2597 compndLines.clear(); 2598 isLastCompndLine = false; 2599 isLastSourceLine = false; 2600 prevMolId = -1; 2601 entities.clear(); 2602 helixList.clear(); 2603 strandList.clear(); 2604 turnList.clear(); 2605 lengthCheck = -1; 2606 atomCount = 0; 2607 atomOverflow = false; 2608 linkRecords = new ArrayList<LinkRecord>(); 2609 siteToResidueMap.clear(); 2610 2611 blankChainIdsPresent = false; 2612 2613 parseCAonly = params.isParseCAOnly(); 2614 2615 String line = null; 2616 2617 while ((line = buf.readLine()) != null) { 2618 2619 // ignore empty lines 2620 if ( line.equals("") || 2621 (line.equals(NEWLINE))){ 2622 continue; 2623 } 2624 2625 2626 // ignore short TER and END lines 2627 if ( line.startsWith("END")) { 2628 continue; 2629 } 2630 2631 if ( line.length() < 6 && !line.startsWith("TER")) { 2632 logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" ); 2633 continue; 2634 } 2635 2636 String recordName = null; 2637 if (line.length()<6) 2638 recordName = line.trim(); 2639 else 2640 recordName = line.substring (0, 6).trim (); 2641 2642 try { 2643 if (recordName.equals("ATOM")) 2644 pdb_ATOM_Handler(line); 2645 else if (recordName.equals("SEQRES")) 2646 pdb_SEQRES_Handler(line); 2647 else if (recordName.equals("HETATM")) 2648 pdb_ATOM_Handler(line); 2649 else if (recordName.equals("MODEL")) 2650 pdb_MODEL_Handler(line); 2651 else if (recordName.equals("TER")) 2652 pdb_TER_Handler(); 2653 else if (recordName.equals("HEADER")) 2654 pdb_HEADER_Handler(line); 2655 else if (recordName.equals("AUTHOR")) 2656 pdb_AUTHOR_Handler(line); 2657 else if (recordName.equals("TITLE")) 2658 pdb_TITLE_Handler(line); 2659 else if (recordName.equals("SOURCE")) 2660 sourceLines.add(line); //pdb_SOURCE_Handler 2661 else if (recordName.equals("COMPND")) 2662 compndLines.add(line); //pdb_COMPND_Handler 2663 else if (recordName.equals("JRNL")) 2664 pdb_JRNL_Handler(line); 2665 else if (recordName.equals("EXPDTA")) 2666 pdb_EXPDTA_Handler(line); 2667 else if (recordName.equals("CRYST1")) 2668 pdb_CRYST1_Handler(line); 2669 else if (recordName.startsWith("MTRIX")) 2670 pdb_MTRIXn_Handler(line); 2671 else if (recordName.equals("REMARK")) 2672 pdb_REMARK_Handler(line); 2673 else if (recordName.equals("CONECT")) 2674 pdb_CONECT_Handler(line); 2675 else if (recordName.equals("REVDAT")) 2676 pdb_REVDAT_Handler(line); 2677 else if (recordName.equals("DBREF")) 2678 pdb_DBREF_Handler(line); 2679 else if (recordName.equals("SITE")) 2680 pdb_SITE_Handler(line); 2681 else if (recordName.equals("SSBOND")) 2682 pdb_SSBOND_Handler(line); 2683 else if (recordName.equals("LINK")) 2684 pdb_LINK_Handler(line); 2685 else if ( params.isParseSecStruc()) { 2686 if ( recordName.equals("HELIX") ) pdb_HELIX_Handler ( line ) ; 2687 else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ; 2688 else if (recordName.equals("TURN")) pdb_TURN_Handler( line ) ; 2689 } 2690 } catch (StringIndexOutOfBoundsException | NullPointerException ex) { 2691 logger.info("Unable to parse [" + line + "]"); 2692 } 2693 } 2694 2695 makeCompounds(compndLines, sourceLines); 2696 2697 triggerEndFileChecks(); 2698 2699 if (params.shouldCreateAtomBonds()) { 2700 formBonds(); 2701 } 2702 2703 if ( params.shouldCreateAtomCharges()) { 2704 addCharges(); 2705 } 2706 2707 if ( params.isParseSecStruc() && !params.isHeaderOnly()) 2708 setSecStruc(); 2709 2710 // Now correct the alternate location group 2711 StructureTools.cleanUpAltLocs(structure); 2712 2713 return structure; 2714 2715 } 2716 2717 2718 /** 2719 * Add the charges to the Structure 2720 */ 2721 private void addCharges() { 2722 ChargeAdder.addCharges(structure); 2723 } 2724 2725 /** 2726 * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained. 2727 * @author Jules Jacobsen 2728 * @param compoundList 2729 * @param sourceList 2730 */ 2731 private void makeCompounds(List<String> compoundList, 2732 List<String> sourceList) { 2733 // System.out.println("[makeCompounds] making compounds from compoundLines"); 2734 2735 for (String line : compoundList) { 2736 if (compoundList.indexOf(line) + 1 == compoundList.size()) { 2737 // System.out.println("[makeCompounds] Final line in compoundLines."); 2738 isLastCompndLine = true; 2739 } 2740 pdb_COMPND_Handler(line); 2741 2742 } 2743 // System.out.println("[makeCompounds] adding sources to compounds from sourceLines"); 2744 // since we're starting again from the first compound, reset it here 2745 if ( entities.size() == 0){ 2746 current_compound = new EntityInfo(); 2747 } else { 2748 current_compound = entities.get(0); 2749 } 2750 for (String line : sourceList) { 2751 if (sourceList.indexOf(line) + 1 == sourceList.size()) { 2752 // System.out.println("[makeCompounds] Final line in sourceLines."); 2753 isLastSourceLine = true; 2754 } 2755 pdb_SOURCE_Handler(line); 2756 } 2757 2758 } 2759 2760 /** 2761 * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide 2762 * bonds), peptide bonds, and intra-residue bonds. 2763 * <p> 2764 * Note: the current implementation only looks at the first model of each 2765 * structure. This may need to be fixed in the future. 2766 */ 2767 private void formBonds() { 2768 2769 BondMaker maker = new BondMaker(structure, params); 2770 2771 // LINK records should be preserved, they are the way that 2772 // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers. 2773 // The analogy in mmCIF is the _struct_conn record. 2774 for (LinkRecord linkRecord : linkRecords) { 2775 maker.formLinkRecordBond(linkRecord); 2776 } 2777 2778 maker.formDisulfideBonds(ssbonds); 2779 2780 maker.makeBonds(); 2781 } 2782 2783 2784 2785 private void triggerEndFileChecks(){ 2786 2787 // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines) 2788 if (currentChain!=null && currentGroup!=null) { 2789 currentChain.addGroup(currentGroup); 2790 } 2791 if (currentModel!=null && currentChain!=null) { 2792 currentModel.add(currentChain); 2793 } 2794 if (currentModel!=null) { 2795 allModels.add(currentModel); 2796 } 2797 2798 if (blankChainIdsPresent) { 2799 // from biojava 5.0 there's limited support for old pdb files with blank chain ids 2800 logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly."); 2801 } 2802 2803 // reordering chains following the mmcif model and assigning entities 2804 assignChainsAndEntities(); 2805 structure.setEntityInfos(entities); 2806 2807 2808 2809 // header data 2810 2811 Date modDate = pdbHeader.getModDate(); 2812 if ( modDate.equals(new Date(0)) ) { 2813 // modification date = deposition date 2814 Date depositionDate = pdbHeader.getDepDate(); 2815 2816 if (! depositionDate.equals(modDate)){ 2817 // depDate is 0000-00-00 2818 pdbHeader.setDepDate(depositionDate); 2819 } 2820 2821 } 2822 2823 structure.setPDBHeader(pdbHeader); 2824 structure.setCrystallographicInfo(crystallographicInfo); 2825 2826 //set the JournalArticle, if there is one 2827 if (!journalLines.isEmpty()) { 2828 buildjournalArticle(); 2829 pdbHeader.setJournalArticle(journalArticle); 2830 } 2831 2832 structure.setDBRefs(dbrefs); 2833 2834 // Only align if requested (default) and not when headerOnly mode with no Atoms. 2835 // Otherwise, we store the empty SeqRes Groups unchanged in the right chains. 2836 if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){ 2837 logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence"); 2838 SeqRes2AtomAligner aligner = new SeqRes2AtomAligner(); 2839 aligner.align(structure,seqResChains); 2840 2841 } else { 2842 logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence"); 2843 SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly()); 2844 } 2845 2846 2847 2848 //associate the temporary Groups in the siteMap to the ones 2849 if (!params.isHeaderOnly()) { 2850 // Only can link SITES if Atom Groups were parsed. 2851 linkSitesToGroups(); // will work now that setSites is called 2852 } 2853 2854 if ( bioAssemblyParser != null){ 2855 bioAssemblyParser.setMacromolecularSizes(); 2856 pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap()); 2857 } 2858 2859 if (ncsOperators !=null && ncsOperators.size()>0) { 2860 crystallographicInfo.setNcsOperators( 2861 ncsOperators.toArray(new Matrix4d[ncsOperators.size()])); 2862 } 2863 2864 2865 // rfree end file check 2866 // Rfree annotation is not very consistent in PDB format, it varies depending on the software 2867 // Here we follow this strategy: 2868 // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q) 2869 // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak) 2870 2871 if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) { 2872 pdbHeader.setRfree(rfreeNoCutoffLine); 2873 } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) { 2874 pdbHeader.setRfree(rfreeStandardLine); 2875 } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) { 2876 pdbHeader.setRfree(rfreeStandardLine); 2877 } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE 2878 2879 2880 2881 } 2882 2883 private void setSecStruc(){ 2884 2885 setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2886 SecStrucType.helix4); 2887 setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2888 SecStrucType.extended); 2889 setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2890 SecStrucType.turn); 2891 2892 //Now insert random coil to the Groups that did not have SS information 2893 GroupIterator gi = new GroupIterator(structure); 2894 while (gi.hasNext()){ 2895 Group g = gi.next(); 2896 if (g.hasAminoAtoms()){ 2897 if (g.getProperty(Group.SEC_STRUC) == null){ 2898 SecStrucInfo ss = new SecStrucInfo(g, 2899 SecStrucInfo.PDB_AUTHOR_ASSIGNMENT, 2900 SecStrucType.coil); 2901 g.setProperty(Group.SEC_STRUC, ss); 2902 } 2903 } 2904 } 2905 2906 } 2907 2908 private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){ 2909 2910 2911 Iterator<Map<String,String>> iter = secList.iterator(); 2912 nextElement: 2913 while (iter.hasNext()){ 2914 Map<String,String> m = iter.next(); 2915 2916 // assign all residues in this range to this secondary structure type 2917 // String initResName = (String)m.get("initResName"); 2918 String initChainId = m.get("initChainId"); 2919 String initSeqNum = m.get("initSeqNum" ); 2920 String initICode = m.get("initICode" ); 2921 // String endResName = (String)m.get("endResName" ); 2922 String endChainId = m.get("endChainId" ); 2923 String endSeqNum = m.get("endSeqNum"); 2924 String endICode = m.get("endICode"); 2925 2926 if (initICode.equals(" ")) 2927 initICode = ""; 2928 if (endICode.equals(" ")) 2929 endICode = ""; 2930 2931 GroupIterator gi = new GroupIterator(structure); 2932 boolean inRange = false; 2933 while (gi.hasNext()){ 2934 Group g = gi.next(); 2935 Chain c = g.getChain(); 2936 2937 if (c.getName().equals(initChainId)){ 2938 2939 String pdbCode = initSeqNum + initICode; 2940 if ( g.getResidueNumber().toString().equals(pdbCode) ) { 2941 inRange = true; 2942 } 2943 } 2944 if ( inRange){ 2945 if (g.hasAminoAtoms()) { 2946 SecStrucInfo ss = new SecStrucInfo(g, assignment, type); 2947 g.setProperty(Group.SEC_STRUC, ss); 2948 } 2949 2950 } 2951 if ( c.getName().equals(endChainId)){ 2952 String pdbCode = endSeqNum + endICode; 2953 if (pdbCode.equals(g.getResidueNumber().toString())){ 2954 inRange = false; 2955 continue nextElement; 2956 } 2957 } 2958 } 2959 } 2960 } 2961 2962 /** 2963 * Gets all chains with given chainName from given models list 2964 * @param chainName 2965 * @param polyModels 2966 * @return 2967 */ 2968 private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) { 2969 List<List<Chain>> models = new ArrayList<>(); 2970 2971 for (List<Chain> chains:polyModels) { 2972 List<Chain> matchingChains = new ArrayList<>(); 2973 models.add(matchingChains); 2974 for (Chain c:chains) { 2975 if (c.getName().equals(chainName)) { 2976 matchingChains.add(c); 2977 } 2978 } 2979 } 2980 return models; 2981 } 2982 2983 /** 2984 * Split the given chain (containing non-polymer groups and water groups only) 2985 * into individual chains per non-polymer group and individual chains per contiguous sets of water groups. 2986 * @param chain 2987 * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains 2988 */ 2989 private static List<List<Chain>> splitNonPolyChain(Chain chain) { 2990 List<Chain> splitNonPolys = new ArrayList<>(); 2991 List<Chain> waterChains = new ArrayList<>(); 2992 2993 Chain split = null; 2994 boolean previousGroupIsWater = false; 2995 2996 for (Group g:chain.getAtomGroups()){ 2997 2998 if (!previousGroupIsWater) { 2999 // add last one if there's one 3000 if (split!=null) { 3001 splitNonPolys.add(split); 3002 } 3003 split = new ChainImpl(); 3004 split.setName(chain.getName()); 3005 } else if (!g.isWater()) { 3006 // previous group is water and this group is not water: we change from a water chain to a non-poly 3007 // we'll need to add now the water chain to the list of water chains 3008 waterChains.add(split); 3009 split = new ChainImpl(); 3010 split.setName(chain.getName()); 3011 } 3012 3013 if (g.isWater()) { 3014 previousGroupIsWater = true; 3015 } else { 3016 previousGroupIsWater = false; 3017 3018 } 3019 3020 // this should include alt locs (referenced from the main group) 3021 split.addGroup(g); 3022 3023 } 3024 3025 // adding the last split chain: either to water or non-poly depending on what was the last seen group 3026 if (split!=null) { 3027 if (previousGroupIsWater) 3028 waterChains.add(split); 3029 else 3030 splitNonPolys.add(split); 3031 } 3032 3033 3034 List<List<Chain>> all = new ArrayList<>(2); 3035 all.add(splitNonPolys); 3036 all.add(waterChains); 3037 3038 return all; 3039 } 3040 3041 /** 3042 * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files 3043 * @param polys 3044 * @param nonPolys 3045 * @param waters 3046 */ 3047 private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) { 3048 3049 for (int i=0; i<polys.size(); i++) { 3050 String asymId = "A"; 3051 3052 for (Chain poly:polys.get(i)) { 3053 poly.setId(asymId); 3054 asymId = getNextAsymId(asymId); 3055 } 3056 for (Chain nonPoly:nonPolys.get(i)) { 3057 nonPoly.setId(asymId); 3058 asymId = getNextAsymId(asymId); 3059 } 3060 for (Chain water:waters.get(i)) { 3061 water.setId(asymId); 3062 asymId = getNextAsymId(asymId); 3063 } 3064 } 3065 } 3066 3067 /** 3068 * Gets the next asym id given an asymId, according to the convention followed by 3069 * mmCIF files produced by the PDB 3070 * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,... 3071 * @param asymId 3072 * @return 3073 */ 3074 private String getNextAsymId(String asymId) { 3075 if (asymId.length()==1) { 3076 if (!asymId.equals("Z")) { 3077 return Character.toString(getNextChar(asymId.charAt(0))); 3078 } else { 3079 return "AA"; 3080 } 3081 } else if (asymId.length()==2) { 3082 if (asymId.equals("ZZ")) { 3083 return "AAA"; 3084 } 3085 char[] c = new char[2]; 3086 asymId.getChars(0, 2, c, 0); 3087 c[0] = getNextChar(c[0]); 3088 if (c[0]=='A') { 3089 c[1] = getNextChar(c[1]); 3090 } 3091 return new String(c); 3092 } else if (asymId.length()==3) { 3093 char[] c = new char[3]; 3094 asymId.getChars(0, 3, c, 0); 3095 c[0] = getNextChar(c[0]); 3096 if (c[0]=='A') { 3097 c[1] = getNextChar(c[1]); 3098 if (c[1]=='A') { 3099 c[2] = getNextChar(c[2]); 3100 } 3101 } 3102 return new String(c); 3103 } 3104 return null; 3105 } 3106 3107 private char getNextChar(char c) { 3108 if (c!='Z') { 3109 return ((char)(c+1)); 3110 } else { 3111 return 'A'; 3112 } 3113 } 3114 3115 /** 3116 * Here we assign chains following the mmCIF data model: 3117 * one chain per polymer, one chain per non-polymer group and 3118 * several water chains. 3119 * <p> 3120 * Subsequently we assign entities for them: either from those read from 3121 * COMPOUND records or from those found heuristically through {@link EntityFinder} 3122 * 3123 */ 3124 private void assignChainsAndEntities(){ 3125 3126 List<List<Chain>> polyModels = new ArrayList<>(); 3127 List<List<Chain>> nonPolyModels = new ArrayList<>(); 3128 List<List<Chain>> waterModels = new ArrayList<>(); 3129 3130 for (List<Chain> model:allModels) { 3131 3132 List<Chain> polyChains = new ArrayList<>(); 3133 List<Chain> nonPolyChains = new ArrayList<>(); 3134 List<Chain> waterChains = new ArrayList<>(); 3135 3136 polyModels.add(polyChains); 3137 nonPolyModels.add(nonPolyChains); 3138 waterModels.add(waterChains); 3139 3140 for (Chain c:model) { 3141 3142 // we only have entities for polymeric chains, all others are ignored for assigning entities 3143 if (c.isWaterOnly()) { 3144 waterChains.add(c); 3145 3146 } else if (c.isPureNonPolymer()) { 3147 nonPolyChains.add(c); 3148 3149 } else { 3150 polyChains.add(c); 3151 } 3152 } 3153 } 3154 3155 List<List<Chain>> splitNonPolyModels = new ArrayList<>(); 3156 for (int i=0; i<nonPolyModels.size(); i++) { 3157 List<Chain> nonPolyModel = nonPolyModels.get(i); 3158 List<Chain> waterModel = waterModels.get(i); 3159 3160 List<Chain> splitNonPolys = new ArrayList<>(); 3161 splitNonPolyModels.add(splitNonPolys); 3162 3163 for (Chain nonPoly:nonPolyModel) { 3164 List<List<Chain>> splits = splitNonPolyChain(nonPoly); 3165 splitNonPolys.addAll(splits.get(0)); 3166 waterModel.addAll(splits.get(1)); 3167 } 3168 } 3169 3170 3171 // now we have all chains as in mmcif, let's assign ids following the mmcif rules 3172 assignAsymIds(polyModels, splitNonPolyModels, waterModels); 3173 3174 3175 if (!entities.isEmpty()) { 3176 // if the file contained COMPOUND records then we can assign entities to the poly chains 3177 for (EntityInfo comp : entities){ 3178 List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId()); 3179 if ( chainIds == null) 3180 continue; 3181 for ( String chainId : chainIds) { 3182 3183 List<List<Chain>> models = findChains(chainId, polyModels); 3184 3185 for (List<Chain> matchingChains:models) { 3186 for (Chain chain:matchingChains) { 3187 comp.addChain(chain); 3188 chain.setEntityInfo(comp); 3189 } 3190 3191 if (matchingChains.isEmpty()) { 3192 // usually if this happens something is wrong with the PDB header 3193 // e.g. 2brd - there is no Chain A, although it is specified in the header 3194 // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES 3195 // but the authors didn't observe in the density so it's completely missing 3196 // from the ATOM lines 3197 logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId()); 3198 } 3199 } 3200 } 3201 } 3202 3203 } else { 3204 3205 logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically"); 3206 // if no entity information was present in file we then go and find the entities heuristically with EntityFinder 3207 entities = EntityFinder.findPolyEntities(polyModels); 3208 3209 } 3210 3211 // now we assign entities to the nonpoly and water chains 3212 EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities); 3213 3214 3215 // in some rare cases purely non-polymer or purely water chain are present in pdb files 3216 // see https://github.com/biojava/biojava/pull/394 3217 // these case should be covered by the above 3218 3219 3220 // now that we have entities in chains we add the chains to the structure 3221 3222 for (int i=0;i<allModels.size();i++) { 3223 List<Chain> model = new ArrayList<>(); 3224 model.addAll(polyModels.get(i)); 3225 model.addAll(splitNonPolyModels.get(i)); 3226 model.addAll(waterModels.get(i)); 3227 structure.addModel(model); 3228 } 3229 3230 3231 } 3232 3233 /** 3234 * Links the Sites in the siteMap to the Groups in the Structure via the 3235 * siteToResidueMap ResidueNumber. 3236 * @author Jules Jacobsen 3237 * @return 3238 */ 3239 private void linkSitesToGroups() { 3240 3241 //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size()); 3242 3243 //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back. 3244 //the return list 3245 3246 if ( siteMap == null || siteToResidueMap == null){ 3247 logger.info("Sites can not be linked to residues!"); 3248 3249 return; 3250 } 3251 3252 List<Site> sites = null; 3253 //check that there are chains with which to associate the groups 3254 if (structure.getChains().isEmpty()) { 3255 sites = new ArrayList<Site>(siteMap.values()); 3256 logger.info("No chains to link Site Groups with - Sites will not be present in the Structure"); 3257 return; 3258 } 3259 3260 //check that the keys in the siteMap and SiteToResidueMap are equal 3261 if (! siteMap.keySet().equals(siteToResidueMap.keySet())) { 3262 logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure"); 3263 logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet()); 3264 //return; 3265 } 3266 3267 //so we have chains - associate the siteResidues-related groups with the ones 3268 //already in in the chains 3269 for (String key : siteMap.keySet()) { 3270 Site currentSite = siteMap.get(key); 3271 List<ResidueNumber> linkedGroups = siteToResidueMap.get(key); 3272 if ( linkedGroups == null) 3273 continue; 3274 for (ResidueNumber residueNumber : linkedGroups) { 3275 3276 String pdbCode = residueNumber.toString(); 3277 String chain = residueNumber.getChainName(); 3278 // System.out.println("chain: '" + chain + "'"); 3279 // String resNum = resNum.getSeqNum().toString(); 3280 // System.out.println("resNum: '" + resNum + "'"); 3281 3282 Group linkedGroup = null; 3283 try { 3284 //TODO: implement findGroup(ResidueNumber resNum) 3285 linkedGroup = structure.findGroup(chain, pdbCode); 3286 } catch (StructureException ex) { 3287 logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")"); 3288 continue; 3289 } 3290 3291 // System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID()); 3292 currentSite.getGroups().add(linkedGroup); 3293 } 3294 } 3295 3296 //System.out.println("SITEMAP: " + siteMap); 3297 3298 sites = new ArrayList<Site>(siteMap.values()); 3299 structure.setSites(sites); 3300 //System.out.println("STRUCTURE SITES: " + structure.getSites().size()); 3301 // for (Site site : structure.getSites()) { 3302 // System.out.println(site); 3303 // } 3304 // System.out.println("Linked Site Groups with Chains"); 3305 3306 } 3307 3308 private void buildjournalArticle() { 3309 3310 logger.debug("building new JournalArticle"); 3311 // for (String line : journalLines) { 3312 // System.out.println(line); 3313 // } 3314 3315 this.journalArticle = new JournalArticle(); 3316 // JRNL AUTH M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI, 3317 // JRNL AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT 3318 // JRNL TITL A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY 3319 // JRNL TITL 2 STAPHYLOCOCCUS AUREUS. 3320 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3321 // JRNL REFN ISSN 1529-2908 3322 // JRNL PMID 17351618 3323 // JRNL DOI 10.1038/NI1450 3324 StringBuffer auth = new StringBuffer(); 3325 StringBuffer titl = new StringBuffer(); 3326 StringBuffer edit = new StringBuffer(); 3327 StringBuffer ref = new StringBuffer(); 3328 StringBuffer publ = new StringBuffer(); 3329 StringBuffer refn = new StringBuffer(); 3330 StringBuffer pmid = new StringBuffer(); 3331 StringBuffer doi = new StringBuffer(); 3332 3333 for (String line : journalLines) { 3334 if ( line.length() < 19 ) { 3335 logger.info("can not process Journal line: " + line); 3336 continue; 3337 } 3338 // System.out.println("'" + line + "'"); 3339 String subField = line.substring(12, 16); 3340 // System.out.println("'" + subField + "'"); 3341 if (subField.equals("AUTH")) { 3342 auth.append(line.substring(19, line.length()).trim()); 3343 3344 logger.debug("AUTH '" + auth.toString() + "'"); 3345 3346 } 3347 if (subField.equals("TITL")) { 3348 //add a space to the end of a line so that when wrapped the 3349 //words on the join won't be concatenated 3350 titl.append(line.substring(19, line.length()).trim()).append(" "); 3351 3352 logger.debug("TITL '" + titl.toString() + "'"); 3353 3354 } 3355 if (subField.equals("EDIT")) { 3356 edit.append(line.substring(19, line.length()).trim()); 3357 3358 logger.debug("EDIT '" + edit.toString() + "'"); 3359 3360 } 3361 // JRNL REF NAT.IMMUNOL. V. 8 430 2007 3362 if (subField.equals("REF ")) { 3363 ref.append(line.substring(19, line.length()).trim()).append(" "); 3364 3365 logger.debug("REF '" + ref.toString() + "'"); 3366 3367 } 3368 if (subField.equals("PUBL")) { 3369 publ.append(line.substring(19, line.length()).trim()).append(" "); 3370 3371 logger.debug("PUBL '" + publ.toString() + "'"); 3372 3373 } 3374 // JRNL REFN ISSN 1529-2908 3375 if (subField.equals("REFN")) { 3376 if ( line.length() < 35 ) { 3377 logger.info("can not process Journal REFN line: " + line); 3378 continue; 3379 } 3380 refn.append(line.substring(35, line.length()).trim()); 3381 3382 logger.debug("REFN '" + refn.toString() + "'"); 3383 3384 } 3385 // JRNL PMID 17351618 3386 if (subField.equals("PMID")) { 3387 pmid.append(line.substring(19, line.length()).trim()); 3388 3389 logger.debug("PMID '" + pmid.toString() + "'"); 3390 3391 } 3392 // JRNL DOI 10.1038/NI1450 3393 if (subField.equals("DOI ")) { 3394 doi.append(line.substring(19, line.length()).trim()); 3395 3396 logger.debug("DOI '" + doi.toString() + "'"); 3397 3398 } 3399 } 3400 3401 //now set the parts of the JournalArticle 3402 journalArticle.setAuthorList(authorBuilder(auth.toString())); 3403 journalArticle.setEditorList(authorBuilder(edit.toString())); 3404 journalArticle.setRef(ref.toString()); 3405 JournalParser journalParser = new JournalParser(ref.toString()); 3406 journalArticle.setJournalName(journalParser.getJournalName()); 3407 if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) { 3408 journalArticle.setIsPublished(true); 3409 } 3410 journalArticle.setVolume(journalParser.getVolume()); 3411 journalArticle.setStartPage(journalParser.getStartPage()); 3412 journalArticle.setPublicationDate(journalParser.getPublicationDate()); 3413 journalArticle.setPublisher(publ.toString().trim()); 3414 journalArticle.setTitle(titl.toString().trim()); 3415 journalArticle.setRefn(refn.toString().trim()); 3416 journalArticle.setPmid(pmid.toString().trim()); 3417 journalArticle.setDoi(doi.toString().trim()); 3418 3419 3420 logger.debug("Made JournalArticle:"); 3421 logger.debug(journalArticle.toString()); 3422 3423 } 3424 3425 //inner class to deal with all the journal info 3426 private class JournalParser { 3427 3428 private String journalName; 3429 private String volume; 3430 private String startPage; 3431 private int publicationDate; 3432 3433 3434 public JournalParser(String ref) { 3435 3436 logger.debug("JournalParser init '" + ref + "'"); 3437 3438 3439 if (ref.equals("TO BE PUBLISHED ")) { 3440 journalName = ref.trim(); 3441 3442 logger.debug(String.format("JournalParser found journalString '%s'", journalName)); 3443 3444 return; 3445 } 3446 3447 if (ref.length() < 48) { 3448 logger.info("REF line too short - must be at least 48 characters to be valid for parsing."); 3449 journalName = ""; 3450 volume = ""; 3451 startPage = ""; 3452 publicationDate = 0; 3453 return; 3454 } 3455 //can be multi line: 3456 //REF PHILOS.TRANS.R.SOC.LONDON, V. 293 53 1981 3457 //REF 2 SER.B 3458 3459 //or 3460 3461 //REF GLYCOGEN PHOSPHORYLASE B: 1 1991 3462 //REF 2 DESCRIPTION OF THE PROTEIN 3463 //REF 3 STRUCTURE 3464 3465 //but usually single line 3466 //REF NUCLEIC ACIDS RES. 2009 3467 //REF MOL.CELL 2009 3468 //REF NAT.STRUCT.MOL.BIOL. V. 16 238 2009 3469 //REF ACTA CRYSTALLOGR.,SECT.F V. 65 199 2009 3470 //check if the date is present at the end of the line. 3471 // 09876543210987654321 3472 //'J.BIOL.CHEM. V. 280 23000 2005 ' 3473 //'J.AM.CHEM.SOC. V. 130 16011 2008 ' 3474 //'NAT.STRUCT.MOL.BIOL. V. 16 238 2009' 3475 String volumeInformation = ref.substring(30, 48); 3476 3477 logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation)); 3478 3479 //volumeInformation: 'V. 293 53 1981 ' 3480 // String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim(); 3481 // String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim(); 3482 // String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim(); 3483 // String journalString = ref.substring(0 , ref.length() - 18).trim(); 3484 String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim(); 3485 String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim(); 3486 String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim(); 3487 //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk) 3488 String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim(); 3489 journalString = journalString.trim(); 3490 // System.out.println("journalString: " + journalString); 3491 3492 logger.debug(String.format("JournalParser found volumeString '%s'", volumeString)); 3493 logger.debug(String.format("JournalParser found startPageString '%s'", startPageString)); 3494 logger.debug(String.format("JournalParser found dateString '%s'", dateString)); 3495 logger.debug(String.format("JournalParser found journalString '%s'", journalString)); 3496 3497 3498 if (!dateString.equals(" ")) { 3499 try { 3500 publicationDate = Integer.valueOf(dateString); 3501 } catch (NumberFormatException nfe) { 3502 logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1"); 3503 } 3504 // if (DEBUG) { 3505 // System.out.println("JournalParser set date " + publicationDate); 3506 // } 3507 } 3508 3509 if (!startPageString.equals(" ")) { 3510 startPage = startPageString; 3511 // if (DEBUG) { 3512 // System.out.println("JournalParser set startPage " + startPage); 3513 // } 3514 } 3515 3516 if (!volumeString.equals(" ")) { 3517 volume = volumeString; 3518 // if (DEBUG) { 3519 // System.out.println("JournalParser set volume " + volume); 3520 // } 3521 } 3522 3523 if (!journalString.equals(" ")) { 3524 journalName = journalString; 3525 3526 logger.debug("JournalParser set journalName " + journalName); 3527 3528 } 3529 } 3530 3531 private String getJournalName() { 3532 return journalName; 3533 } 3534 3535 private int getPublicationDate() { 3536 return publicationDate; 3537 } 3538 3539 private String getStartPage() { 3540 return startPage; 3541 } 3542 3543 private String getVolume() { 3544 return volume; 3545 } 3546 } 3547 3548 private List<Author> authorBuilder(String authorString) { 3549 ArrayList<Author> authorList = new ArrayList<Author>(); 3550 3551 if (authorString.equals("")) { 3552 return authorList; 3553 } 3554 3555 String[] authors = authorString.split(","); 3556 // if (DEBUG) { 3557 // for (int i = 0; i < authors.length; i++) { 3558 // String string = authors[i]; 3559 // System.out.println("authorBuilder author: '" + string + "'"); 3560 // } 3561 // } 3562 // AUTH SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS 3563 // AUTH 2 DISEASE (SSGCID) 3564 // or 3565 // AUTH E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET, 3566 // AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA, 3567 // AUTH 3 A.BOCHKAREV,D.COSSAR, 3568 // AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC) 3569 // or 3570 // AUTH T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER 3571 if (authors.length == 1) { 3572 //only one element means it's a consortium only 3573 Author author = new Author(); 3574 author.setSurname(authors[0]); 3575 3576 logger.debug("Set consortium author name " + author.getSurname()); 3577 3578 authorList.add(author); 3579 } else { 3580 for (int i = 0; i < authors.length; i++) { 3581 String authorFullName = authors[i]; 3582 3583 logger.debug("Building author " + authorFullName); 3584 3585 Author author = new Author(); 3586 String regex = "\\."; 3587 String[] authorNames = authorFullName.split(regex); 3588 // if (DEBUG) { 3589 // System.out.println("authorNames size " + authorNames.length); 3590 // for (int j = 0; j < authorNames.length; j++) { 3591 // String name = authorNames[j]; 3592 // System.out.println("split authName '" + name + "'"); 3593 // 3594 // } 3595 // } 3596 if (authorNames.length == 0) { 3597 author.setSurname(authorFullName); 3598 3599 logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname()); 3600 3601 } 3602 //again there might be a consortium name so there may be no elements 3603 else if (authorNames.length == 1) { 3604 author.setSurname(authorNames[0]); 3605 3606 logger.debug("Set consortium author name in multiple author block " + author.getSurname 3607 ()); 3608 3609 } else { 3610 String initials = ""; 3611 for (int j = 0; j < authorNames.length - 1; j++) { 3612 String initial = authorNames[j]; 3613 // if (DEBUG) { 3614 // System.out.println("adding initial '" + initial + "'"); 3615 // } 3616 //build the initials back up again 3617 initials += initial + "."; 3618 } 3619 3620 logger.debug("built initials '" + initials + "'"); 3621 3622 author.setInitials(initials); 3623 //surname is always last 3624 int lastName = authorNames.length - 1; 3625 String surname = authorNames[lastName]; 3626 3627 logger.debug("built author surname " + surname); 3628 3629 author.setSurname(surname); 3630 3631 } 3632 authorList.add(author); 3633 } 3634 } 3635 return authorList; 3636 } 3637 3638 public void setFileParsingParameters(FileParsingParameters params) 3639 { 3640 this.params= params; 3641 3642 // set the correct max values for parsing... 3643 loadMaxAtoms = params.getMaxAtoms(); 3644 atomCAThreshold = params.getAtomCaThreshold(); 3645 3646 3647 } 3648 3649 public FileParsingParameters getFileParsingParameters(){ 3650 return params; 3651 } 3652 3653 3654}