Source code

001/*
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 * Created on 16.03.2004
020 *
021 */
022package org.biojava.nbio.structure.io;
023
024import static java.lang.Math.min;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.text.DateFormat;
031import java.text.ParseException;
032import java.text.SimpleDateFormat;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.Collections;
036import java.util.Comparator;
037import java.util.Date;
038import java.util.HashMap;
039import java.util.Iterator;
040import java.util.LinkedHashMap;
041import java.util.List;
042import java.util.Locale;
043import java.util.Map;
044import java.util.StringTokenizer;
045import java.util.regex.Matcher;
046import java.util.regex.Pattern;
047
048import javax.vecmath.Matrix4d;
049
050import org.biojava.nbio.structure.AminoAcid;
051import org.biojava.nbio.structure.AminoAcidImpl;
052import org.biojava.nbio.structure.Atom;
053import org.biojava.nbio.structure.AtomImpl;
054import org.biojava.nbio.structure.Author;
055import org.biojava.nbio.structure.Chain;
056import org.biojava.nbio.structure.ChainImpl;
057import org.biojava.nbio.structure.Compound;
058import org.biojava.nbio.structure.DBRef;
059import org.biojava.nbio.structure.Element;
060import org.biojava.nbio.structure.Group;
061import org.biojava.nbio.structure.GroupIterator;
062import org.biojava.nbio.structure.GroupType;
063import org.biojava.nbio.structure.HetatomImpl;
064import org.biojava.nbio.structure.JournalArticle;
065import org.biojava.nbio.structure.NucleotideImpl;
066import org.biojava.nbio.structure.PDBCrystallographicInfo;
067import org.biojava.nbio.structure.PDBHeader;
068import org.biojava.nbio.structure.ResidueNumber;
069import org.biojava.nbio.structure.Site;
070import org.biojava.nbio.structure.Structure;
071import org.biojava.nbio.structure.StructureException;
072import org.biojava.nbio.structure.StructureImpl;
073import org.biojava.nbio.structure.StructureTools;
074import org.biojava.nbio.structure.io.mmcif.ChemCompGroupFactory;
075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord;
076import org.biojava.nbio.structure.secstruc.SecStrucInfo;
077import org.biojava.nbio.structure.secstruc.SecStrucType;
078import org.biojava.nbio.structure.xtal.CrystalCell;
079import org.biojava.nbio.structure.xtal.SpaceGroup;
080import org.biojava.nbio.structure.xtal.SymoplibParser;
081import org.slf4j.Logger;
082import org.slf4j.LoggerFactory;
083
084
085
086/**
087 * This class implements the actual PDB file parsing. Do not access it directly, but
088 * via the PDBFileReader class.
089 *
090 * <h2>Parsing</h2>
091 *
092 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods.
093 *
094 *
095 * <p>
096 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD.
097 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically
098 * switch to a C-alpha only representation.
099 * </p>
100 *
101 * <p>
102 * The result of the parsing of the PDB file is a new {@link Structure} object.
103 * </p>
104 *
105 *
106 * For more documentation on how to work with the Structure API please
107 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top">
108 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a>
109 *
110 *
111 *
112 *
113 * <h2>Example</h2>
114 * <p>
115 * Q: How can I get a Structure object from a PDB file?
116 * </p>
117 * <p>
118 * A:
119 * <pre>
120 * public {@link Structure} loadStructure(String pathToPDBFile){
121 *      // The PDBFileParser is wrapped by the PDBFileReader
122 *      {@link PDBFileReader} pdbreader = new {@link PDBFileReader}();
123 *
124 *      {@link Structure} structure = null;
125 *      try{
126 *              structure = pdbreader.getStructure(pathToPDBFile);
127 *              System.out.println(structure);
128 *      } catch (IOException e) {
129 *              e.printStackTrace();
130 *      }
131 *      return structure;
132 * }
133 * </pre>
134 *
135 *
136 * @author Andreas Prlic
137 * @author Jules Jacobsen
138 * @author Jose Duarte
139 * @since 1.4
140 */
141public class PDBFileParser  {
142
143
144
145        private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class);
146
147        // for printing
148        private static final String NEWLINE = System.getProperty("line.separator");
149
150
151        // required for parsing:
152        private String pdbId; //the actual id of the entry
153        private Structure     structure;
154        private List<Chain>   current_model; // contains the ATOM records for each model
155        private Chain         current_chain;
156        private Group         current_group;
157
158        private List<Chain>   seqResChains; // contains all the chains for the SEQRES records
159        //we're going to work on the assumption that the files are current -
160        //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true.
161        //if true then lines will be truncated at 72 characters in certain cases
162        //(pdb_COMPOUND_handler for example)
163        private boolean isLegacyFormat = false;
164
165
166        // for re-creating the biological assembly
167
168        private PDBBioAssemblyParser bioAssemblyParser = null;
169
170        private PDBHeader pdbHeader;
171        private PDBCrystallographicInfo crystallographicInfo;
172        private JournalArticle journalArticle;
173        private List<Map<String, Integer>> connects ;
174        private List<Map<String,String>> helixList;
175        private List<Map<String,String>> strandList;
176        private List<Map<String,String>> turnList;
177
178        private int lengthCheck ;
179
180        private boolean isLastCompndLine = false;
181        private boolean isLastSourceLine = false;
182        private Compound current_compound;
183        private List<Compound> compounds = new ArrayList<Compound>();
184        private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>();
185        private List<String> compndLines = new ArrayList<String>();
186        private List<String> sourceLines = new ArrayList<String>();
187        private List<String> journalLines = new ArrayList<String>();
188        private List<DBRef> dbrefs;
189        private Map<String, Site> siteMap = new LinkedHashMap<String, Site>();
190        private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>();
191
192        private List<SSBondImpl> ssbonds = new ArrayList<>();
193
194        private Matrix4d currentNcsOp;
195        private List<Matrix4d> ncsOperators;
196
197        // for storing LINK until we have all the atoms parsed
198        private List<LinkRecord> linkRecords;
199
200        // for parsing COMPOUND and SOURCE Header lines
201        private int prevMolId;
202        private String previousContinuationField;
203        private String continuationField;
204        private String continuationString;
205
206        private DateFormat dateFormat;
207
208        // for rfree parsing
209        private float rfreeStandardLine = -1;
210        private float rfreeNoCutoffLine = -1;
211
212        private static  final List<String> compndFieldValues = new ArrayList<String>(
213                        Arrays.asList(
214                                        "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:",
215                                        "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:",
216                                        "BIOLOGICAL_UNIT:", "OTHER_DETAILS:"
217                                        ));
218
219
220        private static final List<String> ignoreCompndFieldValues = new ArrayList<String>(
221                        Arrays.asList(
222                                        "HETEROGEN:","ENGINEEREED:","FRAGMENT,",
223                                        "MUTANT:","SYNTHETIC:"
224                                        ));
225        // ENGINEEREED in pdb219d
226
227        private static final List<String> sourceFieldValues = new ArrayList<String>(
228                        Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:",
229                                        "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:",
230                                        "ORGANISM_TAXID:","STRAIN:",
231                                        "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:",
232                                        "CELL:", "ORGANELLE:", "SECRETION:", "GENE:",
233                                        "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:",
234                                        "EXPRESSION_SYSTEM_TAXID:",
235                                        "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:",
236                                        "EXPRESSION_SYSTEM_CELL_LINE:",
237                                        "EXPRESSION_SYSTEM_ATCC_NUMBER:",
238                                        "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:",
239                                        "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:",
240                                        "EXPRESSION_SYSTEM_CELLULAR_LOCATION:",
241                                        "EXPRESSION_SYSTEM_VECTOR_TYPE:",
242                                        "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:",
243                                        "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:"));
244
245        private int atomCount;
246
247        // parsing options:
248
249        private int my_ATOM_CA_THRESHOLD ;
250
251        private int load_max_atoms;
252
253        private boolean atomOverflow;
254
255        /** flag to tell parser to only read Calpha coordinates **/
256        private boolean parseCAonly;
257
258
259        private FileParsingParameters params;
260
261        public PDBFileParser() {
262                params = new FileParsingParameters();
263
264                structure     = null           ;
265                current_model = new ArrayList<Chain>();
266                current_chain = null           ;
267                current_group = null           ;
268                pdbHeader         = new PDBHeader();
269                crystallographicInfo = new PDBCrystallographicInfo();
270                connects      = new ArrayList<Map<String,Integer>>() ;
271
272
273                helixList     = new ArrayList<Map<String,String>>();
274                strandList    = new ArrayList<Map<String,String>>();
275                turnList      = new ArrayList<Map<String,String>>();
276                current_compound = null;
277                dbrefs        = new ArrayList<DBRef>();
278                siteMap = null;
279                dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US);
280                atomCount = 0;
281                atomOverflow = false;
282                parseCAonly = false;
283
284                // this SHOULD not be done
285                // DONOT:setFileParsingParameters(params);
286                // set the correct max values for parsing...
287                load_max_atoms = params.getMaxAtoms();
288                my_ATOM_CA_THRESHOLD = params.getAtomCaThreshold();
289
290                linkRecords = new ArrayList<LinkRecord>();
291        }
292
293        /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */
294        private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) {
295
296                Group g =  ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3);
297                if ( g != null && !g.getChemComp().isEmpty())
298                        return g;
299
300
301                Group group;
302                if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){
303                        group = new HetatomImpl();
304
305                } else if(StructureTools.isNucleotide(aminoCode3))  {
306                        // it is a nucleotide
307                        NucleotideImpl nu = new NucleotideImpl();
308                        group = nu;
309
310                } else {
311                        AminoAcidImpl aa = new AminoAcidImpl() ;
312                        aa.setAminoType(aminoCode1);
313                        group = aa ;
314                }
315
316                //              System.out.println("new resNum type: "+ resNum.getType() );
317                return  group ;
318        }
319
320
321
322        // Handler methods to deal with PDB file records properly.
323        /**
324         Handler for
325         HEADER Record Format
326
327         COLUMNS        DATA TYPE       FIELD           DEFINITION
328         ----------------------------------------------------------------------------------
329         1 -  6        Record name     "HEADER"
330         11 - 50        String(40)      classification  Classifies the molecule(s)
331         51 - 59        Date            depDate         Deposition date.  This is the date
332         the coordinates were received by
333         the PDB
334         63 - 66        IDcode          idCode          This identifier is unique within PDB
335
336         */
337        private void pdb_HEADER_Handler(String line) {
338                //System.out.println(line);
339
340                String classification  = null;
341                String deposition_date = null;
342                String pdbCode         = null;
343
344                int len = line.trim().length();
345                if(len > 10) {
346                        classification  = line.substring (10, min(len,50)).trim() ;
347                        pdbHeader.setClassification(classification);
348                }
349                if(len > 50) {
350                        deposition_date = line.substring (50, min(len,59)).trim() ;
351                        try {
352                                Date dep = dateFormat.parse(deposition_date);
353                                pdbHeader.setDepDate(dep);
354
355                        } catch (ParseException e){
356                                logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date");
357                        }
358                }
359                if(len > 62) {
360                        pdbCode         = line.substring (62, min(len,66)).trim() ;
361                        pdbId = pdbCode;
362
363                        logger.debug("Parsing entry " + pdbId);
364
365
366                        structure.setPDBCode(pdbCode);
367                        pdbHeader.setIdCode(pdbCode);
368                }
369
370                //*really* old files (you'll need to hunt to find these as they
371                //should have been remediated) have headers like below. Plus the
372                //pdbId at positions 72-76 is present in every line
373
374                //HEADER    PROTEINASE INHIBITOR (TRYPSIN)          05-OCT-84   5PTI      5PTI   3
375                //HEADER    TRANSFERASE (ACYLTRANSFERASE)           02-SEP-92   1LAC      1LAC   2
376                if (len > 66) {
377                        if (pdbId.equals(line.substring (72, 76))){
378                                isLegacyFormat = true;
379                                System.out.println(pdbId + " is a LEGACY entry - this will most likely not parse correctly.");
380                        }
381                }
382
383        }
384
385
386        /** parses the following record:
387         * <pre>
388         *  COLUMNS      DATA  TYPE      FIELD         DEFINITION
389         * ------------------------------------------------------------------------------------
390         *  1 -  6      Record name     "AUTHOR"
391         *  9 - 10      Continuation    continuation  Allows concatenation of multiple records.
392         * 11 - 79      List            authorList    List of the author names, separated
393         *                                            by commas.
394         *
395         * </pre>
396         * @param line
397         */
398        private void pdb_AUTHOR_Handler(String line) {
399
400                String authors = line.substring(10).trim();
401
402                String auth = pdbHeader.getAuthors();
403                if (auth == null){
404                        pdbHeader.setAuthors(authors);
405                } else {
406                        auth +=  authors;
407                        pdbHeader.setAuthors(auth);
408                }
409
410        }
411
412
413
414        /** parses the following record:
415         *
416         * <pre>
417         * COLUMNS       DATA TYPE        FIELD        DEFINITION
418         * --------------------------------------------------------------------
419         *  1 -  6       Record name      "HELIX "
420         *  8 - 10       Integer          serNum       Serial number of the helix.
421         *                                             This starts at 1 and increases
422         *                                             incrementally.
423         * 12 - 14       LString(3)       helixID      Helix identifier. In addition
424         *                                             to a serial number, each helix is
425         *                                             given an alphanumeric character
426         *                                             helix identifier.
427         * 16 - 18       Residue name     initResName  Name of the initial residue.
428         * 20            Character        initChainID  Chain identifier for the chain
429         *                                             containing this helix.
430         * 22 - 25       Integer          initSeqNum   Sequence number of the initial
431         *                                             residue.
432         * 26            AChar            initICode    Insertion code of the initial
433         *                                             residue.
434         * 28 - 30       Residue name     endResName   Name of the terminal residue of
435         *                                             the helix.
436         * 32            Character        endChainID   Chain identifier for the chain
437         *                                             containing this helix.
438         * 34 - 37       Integer          endSeqNum    Sequence number of the terminal
439         *                                             residue.
440         * 38            AChar            endICode     Insertion code of the terminal
441         *                                             residue.
442         * 39 - 40       Integer          helixClass   Helix class (see below).
443         * 41 - 70       String           comment      Comment about this helix.
444         * 72 - 76       Integer          length       Length of this helix.
445         * </pre>
446         */
447
448        private void pdb_HELIX_Handler(String line){
449
450                if (params.isHeaderOnly()) return;
451
452                if (line.length()<38) {
453                        logger.info("HELIX line has length under 38. Ignoring it.");
454                        return;
455                }
456
457                String initResName = line.substring(15,18).trim();
458                String initChainId = line.substring(19,20);
459                String initSeqNum  = line.substring(21,25).trim();
460                String initICode   = line.substring(25,26);
461                String endResName  = line.substring(27,30).trim();
462                String endChainId  = line.substring(31,32);
463                String endSeqNum   = line.substring(33,37).trim();
464                String endICode    = line.substring(37,38);
465
466                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
467                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
468
469                Map<String,String> m = new HashMap<String,String>();
470
471                m.put("initResName",initResName);
472                m.put("initChainId", initChainId);
473                m.put("initSeqNum", initSeqNum);
474                m.put("initICode", initICode);
475                m.put("endResName", endResName);
476                m.put("endChainId", endChainId);
477                m.put("endSeqNum",endSeqNum);
478                m.put("endICode",endICode);
479
480                helixList.add(m);
481
482        }
483
484        /**
485         * Handler for
486         * <pre>
487         *       COLUMNS     DATA TYPE        FIELD           DEFINITION
488         * --------------------------------------------------------------
489         *  1 -  6     Record name      "SHEET "
490         *  8 - 10     Integer          strand       Strand number which starts at 1
491         *                                           for each strand within a sheet
492         *                                           and increases by one.
493         * 12 - 14     LString(3)       sheetID      Sheet identifier.
494         * 15 - 16     Integer          numStrands   Number of strands in sheet.
495         * 18 - 20     Residue name     initResName  Residue name of initial residue.
496         * 22          Character        initChainID  Chain identifier of initial
497         *                                           residue in strand.
498         * 23 - 26     Integer          initSeqNum   Sequence number of initial
499         *                                           residue in strand.
500         * 27          AChar            initICode    Insertion code of initial residue
501         *                                           in strand.
502         * 29 - 31     Residue name     endResName   Residue name of terminal residue.
503         * 33          Character        endChainID   Chain identifier of terminal
504         *                                           residue.
505         * 34 - 37     Integer          endSeqNum    Sequence number of terminal
506         *                                           residue.
507         * 38          AChar            endICode     Insertion code of terminal
508         *                                           residue.
509         * 39 - 40     Integer          sense        Sense of strand with respect to
510         *                                           previous strand in the sheet. 0
511         *                                           if first strand, 1 if parallel,
512         *                                           -1 if anti-parallel.
513         * 42 - 45     Atom             curAtom      Registration. Atom name in
514         *                                           current strand.
515         * 46 - 48     Residue name     curResName   Registration. Residue name in
516         *                                           current strand.
517         * 50          Character        curChainId   Registration. Chain identifier in
518         *                                           current strand.
519         * 51 - 54     Integer          curResSeq    Registration. Residue sequence
520         *                                           number in current strand.
521         * 55          AChar            curICode     Registration. Insertion code in
522         *                                           current strand.
523         * 57 - 60     Atom             prevAtom     Registration. Atom name in
524         *                                           previous strand.
525         * 61 - 63     Residue name     prevResName  Registration. Residue name in
526         *                                           previous strand.
527         * 65          Character        prevChainId  Registration. Chain identifier in
528         *                                           previous strand.
529         * 66 - 69     Integer          prevResSeq   Registration. Residue sequence
530         *                                           number in previous strand.
531         * 70          AChar            prevICode    Registration. Insertion code in
532         *                                               previous strand.
533         * </pre>
534         */
535        private void pdb_SHEET_Handler( String line){
536
537                if (params.isHeaderOnly()) return;
538
539                if (line.length()<38) {
540                        logger.info("SHEET line has length under 38. Ignoring it.");
541                        return;
542                }
543
544                String initResName = line.substring(17,20).trim();
545                String initChainId = line.substring(21,22);
546                String initSeqNum  = line.substring(22,26).trim();
547                String initICode   = line.substring(26,27);
548                String endResName  = line.substring(28,31).trim();
549                String endChainId  = line.substring(32,33);
550                String endSeqNum   = line.substring(33,37).trim();
551                String endICode    = line.substring(37,38);
552
553                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
554                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
555
556                Map<String,String> m = new HashMap<String,String>();
557
558                m.put("initResName",initResName);
559                m.put("initChainId", initChainId);
560                m.put("initSeqNum", initSeqNum);
561                m.put("initICode", initICode);
562                m.put("endResName", endResName);
563                m.put("endChainId", endChainId);
564                m.put("endSeqNum",endSeqNum);
565                m.put("endICode",endICode);
566
567                strandList.add(m);
568        }
569
570
571        /**
572         * Handler for TURN lines
573         * <pre>
574         * COLUMNS      DATA TYPE        FIELD         DEFINITION
575         * --------------------------------------------------------------------
576         *  1 -  6      Record name      "TURN "
577         *  8 - 10      Integer          seq           Turn number; starts with 1 and
578         *                                             increments by one.
579         * 12 - 14      LString(3)       turnId        Turn identifier
580         * 16 - 18      Residue name     initResName   Residue name of initial residue in
581         *                                             turn.
582         * 20           Character        initChainId   Chain identifier for the chain
583         *                                             containing this turn.
584         * 21 - 24      Integer          initSeqNum    Sequence number of initial residue
585         *                                             in turn.
586         * 25           AChar            initICode     Insertion code of initial residue
587         *                                             in turn.
588         * 27 - 29      Residue name     endResName    Residue name of terminal residue
589         *                                             of turn.
590         * 31           Character        endChainId    Chain identifier for the chain
591         *                                             containing this turn.
592         * 32 - 35      Integer          endSeqNum     Sequence number of terminal
593         *                                             residue of turn.
594         * 36           AChar            endICode      Insertion code of terminal residue
595         *                                             of turn.
596         * 41 - 70      String           comment       Associated comment.
597         * </pre>
598         * @param line
599         */
600        private void pdb_TURN_Handler( String line){
601
602                if (params.isHeaderOnly()) return;
603
604                if (line.length()<36) {
605                        logger.info("TURN line has length under 36. Ignoring it.");
606                        return;
607                }
608
609                String initResName = line.substring(15,18).trim();
610                String initChainId = line.substring(19,20);
611                String initSeqNum  = line.substring(20,24).trim();
612                String initICode   = line.substring(24,25);
613                String endResName  = line.substring(26,29).trim();
614                String endChainId  = line.substring(30,31);
615                String endSeqNum   = line.substring(31,35).trim();
616                String endICode    = line.substring(35,36);
617
618                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
619                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
620
621                Map<String,String> m = new HashMap<String,String>();
622
623                m.put("initResName",initResName);
624                m.put("initChainId", initChainId);
625                m.put("initSeqNum", initSeqNum);
626                m.put("initICode", initICode);
627                m.put("endResName", endResName);
628                m.put("endChainId", endChainId);
629                m.put("endSeqNum",endSeqNum);
630                m.put("endICode",endICode);
631
632                turnList.add(m);
633        }
634
635        /**
636         * Handler for
637         * REVDAT Record format:
638         *
639         * COLUMNS       DATA TYPE      FIELD         DEFINITION
640         * ----------------------------------------------------------------------------------
641         * 1 -  6       Record name    "REVDAT"
642         * 8 - 10       Integer        modNum        Modification number.
643         * 11 - 12       Continuation   continuation  Allows concatenation of multiple
644         * records.
645         * 14 - 22       Date           modDate       Date of modification (or release for
646         * new entries).  This is not repeated
647         * on continuation lines.
648         * 24 - 28       String(5)      modId         Identifies this particular
649         * modification.  It links to the
650         * archive used internally by PDB.
651         * This is not repeated on continuation
652         * lines.
653         * 32            Integer        modType       An integer identifying the type of
654         * modification.  In case of revisions
655         * with more than one possible modType,
656         * the highest value applicable will be
657         * assigned.
658         * 40 - 45       LString(6)     record        Name of the modified record.
659         * 47 - 52       LString(6)     record        Name of the modified record.
660         * 54 - 59       LString(6)     record        Name of the modified record.
661         * 61 - 66       LString(6)     record        Name of the modified record.
662         */
663        private void pdb_REVDAT_Handler(String line) {
664
665                // only keep the first...
666                Date modDate = pdbHeader.getModDate();
667
668                if ( modDate==null || modDate.equals(new Date(0)) ) {
669                        // modDate is still uninitialized
670                        String modificationDate = line.substring (13, 22).trim() ;
671
672                        try {
673                                Date dep = dateFormat.parse(modificationDate);
674                                pdbHeader.setModDate(dep);
675                        } catch (ParseException e){
676                                logger.info("Could not parse modification date string '"+modificationDate+"'. Will continue without modification date");
677                        }
678
679                }
680        }
681
682        /** @author Jules Jacobsen
683         * Handler for
684         * SEQRES record format
685         * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied.
686         * <p/>
687         * Record Format
688         * <p/>
689         * COLUMNS        DATA TYPE       FIELD         DEFINITION
690         * ---------------------------------------------------------------------------------
691         * 1 -  6        Record name     "SEQRES"
692         * <p/>
693         * 9 - 10        Integer         serNum        Serial number of the SEQRES record
694         * for the current chain.  Starts at 1
695         * and increments by one each line.
696         * Reset to 1 for each chain.
697         * <p/>
698         * 12             Character       chainID       Chain identifier.  This may be any
699         * single legal character, including a
700         * blank which is used if there is
701         * only one chain.
702         * <p/>
703         * 14 - 17        Integer         numRes        Number of residues in the chain.
704         * This value is repeated on every
705         * record.
706         * <p/>
707         * 20 - 22        Residue name    resName       Residue name.
708         * <p/>
709         * 24 - 26        Residue name    resName       Residue name.
710         * <p/>
711         * 28 - 30        Residue name    resName       Residue name.
712         * <p/>
713         * 32 - 34        Residue name    resName       Residue name.
714         * <p/>
715         * 36 - 38        Residue name    resName       Residue name.
716         * <p/>
717         * 40 - 42        Residue name    resName       Residue name.
718         * <p/>
719         * 44 - 46        Residue name    resName       Residue name.
720         * <p/>
721         * 48 - 50        Residue name    resName       Residue name.
722         * <p/>
723         * 52 - 54        Residue name    resName       Residue name.
724         * <p/>
725         * 56 - 58        Residue name    resName       Residue name.
726         * <p/>
727         * 60 - 62        Residue name    resName       Residue name.
728         * <p/>
729         * 64 - 66        Residue name    resName       Residue name.
730         * <p/>
731         * 68 - 70        Residue name    resName       Residue name.
732         */
733        private void pdb_SEQRES_Handler(String line) {
734
735                /*
736                 *          1         2         3         4         5         6         7
737                 * 1234567890123456789012345678901234567890123456789012345678901234567890
738                 * SEQRES   1 A  376  LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR
739                 * SEQRES   1 A   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
740                 * SEQRES   2 A   21  TYR GLN LEU GLU ASN TYR CYS ASN
741                 * SEQRES   1 B   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
742                 * SEQRES   2 B   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
743                 * SEQRES   3 B   30  THR PRO LYS ALA
744                 * SEQRES   1 C   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
745                 * SEQRES   2 C   21  TYR GLN LEU GLU ASN TYR CYS ASN
746                 * SEQRES   1 D   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
747                 * SEQRES   2 D   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
748                 * SEQRES   3 D   30  THR PRO LYS ALA
749                 */
750
751                String recordName = line.substring(0, 6).trim();
752                String chainID    = line.substring(11, 12);
753                String newLength   = line.substring(13,17).trim();
754                String subSequence = line.substring(18);
755
756                if ( lengthCheck == -1 ){
757                        lengthCheck = Integer.parseInt(newLength);
758                }
759
760                StringTokenizer subSequenceResidues = new StringTokenizer(subSequence);
761
762                Character aminoCode1 = null;
763                if (! recordName.equals(AminoAcid.SEQRESRECORD)) {
764                        // should not have been called
765                        return;
766                }
767
768                current_chain = isKnownChain(chainID, seqResChains);
769                if ( current_chain == null) {
770
771                        current_chain = new ChainImpl();
772                        current_chain.setChainID(chainID);
773
774                }
775
776                while (subSequenceResidues.hasMoreTokens()) {
777
778                        String threeLetter = subSequenceResidues.nextToken();
779
780                        aminoCode1 = StructureTools.get1LetterCode(threeLetter);
781
782                        //if (aminoCode1 == null) {
783                        // could be a nucleotide...
784                        // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide...
785                        //}
786                        current_group = getNewGroup("ATOM", aminoCode1, threeLetter);
787
788                        current_group.setPDBName(threeLetter);
789
790                        if ( current_group instanceof AminoAcid){
791                                AminoAcid aa = (AminoAcid)current_group;
792                                aa.setRecordType(AminoAcid.SEQRESRECORD);
793                        }
794                        // add the current resNum to the new chain.
795                        current_chain.addGroup(current_group);
796
797                }
798                Chain test = isKnownChain(chainID, seqResChains);
799
800                if ( test == null)
801                        seqResChains.add(current_chain);
802
803                if (current_group != null)
804                        current_group.trimToSize();
805
806                current_group = null;
807                current_chain = null;
808
809                //               the current chain is finished!
810                //if ( current_chain.getLength() != lengthCheck ){
811                //      System.err.println("the length of chain " + current_chain.getName() + "(" +
812                //                      current_chain.getLength() + ") does not match the expected " + lengthCheck);
813                //}
814
815                lengthCheck = Integer.parseInt(newLength);
816
817        }
818
819
820
821        /** Handler for
822         TITLE Record Format
823
824         COLUMNS        DATA TYPE       FIELD          DEFINITION
825         ----------------------------------------------------------------------------------
826         1 -  6        Record name     "TITLE "
827         9 - 10        Continuation    continuation   Allows concatenation of multiple
828         records.
829         11 - 70        String          title          Title of the experiment.
830
831
832         */
833        private void pdb_TITLE_Handler(String line) {
834                String title;
835                if ( line.length() > 79)
836                        title = line.substring(10,80).trim();
837                else
838                        title = line.substring(10,line.length()).trim();
839
840                String t = pdbHeader.getTitle();
841                if ( (t != null) && (! t.equals("")) ){
842                        if (t.endsWith("-"))
843                                t += ""; // if last line ends with a hyphen then we don't add space
844                        else
845                                t += " ";
846                }
847                else t = "";
848
849                t += title;
850
851                pdbHeader.setTitle(t);
852        }
853
854        /**
855         * JRNL handler.
856         * The JRNL record contains the primary literature citation that describes the experiment which resulted
857         * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary
858         * reference, then there is no JRNL reference. Other references are given in REMARK 1.
859         *
860         * Record Format
861         *
862         * COLUMNS       DATA TYPE     FIELD         DEFINITION
863         * -----------------------------------------------------------------------
864         * 1 -  6       Record name   "JRNL  "
865         *
866         * 13 - 70       LString        text         See Details below.
867         *
868         */
869        private void pdb_JRNL_Handler(String line) {
870                //add the strings to the journalLines
871                //the actual JournalArticle is then built when the whole entry is being
872                //finalized with triggerEndFileChecks()
873                //JRNL        TITL   NMR SOLUTION STRUCTURE OF RECOMBINANT TICK           1TAP  10
874                if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) {
875                        //trim off the trailing PDB id from legacy files.
876                        //are we really trying to still cater for these museum pieces?
877
878                        logger.debug("trimming legacy PDB id from end of JRNL section line");
879
880                        line = line.substring(0, line.length() - 8);
881                        journalLines.add(line);
882                } else {
883                        journalLines.add(line);
884                }
885        }
886
887        /**
888         * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same
889         * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be
890         * missing. Don't say I didn't warn you.
891         *
892         * @param line
893         */
894        private void pdb_COMPND_Handler(String line) {
895
896                logger.debug("previousContinuationField  is "
897                                        + previousContinuationField);
898                logger.debug("current continuationField  is "
899                                        + continuationField);
900                logger.debug("current continuationString is "
901                                        + continuationString);
902                logger.debug("current compound           is "
903                                        + current_compound);
904
905
906                // In legacy PDB files the line ends with the PDB code and a serial number, chop those off!
907                //format version 3.0 onwards will have 80 characters in a line
908                //              if (line.length() > 72) {
909                if (isLegacyFormat) {
910                        //                    if (DEBUG) {
911                        //                        System.out.println("We have a legacy file - truncating line length to 71 characters:");
912                        //                        System.out.println(line);
913                        //                    }
914                        line = line.substring(0, 72);
915                }
916
917                line = line.substring(10, line.length());
918
919
920                String[] fieldList = line.trim().split("\\s+");
921                int fl = fieldList.length;
922                if ((fl >0 ) && compndFieldValues.contains(fieldList[0])) {
923
924                        continuationField = fieldList[0];
925                        if (previousContinuationField.equals("")) {
926                                previousContinuationField = continuationField;
927                        }
928
929                } else if (fl>0) {
930                        // the ':' character indicates the end of a field name and should be invalid as part the first data token
931                        // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
932                        if (fieldList[0].contains(":") ) {
933                                logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier");
934                                return;
935                        }
936
937                } else {
938
939                        // the line will be added as data to the previous field
940                }
941
942                line = line.replace(continuationField, "").trim();
943
944                StringTokenizer compndTokens = new StringTokenizer(line);
945
946                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
947
948                while (compndTokens.hasMoreTokens()) {
949                        String token = compndTokens.nextToken();
950
951                        if (previousContinuationField.equals("")) {
952                                previousContinuationField = continuationField;
953                        }
954
955                        if (previousContinuationField.equals(continuationField)
956                                        && compndFieldValues.contains(continuationField)) {
957
958                                logger.debug("Still in field " + continuationField);
959                                logger.debug("token = " + token);
960
961                                continuationString = continuationString.concat(token + " ");
962
963                                logger.debug("continuationString = "
964                                                        + continuationString);
965
966                        }
967                        if (!continuationField.equals(previousContinuationField)) {
968
969                                if (continuationString.equals("")) {
970                                        continuationString = token;
971
972                                } else {
973
974                                        compndValueSetter(previousContinuationField,
975                                                        continuationString);
976                                        previousContinuationField = continuationField;
977                                        continuationString = token + " ";
978                                }
979                        } else if (ignoreCompndFieldValues.contains(token)) {
980                                // this field shall be ignored
981                                //continuationField = token;
982                        }
983                }
984                if (isLastCompndLine) {
985                        // final line in the section - finish off the compound
986                        //                      System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header.");
987                        compndValueSetter(continuationField, continuationString);
988                        continuationString = "";
989                        if (current_compound!=null) compounds.add(current_compound);
990                }
991        }
992
993        /**
994         * Set the value in the currrent molId object
995         * @param field
996         * @param value
997         */
998        private void compndValueSetter(String field, String value) {
999
1000                value = value.trim().replace(";", "");
1001                if (field.equals("MOL_ID:")) {
1002
1003                        int i = -1;
1004                        try {
1005                                i = Integer.valueOf(value);
1006                        } catch (NumberFormatException e){
1007                                logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value);
1008                        }
1009                        if (i>0 && prevMolId!=i) {
1010
1011                                if (current_compound!=null) compounds.add(current_compound);
1012
1013                                logger.debug("Initialising new Compound with mol_id {}", i);
1014
1015                                current_compound = new Compound();
1016
1017                                current_compound.setMolId(i);
1018
1019                                prevMolId = i;
1020                        }
1021
1022                }
1023
1024                // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return
1025                if (current_compound==null) {
1026                        return;
1027                }
1028
1029                if (field.equals("MOLECULE:")) {
1030                        current_compound.setMolName(value);
1031
1032                }
1033                if (field.equals("CHAIN:")) {
1034                        //System.out.println(value);
1035                        StringTokenizer chainTokens = new StringTokenizer(value, ",");
1036                        List<String> chains = new ArrayList<String>();
1037
1038                        while (chainTokens.hasMoreTokens()) {
1039                                String chainID = chainTokens.nextToken().trim();
1040                                // NULL is used in old PDB files to represent empty chain DI
1041                                if (chainID.equals("NULL"))
1042                                        chainID = " ";
1043                                chains.add(chainID);
1044                        }
1045                        compoundMolIds2chainIds.put(current_compound.getMolId(),chains);
1046
1047                }
1048                if (field.equals("SYNONYM:")) {
1049
1050                        StringTokenizer synonyms = new StringTokenizer(value, ",");
1051                        List<String> names = new ArrayList<String>();
1052
1053                        while (synonyms.hasMoreTokens()) {
1054                                names.add(synonyms.nextToken());
1055
1056                                current_compound.setSynonyms(names);
1057                        }
1058
1059                }
1060
1061                if (field.equals("EC:")) {
1062
1063                        StringTokenizer ecNumTokens = new StringTokenizer(value, ",");
1064                        List<String> ecNums = new ArrayList<String>();
1065
1066                        while (ecNumTokens.hasMoreTokens()) {
1067                                ecNums.add(ecNumTokens.nextToken());
1068
1069                                current_compound.setEcNums(ecNums);
1070                        }
1071
1072                }
1073                if (field.equals("FRAGMENT:")) {
1074
1075                        current_compound.setFragment(value);
1076
1077                }
1078                if (field.equals("ENGINEERED:")) {
1079
1080                        current_compound.setEngineered(value);
1081
1082                }
1083                if (field.equals("MUTATION:")) {
1084
1085                        current_compound.setMutation(value);
1086
1087                }
1088                if (field.equals("BIOLOGICAL_UNIT:")) {
1089
1090                        current_compound.setBiologicalUnit(value);
1091
1092                }
1093                if (field.equals("OTHER_DETAILS:")) {
1094
1095                        current_compound.setDetails(value);
1096
1097                }
1098
1099        }
1100
1101
1102        /** Handler for
1103         * SOURCE Record format
1104         *
1105         * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied.
1106         * Record Format
1107         *
1108         * COLUMNS   DATA TYPE         FIELD          DEFINITION
1109         * -------------------------------------------------------------------------------
1110         *  1 -  6   Record name       "SOURCE"
1111         *  9 - 10   Continuation      continuation   Allows concatenation of multiple records.
1112         * 11 - 70   Specification     srcName        Identifies the source of the macromolecule in
1113         *            list                            a token: value format.
1114         * @param line the line to be parsed
1115         */
1116        private void pdb_SOURCE_Handler(String line) {
1117                // works in the same way as the pdb_COMPND_Handler.
1118                String continuationNr = line.substring(9, 10).trim();
1119
1120
1121
1122                logger.debug("current continuationNo     is "
1123                                + continuationNr);
1124                logger.debug("previousContinuationField  is "
1125                                + previousContinuationField);
1126                logger.debug("current continuationField  is "
1127                                + continuationField);
1128                logger.debug("current continuationString is "
1129                                + continuationString);
1130                logger.debug("current compound           is "
1131                                + current_compound);
1132
1133
1134                // following the docs, the last valid character should be 79, chop off the rest
1135                if (line.length() > 79) {
1136                        line = line.substring(0, 79);
1137                }
1138
1139                line = line.substring(10, line.length());
1140
1141                logger.debug("LINE: >" + line + "<");
1142
1143                String[] fieldList = line.split("\\s+");
1144
1145                if (!fieldList[0].equals("")
1146                                && sourceFieldValues.contains(fieldList[0])) {
1147                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'");
1148                        continuationField = fieldList[0];
1149                        if (previousContinuationField.equals("")) {
1150                                previousContinuationField = continuationField;
1151                        }
1152
1153                } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) {
1154                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'");
1155                        continuationField = fieldList[1];
1156                        if (previousContinuationField.equals("")) {
1157                                previousContinuationField = continuationField;
1158                        }
1159
1160                } else {
1161                        if (continuationNr.equals("")) {
1162
1163                                logger.debug("looks like an old PDB file");
1164
1165                                continuationField = "MOLECULE:";
1166                                if (previousContinuationField.equals("")) {
1167                                        previousContinuationField = continuationField;
1168                                }
1169                        }
1170
1171                }
1172
1173                line = line.replace(continuationField, "").trim();
1174
1175                StringTokenizer compndTokens = new StringTokenizer(line);
1176
1177                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
1178
1179                while (compndTokens.hasMoreTokens()) {
1180                        String token = compndTokens.nextToken();
1181
1182                        if (previousContinuationField.equals("")) {
1183                                //                              System.out.println("previousContinuationField is empty. Setting to : " + continuationField);
1184                                previousContinuationField = continuationField;
1185                        }
1186
1187                        if (previousContinuationField.equals(continuationField)
1188                                        && sourceFieldValues.contains(continuationField)) {
1189
1190                                logger.debug("Still in field " + continuationField);
1191
1192                                continuationString = continuationString.concat(token + " ");
1193
1194                                logger.debug("continuationString = "
1195                                                        + continuationString);
1196                        }
1197                        if (!continuationField.equals(previousContinuationField)) {
1198
1199                                if (continuationString.equals("")) {
1200                                        continuationString = token;
1201
1202                                } else {
1203
1204                                        sourceValueSetter(previousContinuationField,
1205                                                        continuationString);
1206                                        previousContinuationField = continuationField;
1207                                        continuationString = token + " ";
1208                                }
1209                        } else if (ignoreCompndFieldValues.contains(token)) {
1210                                // this field shall be ignored
1211                                //continuationField = token;
1212                        }
1213                }
1214                if (isLastSourceLine) {
1215                        // final line in the section - finish off the compound
1216                        //                      System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header.");
1217                        sourceValueSetter(continuationField, continuationString);
1218                        continuationString = "";
1219                        //compounds.add(current_compound);
1220                }
1221
1222        }
1223
1224
1225        /** set the value in the currrent molId object
1226         *
1227         * @param field
1228         * @param value
1229         */
1230        private void sourceValueSetter(String field, String value) {
1231
1232                value = value.trim().replace(";", "");
1233                //              System.out.println("[sourceValueSetter] " + field);
1234                if (field.equals("MOL_ID:")) {
1235
1236                        try {
1237                                current_compound = compounds.get(Integer.valueOf(value) - 1);
1238                        } catch (NumberFormatException e){
1239                                logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage());
1240                                return;
1241                        }
1242
1243
1244                        //                      System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId());
1245
1246                }
1247                if (field.equals("SYNTHETIC:")) {
1248                        current_compound.setSynthetic(value);
1249                } else if (field.equals("FRAGMENT:")) {
1250                        current_compound.setFragment(value);
1251                } else if (field.equals("ORGANISM_SCIENTIFIC:")) {
1252                        current_compound.setOrganismScientific(value);
1253                } else if (field.equals("ORGANISM_TAXID:")) {
1254                        current_compound.setOrganismTaxId(value);
1255                } else if (field.equals("ORGANISM_COMMON:")) {
1256                        current_compound.setOrganismCommon(value);
1257                } else if (field.equals("STRAIN:")) {
1258                        current_compound.setStrain(value);
1259                } else if (field.equals("VARIANT:")) {
1260                        current_compound.setVariant(value);
1261                } else if (field.equals("CELL_LINE:")) {
1262                        current_compound.setCellLine(value);
1263                } else if (field.equals("ATCC:")) {
1264                        current_compound.setAtcc(value);
1265                } else if (field.equals("ORGAN:")) {
1266                        current_compound.setOrgan(value);
1267                } else if (field.equals("TISSUE:")) {
1268                        current_compound.setTissue(value);
1269                } else if (field.equals("CELL:")) {
1270                        current_compound.setCell(value);
1271                } else if (field.equals("ORGANELLE:")) {
1272                        current_compound.setOrganelle(value);
1273                } else if (field.equals("SECRETION:")) {
1274                        current_compound.setSecretion(value);
1275                } else if (field.equals("GENE:")) {
1276                        current_compound.setGene(value);
1277                } else if (field.equals("CELLULAR_LOCATION:")) {
1278                        current_compound.setCellularLocation(value);
1279                } else if (field.equals("EXPRESSION_SYSTEM:")) {
1280                        current_compound.setExpressionSystem(value);
1281                } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) {
1282                        current_compound.setExpressionSystemTaxId(value);
1283                } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) {
1284                        current_compound.setExpressionSystemStrain(value);
1285                } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) {
1286                        current_compound.setExpressionSystemVariant(value);
1287                } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) {
1288                        current_compound.setExpressionSystemCellLine(value);
1289                } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) {
1290                        current_compound.setExpressionSystemAtccNumber(value);
1291                } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) {
1292                        current_compound.setExpressionSystemOrgan(value);
1293                } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) {
1294                        current_compound.setExpressionSystemTissue(value);
1295                } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) {
1296                        current_compound.setExpressionSystemCell(value);
1297                } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) {
1298                        current_compound.setExpressionSystemOrganelle(value);
1299                } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) {
1300                        current_compound.setExpressionSystemCellularLocation(value);
1301                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) {
1302                        current_compound.setExpressionSystemVectorType(value);
1303                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) {
1304                        current_compound.setExpressionSystemVector(value);
1305                } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) {
1306                        current_compound.setExpressionSystemPlasmid(value);
1307                } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) {
1308                        current_compound.setExpressionSystemGene(value);
1309                } else if (field.equals("OTHER_DETAILS:")) {
1310                        current_compound.setExpressionSystemOtherDetails(value);
1311                }
1312
1313        }
1314
1315        /**
1316         * Handler for REMARK lines
1317         */
1318        private void pdb_REMARK_Handler(String line) {
1319
1320                if ( line == null || line.length() < 11)
1321                        return;
1322
1323
1324                if (line.startsWith("REMARK 800")) {
1325                        pdb_REMARK_800_Handler(line);
1326
1327                }  else if ( line.startsWith("REMARK 350")){
1328
1329                        if ( params.isParseBioAssembly()) {
1330
1331                                if (bioAssemblyParser == null){
1332                                        bioAssemblyParser = new PDBBioAssemblyParser();
1333                                }
1334
1335                                bioAssemblyParser.pdb_REMARK_350_Handler(line);
1336                        }
1337
1338                // REMARK 3 (for R free)
1339                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1340                // then last one encountered will be taken
1341                } else if (line.startsWith("REMARK   3   FREE R VALUE")) {
1342
1343                        // Rfree annotation is not very consistent in PDB format, it varies depending on the software
1344                        // Here we follow this strategy:
1345                        // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
1346                        // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
1347
1348                        Pattern pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*");
1349                        Matcher mR = pR.matcher(line);
1350                        if (mR.matches()) {
1351                                try {
1352                                        rfreeNoCutoffLine = Float.parseFloat(mR.group(1));
1353                                } catch (NumberFormatException e) {
1354                                        logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it");
1355                                }
1356                        }
1357                        pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*");
1358                        mR = pR.matcher(line);
1359                        if (mR.matches()) {
1360                                try {
1361                                        rfreeStandardLine = Float.parseFloat(mR.group(1));
1362                                } catch (NumberFormatException e) {
1363                                        logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1));
1364                                }
1365                        }
1366
1367                // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries)
1368                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1369                // then last one encountered will be taken
1370                } else if (line.startsWith("REMARK   3   RESOLUTION RANGE HIGH")){
1371                        Pattern pR = Pattern.compile("^REMARK   3   RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*");
1372                        Matcher mR = pR.matcher(line);
1373                        if (mR.matches()) {
1374                                try {
1375                                        float res = Float.parseFloat(mR.group(1));
1376                                        if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) {
1377                                                logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1378                                                                ,mR.group(1), String.format("%4.2f",pdbHeader.getResolution()));
1379                                        }
1380                                        pdbHeader.setResolution(res);
1381                                } catch (NumberFormatException e) {
1382                                        logger.info("Could not parse resolution '{}', ignoring it",mR.group(1));
1383                                }
1384                        }
1385                }
1386
1387        }
1388
1389
1390
1391
1392
1393
1394        /** Handler for
1395         EXPDTA Record Format
1396
1397         COLUMNS       DATA TYPE      FIELD         DEFINITION
1398         -------------------------------------------------------------------------------
1399         1 -  6       Record name    "EXPDTA"
1400         9 - 10       Continuation   continuation  Allows concatenation of multiple
1401         records.
1402         11 - 70       SList          technique     The experimental technique(s) with
1403         optional comment describing the
1404         sample or experiment.
1405
1406         allowed techniques are:
1407         ELECTRON DIFFRACTION
1408         FIBER DIFFRACTION
1409         FLUORESCENCE TRANSFER
1410         NEUTRON DIFFRACTION
1411         NMR
1412         THEORETICAL MODEL
1413         X-RAY DIFFRACTION
1414
1415         */
1416
1417        private void pdb_EXPDTA_Handler(String line) {
1418
1419                String technique  ;
1420                if (line.length() > 69)
1421                        technique = line.substring (10, 70).trim() ;
1422                else
1423                        technique = line.substring(10).trim();
1424
1425                for (String singleTechnique: technique.split(";\\s+")) {
1426                        pdbHeader.setExperimentalTechnique(singleTechnique);
1427                }
1428
1429
1430        }
1431
1432        /** Handler for
1433         * CRYST1 Record Format
1434         * The CRYST1 record presents the unit cell parameters, space group, and Z value.
1435         * If the entry describes a structure determined by a technique other than X-ray crystallography,
1436         * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1437         *
1438         * COLUMNS DATA TYPE    FIELD          DEFINITION
1439         * -------------------------------------------------------------
1440         *  1 - 6  Record name  "CRYST1"
1441         *  7 - 15 Real(9.3)    a              a (Angstroms).
1442         * 16 - 24 Real(9.3)    b              b (Angstroms).
1443         * 25 - 33 Real(9.3)    c              c (Angstroms).
1444         * 34 - 40 Real(7.2)    alpha          alpha (degrees).
1445         * 41 - 47 Real(7.2)    beta           beta (degrees).
1446         * 48 - 54 Real(7.2)    gamma          gamma (degrees).
1447         * 56 - 66 LString      sGroup         Space group.
1448         * 67 - 70 Integer      z              Z value.
1449         *
1450         */
1451
1452        private void pdb_CRYST1_Handler(String line) {
1453                // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 63
1454                if (line.length() < 63) {
1455                        logger.warn("CRYST1 record has fewer than 63 columns: will ignore it");
1456                        return;
1457                }
1458
1459                float a;
1460                float b;
1461                float c;
1462                float alpha;
1463                float beta;
1464                float gamma;
1465                String spaceGroup = "";
1466
1467                try {
1468                        a = Float.parseFloat(line.substring(6,15).trim());
1469                        b = Float.parseFloat(line.substring(15,24).trim());
1470                        c = Float.parseFloat(line.substring(24,33).trim());
1471                        alpha = Float.parseFloat(line.substring(33,40).trim());
1472                        beta = Float.parseFloat(line.substring(40,47).trim());
1473                        gamma = Float.parseFloat(line.substring(47,54).trim());
1474                } catch (NumberFormatException e) {
1475                        logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line);
1476                        return ;
1477                }
1478                if (line.length()>=66) {
1479                        // for well formatted files
1480                        spaceGroup = line.substring(55,66).trim();
1481                } else {
1482                        // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value
1483                        spaceGroup = line.substring(55,line.length()).trim();
1484                }
1485
1486                CrystalCell xtalCell = new CrystalCell();
1487                xtalCell.setA(a);
1488                xtalCell.setB(b);
1489                xtalCell.setC(c);
1490                xtalCell.setAlpha(alpha);
1491                xtalCell.setBeta(beta);
1492                xtalCell.setGamma(gamma);
1493
1494                if (!xtalCell.isCellReasonable()) {
1495                        // If the entry describes a structure determined by a technique other than X-ray crystallography,
1496                    // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1497                        // if so we don't add the crystal cell and it remains null
1498                        logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1499                                        CrystalCell.MIN_VALID_CELL_SIZE);
1500                } else {
1501                        crystallographicInfo.setCrystalCell(xtalCell);
1502                }
1503
1504                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1505                if (sg==null) {
1506                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1507                        crystallographicInfo.setNonStandardSg(true);
1508                } else {
1509                        crystallographicInfo.setSpaceGroup(sg);
1510                        crystallographicInfo.setNonStandardSg(false);
1511                }
1512        }
1513
1514        /**
1515         * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries)
1516         *
1517         * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn
1518         *
1519         * COLUMNS        DATA TYPE     FIELD         DEFINITION
1520         * -------------------------------------------------------------
1521         *
1522         *  1 -  6        Record name   "MTRIXn"      n=1, 2, or 3
1523         *  8 - 10        Integer       serial        Serial number.
1524         * 11 - 20        Real(10.6)    m[n][1]       Mn1
1525         * 21 - 30        Real(10.6)    m[n][2]       Mn2
1526         * 31 - 40        Real(10.6)    m[n][3]       Mn3
1527         * 46 - 55        Real(10.5)    v[n]          Vn
1528         * 60             Integer       iGiven        1
1529         *
1530         * Note that we ignore operators with iGiven==1
1531         *
1532         * @param line
1533         */
1534        private void pdb_MTRIXn_Handler(String line) {
1535
1536                // don't process incomplete records
1537                if (line.length() < 60) {
1538                        logger.info("MTRIXn record has fewer than 60 columns: will ignore it");
1539                        return;
1540                }
1541
1542
1543                try {
1544
1545                        int rowIndex = Integer.parseInt(line.substring(5,6));
1546                        double col1Value = Double.parseDouble(line.substring(10,20));
1547                        double col2Value = Double.parseDouble(line.substring(20,30));
1548                        double col3Value = Double.parseDouble(line.substring(30,40));
1549                        double translValue = Double.parseDouble(line.substring(45,55));
1550                        int iGiven = 0;
1551                        if (!line.substring(59,60).trim().equals("")) {
1552                                iGiven = Integer.parseInt(line.substring(59,60));
1553                        }
1554
1555                        if (iGiven == 1) return;
1556
1557                        if (ncsOperators==null) {
1558                                // we initialise on first pass
1559                                ncsOperators = new ArrayList<Matrix4d>();
1560                        }
1561
1562                        if (currentNcsOp==null) {
1563                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1564                        }
1565
1566                        currentNcsOp.setElement(rowIndex-1, 0, col1Value);
1567                        currentNcsOp.setElement(rowIndex-1, 1, col2Value);
1568                        currentNcsOp.setElement(rowIndex-1, 2, col3Value);
1569                        currentNcsOp.setElement(rowIndex-1, 3, translValue);
1570
1571
1572                        if (rowIndex==3) {
1573                                ncsOperators.add(currentNcsOp);
1574                                // we initialise for next matrix to come
1575                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1576                        }
1577
1578                } catch (NumberFormatException e) {
1579                        logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<");
1580                }
1581        }
1582
1583        /**
1584         * Decides whether or not a Group is qualified to be added to the
1585         * Structure.hetGroups list. If it likes it, it adds it.
1586         * @param group
1587         */
1588        private void addTohetGroupsDecider(Group group) {
1589                boolean wanted = false;
1590                //these are HET groups, but they are usually less interesting
1591                //than other types
1592                if (group.getPDBName().equals("HOH"))
1593                        return;
1594                if (group.getChemComp() == null) {
1595                        if (group.getType().equals(GroupType.HETATM)) {
1596                                wanted = true;
1597                        }
1598                } else if (!group.getChemComp().isStandard()) {
1599                        //also want to add modified amino acids e.g. TYS
1600                        //these are GroupType.AMINOACID, so we need to check the ChemComp
1601                        wanted = true;
1602                }
1603
1604                if (wanted) {
1605                        if (! structure.getHetGroups().contains(group)) {
1606                                //                    System.out.println("Added " + group + " to structure.hetgroups");
1607                                structure.getHetGroups().add(group);
1608                        }
1609                }
1610        }
1611
1612        /**
1613         Handler for
1614         ATOM Record Format
1615         *
1616         * <pre>
1617         * ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1618         *
1619         * COLUMNS        DATA TYPE       FIELD         DEFINITION
1620         * ---------------------------------------------------------------------------------
1621         * 1 -  6        Record name     "ATOM  "
1622         * 7 - 11        Integer         serial        Atom serial number.
1623         * 13 - 16        Atom            name          Atom name.
1624         * 17             Character       altLoc        Alternate location indicator.
1625         * 18 - 20        Residue name    resName       Residue name.
1626         * 22             Character       chainID       Chain identifier.
1627         * 23 - 26        Integer         resSeq        Residue sequence number.
1628         * 27             AChar           iCode         Code for insertion of residues.
1629         * 31 - 38        Real(8.3)       x             Orthogonal coordinates for X in Angstroms.
1630         * 39 - 46        Real(8.3)       y             Orthogonal coordinates for Y in Angstroms.
1631         * 47 - 54        Real(8.3)       z             Orthogonal coordinates for Z in Angstroms.
1632         * 55 - 60        Real(6.2)       occupancy     Occupancy.
1633         * 61 - 66        Real(6.2)       tempFactor    Temperature factor.
1634         * 73 - 76        LString(4)      segID         Segment identifier, left-justified.
1635         * 77 - 78        LString(2)      element       Element symbol, right-justified.
1636         * 79 - 80        LString(2)      charge        Charge on the atom.
1637         * </pre>
1638         */
1639        private void  pdb_ATOM_Handler(String line)     {
1640                // build up chains first.
1641                // headerOnly just goes down to chain resolution.
1642
1643                if ( params.isHeaderOnly())
1644                        return;
1645
1646                boolean startOfNewChain = false;
1647
1648                String chain_id      = line.substring(21,22);
1649
1650                if (current_chain == null) {
1651                        current_chain = new ChainImpl();
1652                        current_chain.setChainID(chain_id);
1653                        startOfNewChain = true;
1654                        current_model.add(current_chain);
1655                }
1656
1657
1658                if ( ! chain_id.equals(current_chain.getChainID()) ) {
1659
1660                        startOfNewChain = true;
1661
1662                        // end up old chain...
1663                        current_chain.addGroup(current_group);
1664
1665                        // see if old chain is known ...
1666                        Chain testchain ;
1667                        testchain = isKnownChain(current_chain.getChainID(),current_model);
1668
1669                        //System.out.println("trying to re-using known chain " + current_chain.getName() + " " + chain_id);
1670                        if ( testchain != null && testchain.getChainID().equals(chain_id)){
1671                                //System.out.println("re-using known chain " + current_chain.getName() + " " + chain_id);
1672
1673                        } else {
1674
1675                                testchain = isKnownChain(chain_id,current_model);
1676                        }
1677
1678                        if ( testchain == null) {
1679                                //System.out.println("unknown chain. creating new chain.");
1680
1681                                current_chain = new ChainImpl();
1682                                current_chain.setChainID(chain_id);
1683
1684                        }   else {
1685                                current_chain = testchain;
1686                        }
1687
1688                        if ( ! current_model.contains(current_chain))
1689                                current_model.add(current_chain);
1690
1691
1692                }
1693
1694                // process group data:
1695                // join residue numbers and insertion codes together
1696                String recordName     = line.substring (0, 6).trim ();
1697
1698                String groupCode3     = line.substring(17,20).trim();
1699                // pdbCode is the old way of doing things...it's a concatenation
1700                //of resNum and iCode which are now defined explicitly
1701                String resNum  = line.substring(22,26).trim();
1702                Character iCode = line.substring(26,27).charAt(0);
1703                if ( iCode == ' ')
1704                        iCode = null;
1705                ResidueNumber residueNumber = new ResidueNumber(chain_id, Integer.valueOf(resNum), iCode);
1706
1707                //recordName      groupCode3
1708                //|                |    resNum
1709                //|                |    |   iCode
1710                //|     |          | |  |   ||
1711                //ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1712                //ATOM   1964  N   ARG H 221A      5.963 -16.715  27.669  1.00 28.59           N
1713
1714                Character aminoCode1 = null;
1715
1716                if ( recordName.equals("ATOM") ){
1717                        aminoCode1 = StructureTools.get1LetterCode(groupCode3);
1718                } else {
1719                        // HETATOM RECORDS are treated slightly differently
1720                        // some modified amino acids that we want to treat as amino acids
1721                        // can be found as HETATOM records
1722                        aminoCode1 = StructureTools.get1LetterCode(groupCode3);
1723                        if ( aminoCode1 != null)
1724                                if ( aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
1725                                        aminoCode1 = null;
1726                }
1727
1728                if (current_group == null) {
1729
1730                        current_group = getNewGroup(recordName,aminoCode1,groupCode3);
1731
1732                        //if ((current_group instanceof AminoAcidImpl) && groupCode3.length()!=3) {
1733                        //      throw new PDBParseException("amino acid name is not of length 3! (" + groupCode3 +")");
1734                        //}
1735                        current_group.setPDBName(groupCode3);
1736                        current_group.setResidueNumber(residueNumber);
1737                        //                                              System.out.println("Made new group: " + groupCode3 + " " + resNum + " " + iCode);
1738                        addTohetGroupsDecider(current_group);
1739                }
1740
1741
1742                if ( startOfNewChain) {
1743                        //System.out.println("end of chain: "+current_chain.getName()+" >"+chain_id+"<");
1744
1745                        current_group = getNewGroup(recordName,aminoCode1,groupCode3);
1746
1747                        //if ((current_group instanceof AminoAcidImpl) && groupCode3.length()!=3) {
1748                        //      throw new PDBParseException("amino acid name is not of length 3! (" + groupCode3 +")");
1749                        //}
1750                        current_group.setPDBName(groupCode3);
1751                        current_group.setResidueNumber(residueNumber);
1752                        addTohetGroupsDecider(current_group);
1753                        //                        System.out.println("Made new start of chain group:  " + groupCode3 + " " + resNum + " " + iCode);
1754                }
1755
1756
1757                Character altLoc   = new Character(line.substring (16, 17).charAt(0));
1758                Group altGroup = null;
1759
1760                //System.out.println(current_group + " " + residueNumber);
1761
1762                // check if residue number is the same ...
1763                // insertion code is part of residue number
1764                if ( ! residueNumber.equals(current_group.getResidueNumber())) {
1765
1766                        current_chain.addGroup(current_group);
1767                        current_group.trimToSize();
1768
1769                        current_group = getNewGroup(recordName,aminoCode1,groupCode3);
1770
1771                        //if ((current_group instanceof AminoAcidImpl) && groupCode3.length()!=3) {
1772                        //      throw new PDBParseException("amino acid name is not of length 3! (" + groupCode3 +")");
1773                        //}
1774                        current_group.setPDBName(groupCode3);
1775                        current_group.setResidueNumber(residueNumber);
1776                        addTohetGroupsDecider(current_group);
1777                        //                        System.out.println("Made new group:  " + groupCode3 + " " + resNum + " " + iCode);
1778
1779                } else {
1780                        // same residueNumber, but altLocs...
1781
1782                        // test altLoc
1783                        if ( ! altLoc.equals(' ')) {
1784                                logger.debug("found altLoc! " + current_group + " " + altGroup);
1785                                altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3);
1786                                if ( altGroup.getChain() == null) {
1787                                        // need to set current chain
1788                                        altGroup.setChain(current_chain);
1789                                }
1790
1791                        }
1792                }
1793
1794                atomCount++;
1795
1796                if ( atomCount == my_ATOM_CA_THRESHOLD ) {
1797                        // throw away the SEQRES lines - too much to deal with...
1798                        logger.warn("more than " + my_ATOM_CA_THRESHOLD + " atoms in this structure, ignoring the SEQRES lines");
1799                        seqResChains.clear();
1800
1801                        switchCAOnly();
1802
1803                }
1804
1805
1806
1807                if ( atomCount == load_max_atoms){
1808                        logger.warn("too many atoms (>"+load_max_atoms+"in this protein structure.");
1809                        logger.warn("ignoring lines after: " + line);
1810                        return;
1811                }
1812                if ( atomCount > load_max_atoms){
1813                        //System.out.println("too many atoms in this protein structure.");
1814                        //System.out.println("ignoring line: " + line);
1815                        return;
1816                }
1817
1818
1819                //          1         2         3         4         5         6
1820                //012345678901234567890123456789012345678901234567890123456789
1821                //ATOM      1  N   MET     1      20.154  29.699   5.276   1.0
1822                //ATOM    112  CA  ASP   112      41.017  33.527  28.371  1.00  0.00
1823                //ATOM     53  CA  MET     7      23.772  33.989 -21.600  1.00  0.00           C
1824                //ATOM    112  CA  ASP   112      37.613  26.621  33.571     0     0
1825
1826
1827                String fullname = line.substring (12, 16);
1828
1829                // check for CA only if requested
1830                if ( parseCAonly ){
1831                        // yes , user wants to get CA only
1832                        // only parse CA atoms...
1833                        if (! fullname.equals(" CA ")){
1834                                //System.out.println("ignoring " + line);
1835                                atomCount--;
1836                                return;
1837                        }
1838                }
1839
1840                if ( params.getAcceptedAtomNames() != null) {
1841
1842                        boolean found = false;
1843                        for (String ok : params.getAcceptedAtomNames()){
1844                                //System.out.println(ok + "< >" + fullname +"<");
1845
1846                                if ( ok.equals(fullname.trim())) {
1847                                        found = true;
1848                                        break;
1849                                }
1850                        }
1851                        if ( ! found) {
1852                                atomCount--;
1853                                return;
1854                        }
1855                }
1856                // create new atom
1857
1858                int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ());
1859                AtomImpl atom = new AtomImpl() ;
1860                atom.setPDBserial(pdbnumber) ;
1861
1862                atom.setAltLoc(altLoc);
1863                atom.setName(fullname.trim());
1864
1865                double x = Double.parseDouble (line.substring (30, 38).trim());
1866                double y = Double.parseDouble (line.substring (38, 46).trim());
1867                double z = Double.parseDouble (line.substring (46, 54).trim());
1868
1869                double[] coords = new double[3];
1870                coords[0] = x ;
1871                coords[1] = y ;
1872                coords[2] = z ;
1873                atom.setCoords(coords);
1874
1875                float occu  = 1.0f;
1876                if ( line.length() > 59 ) {
1877                        try {
1878                                // occu and tempf are sometimes not used :-/
1879                                occu = Float.parseFloat (line.substring (54, 60).trim());
1880                        }  catch (NumberFormatException e){}
1881                }
1882
1883                float tempf = 0.0f;
1884                if ( line.length() > 65) {
1885                        try {
1886                                tempf = Float.parseFloat (line.substring (60, 66).trim());
1887                        }  catch (NumberFormatException e){}
1888                }
1889
1890                atom.setOccupancy(  occu  );
1891                atom.setTempFactor( tempf );
1892
1893
1894
1895
1896                // Parse element from the element field. If this field is
1897                // missing (i.e. misformatted PDB file), then parse the
1898                // name from the atom name.
1899                Element element = Element.R;
1900                if ( line.length() > 77 ) {
1901                        // parse element from element field
1902                        try {
1903                                element = Element.valueOfIgnoreCase(line.substring (76, 78).trim());
1904                        }  catch (IllegalArgumentException e){}
1905                } else {
1906                        // parse the name from the atom name
1907                        String elementSymbol = null;
1908                        // for atom names with 4 characters, the element is
1909                        // at the first position, example HG23 in Valine
1910                        if (fullname.trim().length() == 4) {
1911                                elementSymbol = fullname.substring(0, 1);
1912                        } else if ( fullname.trim().length() > 1){
1913                                elementSymbol = fullname.substring(0, 2).trim();
1914                        } else {
1915                                // unknown element...
1916                                elementSymbol = "R";
1917                        }
1918
1919                        try {
1920                                element = Element.valueOfIgnoreCase(elementSymbol);
1921                        }  catch (IllegalArgumentException e){}
1922                }
1923                atom.setElement(element);
1924
1925
1926                //see if chain_id is one of the previous chains ...
1927                if ( altGroup != null) {
1928                        altGroup.addAtom(atom);
1929                        altGroup = null;
1930                }
1931                else {
1932                        current_group.addAtom(atom);
1933                }
1934
1935
1936                // make sure that main group has all atoms
1937                // GitHub issue: #76
1938                if ( ! current_group.hasAtom(atom.getName())) {
1939                        current_group.addAtom(atom);
1940                }
1941
1942
1943
1944                //System.out.println("current group: " + current_group);
1945                        }
1946
1947
1948        private Group getCorrectAltLocGroup( Character altLoc,
1949                        String recordName, Character aminoCode1, String groupCode3) {
1950
1951                // see if we know this altLoc already;
1952                List<Atom> atoms = current_group.getAtoms();
1953                if ( atoms.size() > 0) {
1954                        Atom a1 = atoms.get(0);
1955                        // we are just adding atoms to the current group
1956                        // probably there is a second group following later...
1957                        if (a1.getAltLoc().equals(altLoc)) {
1958
1959                                return current_group;
1960                        }
1961                }
1962
1963                List<Group> altLocs = current_group.getAltLocs();
1964                for ( Group altLocG : altLocs ){
1965                        atoms = altLocG.getAtoms();
1966                        if ( atoms.size() > 0) {
1967                                for ( Atom a1 : atoms) {
1968                                        if (a1.getAltLoc().equals( altLoc)) {
1969
1970                                                return altLocG;
1971                                        }
1972                                }
1973                        }
1974                }
1975
1976                // no matching altLoc group found.
1977                // build it up.
1978
1979                if ( groupCode3.equals(current_group.getPDBName())) {
1980                        if ( current_group.getAtoms().size() == 0) {
1981                                //System.out.println("current group is empty " + current_group + " " + altLoc);
1982                                return current_group;
1983                        }
1984                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
1985                        Group altLocG = (Group) current_group.clone();
1986                        // drop atoms from cloned group...
1987                        // https://redmine.open-bio.org/issues/3307
1988                        altLocG.setAtoms(new ArrayList<Atom>());
1989                        altLocG.getAltLocs().clear();
1990                        current_group.addAltLoc(altLocG);
1991                        return altLocG;
1992                }
1993
1994                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
1995                Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3);
1996
1997
1998                altLocG.setPDBName(groupCode3);
1999
2000                altLocG.setResidueNumber(current_group.getResidueNumber());
2001                current_group.addAltLoc(altLocG);
2002                return altLocG;
2003        }
2004
2005        private void switchCAOnly(){
2006                parseCAonly = true;
2007
2008
2009                current_model = CAConverter.getRepresentativeAtomsOnly(current_model);
2010
2011                for ( int i =0; i< structure.nrModels() ; i++){
2012                        //  iterate over all known models ...
2013                        List<Chain> model = structure.getModel(i);
2014                        model = CAConverter.getRepresentativeAtomsOnly(model);
2015                        structure.setModel(i,model);
2016                }
2017
2018                current_chain = CAConverter.getRepresentativeAtomsOnly(current_chain);
2019
2020        }
2021
2022
2023        /** safes repeating a few lines ... */
2024        private Integer conect_helper (String line,int start,int end) {
2025                if (line.length() < end) return null;
2026                
2027                String sbond = line.substring(start,end).trim();
2028                int bond  = -1 ;
2029                Integer b = null ;
2030
2031                if ( ! sbond.equals("")) {
2032                        bond = Integer.parseInt(sbond);
2033                        b = new Integer(bond);
2034                }
2035
2036                return b ;
2037        }
2038
2039        /**
2040         Handler for
2041         CONECT Record Format
2042
2043         COLUMNS         DATA TYPE        FIELD           DEFINITION
2044         ---------------------------------------------------------------------------------
2045         1 -  6         Record name      "CONECT"
2046         7 - 11         Integer          serial          Atom serial number
2047         12 - 16         Integer          serial          Serial number of bonded atom
2048         17 - 21         Integer          serial          Serial number of bonded atom
2049         22 - 26         Integer          serial          Serial number of bonded atom
2050         27 - 31         Integer          serial          Serial number of bonded atom
2051         32 - 36         Integer          serial          Serial number of hydrogen bonded
2052         atom
2053         37 - 41         Integer          serial          Serial number of hydrogen bonded
2054         atom
2055         42 - 46         Integer          serial          Serial number of salt bridged
2056         atom
2057         47 - 51         Integer          serial          Serial number of hydrogen bonded
2058         atom
2059         52 - 56         Integer          serial          Serial number of hydrogen bonded
2060         atom
2061         57 - 61         Integer          serial          Serial number of salt bridged
2062         atom
2063         */
2064        private void pdb_CONECT_Handler(String line) {
2065                //System.out.println(line);
2066                // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines...
2067                if ( atomOverflow) {
2068                        return ;
2069                }
2070                if (params.isHeaderOnly()) {
2071                        return;
2072                }
2073                try {
2074                        int atomserial = Integer.parseInt (line.substring(6 ,11).trim());
2075                        Integer bond1      = conect_helper(line,11,16);
2076                        Integer bond2      = conect_helper(line,16,21);
2077                        Integer bond3      = conect_helper(line,21,26);
2078                        Integer bond4      = conect_helper(line,26,31);
2079                        Integer hyd1       = conect_helper(line,31,36);
2080                        Integer hyd2       = conect_helper(line,36,41);
2081                        Integer salt1      = conect_helper(line,41,46);
2082                        Integer hyd3       = conect_helper(line,46,51);
2083                        Integer hyd4       = conect_helper(line,51,56);
2084                        Integer salt2      = conect_helper(line,56,61);
2085
2086                        //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+
2087                        //                 hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2);
2088                        HashMap<String, Integer> cons = new HashMap<String, Integer>();
2089                        cons.put("atomserial",new Integer(atomserial));
2090
2091                        if ( bond1 != null) cons.put("bond1",bond1);
2092                        if ( bond2 != null) cons.put("bond2",bond2);
2093                        if ( bond3 != null) cons.put("bond3",bond3);
2094                        if ( bond4 != null) cons.put("bond4",bond4);
2095                        if ( hyd1  != null) cons.put("hydrogen1",hyd1);
2096                        if ( hyd2  != null) cons.put("hydrogen2",hyd2);
2097                        if ( salt1 != null) cons.put("salt1",salt1);
2098                        if ( hyd3  != null) cons.put("hydrogen3",hyd3);
2099                        if ( hyd4  != null) cons.put("hydrogen4",hyd4);
2100                        if ( salt2 != null) cons.put("salt2",salt2);
2101
2102                        connects.add(cons);
2103                } catch (NumberFormatException e){
2104                        logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line);
2105                        return;
2106                }
2107        }
2108
2109        /**
2110         Handler for
2111         MODEL Record Format
2112
2113         COLUMNS       DATA TYPE      FIELD         DEFINITION
2114         ----------------------------------------------------------------------
2115         1 -  6       Record name    "MODEL "
2116         11 - 14       Integer        serial        Model serial number.
2117         */
2118        private void pdb_MODEL_Handler(String line) {
2119
2120                if (params.isHeaderOnly()) return;
2121
2122                // check beginning of file ...
2123                if (current_chain != null) {
2124                        if (current_group != null) {
2125                                current_chain.addGroup(current_group);
2126                                current_group.trimToSize();
2127                        }
2128
2129                        Chain ch = isKnownChain(current_chain.getChainID(),current_model) ;
2130                        if ( ch == null ) {
2131                                current_model.add(current_chain);
2132                        }
2133
2134                        structure.addModel(current_model);
2135                        current_model = new ArrayList<Chain>();
2136                        current_chain = null;
2137                        current_group = null;
2138                }
2139
2140        }
2141
2142
2143        /**
2144         * COLUMNS       DATA TYPE          FIELD          DEFINITION
2145         * ----------------------------------------------------------------
2146         *  1 - 6        Record name        "DBREF "
2147         *  8 - 11       IDcode             idCode         ID code of this entry.
2148         * 13            Character          chainID        Chain identifier.
2149         * 15 - 18       Integer            seqBegin       Initial sequence number
2150         *                                                 of the PDB sequence segment.
2151         * 19            AChar              insertBegin    Initial insertion code
2152         *                                                 of the PDB sequence segment.
2153         * 21 - 24       Integer            seqEnd         Ending sequence number
2154         *                                                 of the PDB sequence segment.
2155         * 25            AChar              insertEnd      Ending insertion code
2156         *                                                 of the PDB sequence segment.
2157         * 27 - 32       LString            database       Sequence database name.
2158         * 34 - 41       LString            dbAccession    Sequence database accession code.
2159         * 43 - 54      LString            dbIdCode        Sequence database
2160         *                                                 identification code.
2161         * 56 - 60      Integer            dbseqBegin      Initial sequence number of the
2162         *                                                 database seqment.
2163         * 61           AChar              idbnsBeg        Insertion code of initial residue
2164         *                                                 of the segment, if PDB is the
2165         *                                                 reference.
2166         * 63 - 67      Integer            dbseqEnd        Ending sequence number of the
2167         *                                                 database segment.
2168         * 68           AChar              dbinsEnd        Insertion code of the ending
2169         *                                                 residue of the segment, if PDB is
2170         *                                                 the reference.
2171         */
2172        private void pdb_DBREF_Handler(String line){
2173
2174                logger.debug("Parsing DBREF " + line);
2175
2176                DBRef dbref = new DBRef();
2177                String idCode      = line.substring(7,11);
2178                String chainId     = line.substring(12,13);
2179                String seqBegin    = line.substring(14,18);
2180                String insertBegin = line.substring(18,19);
2181                String seqEnd      = line.substring(20,24);
2182                String insertEnd   = line.substring(24,25);
2183                String database    = line.substring(26,32);
2184                String dbAccession = line.substring(33,41);
2185                String dbIdCode    = line.substring(42,54);
2186                String dbseqBegin  = line.substring(55,60);
2187                String idbnsBeg    = line.substring(60,61);
2188                String dbseqEnd    = line.substring(62,67);
2189                // Support implicit space character at end
2190                String dbinsEnd;
2191                if(line.length() >= 68)
2192                        dbinsEnd       = line.substring(67,68);
2193                else
2194                        dbinsEnd       = " ";
2195
2196                dbref.setIdCode(idCode);
2197                dbref.setChainId(chainId);
2198                dbref.setSeqBegin(intFromString(seqBegin));
2199                dbref.setInsertBegin(insertBegin.charAt(0));
2200                dbref.setSeqEnd(intFromString(seqEnd));
2201                dbref.setInsertEnd(insertEnd.charAt(0));
2202                dbref.setDatabase(database.trim());
2203                dbref.setDbAccession(dbAccession.trim());
2204                dbref.setDbIdCode(dbIdCode.trim());
2205                dbref.setDbSeqBegin(intFromString(dbseqBegin));
2206                dbref.setIdbnsBegin(idbnsBeg.charAt(0));
2207                dbref.setDbSeqEnd(intFromString(dbseqEnd));
2208                dbref.setIdbnsEnd(dbinsEnd.charAt(0));
2209
2210                //System.out.println(dbref.toPDB());
2211                dbrefs.add(dbref);
2212        }
2213
2214        /*
2215         * For each het group that appears in the entry, the wwPDB checks that the corresponding HET, HETNAM, HETSYN, FORMUL, HETATM, and CONECT records appear, if applicable. The HET record is generated automatically using the Chemical Component Dictionary and information from the HETATM records.
2216
2217         * Record Format
2218         *
2219         * <pre>
2220         * COLUMNS       DATA  TYPE     FIELD         DEFINITION
2221         * ---------------------------------------------------------------------------------
2222         *  1 -  6       Record name   "HET   "
2223         *  8 - 10       LString(3)    hetID          Het identifier, right-justified.
2224         * 13            Character     ChainID        Chain  identifier.
2225         * 14 - 17       Integer       seqNum         Sequence  number.
2226         * 18            AChar         iCode          Insertion  code.
2227         * 21 - 25       Integer       numHetAtoms    Number of HETATM records for the group
2228         *                                            present in the entry.
2229         * 31 - 70       String        text           Text describing Het group.
2230         *
2231         * Each unique hetID represents a unique molecule.
2232         *
2233         * Relationships to Other Record Types
2234         *
2235         * For each het group that appears in the entry, there must be corresponding HET, HETNAM, HETSYN, FORMUL,HETATM, and CONECT records. LINK records may also be created.
2236         *
2237         * Example
2238         *
2239         *          1         2         3         4         5         6         7         8
2240         * 12345678901234567890123456789012345678901234567890123456789012345678901234567890
2241         * HET    TRS    975       8
2242         *
2243         * HET    UDP  A1457      25
2244         * HET    B3P  A1458      19
2245         *
2246         * HET    NAG  Y   3      15
2247         * HET    FUC  Y   4      10
2248         * HET    NON  Y   5      12
2249         * HET    UNK  A 161       1
2250         * </pre>
2251         *
2252         * Heterogen sections are HET, HETNAM, HETSYN, FORMUL
2253         * @see http://www.wwpdb.org/documentation/format32/sect4.html
2254         */
2255        //private void pdb_HET_handler(String line) {
2256
2257        //}
2258
2259        /**
2260         * Process the disulfide bond info provided by an SSBOND record
2261         *
2262         *
2263        COLUMNS        DATA TYPE       FIELD         DEFINITION
2264        -------------------------------------------------------------------
2265         1 -  6        Record name     "SSBOND"
2266         8 - 10        Integer         serNum       Serial number.
2267        12 - 14        LString(3)      "CYS"        Residue name.
2268        16             Character       chainID1     Chain identifier.
2269        18 - 21        Integer         seqNum1      Residue sequence number.
2270        22             AChar           icode1       Insertion code.
2271        26 - 28        LString(3)      "CYS"        Residue name.
2272        30             Character       chainID2     Chain identifier.
2273        32 - 35        Integer         seqNum2      Residue sequence number.
2274        36             AChar           icode2       Insertion code.
2275        60 - 65        SymOP           sym1         Symmetry oper for 1st resid
2276        67 - 72        SymOP           sym2         Symmetry oper for 2nd resid
2277         */
2278        private void pdb_SSBOND_Handler(String line){
2279
2280                if (params.isHeaderOnly()) return;
2281
2282                if (line.length()<36) {
2283                        logger.info("SSBOND line has length under 36. Ignoring it.");
2284                        return;
2285                }
2286
2287                String chain1      = line.substring(15,16);
2288                String seqNum1     = line.substring(17,21).trim();
2289                String icode1      = line.substring(21,22);
2290                String chain2      = line.substring(29,30);
2291                String seqNum2     = line.substring(31,35).trim();
2292                String icode2      = line.substring(35,36);
2293
2294                if (line.length()>=72) {
2295                        String symop1 = line.substring(59, 65).trim();
2296                        String symop2 = line.substring(66, 72).trim();
2297
2298                        // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them
2299                        if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing
2300                                        (!symop1.equals("1555") || !symop2.equals("1555")) ) {
2301                                logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2);
2302                                return;
2303                        }
2304                }
2305
2306                if (icode1.equals(" "))
2307                        icode1 = "";
2308                if (icode2.equals(" "))
2309                        icode2 = "";
2310
2311                SSBondImpl ssbond = new SSBondImpl();
2312
2313                ssbond.setChainID1(chain1);
2314                ssbond.setResnum1(seqNum1);
2315                ssbond.setChainID2(chain2);
2316                ssbond.setResnum2(seqNum2);
2317                ssbond.setInsCode1(icode1);
2318                ssbond.setInsCode2(icode2);
2319                ssbonds.add(ssbond);
2320        }
2321
2322
2323        /**
2324         * Takes care of LINK records. These take the format of:
2325         *
2326         * <pre>
2327         * COLUMNS        DATA TYPE       FIELD       DEFINITION
2328         * --------------------------------------------------------------------------------
2329         *  1 -  6        Record name     "LINK  "
2330         * 13 - 16        Atom            name1       Atom name.
2331         * 17             Character       altLoc1     Alternate location indicator.
2332         * 18 - 20        Residue name    resName1    Residue name.
2333         * 22             Character       chainID1    Chain identifier.
2334         * 23 - 26        Integer         resSeq1     Residue sequence number.
2335         * 27             AChar           iCode1      Insertion code.
2336         * 43 - 46        Atom            name2       Atom name.
2337         * 47             Character       altLoc2     Alternate location indicator.
2338         * 48 - 50        Residue name    resName2    Residue name.
2339         * 52             Character       chainID2    Chain identifier.
2340         * 53 - 56        Integer         resSeq2     Residue sequence number.
2341         * 57             AChar           iCode2      Insertion code.
2342         * 60 - 65        SymOP           sym1        Symmetry operator for 1st atom.
2343         * 67 - 72        SymOP           sym2        Symmetry operator for 2nd atom.
2344         * </pre>
2345         *
2346         * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK)
2347         *
2348         * @param line the LINK record line to parse.
2349         */
2350        private void pdb_LINK_Handler(String line) {
2351
2352                if (params.isHeaderOnly()) return;
2353                
2354                // Check for the minimal set of fields.
2355                if (line.length()<56) {
2356                        logger.info("LINK line has length under 56. Ignoring it.");
2357                        return;
2358                }
2359
2360                int len = line.length();
2361                
2362                String name1 = line.substring(12, 16).trim();
2363                String altLoc1 = line.substring(16, 17).trim();
2364                String resName1 = line.substring(17, 20).trim();
2365                String chainID1 = line.substring(21, 22).trim();
2366                String resSeq1 = line.substring(22, 26).trim();
2367                String iCode1 = line.substring(26, 27).trim();
2368
2369                String name2 = line.substring(42, 46).trim();
2370                String altLoc2 = line.substring(46, 47).trim();
2371                String resName2 = line.substring(47, 50).trim();
2372                String chainID2 = line.substring(51, 52).trim();
2373                String resSeq2 = line.substring(52, 56).trim();
2374                String iCode2 = null;  // Might get trimmed if blank.
2375                if (len > 56) iCode2 = line.substring(56, 57).trim();
2376
2377                String sym1 = null;
2378                if (len > 64) sym1 = line.substring(59, 65).trim();
2379                String sym2 = null;
2380                if (len > 71) sym2 = line.substring(66, 72).trim();
2381
2382//              System.err.println("LINK");
2383//              System.err.println("\tName: " + name1);
2384//              System.err.println("\tAlt Loc: " + altLoc1);
2385//              System.err.println("\tRes name: " + resName1);
2386//              System.err.println("\tChain ID: " + chainID1);
2387//              System.err.println("\tRes Seq: " + resSeq1);
2388//              System.err.println("\tIns Code: " + iCode1);
2389//              System.err.println(name1 + "." + altLoc1 + "." + resName1 + "." + chainID1 + "." + resSeq1 + "." + iCode1);
2390//              System.err.println(name2 + "." + altLoc2 + "." + resName2 + "." + chainID2 + "." + resSeq2 + "." + iCode2);
2391//              System.err.println(sym1 + "." + sym2);
2392//              System.err.println();
2393
2394                linkRecords.add(new LinkRecord(
2395                                name1, altLoc1, resName1, chainID1, resSeq1, iCode1,
2396                                name2, altLoc2, resName2, chainID2, resSeq2, iCode2,
2397                                sym1, sym2));
2398        }
2399
2400        /**
2401         * Handler for the SITE records. <br>
2402         *
2403         * <pre>
2404         *
2405         * COLUMNS      DATA TYPE               FIELD           DEFINITION
2406         * ---------------------------------------------------------------------------------
2407         * 1 - 6        Record name     "SITE "
2408         * 8 - 10       Integer                 seqNum          Sequence number.
2409         * 12 - 14      LString(3)              siteID          Site name.
2410         * 16 - 17      Integer                 numRes          Number of residues that compose the siteResidues.
2411         * 19 - 21      Residue name    resName1        Residue name for first residue that
2412         *                                                                              creates the siteResidues.
2413         * 23           Character               chainID1        Chain identifier for first residue of siteResidues.
2414         * 24 - 27      Integer                 seq1            Residue sequence number for first residue
2415         *                                                                              of the siteResidues.
2416         * 28           AChar                   iCode1          Insertion code for first residue of the siteResidues.
2417         *
2418         * example:
2419         *          1         2         3         4         5         6         7         8
2420         * 12345678901234567890123456789012345678901234567890123456789012345678901234567890
2421         * SITE     1 AC1  3 HIS A  94 HIS A   96  HIS A 119
2422         * SITE     1 AC2  5 ASN A  62 GLY A   63  HIS A  64  HOH A 328
2423         * SITE     2 AC2  5 HOH A 634
2424         * SITE     1 AC3  5 GLN A 136 GLN A  137  PRO A 138  GLU A 205
2425         * SITE     2 AC3  5 CYS A 206
2426         * SITE     1 AC4 11 HIS A  64 HIS A   94  HIS A  96  HIS A 119
2427         * SITE     2 AC4 11 LEU A 198 THR A  199  THR A 200  TRP A 209
2428         * SITE     3 AC4 11 HOH A 572 HOH A  582  HOH A 635
2429         * </pre>
2430         * @param line the SITE line record being currently read
2431         * @author Amr AL-Hossary
2432         * @author Jules Jacobsen
2433         */
2434        private void pdb_SITE_Handler(String line){
2435
2436                if (params.isHeaderOnly()) return;
2437
2438                //  make a map of: SiteId to List<ResidueNumber>
2439
2440                logger.debug("Site Line:"+line);
2441
2442
2443                String siteID = line.substring(11, 14);
2444                //fetch the siteResidues from the map
2445                List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID);
2446
2447                //if the siteResidues doesn't yet exist, make a new one.
2448                if (siteResidues == null |! siteToResidueMap.containsKey(siteID.trim())){
2449                        siteResidues = new ArrayList<ResidueNumber>();
2450                        siteToResidueMap.put(siteID.trim(), siteResidues);
2451
2452                        logger.debug(String.format("New Site made: %s %s", siteID,  siteResidues));
2453                        logger.debug("Now made " + siteMap.size() + " sites");
2454
2455                }
2456
2457                logger.debug(String.format("SiteId: %s", siteID));
2458
2459
2460                //line = 'SITE     1 AC1  6 ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2461                //line.substring(18) = 'ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2462                line = line.substring(18);
2463                String groupString = null;
2464                //groupString = 'ARG H 221A'
2465                //keep iterating through chunks of 10 characters - these are the groups in the siteResidues
2466                while (!(groupString = line.substring(0, 10)).equals("          ")) {
2467                        //groupstring: 'ARG H 221A'
2468
2469                        logger.debug("groupString: '" + groupString + "'");
2470
2471                        //set the residue name
2472                        //residueName = 'ARG'
2473                        String residueName = groupString.substring(0, 3);
2474                        Character aminoCode1 = StructureTools.get1LetterCode(residueName);
2475                        if (aminoCode1 != null) {
2476                                if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
2477                                        aminoCode1 = null;
2478                                }
2479                        }
2480
2481                        //this is already in the right format, so no need to fiddle with it...
2482                        //pdbCode = 'H 221A'
2483                        //                    String pdbCode = groupString.substring(4, 10).trim();
2484                        String chainId = groupString.substring(4, 5);
2485                        Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim());
2486                        Character insCode = groupString.substring(9, 10).charAt(0);
2487                        //set insCode to null as a measure to prevent storing thousands of empty Strings
2488                        //- the empty value is returned using Group.getInsCode()
2489                        //                    if (insCode.equals(" ")) {
2490                        //                        insCode = null;
2491                        //                    }
2492
2493                        logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode));
2494
2495                        //make a new resNum with the data - this will be linked up with a site later
2496                        ResidueNumber residueNumber = new ResidueNumber();
2497
2498
2499                        logger.debug("pdbCode: '" + resNum + insCode + "'");
2500
2501                        residueNumber.setChainId(chainId);
2502                        residueNumber.setSeqNum(resNum);
2503                        residueNumber.setInsCode(insCode);
2504                        //add the resNum to the groups
2505                        siteResidues.add(residueNumber);
2506
2507                        logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID);
2508
2509                        line = line.substring(11);
2510                }
2511
2512                logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):");
2513                for (String key : siteToResidueMap.keySet()) {
2514                        logger.debug(key + " : " + siteToResidueMap.get(key));
2515                }
2516
2517        }
2518
2519        //Site variable related to parsing the REMARK 800 records.
2520        Site site;
2521        private void pdb_REMARK_800_Handler(String line){
2522
2523                if (params.isHeaderOnly()) return;
2524
2525                // 'REMARK 800 SITE_IDENTIFIER: CAT                                                 '
2526                line = line.substring(11);
2527                String[] fields = line.split(": ");
2528
2529                if (fields.length == 2) {
2530                        if (fields[0].equals("SITE_IDENTIFIER")) {
2531                                //                    remark800Counter++;
2532                                String siteID = fields[1].trim();
2533
2534                                logger.debug("siteID: '" + siteID +"'");
2535
2536                                //fetch the siteResidues from the map
2537                                site = siteMap.get(siteID);
2538
2539                                //if the siteResidues doesn't yet exist, make a new one.
2540                                if (site == null || !siteID.equals(site.getSiteID())) {
2541                                        site = new Site(siteID, new ArrayList<Group>());
2542                                        siteMap.put(site.getSiteID(), site);
2543
2544                                        logger.debug("New Site made: " + site);
2545                                        logger.debug("Now made " + siteMap.size() + " sites");
2546
2547                                }
2548                        }
2549                        if (fields[0].equals("EVIDENCE_CODE")) {
2550                                //                    remark800Counter++;
2551                                String evCode = fields[1].trim();
2552
2553                                logger.debug("evCode: '" + evCode +"'");
2554
2555                                //fetch the siteResidues from the map
2556                                site.setEvCode(evCode);
2557                        }
2558                        if (fields[0].equals("SITE_DESCRIPTION")) {
2559                                //                    remark800Counter++;
2560                                String desc = fields[1].trim();
2561
2562                                logger.debug("desc: '" + desc +"'");
2563
2564                                //fetch the siteResidues from the map
2565                                site.setDescription(desc);
2566
2567                                logger.debug("Finished making REMARK 800 for site " + site.getSiteID());
2568                                logger.debug(site.remark800toPDB());
2569
2570                        }
2571                }
2572        }
2573
2574        private int intFromString(String intString){
2575                int val = Integer.MIN_VALUE;
2576                try {
2577                        val = Integer.parseInt(intString.trim());
2578                } catch (NumberFormatException ex){
2579                        logger.info("Could not parse a number: " + ex.getMessage());
2580                }
2581                return val;
2582        }
2583
2584
2585
2586        /** test if the chain is already known (is in current_model
2587         * ArrayList) and if yes, returns the chain
2588         * if no -> returns null
2589         */
2590        private Chain isKnownChain(String chainID, List<Chain> chains){
2591
2592                for (int i = 0; i< chains.size();i++){
2593                        Chain testchain =  chains.get(i);
2594                        //System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<");
2595                        if (chainID.equals(testchain.getChainID())) {
2596                                //System.out.println("chain "+ chainID+" already known ...");
2597                                return testchain;
2598                        }
2599                }
2600
2601                return null;
2602        }
2603
2604
2605
2606        private BufferedReader getBufferedReader(InputStream inStream)
2607                        throws IOException {
2608
2609                BufferedReader buf ;
2610                if (inStream == null) {
2611                        throw new IOException ("input stream is null!");
2612                }
2613
2614                buf = new BufferedReader (new InputStreamReader (inStream));
2615                return buf ;
2616
2617        }
2618
2619
2620
2621        /**
2622         * Parse a PDB file and return a datastructure implementing
2623         * PDBStructure interface.
2624         *
2625         * @param inStream  an InputStream object
2626         * @return a Structure object
2627         * @throws IOException
2628         */
2629        public Structure parsePDBFile(InputStream inStream)
2630                        throws IOException
2631        {
2632
2633                BufferedReader buf = getBufferedReader(inStream);
2634
2635                return parsePDBFile(buf);
2636
2637        }
2638
2639        /**
2640         * Parse a PDB file and return a datastructure implementing
2641         * PDBStructure interface.
2642         *
2643         * @param buf  a BufferedReader object
2644         * @return the Structure object
2645         * @throws IOException ...
2646         */
2647
2648        public  Structure parsePDBFile(BufferedReader buf)
2649                        throws IOException
2650                        {
2651                // set the correct max values for parsing...
2652                load_max_atoms = params.getMaxAtoms();
2653                my_ATOM_CA_THRESHOLD = params.getAtomCaThreshold();
2654
2655
2656                // (re)set structure
2657
2658                structure     = new StructureImpl() ;
2659                current_model = new ArrayList<Chain>();
2660                seqResChains  = new ArrayList<Chain>();
2661                siteMap = new LinkedHashMap<String, Site>();
2662                current_chain = null           ;
2663                current_group = null           ;
2664                pdbHeader     = new PDBHeader();
2665                connects      = new ArrayList<Map<String,Integer>>();
2666                previousContinuationField = "";
2667                continuationField = "";
2668                continuationString = "";
2669                current_compound = null;
2670                sourceLines.clear();
2671                compndLines.clear();
2672                isLastCompndLine = false;
2673                isLastSourceLine = false;
2674                prevMolId = -1;
2675                compounds.clear();
2676                helixList.clear();
2677                strandList.clear();
2678                turnList.clear();
2679                lengthCheck = -1;
2680                atomCount = 0;
2681                atomOverflow = false;
2682                linkRecords = new ArrayList<LinkRecord>();
2683                siteToResidueMap.clear();
2684
2685                parseCAonly = params.isParseCAOnly();
2686
2687                String line = null;
2688
2689                while ((line = buf.readLine()) != null) {
2690
2691                        // ignore empty lines
2692                        if ( line.equals("") ||
2693                                        (line.equals(NEWLINE))){
2694                                continue;
2695                        }
2696
2697
2698                        // ignore short TER and END lines
2699                        if ( (line.startsWith("TER")) ||
2700                                        (line.startsWith("END"))) {
2701                                continue;
2702                        }
2703
2704                        if ( line.length() < 6) {
2705                                logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
2706                                continue;
2707                        }
2708
2709                        String recordName = line.substring (0, 6).trim ();
2710
2711                        try {
2712                                if (recordName.equals("ATOM"))
2713                                        pdb_ATOM_Handler(line);
2714                                else if (recordName.equals("SEQRES"))
2715                                        pdb_SEQRES_Handler(line);
2716                                else if (recordName.equals("HETATM"))
2717                                        pdb_ATOM_Handler(line);
2718                                else if (recordName.equals("MODEL"))
2719                                        pdb_MODEL_Handler(line);
2720                                else if (recordName.equals("HEADER"))
2721                                        pdb_HEADER_Handler(line);
2722                                else if (recordName.equals("AUTHOR"))
2723                                        pdb_AUTHOR_Handler(line);
2724                                else if (recordName.equals("TITLE"))
2725                                        pdb_TITLE_Handler(line);
2726                                else if (recordName.equals("SOURCE"))
2727                                        sourceLines.add(line); //pdb_SOURCE_Handler
2728                                else if (recordName.equals("COMPND"))
2729                                        compndLines.add(line); //pdb_COMPND_Handler
2730                                else if (recordName.equals("JRNL"))
2731                                        pdb_JRNL_Handler(line);
2732                                else if (recordName.equals("EXPDTA"))
2733                                        pdb_EXPDTA_Handler(line);
2734                                else if (recordName.equals("CRYST1"))
2735                                        pdb_CRYST1_Handler(line);
2736                                else if (recordName.startsWith("MTRIX"))
2737                                        pdb_MTRIXn_Handler(line);
2738                                else if (recordName.equals("REMARK"))
2739                                        pdb_REMARK_Handler(line);
2740                                else if (recordName.equals("CONECT"))
2741                                        pdb_CONECT_Handler(line);
2742                                else if (recordName.equals("REVDAT"))
2743                                        pdb_REVDAT_Handler(line);
2744                                else if (recordName.equals("DBREF"))
2745                                        pdb_DBREF_Handler(line);
2746                                else if (recordName.equals("SITE"))
2747                                        pdb_SITE_Handler(line);
2748                                else if (recordName.equals("SSBOND"))
2749                                        pdb_SSBOND_Handler(line);
2750                                else if (recordName.equals("LINK"))
2751                                        pdb_LINK_Handler(line);
2752                                else if ( params.isParseSecStruc()) {
2753                                        if ( recordName.equals("HELIX") ) pdb_HELIX_Handler (  line ) ;
2754                                        else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ;
2755                                        else if (recordName.equals("TURN")) pdb_TURN_Handler(   line ) ;
2756                                }
2757                        } catch (StringIndexOutOfBoundsException ex) {
2758                                logger.warn("Unable to parse [" + line + "]");
2759                        }
2760
2761
2762                }
2763
2764                makeCompounds(compndLines, sourceLines);
2765
2766                triggerEndFileChecks();
2767
2768                if (params.shouldCreateAtomBonds()) {
2769                        formBonds();
2770                }
2771
2772                if ( params.shouldCreateAtomCharges()) {
2773                        addCharges();
2774                }
2775
2776                if ( params.isParseSecStruc() && !params.isHeaderOnly())
2777                        setSecStruc();
2778
2779
2780                return structure;
2781
2782                        }
2783
2784        private void addCharges() {
2785                ChargeAdder.addCharges(structure);
2786        }
2787
2788        /**
2789         * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained.
2790         * @author Jules Jacobsen
2791         * @param  compoundList
2792         * @param  sourceList
2793         */
2794        private void makeCompounds(List<String> compoundList,
2795                        List<String> sourceList) {
2796                //              System.out.println("[makeCompounds] making compounds from compoundLines");
2797
2798                for (String line : compoundList) {
2799                        if (compoundList.indexOf(line) + 1 == compoundList.size()) {
2800                                //                              System.out.println("[makeCompounds] Final line in compoundLines.");
2801                                isLastCompndLine = true;
2802                        }
2803                        pdb_COMPND_Handler(line);
2804
2805                }
2806                //              System.out.println("[makeCompounds] adding sources to compounds from sourceLines");
2807                // since we're starting again from the first compound, reset it here
2808                if ( compounds.size() == 0){
2809                        current_compound = new Compound();
2810                } else {
2811                        current_compound = compounds.get(0);
2812                }
2813                for (String line : sourceList) {
2814                        if (sourceList.indexOf(line) + 1 == sourceList.size()) {
2815                                //                              System.out.println("[makeCompounds] Final line in sourceLines.");
2816                                isLastSourceLine = true;
2817                        }
2818                        pdb_SOURCE_Handler(line);
2819                }
2820
2821        }
2822
2823        /**
2824         * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide
2825         * bonds), peptide bonds, and intra-residue bonds.
2826         * <p>
2827         * Note: the current implementation only looks at the first model of each
2828         * structure. This may need to be fixed in the future.
2829         */
2830        private void formBonds() {
2831
2832                BondMaker maker = new BondMaker(structure, params);
2833
2834                // TODO do we want link records at all? aren't they overlapping with other bonds that we infer (peptide/nucleotide bonds) or get from chemical components (intra-molecule bonds) - JD 2016-03-03
2835                for (LinkRecord linkRecord : linkRecords) {
2836                        maker.formLinkRecordBond(linkRecord);
2837                }
2838
2839                maker.formDisulfideBonds(ssbonds);
2840
2841                maker.makeBonds();
2842        }
2843
2844
2845
2846        private void triggerEndFileChecks(){
2847                // finish and add ...
2848
2849                Date modDate = pdbHeader.getModDate();
2850                if ( modDate.equals(new Date(0)) ) {
2851                        // modification date = deposition date
2852                        Date depositionDate = pdbHeader.getDepDate();
2853
2854                        if (! depositionDate.equals(modDate)){
2855                                // depDate is 0000-00-00
2856                                pdbHeader.setDepDate(depositionDate);
2857                        }
2858
2859                }
2860
2861                // a problem occurred earlier so current_chain = null ...
2862                // most likely the buffered reader did not provide data ...
2863                if ( current_chain != null ) {
2864                        current_chain.addGroup(current_group);
2865
2866                        if (isKnownChain(current_chain.getChainID(),current_model) == null) {
2867                                current_model.add(current_chain);
2868                        }
2869                }
2870
2871                //set the JournalArticle, if there is one
2872                if (!journalLines.isEmpty()) {
2873                        buildjournalArticle();
2874                        pdbHeader.setJournalArticle(journalArticle);
2875                }
2876
2877
2878                structure.addModel(current_model);
2879                structure.setPDBHeader(pdbHeader);
2880                structure.setCrystallographicInfo(crystallographicInfo);
2881
2882                // TODO after 4.2 release we should remove setConnections/getConnections and rely only on Atom.getBonds/setBonds - JD 2016-03-03
2883                structure.setConnections(connects);
2884
2885                structure.setDBRefs(dbrefs);
2886
2887                // Only align if requested (default) and not when headerOnly mode with no Atoms.
2888                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
2889                if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){
2890                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
2891                        SeqRes2AtomAligner aligner = new SeqRes2AtomAligner();
2892                        aligner.align(structure,seqResChains);
2893
2894                } else {
2895                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
2896                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
2897                }
2898
2899
2900                linkChains2Compound(structure);
2901                structure.setCompounds(compounds);
2902
2903                //associate the temporary Groups in the siteMap to the ones
2904
2905                if (!params.isHeaderOnly()) {
2906                        // Only can link SITES if Atom Groups were parsed.
2907                        linkSitesToGroups(); // will work now that setSites is called
2908                }
2909
2910                if ( bioAssemblyParser != null){
2911                        pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap());
2912                        //System.out.println("setting nr bioAssemblies: " + pdbHeader.getNrBioAssemblies());
2913                        //System.out.println(pdbHeader.getBioUnitTranformationMap().keySet());
2914                }
2915
2916                if (ncsOperators !=null && ncsOperators.size()>0) {
2917                        crystallographicInfo.setNcsOperators(
2918                                ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
2919                }
2920
2921
2922                // rfree end file check
2923                // Rfree annotation is not very consistent in PDB format, it varies depending on the software
2924                // Here we follow this strategy:
2925                // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
2926                // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
2927
2928                if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) {
2929                        pdbHeader.setRfree(rfreeNoCutoffLine);
2930                } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) {
2931                        pdbHeader.setRfree(rfreeStandardLine);
2932                } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) {
2933                        pdbHeader.setRfree(rfreeStandardLine);
2934                } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE
2935
2936
2937                // to make sure we have Compounds linked to chains, we call getCompounds() which will lazily initialise the
2938                // compounds using heuristics (see CompoundFinder) in the case that they were not explicitly present in the file
2939                structure.getCompounds();
2940        }
2941
2942        private void setSecStruc(){
2943
2944                setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2945                                SecStrucType.helix4);
2946                setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2947                                SecStrucType.extended);
2948                setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2949                                SecStrucType.turn);
2950
2951                //Now insert random coil to the Groups that did not have SS information
2952                GroupIterator gi = new GroupIterator(structure);
2953                while (gi.hasNext()){
2954                        Group g = gi.next();
2955                        if (g.hasAminoAtoms()){
2956                                if (g.getProperty(Group.SEC_STRUC) == null){
2957                                        SecStrucInfo ss = new SecStrucInfo(g,
2958                                                        SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2959                                                        SecStrucType.coil);
2960                                        g.setProperty(Group.SEC_STRUC, ss);
2961                                }
2962                        }
2963                }
2964
2965        }
2966
2967        private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){
2968
2969
2970                Iterator<Map<String,String>> iter = secList.iterator();
2971                nextElement:
2972                        while (iter.hasNext()){
2973                                Map<String,String> m = iter.next();
2974
2975                                // assign all residues in this range to this secondary structure type
2976                                // String initResName = (String)m.get("initResName");
2977                                String initChainId = m.get("initChainId");
2978                                String initSeqNum  = m.get("initSeqNum" );
2979                                String initICode   = m.get("initICode" );
2980                                // String endResName  = (String)m.get("endResName" );
2981                                String endChainId  = m.get("endChainId" );
2982                                String endSeqNum   = m.get("endSeqNum");
2983                                String endICode    = m.get("endICode");
2984
2985                                if (initICode.equals(" "))
2986                                        initICode = "";
2987                                if (endICode.equals(" "))
2988                                        endICode = "";
2989
2990                                GroupIterator gi = new GroupIterator(structure);
2991                                boolean inRange = false;
2992                                while (gi.hasNext()){
2993                                        Group g = gi.next();
2994                                        Chain c = g.getChain();
2995
2996                                        if (c.getChainID().equals(initChainId)){
2997
2998                                                String pdbCode = initSeqNum + initICode;
2999                                                if ( g.getResidueNumber().toString().equals(pdbCode)  ) {
3000                                                        inRange = true;
3001                                                }
3002                                        }
3003                                        if ( inRange){
3004                                                if (g.hasAminoAtoms()) {
3005                                                        SecStrucInfo ss = new SecStrucInfo(g, assignment, type);
3006                                                        g.setProperty(Group.SEC_STRUC, ss);
3007                                                }
3008
3009                                        }
3010                                        if ( c.getChainID().equals(endChainId)){
3011                                                String pdbCode = endSeqNum + endICode;
3012                                                if (pdbCode.equals(g.getResidueNumber().toString())){
3013                                                        inRange = false;
3014                                                        continue nextElement;
3015                                                }
3016                                        }
3017                                }
3018                        }
3019        }
3020
3021
3022        /** After the parsing of a PDB file the {@link Chain} and  {@link Compound}
3023         * objects need to be linked to each other.
3024         *
3025         * @param s the structure
3026         */
3027        public void linkChains2Compound(Structure s){
3028
3029
3030                for(Compound comp : compounds){
3031                        List<Chain> chains = new ArrayList<Chain>();
3032                        List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId());
3033                        if ( chainIds == null)
3034                                continue;
3035                        for ( String chainId : chainIds) {
3036                                if ( chainId.equals("NULL"))
3037                                        chainId = " ";
3038                                try {
3039
3040                                        Chain c = s.findChain(chainId);
3041                                        chains.add(c);
3042
3043                                } catch (StructureException e){
3044                                        // usually if this happens something is wrong with the PDB header
3045                                        // e.g. 2brd - there is no Chain A, although it is specified in the header
3046                                        // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES
3047                                        // but the authors didn't observe in the density so it's completely missing
3048                                        // from the ATOM lines
3049                                        logger.warn("Could not find chain {} to link to compound (entity) {}. The chain will be missing in the compound.", chainId, comp.getMolId());
3050                                }
3051                        }
3052                        comp.setChains(chains);
3053                }
3054
3055                if ( compounds.size() == 1) {
3056                        Compound comp = compounds.get(0);
3057                        if ( compoundMolIds2chainIds.get(comp.getMolId()) == null){
3058                                List<Chain> chains = s.getChains(0);
3059                                if ( chains.size() == 1) {
3060                                        // this is an old style PDB file - add the ChainI
3061                                        Chain ch = chains.get(0);
3062                                        comp.addChain(ch);
3063                                }
3064                        }
3065                }
3066
3067                for (Compound comp: compounds){
3068                        if ( compoundMolIds2chainIds.get(comp.getMolId()) == null) {
3069                                // could not link to chain
3070                                // TODO: should this be allowed to happen?
3071                                continue;
3072                        }
3073                        for ( String chainId : compoundMolIds2chainIds.get(comp.getMolId())){
3074                                if ( chainId.equals("NULL"))
3075                                        continue;
3076                                try {
3077                                        Chain c = s.getChainByPDB(chainId);
3078                                        c.setCompound(comp);
3079                                } catch (StructureException e){
3080                                        logger.warn("Chain {} was not found, can't assign a compound (entity) to it.",chainId);
3081                                }
3082                        }
3083                }
3084
3085                // in rare cases where a purely non-polymer or purely water chain is present we have missed it above
3086                // we need now to assign a new compound to it so that at least the structure is consistent
3087                // see https://github.com/biojava/biojava/pull/394
3088
3089                if (compounds!=null && !compounds.isEmpty()) {
3090                        for (Chain c: s.getChains()) {
3091                                if (c.getCompound() == null) {
3092
3093                                        Compound compound = new Compound();
3094                                        compound.addChain(c);
3095                                        compound.setMolId(findMaxCompoundId(compounds)+1);
3096                                        c.setCompound(compound);
3097                                        compounds.add(compound);
3098
3099                                        logger.warn("No compound (entity) found in file for chain {}. Creating new compound {} for it.", c.getChainID(), compound.getMolId());
3100                                }
3101                        }
3102                }
3103        }
3104
3105        private static int findMaxCompoundId(List<Compound> compounds) {
3106
3107                return
3108
3109                Collections.max(compounds, new Comparator<Compound>() {
3110                        @Override
3111                        public int compare(Compound o1, Compound o2) {
3112                                return new Integer(o1.getMolId()).compareTo(o2.getMolId());
3113                        }
3114                }).getMolId();
3115        }
3116
3117        /**
3118         * Links the Sites in the siteMap to the Groups in the Structure via the
3119         * siteToResidueMap ResidueNumber.
3120         * @author Jules Jacobsen
3121         * @return
3122         */
3123        private void linkSitesToGroups() {
3124
3125                //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size());
3126
3127                //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back.
3128                //the return list
3129
3130                if ( siteMap == null || siteToResidueMap == null){
3131                        logger.info("Sites can not be linked to residues!");
3132
3133                        return;
3134                }
3135
3136                List<Site> sites = null;
3137                //check that there are chains with which to associate the groups
3138                if (structure.getChains().isEmpty()) {
3139                        sites = new ArrayList<Site>(siteMap.values());
3140                        logger.info("No chains to link Site Groups with - Sites will not be present in the Structure");
3141                        return;
3142                }
3143
3144                //check that the keys in the siteMap and SiteToResidueMap are equal
3145                if (! siteMap.keySet().equals(siteToResidueMap.keySet())) {
3146                        logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure");
3147                        logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet());
3148                        //return;
3149                }
3150
3151                //so we have chains - associate the siteResidues-related groups with the ones
3152                //already in in the chains
3153                for (String key : siteMap.keySet()) {
3154                        Site currentSite = siteMap.get(key);
3155                        List<ResidueNumber> linkedGroups = siteToResidueMap.get(key);
3156                        if ( linkedGroups == null)
3157                                continue;
3158                        for (ResidueNumber residueNumber : linkedGroups) {
3159
3160                                String pdbCode = residueNumber.toString();
3161                                String chain = residueNumber.getChainId();
3162                                //                    System.out.println("chain: '" + chain + "'");
3163                                //                    String resNum = resNum.getSeqNum().toString();
3164                                //                    System.out.println("resNum: '" + resNum + "'");
3165
3166                                Group linkedGroup = null;
3167                                try {
3168                                        //TODO: implement findGroup(ResidueNumber resNum)
3169                                        linkedGroup = structure.findGroup(chain, pdbCode);
3170                                } catch (StructureException ex) {
3171                                        logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")");
3172                                        continue;
3173                                }
3174
3175                                //                    System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID());
3176                                currentSite.getGroups().add(linkedGroup);
3177                        }
3178                }
3179
3180                //System.out.println("SITEMAP: " + siteMap);
3181
3182                sites = new ArrayList<Site>(siteMap.values());
3183                structure.setSites(sites);
3184                //System.out.println("STRUCTURE SITES: " + structure.getSites().size());
3185                //            for (Site site : structure.getSites()) {
3186                //                System.out.println(site);
3187                //            }
3188                //            System.out.println("Linked Site Groups with Chains");
3189
3190        }
3191
3192        private void buildjournalArticle() {
3193
3194                logger.debug("building new JournalArticle");
3195                //            for (String line : journalLines) {
3196                //                System.out.println(line);
3197                //            }
3198
3199                this.journalArticle = new JournalArticle();
3200                //        JRNL        AUTH   M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI,
3201                //        JRNL        AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT
3202                //        JRNL        TITL   A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY
3203                //        JRNL        TITL 2 STAPHYLOCOCCUS AUREUS.
3204                //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3205                //        JRNL        REFN                   ISSN 1529-2908
3206                //        JRNL        PMID   17351618
3207                //        JRNL        DOI    10.1038/NI1450
3208                StringBuffer auth = new StringBuffer();
3209                StringBuffer titl = new StringBuffer();
3210                StringBuffer edit = new StringBuffer();
3211                StringBuffer ref = new StringBuffer();
3212                StringBuffer publ = new StringBuffer();
3213                StringBuffer refn = new StringBuffer();
3214                StringBuffer pmid = new StringBuffer();
3215                StringBuffer doi = new StringBuffer();
3216
3217                for (String line : journalLines) {
3218                        if ( line.length() < 19 ) {
3219                                logger.info("can not process Journal line: " + line);
3220                                continue;
3221                        }
3222                        //            System.out.println("'" + line + "'");
3223                        String subField = line.substring(12, 16);
3224                        //            System.out.println("'" + subField + "'");
3225                        if (subField.equals("AUTH")) {
3226                                auth.append(line.substring(19, line.length()).trim());
3227
3228                                logger.debug("AUTH '" + auth.toString() + "'");
3229
3230                        }
3231                        if (subField.equals("TITL")) {
3232                                //add a space to the end of a line so that when wrapped the
3233                                //words on the join won't be concatenated
3234                                titl.append(line.substring(19, line.length()).trim()).append(" ");
3235
3236                                logger.debug("TITL '" + titl.toString() + "'");
3237
3238                        }
3239                        if (subField.equals("EDIT")) {
3240                                edit.append(line.substring(19, line.length()).trim());
3241
3242                                logger.debug("EDIT '" + edit.toString() + "'");
3243
3244                        }
3245                        //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3246                        if (subField.equals("REF ")) {
3247                                ref.append(line.substring(19, line.length()).trim()).append(" ");
3248
3249                                logger.debug("REF '" + ref.toString() + "'");
3250
3251                        }
3252                        if (subField.equals("PUBL")) {
3253                                publ.append(line.substring(19, line.length()).trim()).append(" ");
3254
3255                                logger.debug("PUBL '" + publ.toString() + "'");
3256
3257                        }
3258                        //        JRNL        REFN                   ISSN 1529-2908
3259                        if (subField.equals("REFN")) {
3260                                if ( line.length() < 35 ) {
3261                                        logger.info("can not process Journal REFN line: " + line);
3262                                        continue;
3263                                }
3264                                refn.append(line.substring(35, line.length()).trim());
3265
3266                                logger.debug("REFN '" + refn.toString() + "'");
3267
3268                        }
3269                        //        JRNL        PMID   17351618
3270                        if (subField.equals("PMID")) {
3271                                pmid.append(line.substring(19, line.length()).trim());
3272
3273                                logger.debug("PMID '" + pmid.toString() + "'");
3274
3275                        }
3276                        //        JRNL        DOI    10.1038/NI1450
3277                        if (subField.equals("DOI ")) {
3278                                doi.append(line.substring(19, line.length()).trim());
3279
3280                                logger.debug("DOI '" + doi.toString() + "'");
3281
3282                        }
3283                }
3284
3285                //now set the parts of the JournalArticle
3286                journalArticle.setAuthorList(authorBuilder(auth.toString()));
3287                journalArticle.setEditorList(authorBuilder(edit.toString()));
3288                journalArticle.setRef(ref.toString());
3289                JournalParser journalParser = new JournalParser(ref.toString());
3290                journalArticle.setJournalName(journalParser.getJournalName());
3291                if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) {
3292                        journalArticle.setIsPublished(true);
3293                }
3294                journalArticle.setVolume(journalParser.getVolume());
3295                journalArticle.setStartPage(journalParser.getStartPage());
3296                journalArticle.setPublicationDate(journalParser.getPublicationDate());
3297                journalArticle.setPublisher(publ.toString().trim());
3298                journalArticle.setTitle(titl.toString().trim());
3299                journalArticle.setRefn(refn.toString().trim());
3300                journalArticle.setPmid(pmid.toString().trim());
3301                journalArticle.setDoi(doi.toString().trim());
3302
3303
3304                logger.debug("Made JournalArticle:");
3305                logger.debug(journalArticle.toString());
3306
3307        }
3308
3309        //inner class to deal with all the journal info
3310        private class JournalParser {
3311
3312                private String journalName;
3313                private String volume;
3314                private String startPage;
3315                private int publicationDate;
3316
3317
3318                public JournalParser(String ref) {
3319
3320                        logger.debug("JournalParser init '" + ref + "'");
3321
3322
3323                        if (ref.equals("TO BE PUBLISHED ")) {
3324                                journalName = ref.trim();
3325
3326                                logger.debug(String.format("JournalParser found journalString '%s'", journalName));
3327
3328                                return;
3329                        }
3330
3331                        if (ref.length() < 48) {
3332                                logger.info("REF line too short - must be at least 48 characters to be valid for parsing.");
3333                                journalName = "";
3334                                volume = "";
3335                                startPage = "";
3336                                publicationDate = 0;
3337                                return;
3338                        }
3339                        //can be multi line:
3340                        //REF    PHILOS.TRANS.R.SOC.LONDON,    V. 293    53 1981
3341                        //REF  2 SER.B
3342
3343                        //or
3344
3345                        //REF    GLYCOGEN PHOSPHORYLASE B:                1 1991
3346                        //REF  2 DESCRIPTION OF THE PROTEIN
3347                        //REF  3 STRUCTURE
3348
3349                        //but usually single line
3350                        //REF    NUCLEIC ACIDS RES.                         2009
3351                        //REF    MOL.CELL                                   2009
3352                        //REF    NAT.STRUCT.MOL.BIOL.          V.  16   238 2009
3353                        //REF    ACTA CRYSTALLOGR.,SECT.F      V.  65   199 2009
3354                        //check if the date is present at the end of the line.
3355                        //                             09876543210987654321
3356                        //'J.BIOL.CHEM.                  V. 280 23000 2005 '
3357                        //'J.AM.CHEM.SOC.                V. 130 16011 2008 '
3358                        //'NAT.STRUCT.MOL.BIOL.          V.  16   238 2009'
3359                        String volumeInformation = ref.substring(30, 48);
3360
3361                        logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation));
3362
3363                        //volumeInformation: 'V. 293    53 1981 '
3364                        //                      String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim();
3365                        //                      String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim();
3366                        //                      String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim();
3367                        //                      String journalString = ref.substring(0 , ref.length() - 18).trim();
3368                        String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim();
3369                        String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim();
3370                        String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim();
3371                        //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk)
3372                        String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim();
3373                        journalString = journalString.trim();
3374                        //                        System.out.println("journalString: " + journalString);
3375
3376                        logger.debug(String.format("JournalParser found volumeString '%s'", volumeString));
3377                        logger.debug(String.format("JournalParser found startPageString '%s'", startPageString));
3378                        logger.debug(String.format("JournalParser found dateString '%s'", dateString));
3379                        logger.debug(String.format("JournalParser found journalString '%s'", journalString));
3380
3381
3382                        if (!dateString.equals("    ")) {
3383                                try {
3384                                        publicationDate = Integer.valueOf(dateString);
3385                                } catch (NumberFormatException nfe) {
3386                                        logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1");
3387                                }
3388                                //                              if (DEBUG) {
3389                                //                                      System.out.println("JournalParser set date " + publicationDate);
3390                                //                              }
3391                        }
3392
3393                        if (!startPageString.equals("    ")) {
3394                                startPage = startPageString;
3395                                //                              if (DEBUG) {
3396                                //                                      System.out.println("JournalParser set startPage " + startPage);
3397                                //                              }
3398                        }
3399
3400                        if (!volumeString.equals("    ")) {
3401                                volume = volumeString;
3402                                //                              if (DEBUG) {
3403                                //                                      System.out.println("JournalParser set volume " + volume);
3404                                //                              }
3405                        }
3406
3407                        if (!journalString.equals("    ")) {
3408                                journalName = journalString;
3409
3410                                logger.debug("JournalParser set journalName " + journalName);
3411
3412                        }
3413                }
3414
3415                private String getJournalName() {
3416                        return journalName;
3417                }
3418
3419                private int getPublicationDate() {
3420                        return publicationDate;
3421                }
3422
3423                private String getStartPage() {
3424                        return startPage;
3425                }
3426
3427                private String getVolume() {
3428                        return volume;
3429                }
3430        }
3431
3432        private List<Author> authorBuilder(String authorString) {
3433                ArrayList<Author> authorList = new ArrayList<Author>();
3434
3435                if (authorString.equals("")) {
3436                        return authorList;
3437                }
3438
3439                String[] authors = authorString.split(",");
3440                //        if (DEBUG) {
3441                //            for (int i = 0; i < authors.length; i++) {
3442                //                String string = authors[i];
3443                //                System.out.println("authorBuilder author: '" + string + "'");
3444                //            }
3445                //        }
3446                //        AUTH   SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS
3447                //        AUTH 2 DISEASE (SSGCID)
3448                //        or
3449                //        AUTH   E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET,
3450                //        AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA,
3451                //        AUTH 3 A.BOCHKAREV,D.COSSAR,
3452                //        AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC)
3453                //        or
3454                //        AUTH   T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER
3455                if (authors.length == 1) {
3456                        //only one element means it's a consortium only
3457                        Author author = new Author();
3458                        author.setSurname(authors[0]);
3459
3460                        logger.debug("Set consortium author name " + author.getSurname());
3461
3462                        authorList.add(author);
3463                } else {
3464                        for (int i = 0; i < authors.length; i++) {
3465                                String authorFullName = authors[i];
3466
3467                                logger.debug("Building author " + authorFullName);
3468
3469                                Author author = new Author();
3470                                String regex = "\\.";
3471                                String[] authorNames = authorFullName.split(regex);
3472                                //                if (DEBUG) {
3473                                //                    System.out.println("authorNames size " + authorNames.length);
3474                                //                    for (int j = 0; j < authorNames.length; j++) {
3475                                //                        String name = authorNames[j];
3476                                //                        System.out.println("split authName '" + name + "'");
3477                                //
3478                                //                    }
3479                                //                }
3480                                if (authorNames.length == 0) {
3481                                        author.setSurname(authorFullName);
3482
3483                                        logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname());
3484
3485                                }
3486                                //again there might be a consortium name so there may be no elements
3487                                else if (authorNames.length == 1) {
3488                                        author.setSurname(authorNames[0]);
3489
3490                                        logger.debug("Set consortium author name in multiple author block " + author.getSurname
3491                                                                ());
3492
3493                                } else {
3494                                        String initials = "";
3495                                        for (int j = 0; j < authorNames.length - 1; j++) {
3496                                                String initial = authorNames[j];
3497                                                //                        if (DEBUG) {
3498                                                //                            System.out.println("adding initial '" + initial + "'");
3499                                                //                        }
3500                                                //build the initials back up again
3501                                                initials += initial + ".";
3502                                        }
3503
3504                                        logger.debug("built initials '" + initials + "'");
3505
3506                                        author.setInitials(initials);
3507                                        //surname is always last
3508                                        int lastName = authorNames.length - 1;
3509                                        String surname = authorNames[lastName];
3510
3511                                        logger.debug("built author surname " + surname);
3512
3513                                        author.setSurname(surname);
3514
3515                                }
3516                                authorList.add(author);
3517                        }
3518                }
3519                return authorList;
3520        }
3521
3522        public void setFileParsingParameters(FileParsingParameters params)
3523        {
3524                this.params= params;
3525
3526                // set the correct max values for parsing...
3527                load_max_atoms = params.getMaxAtoms();
3528                my_ATOM_CA_THRESHOLD = params.getAtomCaThreshold();
3529
3530        }
3531
3532        public FileParsingParameters getFileParsingParameters(){
3533                return params;
3534        }
3535
3536
3537}