Source code

001/*
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 * Created on 16.03.2004
020 *
021 */
022package org.biojava.nbio.structure.io;
023
024import static java.lang.Math.min;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.text.DateFormat;
031import java.text.ParseException;
032import java.text.SimpleDateFormat;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.Date;
036import java.util.HashMap;
037import java.util.Iterator;
038import java.util.LinkedHashMap;
039import java.util.List;
040import java.util.Locale;
041import java.util.Map;
042import java.util.StringTokenizer;
043import java.util.regex.Matcher;
044import java.util.regex.Pattern;
045
046import javax.vecmath.Matrix4d;
047
048import org.biojava.nbio.structure.AminoAcid;
049import org.biojava.nbio.structure.AminoAcidImpl;
050import org.biojava.nbio.structure.Atom;
051import org.biojava.nbio.structure.AtomImpl;
052import org.biojava.nbio.structure.Author;
053import org.biojava.nbio.structure.Chain;
054import org.biojava.nbio.structure.ChainImpl;
055import org.biojava.nbio.structure.DBRef;
056import org.biojava.nbio.structure.Element;
057import org.biojava.nbio.structure.EntityInfo;
058import org.biojava.nbio.structure.EntityType;
059import org.biojava.nbio.structure.Group;
060import org.biojava.nbio.structure.GroupIterator;
061import org.biojava.nbio.structure.HetatomImpl;
062import org.biojava.nbio.structure.JournalArticle;
063import org.biojava.nbio.structure.NucleotideImpl;
064import org.biojava.nbio.structure.PDBCrystallographicInfo;
065import org.biojava.nbio.structure.PDBHeader;
066import org.biojava.nbio.structure.PdbId;
067import org.biojava.nbio.structure.ResidueNumber;
068import org.biojava.nbio.structure.Site;
069import org.biojava.nbio.structure.Structure;
070import org.biojava.nbio.structure.StructureException;
071import org.biojava.nbio.structure.StructureImpl;
072import org.biojava.nbio.structure.StructureTools;
073import org.biojava.nbio.structure.chem.ChemCompAtom;
074import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord;
076import org.biojava.nbio.structure.secstruc.SecStrucInfo;
077import org.biojava.nbio.structure.secstruc.SecStrucType;
078import org.biojava.nbio.structure.xtal.CrystalCell;
079import org.biojava.nbio.structure.xtal.SpaceGroup;
080import org.biojava.nbio.structure.xtal.SymoplibParser;
081import org.slf4j.Logger;
082import org.slf4j.LoggerFactory;
083
084
085/**
086 * This class implements the actual PDB file parsing. Do not access it directly, but
087 * via the PDBFileReader class.
088 *
089 * <h2>Parsing</h2>
090 *
091 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods.
092 *
093 *
094 * <p>
095 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD.
096 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically
097 * switch to a C-alpha only representation.
098 *
099 * <p>
100 * The result of the parsing of the PDB file is a new {@link Structure} object.
101 *
102 * <p>
103 * For more documentation on how to work with the Structure API please
104 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top">
105 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a>
106 *
107 *
108 *
109 *
110 * <h2>Example</h2>
111 * <p>
112 * Q: How can I get a Structure object from a PDB file?
113 * <p>
114 * A:
115 * <pre>
116 * public {@link Structure} loadStructure(String pathToPDBFile){
117 *      // The PDBFileParser is wrapped by the PDBFileReader
118 *      {@link PDBFileReader} pdbreader = new {@link PDBFileReader}();
119 *
120 *      {@link Structure} structure = null;
121 *      try{
122 *              structure = pdbreader.getStructure(pathToPDBFile);
123 *              System.out.println(structure);
124 *      } catch (IOException e) {
125 *              e.printStackTrace();
126 *      }
127 *      return structure;
128 * }
129 * </pre>
130 *
131 *
132 * @author Andreas Prlic
133 * @author Jules Jacobsen
134 * @author Jose Duarte
135 * @since 1.4
136 */
137public class PDBFileParser  {
138
139
140
141        private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class);
142
143        // for printing
144        private static final String NEWLINE = System.getProperty("line.separator");
145
146
147        // required for parsing:
148        private String pdbId; //the actual id of the entry
149        private Structure     structure;
150        private List<List<Chain>> allModels; // a temp data structure to keep all models
151        private List<Chain>   currentModel; // contains the ATOM records for each model
152        private Chain         currentChain;
153        private Group         currentGroup;
154
155        private List<Chain>   seqResChains; // contains all the chains for the SEQRES records
156        //we're going to work on the assumption that the files are current -
157        //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true.
158        //if true then lines will be truncated at 72 characters in certain cases
159        //(pdb_COMPOUND_handler for example)
160        private boolean isLegacyFormat = false;
161
162        private boolean blankChainIdsPresent = false;
163
164        // for re-creating the biological assembly
165        private PDBBioAssemblyParser bioAssemblyParser = null;
166
167        private PDBHeader pdbHeader;
168        private PDBCrystallographicInfo crystallographicInfo;
169        private JournalArticle journalArticle;
170        private List<Map<String, Integer>> connects ;
171        private List<Map<String,String>> helixList;
172        private List<Map<String,String>> strandList;
173        private List<Map<String,String>> turnList;
174
175        private int lengthCheck ;
176
177        private boolean isLastCompndLine = false;
178        private boolean isLastSourceLine = false;
179        private EntityInfo current_compound;
180        private List<EntityInfo> entities = new ArrayList<EntityInfo>();
181        private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>();
182        private List<String> compndLines = new ArrayList<String>();
183        private List<String> sourceLines = new ArrayList<String>();
184        private List<String> journalLines = new ArrayList<String>();
185        private List<String> keywordsLines = new ArrayList<String>();
186        private List<DBRef> dbrefs;
187        private Map<String, Site> siteMap = new LinkedHashMap<String, Site>();
188        private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>();
189
190        private List<SSBondImpl> ssbonds = new ArrayList<>();
191
192        // for storing LINK until we have all the atoms parsed
193        private List<LinkRecord> linkRecords;
194
195        private Matrix4d currentNcsOp;
196        private List<Matrix4d> ncsOperators;
197
198        // for parsing COMPOUND and SOURCE Header lines
199        private int prevMolId;
200        private String previousContinuationField;
201        private String continuationField;
202        private String continuationString;
203
204        private DateFormat dateFormat;
205
206        // for rfree parsing
207        private float rfreeStandardLine = -1;
208        private float rfreeNoCutoffLine = -1;
209
210        private static  final List<String> compndFieldValues = new ArrayList<String>(
211                        Arrays.asList(
212                                        "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:",
213                                        "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:",
214                                        "BIOLOGICAL_UNIT:", "OTHER_DETAILS:"
215                                        ));
216
217
218        private static final List<String> ignoreCompndFieldValues = new ArrayList<String>(
219                        Arrays.asList(
220                                        "HETEROGEN:","ENGINEEREED:","FRAGMENT,",
221                                        "MUTANT:","SYNTHETIC:"
222                                        ));
223        // ENGINEEREED in pdb219d
224
225        private static final List<String> sourceFieldValues = new ArrayList<String>(
226                        Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:",
227                                        "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:",
228                                        "ORGANISM_TAXID:","STRAIN:",
229                                        "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:",
230                                        "CELL:", "ORGANELLE:", "SECRETION:", "GENE:",
231                                        "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:",
232                                        "EXPRESSION_SYSTEM_TAXID:",
233                                        "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:",
234                                        "EXPRESSION_SYSTEM_CELL_LINE:",
235                                        "EXPRESSION_SYSTEM_ATCC_NUMBER:",
236                                        "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:",
237                                        "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:",
238                                        "EXPRESSION_SYSTEM_CELLULAR_LOCATION:",
239                                        "EXPRESSION_SYSTEM_VECTOR_TYPE:",
240                                        "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:",
241                                        "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:"));
242
243        private int atomCount;
244
245        // parsing options:
246
247        private int atomCAThreshold ;
248
249        private int loadMaxAtoms;
250
251        private boolean atomOverflow;
252
253        /** flag to tell parser to only read Calpha coordinates **/
254        private boolean parseCAonly;
255
256
257        private FileParsingParameters params;
258
259        private boolean startOfMolecule;
260        private boolean startOfModel;
261
262        public PDBFileParser() {
263                params = new FileParsingParameters();
264
265                allModels = new ArrayList<>();
266                structure     = null           ;
267                currentModel  = null;
268                currentChain  = null;
269                currentGroup  = null;
270                // we initialise to true since at the beginning of the file we are always starting a new molecule
271                startOfMolecule = true;
272                startOfModel = true;
273
274
275                pdbHeader         = new PDBHeader();
276                crystallographicInfo = new PDBCrystallographicInfo();
277                connects      = new ArrayList<Map<String,Integer>>() ;
278
279
280                helixList     = new ArrayList<Map<String,String>>();
281                strandList    = new ArrayList<Map<String,String>>();
282                turnList      = new ArrayList<Map<String,String>>();
283                current_compound = null;
284                dbrefs        = new ArrayList<DBRef>();
285                siteMap = null;
286                dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US);
287                atomCount = 0;
288                atomOverflow = false;
289                parseCAonly = false;
290
291                // this SHOULD not be done
292                // DONOT:setFileParsingParameters(params);
293                // set the correct max values for parsing...
294                loadMaxAtoms = params.getMaxAtoms();
295                atomCAThreshold = params.getAtomCaThreshold();
296
297                linkRecords = new ArrayList<LinkRecord>();
298
299                blankChainIdsPresent = false;
300
301        }
302
303        /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */
304        private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) {
305
306                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3);
307                if ( g != null && !g.getChemComp().isEmpty())
308                        return g;
309
310
311                Group group;
312                if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){
313                        group = new HetatomImpl();
314
315                } else if(StructureTools.isNucleotide(aminoCode3))  {
316                        // it is a nucleotide
317                        NucleotideImpl nu = new NucleotideImpl();
318                        group = nu;
319
320                } else {
321                        AminoAcidImpl aa = new AminoAcidImpl() ;
322                        aa.setAminoType(aminoCode1);
323                        group = aa ;
324                }
325
326                //              System.out.println("new resNum type: "+ resNum.getType() );
327                return  group ;
328        }
329
330
331
332        // Handler methods to deal with PDB file records properly.
333        /**
334         Handler for
335         HEADER Record Format
336         <pre>
337         COLUMNS        DATA TYPE       FIELD           DEFINITION
338         ----------------------------------------------------------------------------------
339         1 -  6        Record name     "HEADER"
340         11 - 50        String(40)      classification  Classifies the molecule(s)
341         51 - 59        Date            depDate         Deposition date.  This is the date
342         the coordinates were received by
343         the PDB
344         63 - 66        IDcode          idCode          This identifier is unique within PDB
345        </pre>
346         */
347        private void pdb_HEADER_Handler(String line) {
348
349                String classification  = null;
350                String deposition_date = null;
351                String pdbCode         = null;
352
353                int len = line.trim().length();
354                if(len > 10) {
355                        classification  = line.substring (10, min(len,50)).trim() ;
356                        pdbHeader.setClassification(classification);
357                }
358                if(len > 50) {
359                        deposition_date = line.substring (50, min(len,59)).trim() ;
360                        try {
361                                Date dep = dateFormat.parse(deposition_date);
362                                pdbHeader.setDepDate(dep);
363
364                        } catch (ParseException e){
365                                logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date");
366                        }
367                }
368                if(len > 62) {
369                        pdbCode         = line.substring (62, min(len,66)).trim() ;
370                        pdbId = pdbCode;
371
372                        logger.debug("Parsing entry " + pdbId);
373
374
375                        PdbId pdbIdToSet;
376                        try {
377                                pdbIdToSet = new PdbId(pdbCode);
378                        } catch (IllegalArgumentException e) {
379                                logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode);
380                                pdbIdToSet = null;
381                        }
382                        structure.setPdbId(pdbIdToSet);
383                        pdbHeader.setPdbId(pdbIdToSet);
384                }
385
386                //*really* old files (you'll need to hunt to find these as they
387                //should have been remediated) have headers like below. Plus the
388                //pdbId at positions 72-76 is present in every line
389
390                //HEADER    PROTEINASE INHIBITOR (TRYPSIN)          05-OCT-84   5PTI      5PTI   3
391                //HEADER    TRANSFERASE (ACYLTRANSFERASE)           02-SEP-92   1LAC      1LAC   2
392                if (len > 66) {
393                        if (pdbId.equals(line.substring (72, 76))){
394                                isLegacyFormat = true;
395                                logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly.");
396                        }
397                }
398
399        }
400
401
402        /**
403         * Parses the following record:
404         * <pre>
405         *  COLUMNS      DATA  TYPE      FIELD         DEFINITION
406         * ------------------------------------------------------------------------------------
407         *  1 -  6      Record name     "AUTHOR"
408         *  9 - 10      Continuation    continuation  Allows concatenation of multiple records.
409         * 11 - 79      List            authorList    List of the author names, separated
410         *                                            by commas.
411         *
412         * </pre>
413         * @param line
414         */
415        private void pdb_AUTHOR_Handler(String line) {
416
417                String authors = line.substring(10).trim();
418
419                String auth = pdbHeader.getAuthors();
420                if (auth == null){
421                        pdbHeader.setAuthors(authors);
422                } else {
423                        auth +=  authors;
424                        pdbHeader.setAuthors(auth);
425                }
426
427        }
428
429
430
431        /**
432         * Parses the following record:
433         *
434         * <pre>
435         * COLUMNS       DATA TYPE        FIELD        DEFINITION
436         * --------------------------------------------------------------------
437         *  1 -  6       Record name      "HELIX "
438         *  8 - 10       Integer          serNum       Serial number of the helix.
439         *                                             This starts at 1 and increases
440         *                                             incrementally.
441         * 12 - 14       LString(3)       helixID      Helix identifier. In addition
442         *                                             to a serial number, each helix is
443         *                                             given an alphanumeric character
444         *                                             helix identifier.
445         * 16 - 18       Residue name     initResName  Name of the initial residue.
446         * 20            Character        initChainID  Chain identifier for the chain
447         *                                             containing this helix.
448         * 22 - 25       Integer          initSeqNum   Sequence number of the initial
449         *                                             residue.
450         * 26            AChar            initICode    Insertion code of the initial
451         *                                             residue.
452         * 28 - 30       Residue name     endResName   Name of the terminal residue of
453         *                                             the helix.
454         * 32            Character        endChainID   Chain identifier for the chain
455         *                                             containing this helix.
456         * 34 - 37       Integer          endSeqNum    Sequence number of the terminal
457         *                                             residue.
458         * 38            AChar            endICode     Insertion code of the terminal
459         *                                             residue.
460         * 39 - 40       Integer          helixClass   Helix class (see below).
461         * 41 - 70       String           comment      Comment about this helix.
462         * 72 - 76       Integer          length       Length of this helix.
463         * </pre>
464         */
465        private void pdb_HELIX_Handler(String line){
466
467                if (params.isHeaderOnly()) return;
468
469                if (line.length()<38) {
470                        logger.info("HELIX line has length under 38. Ignoring it.");
471                        return;
472                }
473
474                String initResName = line.substring(15,18).trim();
475                String initChainId = line.substring(19,20);
476                String initSeqNum  = line.substring(21,25).trim();
477                String initICode   = line.substring(25,26);
478                String endResName  = line.substring(27,30).trim();
479                String endChainId  = line.substring(31,32);
480                String endSeqNum   = line.substring(33,37).trim();
481                String endICode    = line.substring(37,38);
482
483                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
484                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
485
486                Map<String,String> m = new HashMap<String,String>();
487
488                m.put("initResName",initResName);
489                m.put("initChainId", initChainId);
490                m.put("initSeqNum", initSeqNum);
491                m.put("initICode", initICode);
492                m.put("endResName", endResName);
493                m.put("endChainId", endChainId);
494                m.put("endSeqNum",endSeqNum);
495                m.put("endICode",endICode);
496
497                helixList.add(m);
498
499        }
500
501        /**
502         * Handler for
503         * <pre>
504         *       COLUMNS     DATA TYPE        FIELD           DEFINITION
505         * --------------------------------------------------------------
506         *  1 -  6     Record name      "SHEET "
507         *  8 - 10     Integer          strand       Strand number which starts at 1
508         *                                           for each strand within a sheet
509         *                                           and increases by one.
510         * 12 - 14     LString(3)       sheetID      Sheet identifier.
511         * 15 - 16     Integer          numStrands   Number of strands in sheet.
512         * 18 - 20     Residue name     initResName  Residue name of initial residue.
513         * 22          Character        initChainID  Chain identifier of initial
514         *                                           residue in strand.
515         * 23 - 26     Integer          initSeqNum   Sequence number of initial
516         *                                           residue in strand.
517         * 27          AChar            initICode    Insertion code of initial residue
518         *                                           in strand.
519         * 29 - 31     Residue name     endResName   Residue name of terminal residue.
520         * 33          Character        endChainID   Chain identifier of terminal
521         *                                           residue.
522         * 34 - 37     Integer          endSeqNum    Sequence number of terminal
523         *                                           residue.
524         * 38          AChar            endICode     Insertion code of terminal
525         *                                           residue.
526         * 39 - 40     Integer          sense        Sense of strand with respect to
527         *                                           previous strand in the sheet. 0
528         *                                           if first strand, 1 if parallel,
529         *                                           -1 if anti-parallel.
530         * 42 - 45     Atom             curAtom      Registration. Atom name in
531         *                                           current strand.
532         * 46 - 48     Residue name     curResName   Registration. Residue name in
533         *                                           current strand.
534         * 50          Character        curChainId   Registration. Chain identifier in
535         *                                           current strand.
536         * 51 - 54     Integer          curResSeq    Registration. Residue sequence
537         *                                           number in current strand.
538         * 55          AChar            curICode     Registration. Insertion code in
539         *                                           current strand.
540         * 57 - 60     Atom             prevAtom     Registration. Atom name in
541         *                                           previous strand.
542         * 61 - 63     Residue name     prevResName  Registration. Residue name in
543         *                                           previous strand.
544         * 65          Character        prevChainId  Registration. Chain identifier in
545         *                                           previous strand.
546         * 66 - 69     Integer          prevResSeq   Registration. Residue sequence
547         *                                           number in previous strand.
548         * 70          AChar            prevICode    Registration. Insertion code in
549         *                                               previous strand.
550         * </pre>
551         */
552        private void pdb_SHEET_Handler( String line){
553
554                if (params.isHeaderOnly()) return;
555
556                if (line.length()<38) {
557                        logger.info("SHEET line has length under 38. Ignoring it.");
558                        return;
559                }
560
561                String initResName = line.substring(17,20).trim();
562                String initChainId = line.substring(21,22);
563                String initSeqNum  = line.substring(22,26).trim();
564                String initICode   = line.substring(26,27);
565                String endResName  = line.substring(28,31).trim();
566                String endChainId  = line.substring(32,33);
567                String endSeqNum   = line.substring(33,37).trim();
568                String endICode    = line.substring(37,38);
569
570                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
571                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
572
573                Map<String,String> m = new HashMap<String,String>();
574
575                m.put("initResName",initResName);
576                m.put("initChainId", initChainId);
577                m.put("initSeqNum", initSeqNum);
578                m.put("initICode", initICode);
579                m.put("endResName", endResName);
580                m.put("endChainId", endChainId);
581                m.put("endSeqNum",endSeqNum);
582                m.put("endICode",endICode);
583
584                strandList.add(m);
585        }
586
587
588        /**
589         * Handler for TURN lines
590         * <pre>
591         * COLUMNS      DATA TYPE        FIELD         DEFINITION
592         * --------------------------------------------------------------------
593         *  1 -  6      Record name      "TURN "
594         *  8 - 10      Integer          seq           Turn number; starts with 1 and
595         *                                             increments by one.
596         * 12 - 14      LString(3)       turnId        Turn identifier
597         * 16 - 18      Residue name     initResName   Residue name of initial residue in
598         *                                             turn.
599         * 20           Character        initChainId   Chain identifier for the chain
600         *                                             containing this turn.
601         * 21 - 24      Integer          initSeqNum    Sequence number of initial residue
602         *                                             in turn.
603         * 25           AChar            initICode     Insertion code of initial residue
604         *                                             in turn.
605         * 27 - 29      Residue name     endResName    Residue name of terminal residue
606         *                                             of turn.
607         * 31           Character        endChainId    Chain identifier for the chain
608         *                                             containing this turn.
609         * 32 - 35      Integer          endSeqNum     Sequence number of terminal
610         *                                             residue of turn.
611         * 36           AChar            endICode      Insertion code of terminal residue
612         *                                             of turn.
613         * 41 - 70      String           comment       Associated comment.
614         * </pre>
615         * @param line
616         */
617        private void pdb_TURN_Handler( String line){
618
619                if (params.isHeaderOnly()) return;
620
621                if (line.length()<36) {
622                        logger.info("TURN line has length under 36. Ignoring it.");
623                        return;
624                }
625
626                String initResName = line.substring(15,18).trim();
627                String initChainId = line.substring(19,20);
628                String initSeqNum  = line.substring(20,24).trim();
629                String initICode   = line.substring(24,25);
630                String endResName  = line.substring(26,29).trim();
631                String endChainId  = line.substring(30,31);
632                String endSeqNum   = line.substring(31,35).trim();
633                String endICode    = line.substring(35,36);
634
635                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
636                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
637
638                Map<String,String> m = new HashMap<String,String>();
639
640                m.put("initResName",initResName);
641                m.put("initChainId", initChainId);
642                m.put("initSeqNum", initSeqNum);
643                m.put("initICode", initICode);
644                m.put("endResName", endResName);
645                m.put("endChainId", endChainId);
646                m.put("endSeqNum",endSeqNum);
647                m.put("endICode",endICode);
648
649                turnList.add(m);
650        }
651
652        /**
653         * Handler for
654         * REVDAT Record format:
655         * <pre>
656         *
657         * COLUMNS       DATA TYPE      FIELD         DEFINITION
658         * ----------------------------------------------------------------------------------
659         * 1 -  6       Record name    "REVDAT"
660         * 8 - 10       Integer        modNum        Modification number.
661         * 11 - 12       Continuation   continuation  Allows concatenation of multiple
662         * records.
663         * 14 - 22       Date           modDate       Date of modification (or release for
664         * new entries).  This is not repeated
665         * on continuation lines.
666         * 24 - 28       String(5)      modId         Identifies this particular
667         * modification.  It links to the
668         * archive used internally by PDB.
669         * This is not repeated on continuation
670         * lines.
671         * 32            Integer        modType       An integer identifying the type of
672         * modification.  In case of revisions
673         * with more than one possible modType,
674         * the highest value applicable will be
675         * assigned.
676         * 40 - 45       LString(6)     record        Name of the modified record.
677         * 47 - 52       LString(6)     record        Name of the modified record.
678         * 54 - 59       LString(6)     record        Name of the modified record.
679         * 61 - 66       LString(6)     record        Name of the modified record.
680         * </pre>
681         */
682        private void pdb_REVDAT_Handler(String line) {
683
684                // keep the first as latest modified date and the last as release date
685                Date modDate = pdbHeader.getModDate();
686
687                if ( modDate==null || modDate.equals(new Date(0)) ) {
688
689                        // modified date is still uninitialized
690                        String modificationDate = line.substring (13, 22).trim() ;
691
692                        try {
693                                Date dep = dateFormat.parse(modificationDate);
694                                pdbHeader.setModDate(dep);
695                                pdbHeader.setRelDate(dep);
696                        } catch (ParseException e){
697                                logger.info("Could not parse revision date string '"+modificationDate+"'. ");
698                        }
699
700                } else {
701
702                        // set as the release date
703                        String releaseDate = line.substring (13, 22).trim() ;
704
705                        try {
706                                Date dep = dateFormat.parse(releaseDate);
707                                pdbHeader.setRelDate(dep);
708                        } catch (ParseException e){
709                                logger.info("Could not parse revision date string '"+releaseDate+"'. ");
710                        }
711                }
712        }
713
714        /**
715         * Handler for
716         * SEQRES record format
717         * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied.
718         * <p>
719         * Record Format:
720         * <p>
721         * <pre>
722         * COLUMNS        DATA TYPE       FIELD         DEFINITION
723         * ---------------------------------------------------------------------------------
724         * 1 -  6        Record name     "SEQRES"
725         * 9 - 10        Integer         serNum        Serial number of the SEQRES record
726         * for the current chain.  Starts at 1
727         * and increments by one each line.
728         * Reset to 1 for each chain.
729         * 12             Character       chainID       Chain identifier.  This may be any
730         * single legal character, including a
731         * blank which is used if there is
732         * only one chain.
733         * 14 - 17        Integer         numRes        Number of residues in the chain.
734         * This value is repeated on every
735         * record.
736         * 20 - 22        Residue name    resName       Residue name.
737         * 24 - 26        Residue name    resName       Residue name.
738         * 28 - 30        Residue name    resName       Residue name.
739         * 32 - 34        Residue name    resName       Residue name.
740         * 36 - 38        Residue name    resName       Residue name.
741         * 40 - 42        Residue name    resName       Residue name.
742         * 44 - 46        Residue name    resName       Residue name.
743         * 48 - 50        Residue name    resName       Residue name.
744         * 52 - 54        Residue name    resName       Residue name.
745         * 56 - 58        Residue name    resName       Residue name.
746         * 60 - 62        Residue name    resName       Residue name.
747         * 64 - 66        Residue name    resName       Residue name.
748         * 68 - 70        Residue name    resName       Residue name.
749         * </pre>
750         * @author Jules Jacobsen
751         */
752        private void pdb_SEQRES_Handler(String line) {
753
754                /*
755                 *          1         2         3         4         5         6         7
756                 * 1234567890123456789012345678901234567890123456789012345678901234567890
757                 * SEQRES   1 A  376  LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR
758                 * SEQRES   1 A   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
759                 * SEQRES   2 A   21  TYR GLN LEU GLU ASN TYR CYS ASN
760                 * SEQRES   1 B   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
761                 * SEQRES   2 B   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
762                 * SEQRES   3 B   30  THR PRO LYS ALA
763                 * SEQRES   1 C   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
764                 * SEQRES   2 C   21  TYR GLN LEU GLU ASN TYR CYS ASN
765                 * SEQRES   1 D   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
766                 * SEQRES   2 D   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
767                 * SEQRES   3 D   30  THR PRO LYS ALA
768                 */
769
770                String recordName = line.substring(0, 6).trim();
771                String chainID    = line.substring(11, 12);
772                String newLength   = line.substring(13,17).trim();
773                String subSequence = line.substring(18);
774
775                if ( lengthCheck == -1 ){
776                        lengthCheck = Integer.parseInt(newLength);
777                }
778
779                StringTokenizer subSequenceResidues = new StringTokenizer(subSequence);
780
781                Character aminoCode1 = null;
782                if (! recordName.equals(AminoAcid.SEQRESRECORD)) {
783                        // should not have been called
784                        return;
785                }
786
787                currentChain = isKnownChain(chainID, seqResChains);
788                if ( currentChain == null) {
789
790                        currentChain = new ChainImpl();
791                        currentChain.setId(chainID);
792                        currentChain.setName(chainID);
793
794                }
795
796                while (subSequenceResidues.hasMoreTokens()) {
797
798                        String threeLetter = subSequenceResidues.nextToken();
799
800                        aminoCode1 = StructureTools.get1LetterCode(threeLetter);
801
802                        //if (aminoCode1 == null) {
803                        // could be a nucleotide...
804                        // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide...
805                        //}
806                        currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter);
807
808                        currentGroup.setPDBName(threeLetter);
809
810                        if ( currentGroup instanceof AminoAcid){
811                                AminoAcid aa = (AminoAcid)currentGroup;
812                                aa.setRecordType(AminoAcid.SEQRESRECORD);
813                        }
814                        // add the current resNum to the new chain.
815                        currentChain.addGroup(currentGroup);
816
817                }
818                Chain test = isKnownChain(chainID, seqResChains);
819
820                if ( test == null)
821                        seqResChains.add(currentChain);
822
823                if (currentGroup != null)
824                        currentGroup.trimToSize();
825
826                currentGroup = null;
827                currentChain = null;
828
829                //               the current chain is finished!
830                //if ( current_chain.getLength() != lengthCheck ){
831                //      System.err.println("the length of chain " + current_chain.getName() + "(" +
832                //                      current_chain.getLength() + ") does not match the expected " + lengthCheck);
833                //}
834
835                lengthCheck = Integer.parseInt(newLength);
836
837        }
838
839
840
841        /**
842         * Handler for
843         * TITLE Record Format
844         * <pre>
845         COLUMNS        DATA TYPE       FIELD          DEFINITION
846         ----------------------------------------------------------------------------------
847         1 -  6        Record name     "TITLE "
848         9 - 10        Continuation    continuation   Allows concatenation of multiple
849         records.
850         11 - 70        String          title          Title of the experiment.
851         * </pre>
852         *
853         */
854        private void pdb_TITLE_Handler(String line) {
855                String title;
856                if ( line.length() > 79)
857                        title = line.substring(10,80).trim();
858                else
859                        title = line.substring(10,line.length()).trim();
860
861                String t = pdbHeader.getTitle();
862                if ( (t != null) && (! t.equals("")) ){
863                        if (t.endsWith("-"))
864                                t += ""; // if last line ends with a hyphen then we don't add space
865                        else
866                                t += " ";
867                }
868                else t = "";
869
870                t += title;
871
872                pdbHeader.setTitle(t);
873        }
874
875        /**
876         * JRNL handler.
877         * The JRNL record contains the primary literature citation that describes the experiment which resulted
878         * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary
879         * reference, then there is no JRNL reference. Other references are given in REMARK 1.
880         *
881         * Record Format
882         * <pre>
883         * COLUMNS       DATA TYPE     FIELD         DEFINITION
884         * -----------------------------------------------------------------------
885         * 1 -  6       Record name   "JRNL  "
886         *
887         * 13 - 70       LString        text         See Details below.
888         * </pre>
889         */
890        private void pdb_JRNL_Handler(String line) {
891                //add the strings to the journalLines
892                //the actual JournalArticle is then built when the whole entry is being
893                //finalized with triggerEndFileChecks()
894                //JRNL        TITL   NMR SOLUTION STRUCTURE OF RECOMBINANT TICK           1TAP  10
895                if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) {
896                        //trim off the trailing PDB id from legacy files.
897                        //are we really trying to still cater for these museum pieces?
898
899                        logger.debug("trimming legacy PDB id from end of JRNL section line");
900
901                        line = line.substring(0, line.length() - 8);
902                        journalLines.add(line);
903                } else {
904                        journalLines.add(line);
905                }
906        }
907
908        /**
909         * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same
910         * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be
911         * missing. Don't say I didn't warn you.
912         *
913         * @param line
914         */
915        private void pdb_COMPND_Handler(String line) {
916
917                logger.debug("previousContinuationField  is "
918                                        + previousContinuationField);
919                logger.debug("current continuationField  is "
920                                        + continuationField);
921                logger.debug("current continuationString is "
922                                        + continuationString);
923                logger.debug("current compound           is "
924                                        + current_compound);
925
926
927                // In legacy PDB files the line ends with the PDB code and a serial number, chop those off!
928                //format version 3.0 onwards will have 80 characters in a line
929                //              if (line.length() > 72) {
930                if (isLegacyFormat) {
931                        //                    if (DEBUG) {
932                        //                        System.out.println("We have a legacy file - truncating line length to 71 characters:");
933                        //                        System.out.println(line);
934                        //                    }
935                        line = line.substring(0, 72);
936                }
937
938                line = line.substring(10, line.length());
939
940
941                String[] fieldList = line.trim().split("\\s+");
942                int fl = fieldList.length;
943                if (fl > 0) {
944                        String field0 = fieldList[0];
945                        if (compndFieldValues.contains(field0)) {
946                                continuationField = field0;
947                                if (previousContinuationField.equals("")) {
948                                        previousContinuationField = continuationField;
949                                }
950                        } else if (field0.endsWith(";") && compndFieldValues.contains(field0.substring(0, field0.length()-1)) ) {
951                                // the ':' character indicates the end of a field name and should be invalid as part the first data token
952                                // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
953                                // UPDATE: There is no harm of having a ':' in the first data token. e.g. 3fdj contains a ':'.
954                                //   The intended case occurs only if the token is a key followed by a colon and a semicolon without spaces, e.g. "COMPND   2 MOLECULE:;"
955                                logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier");
956                                return;
957                        }
958                } else {
959                        // the line will be added as data to the previous field
960                }
961
962
963                line = line.replace(continuationField, "").trim();
964
965                StringTokenizer compndTokens = new StringTokenizer(line);
966
967                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
968
969                while (compndTokens.hasMoreTokens()) {
970                        String token = compndTokens.nextToken();
971
972                        if (previousContinuationField.equals("")) {
973                                previousContinuationField = continuationField;
974                        }
975
976                        if (previousContinuationField.equals(continuationField)
977                                        && compndFieldValues.contains(continuationField)) {
978
979                                logger.debug("Still in field " + continuationField);
980                                logger.debug("token = " + token);
981
982                                continuationString = continuationString.concat(token + " ");
983
984                                logger.debug("continuationString = "
985                                                        + continuationString);
986
987                        }
988                        if (!continuationField.equals(previousContinuationField)) {
989
990                                if (continuationString.equals("")) {
991                                        continuationString = token;
992
993                                } else {
994
995                                        compndValueSetter(previousContinuationField,
996                                                        continuationString);
997                                        previousContinuationField = continuationField;
998                                        continuationString = token + " ";
999                                }
1000                        } else if (ignoreCompndFieldValues.contains(token)) {
1001                                // this field shall be ignored
1002                                //continuationField = token;
1003                        }
1004                }
1005                if (isLastCompndLine) {
1006                        // final line in the section - finish off the compound
1007                        //                      System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header.");
1008                        compndValueSetter(continuationField, continuationString);
1009                        continuationString = "";
1010                        if (current_compound!=null) entities.add(current_compound);
1011                }
1012        }
1013
1014        /**
1015         * Set the value in the current molId object
1016         * @param field
1017         * @param value
1018         */
1019        private void compndValueSetter(String field, String value) {
1020
1021                value = value.trim().replace(";", "");
1022                if (field.equals("MOL_ID:")) {
1023
1024                        int i = -1;
1025                        try {
1026                                i = Integer.valueOf(value);
1027                        } catch (NumberFormatException e){
1028                                logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value);
1029                        }
1030                        if (i>0 && prevMolId!=i) {
1031
1032                                if (current_compound!=null) entities.add(current_compound);
1033
1034                                logger.debug("Initialising new Compound with mol_id {}", i);
1035
1036                                current_compound = new EntityInfo();
1037
1038                                current_compound.setMolId(i);
1039
1040                                // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25
1041                                current_compound.setType(EntityType.POLYMER);
1042
1043                                prevMolId = i;
1044                        }
1045
1046                }
1047
1048                // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return
1049                if (current_compound==null) {
1050                        return;
1051                }
1052
1053                if (field.equals("MOLECULE:")) {
1054                        current_compound.setDescription(value);
1055
1056                }
1057                if (field.equals("CHAIN:")) {
1058                        //System.out.println(value);
1059                        StringTokenizer chainTokens = new StringTokenizer(value, ",");
1060                        List<String> chains = new ArrayList<String>();
1061
1062                        while (chainTokens.hasMoreTokens()) {
1063                                String chainID = chainTokens.nextToken().trim();
1064                                // NULL is used in old PDB files to represent empty chain DI
1065                                if (chainID.equals("NULL"))
1066                                        chainID = " ";
1067                                chains.add(chainID);
1068                        }
1069                        compoundMolIds2chainIds.put(current_compound.getMolId(),chains);
1070
1071                }
1072                if (field.equals("SYNONYM:")) {
1073
1074                        StringTokenizer synonyms = new StringTokenizer(value, ",");
1075                        List<String> names = new ArrayList<String>();
1076
1077                        while (synonyms.hasMoreTokens()) {
1078                                names.add(synonyms.nextToken());
1079
1080                                current_compound.setSynonyms(names);
1081                        }
1082
1083                }
1084
1085                if (field.equals("EC:")) {
1086
1087                        StringTokenizer ecNumTokens = new StringTokenizer(value, ",");
1088                        List<String> ecNums = new ArrayList<String>();
1089
1090                        while (ecNumTokens.hasMoreTokens()) {
1091                                ecNums.add(ecNumTokens.nextToken());
1092
1093                                current_compound.setEcNums(ecNums);
1094                        }
1095
1096                }
1097                if (field.equals("FRAGMENT:")) {
1098
1099                        current_compound.setFragment(value);
1100
1101                }
1102                if (field.equals("ENGINEERED:")) {
1103
1104                        current_compound.setEngineered(value);
1105
1106                }
1107                if (field.equals("MUTATION:")) {
1108
1109                        current_compound.setMutation(value);
1110
1111                }
1112                if (field.equals("BIOLOGICAL_UNIT:")) {
1113
1114                        current_compound.setBiologicalUnit(value);
1115
1116                }
1117                if (field.equals("OTHER_DETAILS:")) {
1118
1119                        current_compound.setDetails(value);
1120
1121                }
1122
1123        }
1124
1125
1126        /**
1127         * Handler for
1128         * SOURCE Record format
1129         *
1130         * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied.
1131         * Record Format
1132         * <pre>
1133         * COLUMNS   DATA TYPE         FIELD          DEFINITION
1134         * -------------------------------------------------------------------------------
1135         *  1 -  6   Record name       "SOURCE"
1136         *  9 - 10   Continuation      continuation   Allows concatenation of multiple records.
1137         * 11 - 70   Specification     srcName        Identifies the source of the macromolecule in
1138         *            list                            a token: value format.
1139         * </pre>
1140         * @param line the line to be parsed
1141         */
1142        private void pdb_SOURCE_Handler(String line) {
1143                // works in the same way as the pdb_COMPND_Handler.
1144                String continuationNr = line.substring(9, 10).trim();
1145
1146
1147
1148                logger.debug("current continuationNo     is "
1149                                + continuationNr);
1150                logger.debug("previousContinuationField  is "
1151                                + previousContinuationField);
1152                logger.debug("current continuationField  is "
1153                                + continuationField);
1154                logger.debug("current continuationString is "
1155                                + continuationString);
1156                logger.debug("current compound           is "
1157                                + current_compound);
1158
1159
1160                // following the docs, the last valid character should be 79, chop off the rest
1161                if (line.length() > 79) {
1162                        line = line.substring(0, 79);
1163                }
1164
1165                line = line.substring(10, line.length());
1166
1167                logger.debug("LINE: >" + line + "<");
1168
1169                String[] fieldList = line.split("\\s+");
1170
1171                if (!fieldList[0].equals("")
1172                                && sourceFieldValues.contains(fieldList[0])) {
1173                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'");
1174                        continuationField = fieldList[0];
1175                        if (previousContinuationField.equals("")) {
1176                                previousContinuationField = continuationField;
1177                        }
1178
1179                } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) {
1180                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'");
1181                        continuationField = fieldList[1];
1182                        if (previousContinuationField.equals("")) {
1183                                previousContinuationField = continuationField;
1184                        }
1185
1186                } else {
1187                        if (continuationNr.equals("")) {
1188
1189                                logger.debug("looks like an old PDB file");
1190
1191                                continuationField = "MOLECULE:";
1192                                if (previousContinuationField.equals("")) {
1193                                        previousContinuationField = continuationField;
1194                                }
1195                        }
1196
1197                }
1198
1199                line = line.replace(continuationField, "").trim();
1200
1201                StringTokenizer compndTokens = new StringTokenizer(line);
1202
1203                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
1204
1205                while (compndTokens.hasMoreTokens()) {
1206                        String token = compndTokens.nextToken();
1207
1208                        if (previousContinuationField.equals("")) {
1209                                //                              System.out.println("previousContinuationField is empty. Setting to : " + continuationField);
1210                                previousContinuationField = continuationField;
1211                        }
1212
1213                        if (previousContinuationField.equals(continuationField)
1214                                        && sourceFieldValues.contains(continuationField)) {
1215
1216                                logger.debug("Still in field " + continuationField);
1217
1218                                continuationString = continuationString.concat(token + " ");
1219
1220                                logger.debug("continuationString = "
1221                                                        + continuationString);
1222                        }
1223                        if (!continuationField.equals(previousContinuationField)) {
1224
1225                                if (continuationString.equals("")) {
1226                                        continuationString = token;
1227
1228                                } else {
1229
1230                                        sourceValueSetter(previousContinuationField,
1231                                                        continuationString);
1232                                        previousContinuationField = continuationField;
1233                                        continuationString = token + " ";
1234                                }
1235                        } else if (ignoreCompndFieldValues.contains(token)) {
1236                                // this field shall be ignored
1237                                //continuationField = token;
1238                        }
1239                }
1240                if (isLastSourceLine) {
1241                        // final line in the section - finish off the compound
1242                        //                      System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header.");
1243                        sourceValueSetter(continuationField, continuationString);
1244                        continuationString = "";
1245                        //compounds.add(current_compound);
1246                }
1247
1248        }
1249
1250
1251        /**
1252         * Set the value in the current molId object
1253         *
1254         * @param field
1255         * @param value
1256         */
1257        private void sourceValueSetter(String field, String value) {
1258
1259                value = value.trim().replace(";", "");
1260                //              System.out.println("[sourceValueSetter] " + field);
1261                if (field.equals("MOL_ID:")) {
1262
1263                        try {
1264                                current_compound = entities.get(Integer.valueOf(value) - 1);
1265                        } catch (NumberFormatException e){
1266                                logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage());
1267                                return;
1268                        }
1269
1270
1271                        //                      System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId());
1272
1273                }
1274                if (field.equals("SYNTHETIC:")) {
1275                        current_compound.setSynthetic(value);
1276                } else if (field.equals("FRAGMENT:")) {
1277                        current_compound.setFragment(value);
1278                } else if (field.equals("ORGANISM_SCIENTIFIC:")) {
1279                        current_compound.setOrganismScientific(value);
1280                } else if (field.equals("ORGANISM_TAXID:")) {
1281                        current_compound.setOrganismTaxId(value);
1282                } else if (field.equals("ORGANISM_COMMON:")) {
1283                        current_compound.setOrganismCommon(value);
1284                } else if (field.equals("STRAIN:")) {
1285                        current_compound.setStrain(value);
1286                } else if (field.equals("VARIANT:")) {
1287                        current_compound.setVariant(value);
1288                } else if (field.equals("CELL_LINE:")) {
1289                        current_compound.setCellLine(value);
1290                } else if (field.equals("ATCC:")) {
1291                        current_compound.setAtcc(value);
1292                } else if (field.equals("ORGAN:")) {
1293                        current_compound.setOrgan(value);
1294                } else if (field.equals("TISSUE:")) {
1295                        current_compound.setTissue(value);
1296                } else if (field.equals("CELL:")) {
1297                        current_compound.setCell(value);
1298                } else if (field.equals("ORGANELLE:")) {
1299                        current_compound.setOrganelle(value);
1300                } else if (field.equals("SECRETION:")) {
1301                        current_compound.setSecretion(value);
1302                } else if (field.equals("GENE:")) {
1303                        current_compound.setGene(value);
1304                } else if (field.equals("CELLULAR_LOCATION:")) {
1305                        current_compound.setCellularLocation(value);
1306                } else if (field.equals("EXPRESSION_SYSTEM:")) {
1307                        current_compound.setExpressionSystem(value);
1308                } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) {
1309                        current_compound.setExpressionSystemTaxId(value);
1310                } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) {
1311                        current_compound.setExpressionSystemStrain(value);
1312                } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) {
1313                        current_compound.setExpressionSystemVariant(value);
1314                } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) {
1315                        current_compound.setExpressionSystemCellLine(value);
1316                } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) {
1317                        current_compound.setExpressionSystemAtccNumber(value);
1318                } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) {
1319                        current_compound.setExpressionSystemOrgan(value);
1320                } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) {
1321                        current_compound.setExpressionSystemTissue(value);
1322                } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) {
1323                        current_compound.setExpressionSystemCell(value);
1324                } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) {
1325                        current_compound.setExpressionSystemOrganelle(value);
1326                } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) {
1327                        current_compound.setExpressionSystemCellularLocation(value);
1328                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) {
1329                        current_compound.setExpressionSystemVectorType(value);
1330                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) {
1331                        current_compound.setExpressionSystemVector(value);
1332                } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) {
1333                        current_compound.setExpressionSystemPlasmid(value);
1334                } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) {
1335                        current_compound.setExpressionSystemGene(value);
1336                } else if (field.equals("OTHER_DETAILS:")) {
1337                        current_compound.setExpressionSystemOtherDetails(value);
1338                }
1339
1340        }
1341
1342        /**
1343         * Handler for REMARK lines
1344         */
1345        private void pdb_REMARK_Handler(String line) {
1346
1347                if ( line == null || line.length() < 11)
1348                        return;
1349
1350
1351                if (line.startsWith("REMARK 800")) {
1352                        pdb_REMARK_800_Handler(line);
1353
1354                }  else if ( line.startsWith("REMARK 350")){
1355
1356                        if ( params.isParseBioAssembly()) {
1357
1358                                if (bioAssemblyParser == null){
1359                                        bioAssemblyParser = new PDBBioAssemblyParser();
1360                                }
1361
1362                                bioAssemblyParser.pdb_REMARK_350_Handler(line);
1363                        }
1364
1365                // REMARK 3 (for R free)
1366                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1367                // then last one encountered will be taken
1368                } else if (line.startsWith("REMARK   3   FREE R VALUE")) {
1369
1370                        // Rfree annotation is not very consistent in PDB format, it varies depending on the software
1371                        // Here we follow this strategy:
1372                        // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
1373                        // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
1374
1375                        Pattern pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*");
1376                        Matcher mR = pR.matcher(line);
1377                        if (mR.matches()) {
1378                                try {
1379                                        rfreeNoCutoffLine = Float.parseFloat(mR.group(1));
1380                                } catch (NumberFormatException e) {
1381                                        logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it");
1382                                }
1383                        }
1384                        pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*");
1385                        mR = pR.matcher(line);
1386                        if (mR.matches()) {
1387                                try {
1388                                        rfreeStandardLine = Float.parseFloat(mR.group(1));
1389                                } catch (NumberFormatException e) {
1390                                        logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1));
1391                                }
1392                        }
1393
1394                // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries)
1395                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1396                // then last one encountered will be taken
1397                } else if (line.startsWith("REMARK   3   RESOLUTION RANGE HIGH")){
1398                        Pattern pR = Pattern.compile("^REMARK   3   RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*");
1399                        Matcher mR = pR.matcher(line);
1400                        if (mR.matches()) {
1401                                try {
1402                                        float res = Float.parseFloat(mR.group(1));
1403                                        if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) {
1404                                                logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1405                                                                ,mR.group(1), String.format("%4.2f",pdbHeader.getResolution()));
1406                                        }
1407                                        pdbHeader.setResolution(res);
1408                                } catch (NumberFormatException e) {
1409                                        logger.info("Could not parse resolution '{}', ignoring it",mR.group(1));
1410                                }
1411                        }
1412                }
1413
1414        }
1415
1416
1417
1418
1419
1420
1421        /**
1422         * Handler for
1423         * EXPDTA Record Format
1424        <pre>
1425         COLUMNS       DATA TYPE      FIELD         DEFINITION
1426         -------------------------------------------------------------------------------
1427         1 -  6       Record name    "EXPDTA"
1428         9 - 10       Continuation   continuation  Allows concatenation of multiple
1429         records.
1430         11 - 70       SList          technique     The experimental technique(s) with
1431         optional comment describing the
1432         sample or experiment.
1433
1434         allowed techniques are:
1435         ELECTRON DIFFRACTION
1436         FIBER DIFFRACTION
1437         FLUORESCENCE TRANSFER
1438         NEUTRON DIFFRACTION
1439         NMR
1440         THEORETICAL MODEL
1441         X-RAY DIFFRACTION
1442        </pre>
1443         */
1444        private void pdb_EXPDTA_Handler(String line) {
1445
1446                String technique  ;
1447                if (line.length() > 69)
1448                        technique = line.substring (10, 70).trim() ;
1449                else
1450                        technique = line.substring(10).trim();
1451
1452                for (String singleTechnique: technique.split(";\\s+")) {
1453                        pdbHeader.setExperimentalTechnique(singleTechnique);
1454                }
1455
1456
1457        }
1458
1459        /**
1460         * Handler for
1461         * CRYST1 Record Format
1462         * The CRYST1 record presents the unit cell parameters, space group, and Z value.
1463         * If the entry describes a structure determined by a technique other than X-ray crystallography,
1464         * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1465         * <pre>
1466         * COLUMNS DATA TYPE    FIELD          DEFINITION
1467         * -------------------------------------------------------------
1468         *  1 - 6  Record name  "CRYST1"
1469         *  7 - 15 Real(9.3)    a              a (Angstroms).
1470         * 16 - 24 Real(9.3)    b              b (Angstroms).
1471         * 25 - 33 Real(9.3)    c              c (Angstroms).
1472         * 34 - 40 Real(7.2)    alpha          alpha (degrees).
1473         * 41 - 47 Real(7.2)    beta           beta (degrees).
1474         * 48 - 54 Real(7.2)    gamma          gamma (degrees).
1475         * 56 - 66 LString      sGroup         Space group.
1476         * 67 - 70 Integer      z              Z value.
1477         * </pre>
1478         */
1479        private void pdb_CRYST1_Handler(String line) {
1480                // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1')
1481                if (line.length() < 58) {
1482                        logger.warn("CRYST1 record has fewer than 58 columns: will ignore it");
1483                        return;
1484                }
1485
1486                float a;
1487                float b;
1488                float c;
1489                float alpha;
1490                float beta;
1491                float gamma;
1492                String spaceGroup = "";
1493
1494                try {
1495                        a = Float.parseFloat(line.substring(6,15).trim());
1496                        b = Float.parseFloat(line.substring(15,24).trim());
1497                        c = Float.parseFloat(line.substring(24,33).trim());
1498                        alpha = Float.parseFloat(line.substring(33,40).trim());
1499                        beta = Float.parseFloat(line.substring(40,47).trim());
1500                        gamma = Float.parseFloat(line.substring(47,54).trim());
1501                } catch (NumberFormatException e) {
1502                        logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line);
1503                        return ;
1504                }
1505                if (line.length()>=66) {
1506                        // for well formatted files
1507                        spaceGroup = line.substring(55,66).trim();
1508                } else {
1509                        // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value
1510                        spaceGroup = line.substring(55,line.length()).trim();
1511                }
1512
1513                CrystalCell xtalCell = new CrystalCell();
1514                xtalCell.setA(a);
1515                xtalCell.setB(b);
1516                xtalCell.setC(c);
1517                xtalCell.setAlpha(alpha);
1518                xtalCell.setBeta(beta);
1519                xtalCell.setGamma(gamma);
1520
1521                if (!xtalCell.isCellReasonable()) {
1522                        // If the entry describes a structure determined by a technique other than X-ray crystallography,
1523                        // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1524                        // if so we don't add the crystal cell and it remains null
1525                        logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1526                                        CrystalCell.MIN_VALID_CELL_SIZE);
1527                } else {
1528                        crystallographicInfo.setCrystalCell(xtalCell);
1529                }
1530
1531                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1532                if (sg==null) {
1533                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1534                        crystallographicInfo.setNonStandardSg(true);
1535                } else {
1536                        crystallographicInfo.setSpaceGroup(sg);
1537                        crystallographicInfo.setNonStandardSg(false);
1538                }
1539        }
1540
1541        /**
1542         * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries)
1543         *
1544         * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn
1545         * <pre>
1546         * COLUMNS        DATA TYPE     FIELD         DEFINITION
1547         * -------------------------------------------------------------
1548         *
1549         *  1 -  6        Record name   "MTRIXn"      n=1, 2, or 3
1550         *  8 - 10        Integer       serial        Serial number.
1551         * 11 - 20        Real(10.6)    m[n][1]       Mn1
1552         * 21 - 30        Real(10.6)    m[n][2]       Mn2
1553         * 31 - 40        Real(10.6)    m[n][3]       Mn3
1554         * 46 - 55        Real(10.5)    v[n]          Vn
1555         * 60             Integer       iGiven        1
1556         *
1557         * </pre>
1558         * Note that we ignore operators with iGiven==1
1559         *
1560         * @param line
1561         */
1562        private void pdb_MTRIXn_Handler(String line) {
1563
1564                // don't process incomplete records
1565                if (line.length() < 55) {
1566                        logger.info("MTRIXn record has fewer than 55 columns: will ignore it");
1567                        return;
1568                }
1569
1570
1571                try {
1572
1573                        int rowIndex = Integer.parseInt(line.substring(5,6));
1574                        double col1Value = Double.parseDouble(line.substring(10,20));
1575                        double col2Value = Double.parseDouble(line.substring(20,30));
1576                        double col3Value = Double.parseDouble(line.substring(30,40));
1577                        double translValue = Double.parseDouble(line.substring(45,55));
1578                        int iGiven = 0;
1579                        if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) {
1580                                iGiven = Integer.parseInt(line.substring(59,60));
1581                        }
1582
1583                        if (iGiven == 1) return;
1584
1585                        if (ncsOperators==null) {
1586                                // we initialise on first pass
1587                                ncsOperators = new ArrayList<Matrix4d>();
1588                        }
1589
1590                        if (currentNcsOp==null) {
1591                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1592                        }
1593
1594                        currentNcsOp.setElement(rowIndex-1, 0, col1Value);
1595                        currentNcsOp.setElement(rowIndex-1, 1, col2Value);
1596                        currentNcsOp.setElement(rowIndex-1, 2, col3Value);
1597                        currentNcsOp.setElement(rowIndex-1, 3, translValue);
1598
1599
1600                        if (rowIndex==3) {
1601                                ncsOperators.add(currentNcsOp);
1602                                // we initialise for next matrix to come
1603                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1604                        }
1605
1606                } catch (NumberFormatException e) {
1607                        logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<");
1608                }
1609        }
1610
1611        /**
1612         * Handler for ATOM.
1613         * Record Format:
1614         *
1615         * <pre>
1616         * ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1617         *
1618         * COLUMNS        DATA TYPE       FIELD         DEFINITION
1619         * ---------------------------------------------------------------------------------
1620         * 1 -  6        Record name     "ATOM  "
1621         * 7 - 11        Integer         serial        Atom serial number.
1622         * 13 - 16        Atom            name          Atom name.
1623         * 17             Character       altLoc        Alternate location indicator.
1624         * 18 - 20        Residue name    resName       Residue name.
1625         * 22             Character       chainID       Chain identifier.
1626         * 23 - 26        Integer         resSeq        Residue sequence number.
1627         * 27             AChar           iCode         Code for insertion of residues.
1628         * 31 - 38        Real(8.3)       x             Orthogonal coordinates for X in Angstroms.
1629         * 39 - 46        Real(8.3)       y             Orthogonal coordinates for Y in Angstroms.
1630         * 47 - 54        Real(8.3)       z             Orthogonal coordinates for Z in Angstroms.
1631         * 55 - 60        Real(6.2)       occupancy     Occupancy.
1632         * 61 - 66        Real(6.2)       tempFactor    Temperature factor.
1633         * 73 - 76        LString(4)      segID         Segment identifier, left-justified.
1634         * 77 - 78        LString(2)      element       Element symbol, right-justified.
1635         * 79 - 80        LString(2)      charge        Charge on the atom.
1636         * </pre>
1637         */
1638        private void  pdb_ATOM_Handler(String line)     {
1639
1640                if ( params.isHeaderOnly())
1641                        return;
1642
1643                // let's first get the chain name which will serve to identify if we are starting a new molecule
1644                String chainName      = line.substring(21,22);
1645
1646                if (chainName.equals(" ")) {
1647                        blankChainIdsPresent = true;
1648                }
1649
1650                if (currentChain!=null && !currentChain.getName().equals(chainName)) {
1651                        // new chain name: another molecule coming
1652                        startOfMolecule = true;
1653                }
1654
1655                if (startOfMolecule) {
1656                        // we add last chain if there was one
1657                        if (currentChain!=null) {
1658                                currentModel.add(currentChain);
1659                                // let's not forget adding the last group to the finishing chain
1660                                if (currentGroup!=null) {
1661                                        currentChain.addGroup(currentGroup);
1662                                }
1663                        }
1664                        // we initialise the new molecule to come
1665                        currentChain = new ChainImpl();
1666                        // note that the chainId (asym id) is set properly later in assignAsymIds
1667                        currentChain.setId(chainName);
1668                        currentChain.setName(chainName);
1669
1670                }
1671
1672                if (startOfModel) {
1673                        // we add last model if there was one
1674                        if (currentModel!=null) {
1675                                allModels.add(currentModel);
1676                        }
1677                        // we initialise the model to come
1678                        currentModel = new ArrayList<>();
1679                }
1680
1681
1682                // let's get the residue number and see if we need to start a new group
1683
1684                String groupCode3     = line.substring(17,20).trim();
1685                String resNum  = line.substring(22,26).trim();
1686                Character iCode = line.substring(26,27).charAt(0);
1687                if ( iCode == ' ')
1688                        iCode = null;
1689                ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode);
1690
1691                //recordName      groupCode3
1692                //|                |    resNum
1693                //|                |    |   iCode
1694                //|     |          | |  |   ||
1695                //ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1696                //ATOM   1964  N   ARG H 221A      5.963 -16.715  27.669  1.00 28.59           N
1697
1698                Character aminoCode1 = StructureTools.get1LetterCode(groupCode3);
1699
1700                String recordName     = line.substring (0, 6).trim ();
1701
1702                boolean isHetAtomInFile = false;
1703
1704                if (recordName.equals("HETATM") ){
1705                        // HETATOM RECORDS are treated slightly differently
1706                        // some modified amino acids that we want to treat as amino acids
1707                        // can be found as HETATOM records
1708                        if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
1709                                        aminoCode1 = null;
1710
1711                        isHetAtomInFile = true;
1712                }
1713
1714                if ( startOfMolecule) {
1715
1716                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1717
1718                        currentGroup.setPDBName(groupCode3);
1719                        currentGroup.setResidueNumber(residueNumber);
1720                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1721
1722                }
1723
1724                // resetting states
1725                startOfModel = false;
1726                startOfMolecule = false;
1727
1728
1729                Character altLoc   = new Character(line.substring (16, 17).charAt(0));
1730                Group altGroup = null;
1731
1732
1733                // check if residue number is the same ...
1734                if ( ! residueNumber.equals(currentGroup.getResidueNumber())) {
1735
1736                        currentChain.addGroup(currentGroup);
1737                        currentGroup.trimToSize();
1738
1739                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1740
1741                        currentGroup.setPDBName(groupCode3);
1742                        currentGroup.setResidueNumber(residueNumber);
1743                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1744
1745                } else {
1746                        // same residueNumber, but altLocs...
1747
1748                        // test altLoc
1749                        if ( ! altLoc.equals(' ')) {
1750                                logger.debug("found altLoc! " + currentGroup + " " + altGroup);
1751                                altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3);
1752                                if ( altGroup.getChain() == null) {
1753                                        // need to set current chain
1754                                        altGroup.setChain(currentChain);
1755                                }
1756
1757                        }
1758                }
1759
1760                atomCount++;
1761
1762                if ( atomCount == atomCAThreshold ) {
1763                        // throw away the SEQRES lines - too much to deal with...
1764                        logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines");
1765                        seqResChains.clear();
1766
1767                        switchCAOnly();
1768
1769                }
1770
1771
1772
1773                if ( atomCount == loadMaxAtoms){
1774                        logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line);
1775                        return;
1776                }
1777                if ( atomCount > loadMaxAtoms){
1778                        return;
1779                }
1780
1781
1782                //          1         2         3         4         5         6
1783                //012345678901234567890123456789012345678901234567890123456789
1784                //ATOM      1  N   MET     1      20.154  29.699   5.276   1.0
1785                //ATOM    112  CA  ASP   112      41.017  33.527  28.371  1.00  0.00
1786                //ATOM     53  CA  MET     7      23.772  33.989 -21.600  1.00  0.00           C
1787                //ATOM    112  CA  ASP   112      37.613  26.621  33.571     0     0
1788
1789
1790                String fullname = line.substring (12, 16);
1791
1792                // check for CA only if requested
1793                if ( parseCAonly ){
1794                        // yes , user wants to get CA only
1795                        // only parse CA atoms...
1796                        if (! fullname.equals(" CA ")){
1797                                //System.out.println("ignoring " + line);
1798                                atomCount--;
1799                                return;
1800                        }
1801                }
1802
1803                if ( params.getAcceptedAtomNames() != null) {
1804
1805                        boolean found = false;
1806                        for (String ok : params.getAcceptedAtomNames()){
1807                                //System.out.println(ok + "< >" + fullname +"<");
1808
1809                                if ( ok.equals(fullname.trim())) {
1810                                        found = true;
1811                                        break;
1812                                }
1813                        }
1814                        if ( ! found) {
1815                                atomCount--;
1816                                return;
1817                        }
1818                }
1819                // create new atom
1820
1821                int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ());
1822                AtomImpl atom = new AtomImpl() ;
1823                atom.setPDBserial(pdbnumber) ;
1824
1825                atom.setAltLoc(altLoc);
1826                atom.setName(fullname.trim());
1827
1828                double x = Double.parseDouble (line.substring (30, 38).trim());
1829                double y = Double.parseDouble (line.substring (38, 46).trim());
1830                double z = Double.parseDouble (line.substring (46, 54).trim());
1831
1832                double[] coords = new double[3];
1833                coords[0] = x ;
1834                coords[1] = y ;
1835                coords[2] = z ;
1836                atom.setCoords(coords);
1837
1838                float occu  = 1.0f;
1839                if ( line.length() > 59 ) {
1840                        try {
1841                                // occu and tempf are sometimes not used :-/
1842                                occu = Float.parseFloat (line.substring (54, 60).trim());
1843                        }  catch (NumberFormatException e){}
1844                }
1845
1846                float tempf = 0.0f;
1847                if ( line.length() > 65) {
1848                        try {
1849                                tempf = Float.parseFloat (line.substring (60, 66).trim());
1850                        }  catch (NumberFormatException e){}
1851                }
1852
1853                atom.setOccupancy(  occu  );
1854                atom.setTempFactor( tempf );
1855
1856
1857
1858
1859                // Parse element from the element field. If this field is
1860                // missing (i.e. misformatted PDB file), then parse the
1861                // element from the chemical component.
1862                Element element = Element.R;
1863                boolean guessElement = true;
1864                if ( line.length() > 77 ) {
1865                        // parse element from element field
1866                        String elementSymbol = line.substring(76, 78).trim();
1867                        if (elementSymbol.isEmpty()) {
1868                                logger.info("Element column was empty for atom {} {}. Assigning atom element "
1869                                                + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber);
1870                        } else {
1871
1872                        try {
1873                                        element = Element.valueOfIgnoreCase(elementSymbol);
1874                                        guessElement = false;
1875                                }  catch (IllegalArgumentException e){
1876                                        logger.info("Element {} of atom {} {} was not recognised. Assigning atom element "
1877                                                        + "from Chemical Component Dictionary information", elementSymbol,
1878                                                        fullname.trim(), pdbnumber);
1879                                }
1880                        }
1881                } else {
1882                        logger.info("Missformatted PDB file: element column of atom {} {} is not present. "
1883                                        + "Assigning atom element from Chemical Component Dictionary information",
1884                                        fullname.trim(), pdbnumber);
1885                }
1886                if (guessElement) {
1887                        String elementSymbol = null;
1888                        if (currentGroup.getChemComp() != null) {
1889                                for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) {
1890                                        if (a.getAtomId().equals(fullname.trim())) {
1891                                                elementSymbol = a.getTypeSymbol();
1892                                                break;
1893                                        }
1894                                }
1895                                if (elementSymbol == null) {
1896                                        logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. "
1897                                                        + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName());
1898                        } else {
1899                        try {
1900                                element = Element.valueOfIgnoreCase(elementSymbol);
1901                                        } catch (IllegalArgumentException e) {
1902                                                // this can still happen for cases like UNK
1903                                                logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. "
1904                                                                + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber);
1905                                        }
1906                                }
1907                        } else {
1908                                logger.warn("Chemical Component Dictionary information was not found for Atom name {}. "
1909                                                + "Assigning generic element R to it", fullname.trim());
1910                        }
1911
1912                }
1913                atom.setElement(element);
1914
1915
1916                //see if chain_id is one of the previous chains ...
1917                if ( altGroup != null) {
1918                        altGroup.addAtom(atom);
1919                        altGroup = null;
1920                }
1921                else {
1922                        currentGroup.addAtom(atom);
1923                }
1924
1925
1926                // make sure that main group has all atoms
1927                // GitHub issue: #76
1928                if ( ! currentGroup.hasAtom(atom.getName())) {
1929                        currentGroup.addAtom(atom);
1930                }
1931
1932
1933
1934                        }
1935
1936
1937        private Group getCorrectAltLocGroup( Character altLoc,
1938                        String recordName, Character aminoCode1, String groupCode3) {
1939
1940                // see if we know this altLoc already;
1941                List<Atom> atoms = currentGroup.getAtoms();
1942                if ( atoms.size() > 0) {
1943                        Atom a1 = atoms.get(0);
1944                        // we are just adding atoms to the current group
1945                        // probably there is a second group following later...
1946                        if (a1.getAltLoc().equals(altLoc)) {
1947
1948                                return currentGroup;
1949                        }
1950                }
1951
1952                List<Group> altLocs = currentGroup.getAltLocs();
1953                for ( Group altLocG : altLocs ){
1954                        atoms = altLocG.getAtoms();
1955                        if ( atoms.size() > 0) {
1956                                for ( Atom a1 : atoms) {
1957                                        if (a1.getAltLoc().equals( altLoc)) {
1958
1959                                                return altLocG;
1960                                        }
1961                                }
1962                        }
1963                }
1964
1965                // no matching altLoc group found.
1966                // build it up.
1967
1968                if ( groupCode3.equals(currentGroup.getPDBName())) {
1969                        if ( currentGroup.getAtoms().size() == 0) {
1970                                //System.out.println("current group is empty " + current_group + " " + altLoc);
1971                                return currentGroup;
1972                        }
1973                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
1974                        Group altLocG = (Group) currentGroup.clone();
1975                        // drop atoms from cloned group...
1976                        // https://redmine.open-bio.org/issues/3307
1977                        altLocG.setAtoms(new ArrayList<Atom>());
1978                        altLocG.getAltLocs().clear();
1979                        currentGroup.addAltLoc(altLocG);
1980                        return altLocG;
1981                }
1982
1983                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
1984                Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3);
1985
1986
1987                altLocG.setPDBName(groupCode3);
1988
1989                altLocG.setResidueNumber(currentGroup.getResidueNumber());
1990                currentGroup.addAltLoc(altLocG);
1991                return altLocG;
1992        }
1993
1994        private void switchCAOnly(){
1995                parseCAonly = true;
1996
1997
1998                currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel);
1999
2000                for ( int i =0; i< structure.nrModels() ; i++){
2001                        //  iterate over all known models ...
2002                        List<Chain> model = structure.getModel(i);
2003                        model = CAConverter.getRepresentativeAtomsOnly(model);
2004                        structure.setModel(i,model);
2005                }
2006
2007                currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain);
2008
2009        }
2010
2011
2012        /** safes repeating a few lines ... */
2013        private Integer conect_helper (String line,int start,int end) {
2014                if (line.length() < end) return null;
2015
2016                String sbond = line.substring(start,end).trim();
2017                int bond  = -1 ;
2018                Integer b = null ;
2019
2020                if ( ! sbond.equals("")) {
2021                        bond = Integer.parseInt(sbond);
2022                        b = new Integer(bond);
2023                }
2024
2025                return b ;
2026        }
2027
2028        /**
2029         * Handler for CONECT Record Format
2030        <pre>
2031         COLUMNS         DATA TYPE        FIELD           DEFINITION
2032         ---------------------------------------------------------------------------------
2033         1 -  6         Record name      "CONECT"
2034         7 - 11         Integer          serial          Atom serial number
2035         12 - 16         Integer          serial          Serial number of bonded atom
2036         17 - 21         Integer          serial          Serial number of bonded atom
2037         22 - 26         Integer          serial          Serial number of bonded atom
2038         27 - 31         Integer          serial          Serial number of bonded atom
2039         32 - 36         Integer          serial          Serial number of hydrogen bonded
2040         atom
2041         37 - 41         Integer          serial          Serial number of hydrogen bonded
2042         atom
2043         42 - 46         Integer          serial          Serial number of salt bridged
2044         atom
2045         47 - 51         Integer          serial          Serial number of hydrogen bonded
2046         atom
2047         52 - 56         Integer          serial          Serial number of hydrogen bonded
2048         atom
2049         57 - 61         Integer          serial          Serial number of salt bridged
2050         atom
2051         </pre>
2052         */
2053        private void pdb_CONECT_Handler(String line) {
2054
2055                if ( atomOverflow) {
2056                        return ;
2057                }
2058                if (params.isHeaderOnly()) {
2059                        return;
2060                }
2061
2062                // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines...
2063                try {
2064                        int atomserial = Integer.parseInt (line.substring(6 ,11).trim());
2065                        Integer bond1      = conect_helper(line,11,16);
2066                        Integer bond2      = conect_helper(line,16,21);
2067                        Integer bond3      = conect_helper(line,21,26);
2068                        Integer bond4      = conect_helper(line,26,31);
2069                        Integer hyd1       = conect_helper(line,31,36);
2070                        Integer hyd2       = conect_helper(line,36,41);
2071                        Integer salt1      = conect_helper(line,41,46);
2072                        Integer hyd3       = conect_helper(line,46,51);
2073                        Integer hyd4       = conect_helper(line,51,56);
2074                        Integer salt2      = conect_helper(line,56,61);
2075
2076                        //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+
2077                        //                 hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2);
2078                        HashMap<String, Integer> cons = new HashMap<String, Integer>();
2079                        cons.put("atomserial",new Integer(atomserial));
2080
2081                        if ( bond1 != null) cons.put("bond1",bond1);
2082                        if ( bond2 != null) cons.put("bond2",bond2);
2083                        if ( bond3 != null) cons.put("bond3",bond3);
2084                        if ( bond4 != null) cons.put("bond4",bond4);
2085                        if ( hyd1  != null) cons.put("hydrogen1",hyd1);
2086                        if ( hyd2  != null) cons.put("hydrogen2",hyd2);
2087                        if ( salt1 != null) cons.put("salt1",salt1);
2088                        if ( hyd3  != null) cons.put("hydrogen3",hyd3);
2089                        if ( hyd4  != null) cons.put("hydrogen4",hyd4);
2090                        if ( salt2 != null) cons.put("salt2",salt2);
2091
2092                        connects.add(cons);
2093                } catch (NumberFormatException e){
2094                        logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line);
2095                        return;
2096                }
2097        }
2098
2099        /**
2100         * Handler for MODEL Record Format
2101         * <pre>
2102         * COLUMNS       DATA TYPE      FIELD         DEFINITION
2103         * ----------------------------------------------------------------------
2104         * 1 -  6       Record name    "MODEL "
2105         * 11 - 14       Integer        serial        Model serial number.
2106         * </pre>
2107         */
2108        private void pdb_MODEL_Handler(String line) {
2109
2110                if (params.isHeaderOnly()) return;
2111
2112                // new model: we start a new molecule
2113                startOfMolecule = true;
2114                startOfModel = true;
2115
2116        }
2117
2118        /**
2119         * Handler for TER record. The record is used in deposited PDB files and many others,
2120         * but it's often forgotten by some softwares. In any case it helps identifying the
2121         * start of ligand molecules so we use it for that.
2122         */
2123        private void pdb_TER_Handler() {
2124                startOfMolecule = true;
2125        }
2126
2127
2128        /**
2129         * DBREF handler
2130         * <pre>
2131         * COLUMNS       DATA TYPE          FIELD          DEFINITION
2132         * ----------------------------------------------------------------
2133         *  1 - 6        Record name        "DBREF "
2134         *  8 - 11       IDcode             idCode         ID code of this entry.
2135         * 13            Character          chainID        Chain identifier.
2136         * 15 - 18       Integer            seqBegin       Initial sequence number
2137         *                                                 of the PDB sequence segment.
2138         * 19            AChar              insertBegin    Initial insertion code
2139         *                                                 of the PDB sequence segment.
2140         * 21 - 24       Integer            seqEnd         Ending sequence number
2141         *                                                 of the PDB sequence segment.
2142         * 25            AChar              insertEnd      Ending insertion code
2143         *                                                 of the PDB sequence segment.
2144         * 27 - 32       LString            database       Sequence database name.
2145         * 34 - 41       LString            dbAccession    Sequence database accession code.
2146         * 43 - 54      LString            dbIdCode        Sequence database
2147         *                                                 identification code.
2148         * 56 - 60      Integer            dbseqBegin      Initial sequence number of the
2149         *                                                 database seqment.
2150         * 61           AChar              idbnsBeg        Insertion code of initial residue
2151         *                                                 of the segment, if PDB is the
2152         *                                                 reference.
2153         * 63 - 67      Integer            dbseqEnd        Ending sequence number of the
2154         *                                                 database segment.
2155         * 68           AChar              dbinsEnd        Insertion code of the ending
2156         *                                                 residue of the segment, if PDB is
2157         *                                                 the reference.
2158         * </pre>
2159         */
2160        private void pdb_DBREF_Handler(String line){
2161
2162                logger.debug("Parsing DBREF " + line);
2163
2164                DBRef dbref = new DBRef();
2165                String idCode      = line.substring(7,11);
2166                String chainName     = line.substring(12,13);
2167                String seqBegin    = line.substring(14,18);
2168                String insertBegin = line.substring(18,19);
2169                String seqEnd      = line.substring(20,24);
2170                String insertEnd   = line.substring(24,25);
2171                String database    = line.substring(26,32);
2172                String dbAccession = line.substring(33,41);
2173                String dbIdCode    = line.substring(42,54);
2174                String dbseqBegin  = line.substring(55,60);
2175                String idbnsBeg    = line.substring(60,61);
2176                String dbseqEnd    = line.substring(62,67);
2177                // Support implicit space character at end
2178                String dbinsEnd;
2179                if(line.length() >= 68)
2180                        dbinsEnd       = line.substring(67,68);
2181                else
2182                        dbinsEnd       = " ";
2183
2184                dbref.setIdCode(idCode);
2185                dbref.setChainName(chainName);
2186                dbref.setSeqBegin(intFromString(seqBegin));
2187                dbref.setInsertBegin(insertBegin.charAt(0));
2188                dbref.setSeqEnd(intFromString(seqEnd));
2189                dbref.setInsertEnd(insertEnd.charAt(0));
2190                dbref.setDatabase(database.trim());
2191                dbref.setDbAccession(dbAccession.trim());
2192                dbref.setDbIdCode(dbIdCode.trim());
2193                dbref.setDbSeqBegin(intFromString(dbseqBegin));
2194                dbref.setIdbnsBegin(idbnsBeg.charAt(0));
2195                dbref.setDbSeqEnd(intFromString(dbseqEnd));
2196                dbref.setIdbnsEnd(dbinsEnd.charAt(0));
2197
2198                //System.out.println(dbref.toPDB());
2199                dbrefs.add(dbref);
2200        }
2201
2202
2203        /**
2204         * Process the disulfide bond info provided by an SSBOND record
2205         *
2206         * <pre>
2207        COLUMNS        DATA TYPE       FIELD         DEFINITION
2208        -------------------------------------------------------------------
2209         1 -  6        Record name     "SSBOND"
2210         8 - 10        Integer         serNum       Serial number.
2211        12 - 14        LString(3)      "CYS"        Residue name.
2212        16             Character       chainID1     Chain identifier.
2213        18 - 21        Integer         seqNum1      Residue sequence number.
2214        22             AChar           icode1       Insertion code.
2215        26 - 28        LString(3)      "CYS"        Residue name.
2216        30             Character       chainID2     Chain identifier.
2217        32 - 35        Integer         seqNum2      Residue sequence number.
2218        36             AChar           icode2       Insertion code.
2219        60 - 65        SymOP           sym1         Symmetry oper for 1st resid
2220        67 - 72        SymOP           sym2         Symmetry oper for 2nd resid
2221         * </pre>
2222         */
2223        private void pdb_SSBOND_Handler(String line){
2224
2225                if (params.isHeaderOnly()) return;
2226
2227                if (line.length()<36) {
2228                        logger.info("SSBOND line has length under 36. Ignoring it.");
2229                        return;
2230                }
2231
2232                String chain1      = line.substring(15,16);
2233                String seqNum1     = line.substring(17,21).trim();
2234                String icode1      = line.substring(21,22);
2235                String chain2      = line.substring(29,30);
2236                String seqNum2     = line.substring(31,35).trim();
2237                String icode2      = line.substring(35,36);
2238
2239                if (line.length()>=72) {
2240                        String symop1 = line.substring(59, 65).trim();
2241                        String symop2 = line.substring(66, 72).trim();
2242
2243                        // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them
2244                        if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing
2245                                        (!symop1.equals("1555") || !symop2.equals("1555")) ) {
2246                                logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2);
2247                                return;
2248                        }
2249                }
2250
2251                if (icode1.equals(" "))
2252                        icode1 = "";
2253                if (icode2.equals(" "))
2254                        icode2 = "";
2255
2256                SSBondImpl ssbond = new SSBondImpl();
2257
2258                ssbond.setChainID1(chain1);
2259                ssbond.setResnum1(seqNum1);
2260                ssbond.setChainID2(chain2);
2261                ssbond.setResnum2(seqNum2);
2262                ssbond.setInsCode1(icode1);
2263                ssbond.setInsCode2(icode2);
2264                ssbonds.add(ssbond);
2265        }
2266
2267
2268        /**
2269         * Takes care of LINK records. These take the format of:
2270         *
2271         * <pre>
2272         * COLUMNS        DATA TYPE       FIELD       DEFINITION
2273         * --------------------------------------------------------------------------------
2274         *  1 -  6        Record name     "LINK  "
2275         * 13 - 16        Atom            name1       Atom name.
2276         * 17             Character       altLoc1     Alternate location indicator.
2277         * 18 - 20        Residue name    resName1    Residue name.
2278         * 22             Character       chainID1    Chain identifier.
2279         * 23 - 26        Integer         resSeq1     Residue sequence number.
2280         * 27             AChar           iCode1      Insertion code.
2281         * 43 - 46        Atom            name2       Atom name.
2282         * 47             Character       altLoc2     Alternate location indicator.
2283         * 48 - 50        Residue name    resName2    Residue name.
2284         * 52             Character       chainID2    Chain identifier.
2285         * 53 - 56        Integer         resSeq2     Residue sequence number.
2286         * 57             AChar           iCode2      Insertion code.
2287         * 60 - 65        SymOP           sym1        Symmetry operator for 1st atom.
2288         * 67 - 72        SymOP           sym2        Symmetry operator for 2nd atom.
2289         * </pre>
2290         *
2291         * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK)
2292         *
2293         * @param line the LINK record line to parse.
2294         */
2295        private void pdb_LINK_Handler(String line) {
2296
2297                if (params.isHeaderOnly()) return;
2298
2299                // Check for the minimal set of fields.
2300                if (line.length()<56) {
2301                        logger.info("LINK line has length under 56. Ignoring it.");
2302                        return;
2303                }
2304
2305                int len = line.length();
2306
2307                String name1 = line.substring(12, 16).trim();
2308                String altLoc1 = line.substring(16, 17).trim();
2309                String resName1 = line.substring(17, 20).trim();
2310                String chainID1 = line.substring(21, 22).trim();
2311                String resSeq1 = line.substring(22, 26).trim();
2312                String iCode1 = line.substring(26, 27).trim();
2313
2314                String name2 = line.substring(42, 46).trim();
2315                String altLoc2 = line.substring(46, 47).trim();
2316                String resName2 = line.substring(47, 50).trim();
2317                String chainID2 = line.substring(51, 52).trim();
2318                String resSeq2 = line.substring(52, 56).trim();
2319                String iCode2 = null;  // Might get trimmed if blank.
2320                if (len > 56) iCode2 = line.substring(56, 57).trim();
2321
2322                String sym1 = null;
2323                if (len > 64) sym1 = line.substring(59, 65).trim();
2324                String sym2 = null;
2325                if (len > 71) sym2 = line.substring(66, 72).trim();
2326
2327                linkRecords.add(new LinkRecord(
2328                                name1, altLoc1, resName1, chainID1, resSeq1, iCode1,
2329                                name2, altLoc2, resName2, chainID2, resSeq2, iCode2,
2330                                sym1, sym2));
2331        }
2332
2333        /**
2334         * Handler for the SITE records. <br>
2335         *
2336         * <pre>
2337         *
2338         * COLUMNS      DATA TYPE               FIELD           DEFINITION
2339         * ---------------------------------------------------------------------------------
2340         * 1 - 6        Record name     "SITE "
2341         * 8 - 10       Integer                 seqNum          Sequence number.
2342         * 12 - 14      LString(3)              siteID          Site name.
2343         * 16 - 17      Integer                 numRes          Number of residues that compose the siteResidues.
2344         * 19 - 21      Residue name    resName1        Residue name for first residue that
2345         *                                                                              creates the siteResidues.
2346         * 23           Character               chainID1        Chain identifier for first residue of siteResidues.
2347         * 24 - 27      Integer                 seq1            Residue sequence number for first residue
2348         *                                                                              of the siteResidues.
2349         * 28           AChar                   iCode1          Insertion code for first residue of the siteResidues.
2350         *
2351         * example:
2352         *          1         2         3         4         5         6         7         8
2353         * 12345678901234567890123456789012345678901234567890123456789012345678901234567890
2354         * SITE     1 AC1  3 HIS A  94 HIS A   96  HIS A 119
2355         * SITE     1 AC2  5 ASN A  62 GLY A   63  HIS A  64  HOH A 328
2356         * SITE     2 AC2  5 HOH A 634
2357         * SITE     1 AC3  5 GLN A 136 GLN A  137  PRO A 138  GLU A 205
2358         * SITE     2 AC3  5 CYS A 206
2359         * SITE     1 AC4 11 HIS A  64 HIS A   94  HIS A  96  HIS A 119
2360         * SITE     2 AC4 11 LEU A 198 THR A  199  THR A 200  TRP A 209
2361         * SITE     3 AC4 11 HOH A 572 HOH A  582  HOH A 635
2362         * </pre>
2363         * @param line the SITE line record being currently read
2364         * @author Amr ALHOSSARY
2365         * @author Jules Jacobsen
2366         */
2367        private void pdb_SITE_Handler(String line){
2368
2369                if (params.isHeaderOnly()) return;
2370
2371                //  make a map of: SiteId to List<ResidueNumber>
2372
2373                logger.debug("Site Line:"+line);
2374
2375
2376                String siteID = line.substring(11, 14);
2377                //fetch the siteResidues from the map
2378                List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID);
2379
2380                //if the siteResidues doesn't yet exist, make a new one.
2381                if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){
2382                        siteResidues = new ArrayList<ResidueNumber>();
2383                        siteToResidueMap.put(siteID.trim(), siteResidues);
2384
2385                        logger.debug(String.format("New Site made: %s %s", siteID,  siteResidues));
2386                        logger.debug("Now made " + siteMap.size() + " sites");
2387
2388                }
2389
2390                logger.debug(String.format("SiteId: %s", siteID));
2391
2392
2393                //line = 'SITE     1 AC1  6 ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2394                //line.substring(18) = 'ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2395                line = line.substring(18);
2396                String groupString = null;
2397                //groupString = 'ARG H 221A'
2398                //keep iterating through chunks of 10 characters - these are the groups in the siteResidues
2399                while (!(groupString = line.substring(0, 10)).equals("          ")) {
2400                        //groupstring: 'ARG H 221A'
2401
2402                        logger.debug("groupString: '" + groupString + "'");
2403
2404                        //set the residue name
2405                        //residueName = 'ARG'
2406                        String residueName = groupString.substring(0, 3);
2407                        Character aminoCode1 = StructureTools.get1LetterCode(residueName);
2408                        if (aminoCode1 != null) {
2409                                if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
2410                                        aminoCode1 = null;
2411                                }
2412                        }
2413
2414                        //this is already in the right format, so no need to fiddle with it...
2415                        //pdbCode = 'H 221A'
2416                        //                    String pdbCode = groupString.substring(4, 10).trim();
2417                        String chainId = groupString.substring(4, 5);
2418                        Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim());
2419                        Character insCode = groupString.substring(9, 10).charAt(0);
2420                        //set insCode to null as a measure to prevent storing thousands of empty Strings
2421                        //- the empty value is returned using Group.getInsCode()
2422                        //                    if (insCode.equals(" ")) {
2423                        //                        insCode = null;
2424                        //                    }
2425
2426                        logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode));
2427
2428                        //make a new resNum with the data - this will be linked up with a site later
2429                        ResidueNumber residueNumber = new ResidueNumber();
2430
2431
2432                        logger.debug("pdbCode: '" + resNum + insCode + "'");
2433
2434                        residueNumber.setChainName(chainId);
2435                        residueNumber.setSeqNum(resNum);
2436                        residueNumber.setInsCode(insCode);
2437                        //add the resNum to the groups
2438                        siteResidues.add(residueNumber);
2439
2440                        logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID);
2441
2442                        line = line.substring(11);
2443                }
2444
2445                logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):");
2446                for (String key : siteToResidueMap.keySet()) {
2447                        logger.debug(key + " : " + siteToResidueMap.get(key));
2448                }
2449
2450        }
2451
2452        //Site variable related to parsing the REMARK 800 records.
2453        Site site;
2454
2455        private String[] keywords;
2456        private void pdb_REMARK_800_Handler(String line){
2457
2458                if (params.isHeaderOnly()) return;
2459
2460                // 'REMARK 800 SITE_IDENTIFIER: CAT                                                 '
2461                line = line.substring(11);
2462                String[] fields = line.split(": ");
2463
2464                if (fields.length == 2) {
2465                        if (fields[0].equals("SITE_IDENTIFIER")) {
2466                                //                    remark800Counter++;
2467                                String siteID = fields[1].trim();
2468
2469                                logger.debug("siteID: '" + siteID +"'");
2470
2471                                //fetch the siteResidues from the map
2472                                site = siteMap.get(siteID);
2473
2474                                //if the siteResidues doesn't yet exist, make a new one.
2475                                if (site == null || !siteID.equals(site.getSiteID())) {
2476                                        site = new Site(siteID, new ArrayList<Group>());
2477                                        siteMap.put(site.getSiteID(), site);
2478
2479                                        logger.debug("New Site made: " + site);
2480                                        logger.debug("Now made " + siteMap.size() + " sites");
2481
2482                                }
2483                        }
2484                        if (fields[0].equals("EVIDENCE_CODE")) {
2485                                //                    remark800Counter++;
2486                                String evCode = fields[1].trim();
2487
2488                                logger.debug("evCode: '" + evCode +"'");
2489
2490                                //fetch the siteResidues from the map
2491                                site.setEvCode(evCode);
2492                        }
2493                        if (fields[0].equals("SITE_DESCRIPTION")) {
2494                                //                    remark800Counter++;
2495                                String desc = fields[1].trim();
2496
2497                                logger.debug("desc: '" + desc +"'");
2498
2499                                //fetch the siteResidues from the map
2500                                site.setDescription(desc);
2501
2502                                logger.debug("Finished making REMARK 800 for site " + site.getSiteID());
2503                                logger.debug(site.remark800toPDB());
2504
2505                        }
2506                }
2507        }
2508
2509        private int intFromString(String intString){
2510                int val = Integer.MIN_VALUE;
2511                try {
2512                        val = Integer.parseInt(intString.trim());
2513                } catch (NumberFormatException ex){
2514                        logger.info("Could not parse a number: " + ex.getMessage());
2515                }
2516                return val;
2517        }
2518
2519
2520
2521        /**
2522         * Finds in the given list of chains the first one that has as name the given chainID.
2523         * If no such Chain can be found it returns null.
2524         */
2525        private static Chain isKnownChain(String chainID, List<Chain> chains){
2526
2527                for (int i = 0; i< chains.size();i++){
2528                        Chain testchain =  chains.get(i);
2529                        if (chainID.equals(testchain.getName())) {
2530                                return testchain;
2531                        }
2532                }
2533
2534                return null;
2535        }
2536
2537
2538
2539        private BufferedReader getBufferedReader(InputStream inStream)
2540                        throws IOException {
2541
2542                BufferedReader buf ;
2543                if (inStream == null) {
2544                        throw new IOException ("input stream is null!");
2545                }
2546
2547                buf = new BufferedReader (new InputStreamReader (inStream));
2548                return buf ;
2549
2550        }
2551
2552
2553
2554        /**
2555         * Parse a PDB file and return a datastructure implementing
2556         * PDBStructure interface.
2557         *
2558         * @param inStream  an InputStream object
2559         * @return a Structure object
2560         * @throws IOException
2561         */
2562        public Structure parsePDBFile(InputStream inStream)
2563                        throws IOException
2564        {
2565
2566                BufferedReader buf = getBufferedReader(inStream);
2567
2568                return parsePDBFile(buf);
2569
2570        }
2571
2572        /**
2573         * Parse a PDB file and return a datastructure implementing
2574         * PDBStructure interface.
2575         *
2576         * @param buf  a BufferedReader object
2577         * @return the Structure object
2578         * @throws IOException ...
2579         */
2580        public  Structure parsePDBFile(BufferedReader buf)
2581                        throws IOException
2582                        {
2583                // set the correct max values for parsing...
2584                loadMaxAtoms = params.getMaxAtoms();
2585                atomCAThreshold = params.getAtomCaThreshold();
2586
2587
2588                // (re)set structure
2589
2590                allModels = new ArrayList<>();
2591                structure     = new StructureImpl() ;
2592                currentModel  = null;
2593                currentChain  = null;
2594                currentGroup  = null;
2595                // we initialise to true since at the beginning of the file we are always starting a new molecule
2596                startOfMolecule = true;
2597                startOfModel = true;
2598
2599                seqResChains  = new ArrayList<Chain>();
2600                siteMap = new LinkedHashMap<String, Site>();
2601                pdbHeader     = new PDBHeader();
2602                connects      = new ArrayList<Map<String,Integer>>();
2603                previousContinuationField = "";
2604                continuationField = "";
2605                continuationString = "";
2606                current_compound = null;
2607                sourceLines.clear();
2608                compndLines.clear();
2609                keywordsLines.clear();
2610                isLastCompndLine = false;
2611                isLastSourceLine = false;
2612                prevMolId = -1;
2613                entities.clear();
2614                helixList.clear();
2615                strandList.clear();
2616                turnList.clear();
2617                lengthCheck = -1;
2618                atomCount = 0;
2619                atomOverflow = false;
2620                linkRecords = new ArrayList<LinkRecord>();
2621                siteToResidueMap.clear();
2622
2623                blankChainIdsPresent = false;
2624
2625                parseCAonly = params.isParseCAOnly();
2626
2627                String line = null;
2628
2629                while ((line = buf.readLine()) != null) {
2630
2631                        // ignore empty lines
2632                        if ( line.equals("") ||
2633                                        (line.equals(NEWLINE))){
2634                                continue;
2635                        }
2636
2637
2638                        // ignore short TER and END lines
2639                        if ( line.startsWith("END")) {
2640                                continue;
2641                        }
2642
2643                        if ( line.length() < 6 && !line.startsWith("TER")) {
2644                                logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
2645                                continue;
2646                        }
2647
2648                        String recordName = null;
2649                        if (line.length()<6)
2650                                recordName = line.trim();
2651                        else
2652                                recordName = line.substring (0, 6).trim ();
2653
2654                        try {
2655                                if (recordName.equals("ATOM"))
2656                                        pdb_ATOM_Handler(line);
2657                                else if (recordName.equals("SEQRES"))
2658                                        pdb_SEQRES_Handler(line);
2659                                else if (recordName.equals("HETATM"))
2660                                        pdb_ATOM_Handler(line);
2661                                else if (recordName.equals("MODEL"))
2662                                        pdb_MODEL_Handler(line);
2663                                else if (recordName.equals("TER"))
2664                                        pdb_TER_Handler();
2665                                else if (recordName.equals("HEADER"))
2666                                        pdb_HEADER_Handler(line);
2667                                else if (recordName.equals("AUTHOR"))
2668                                        pdb_AUTHOR_Handler(line);
2669                                else if (recordName.equals("TITLE"))
2670                                        pdb_TITLE_Handler(line);
2671                                else if (recordName.equals("SOURCE"))
2672                                        sourceLines.add(line); //pdb_SOURCE_Handler
2673                                else if (recordName.equals("COMPND"))
2674                                        compndLines.add(line); //pdb_COMPND_Handler
2675                                else if (recordName.equals("KEYWDS"))
2676                                        keywordsLines.add(line);
2677                                else if (recordName.equals("JRNL"))
2678                                        pdb_JRNL_Handler(line);
2679                                else if (recordName.equals("EXPDTA"))
2680                                        pdb_EXPDTA_Handler(line);
2681                                else if (recordName.equals("CRYST1"))
2682                                        pdb_CRYST1_Handler(line);
2683                                else if (recordName.startsWith("MTRIX"))
2684                                        pdb_MTRIXn_Handler(line);
2685                                else if (recordName.equals("REMARK"))
2686                                        pdb_REMARK_Handler(line);
2687                                else if (recordName.equals("CONECT"))
2688                                        pdb_CONECT_Handler(line);
2689                                else if (recordName.equals("REVDAT"))
2690                                        pdb_REVDAT_Handler(line);
2691                                else if (recordName.equals("DBREF"))
2692                                        pdb_DBREF_Handler(line);
2693                                else if (recordName.equals("SITE"))
2694                                        pdb_SITE_Handler(line);
2695                                else if (recordName.equals("SSBOND"))
2696                                        pdb_SSBOND_Handler(line);
2697                                else if (recordName.equals("LINK"))
2698                                        pdb_LINK_Handler(line);
2699                                else if ( params.isParseSecStruc()) {
2700                                        if ( recordName.equals("HELIX") ) pdb_HELIX_Handler (  line ) ;
2701                                        else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ;
2702                                        else if (recordName.equals("TURN")) pdb_TURN_Handler(   line ) ;
2703                                }
2704                        } catch (StringIndexOutOfBoundsException | NullPointerException ex) {
2705                                logger.info("Unable to parse [" + line + "]");
2706                        }
2707                }
2708
2709                makeCompounds(compndLines, sourceLines);
2710                
2711                handlePDBKeywords(keywordsLines);
2712
2713                triggerEndFileChecks();
2714
2715                if (params.shouldCreateAtomBonds()) {
2716                        formBonds();
2717                }
2718
2719                if ( params.shouldCreateAtomCharges()) {
2720                        addCharges();
2721                }
2722
2723                if ( params.isParseSecStruc() && !params.isHeaderOnly())
2724                        setSecStruc();
2725
2726                // Now correct the alternate location group
2727                StructureTools.cleanUpAltLocs(structure);
2728
2729                return structure;
2730
2731                        }
2732
2733
2734        /**
2735         * Add the charges to the Structure
2736         */
2737        private void addCharges() {
2738                ChargeAdder.addCharges(structure);
2739        }
2740
2741        /**
2742         * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained.
2743         * @author Jules Jacobsen
2744         * @param  compoundList
2745         * @param  sourceList
2746         */
2747        private void makeCompounds(List<String> compoundList,
2748                        List<String> sourceList) {
2749                //              System.out.println("[makeCompounds] making compounds from compoundLines");
2750
2751                for (String line : compoundList) {
2752                        if (compoundList.indexOf(line) + 1 == compoundList.size()) {
2753                                //                              System.out.println("[makeCompounds] Final line in compoundLines.");
2754                                isLastCompndLine = true;
2755                        }
2756                        pdb_COMPND_Handler(line);
2757
2758                }
2759                //              System.out.println("[makeCompounds] adding sources to compounds from sourceLines");
2760                // since we're starting again from the first compound, reset it here
2761                if ( entities.size() == 0){
2762                        current_compound = new EntityInfo();
2763                } else {
2764                        current_compound = entities.get(0);
2765                }
2766                for (String line : sourceList) {
2767                        if (sourceList.indexOf(line) + 1 == sourceList.size()) {
2768                                //                              System.out.println("[makeCompounds] Final line in sourceLines.");
2769                                isLastSourceLine = true;
2770                        }
2771                        pdb_SOURCE_Handler(line);
2772                }
2773
2774        }
2775
2776        /**Parse KEYWODS record of the PDB file.<br>
2777         * A keyword may be split over two lines. whether a keyword ends by the end 
2778         * of a line or it is aplit over two lines, a <code>space</code> is added 
2779         * between the 2 lines's contents, unless the first line ends in 
2780         * a '-' character.
2781         * <pre>
2782         * Record Format
2783         * COLUMNS       DATA  TYPE     FIELD         DEFINITION 
2784         *      ---------------------------------------------------------------------------------
2785         *       1 -  6       Record name    "KEYWDS" 
2786         *       9 - 10       Continuation   continuation  Allows concatenation of records if necessary.
2787         *      11 - 79       List           keywds        Comma-separated list of keywords relevant
2788         *                                                 to the entry.      
2789         * Example
2790         *               1         2         3         4         5         6         7         8
2791         *      12345678901234567890123456789012345678901234567890123456789012345678901234567890
2792         *      KEYWDS    LYASE,  TRICARBOXYLIC ACID CYCLE, MITOCHONDRION, OXIDATIVE
2793         *      KEYWDS   2 METABOLISM
2794         * </pre>
2795         * @param lines The KEWODS record lines.
2796         * @author Amr ALHOSSARY
2797         */
2798        private void handlePDBKeywords(List<String> lines) {
2799                StringBuilder fullList = new StringBuilder();
2800                for (String line : lines) {
2801                        String kwList = line.substring(10).trim();
2802                        if(kwList.length() > 0) {
2803                                if(fullList.length() > 0 && fullList.indexOf("-", fullList.length()-1) < 0) {
2804                                        fullList.append(' ');
2805                                }
2806                                fullList.append(kwList);
2807                        }
2808                }
2809                String fulllengthList = fullList.toString();
2810                keywords = fulllengthList.split("( )*,( )*");
2811                ArrayList<String> lst = new ArrayList<String>(keywords.length);
2812                for (String keyword : keywords) {
2813                        if(keyword.length() == 0) {
2814                                logger.debug("Keyword empty in structure {}", structure.getIdentifier().toString());
2815                                continue;
2816                        }
2817                        lst.add(keyword);
2818                }
2819                pdbHeader.setKeywords(lst);
2820        }
2821        
2822        /**
2823         * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide
2824         * bonds), peptide bonds, and intra-residue bonds.
2825         * <p>
2826         * Note: the current implementation only looks at the first model of each
2827         * structure. This may need to be fixed in the future.
2828         */
2829        private void formBonds() {
2830
2831                BondMaker maker = new BondMaker(structure, params);
2832
2833                // LINK records should be preserved, they are the way that
2834                // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers.
2835                // The analogy in mmCIF is the _struct_conn record.
2836                for (LinkRecord linkRecord : linkRecords) {
2837                        maker.formLinkRecordBond(linkRecord);
2838                }
2839
2840                maker.formDisulfideBonds(ssbonds);
2841
2842                maker.makeBonds();
2843        }
2844
2845
2846
2847        private void triggerEndFileChecks(){
2848
2849                // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines)
2850                if (currentChain!=null && currentGroup!=null) {
2851                        currentChain.addGroup(currentGroup);
2852                }
2853                if (currentModel!=null && currentChain!=null) {
2854                        currentModel.add(currentChain);
2855                }
2856                if (currentModel!=null) {
2857                        allModels.add(currentModel);
2858                }
2859
2860                if (blankChainIdsPresent) {
2861                        // from biojava 5.0 there's limited support for old pdb files with blank chain ids
2862                        logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly.");
2863                }
2864
2865                // reordering chains following the mmcif model and assigning entities
2866                assignChainsAndEntities();
2867                structure.setEntityInfos(entities);
2868
2869
2870
2871                // header data
2872
2873                Date modDate = pdbHeader.getModDate();
2874                if ( modDate.equals(new Date(0)) ) {
2875                        // modification date = deposition date
2876                        Date depositionDate = pdbHeader.getDepDate();
2877
2878                        if (! depositionDate.equals(modDate)){
2879                                // depDate is 0000-00-00
2880                                pdbHeader.setDepDate(depositionDate);
2881                        }
2882
2883                }
2884
2885                structure.setPDBHeader(pdbHeader);
2886                structure.setCrystallographicInfo(crystallographicInfo);
2887
2888                //set the JournalArticle, if there is one
2889                if (!journalLines.isEmpty()) {
2890                        buildjournalArticle();
2891                        pdbHeader.setJournalArticle(journalArticle);
2892                }
2893
2894                structure.setDBRefs(dbrefs);
2895
2896                // Only align if requested (default) and not when headerOnly mode with no Atoms.
2897                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
2898                if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){
2899                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
2900                        SeqRes2AtomAligner aligner = new SeqRes2AtomAligner();
2901                        aligner.align(structure,seqResChains);
2902
2903                } else {
2904                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
2905                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
2906                }
2907
2908
2909
2910                //associate the temporary Groups in the siteMap to the ones
2911                if (!params.isHeaderOnly()) {
2912                        // Only can link SITES if Atom Groups were parsed.
2913                        linkSitesToGroups(); // will work now that setSites is called
2914                }
2915
2916                if ( bioAssemblyParser != null){
2917                        bioAssemblyParser.setMacromolecularSizes();
2918                        pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap());
2919                }
2920
2921                if (ncsOperators !=null && ncsOperators.size()>0) {
2922                        crystallographicInfo.setNcsOperators(
2923                                ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
2924                }
2925
2926
2927                // rfree end file check
2928                // Rfree annotation is not very consistent in PDB format, it varies depending on the software
2929                // Here we follow this strategy:
2930                // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
2931                // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
2932
2933                if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) {
2934                        pdbHeader.setRfree(rfreeNoCutoffLine);
2935                } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) {
2936                        pdbHeader.setRfree(rfreeStandardLine);
2937                } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) {
2938                        pdbHeader.setRfree(rfreeStandardLine);
2939                } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE
2940
2941
2942
2943        }
2944
2945        private void setSecStruc(){
2946
2947                setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2948                                SecStrucType.helix4);
2949                setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2950                                SecStrucType.extended);
2951                setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2952                                SecStrucType.turn);
2953
2954                //Now insert random coil to the Groups that did not have SS information
2955                GroupIterator gi = new GroupIterator(structure);
2956                while (gi.hasNext()){
2957                        Group g = gi.next();
2958                        if (g.hasAminoAtoms()){
2959                                if (g.getProperty(Group.SEC_STRUC) == null){
2960                                        SecStrucInfo ss = new SecStrucInfo(g,
2961                                                        SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2962                                                        SecStrucType.coil);
2963                                        g.setProperty(Group.SEC_STRUC, ss);
2964                                }
2965                        }
2966                }
2967
2968        }
2969
2970        private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){
2971
2972
2973                Iterator<Map<String,String>> iter = secList.iterator();
2974                nextElement:
2975                        while (iter.hasNext()){
2976                                Map<String,String> m = iter.next();
2977
2978                                // assign all residues in this range to this secondary structure type
2979                                // String initResName = (String)m.get("initResName");
2980                                String initChainId = m.get("initChainId");
2981                                String initSeqNum  = m.get("initSeqNum" );
2982                                String initICode   = m.get("initICode" );
2983                                // String endResName  = (String)m.get("endResName" );
2984                                String endChainId  = m.get("endChainId" );
2985                                String endSeqNum   = m.get("endSeqNum");
2986                                String endICode    = m.get("endICode");
2987
2988                                if (initICode.equals(" "))
2989                                        initICode = "";
2990                                if (endICode.equals(" "))
2991                                        endICode = "";
2992
2993                                GroupIterator gi = new GroupIterator(structure);
2994                                boolean inRange = false;
2995                                while (gi.hasNext()){
2996                                        Group g = gi.next();
2997                                        Chain c = g.getChain();
2998
2999                                        if (c.getName().equals(initChainId)){
3000
3001                                                String pdbCode = initSeqNum + initICode;
3002                                                if ( g.getResidueNumber().toString().equals(pdbCode)  ) {
3003                                                        inRange = true;
3004                                                }
3005                                        }
3006                                        if ( inRange){
3007                                                if (g.hasAminoAtoms()) {
3008                                                        SecStrucInfo ss = new SecStrucInfo(g, assignment, type);
3009                                                        g.setProperty(Group.SEC_STRUC, ss);
3010                                                }
3011
3012                                        }
3013                                        if ( c.getName().equals(endChainId)){
3014                                                String pdbCode = endSeqNum + endICode;
3015                                                if (pdbCode.equals(g.getResidueNumber().toString())){
3016                                                        inRange = false;
3017                                                        continue nextElement;
3018                                                }
3019                                        }
3020                                }
3021                        }
3022        }
3023
3024        /**
3025         * Gets all chains with given chainName from given models list
3026         * @param chainName
3027         * @param polyModels
3028         * @return
3029         */
3030        private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) {
3031                List<List<Chain>> models = new ArrayList<>();
3032
3033                for (List<Chain> chains:polyModels) {
3034                        List<Chain> matchingChains = new ArrayList<>();
3035                        models.add(matchingChains);
3036                        for (Chain c:chains) {
3037                                if (c.getName().equals(chainName)) {
3038                                        matchingChains.add(c);
3039                                }
3040                        }
3041                }
3042                return models;
3043        }
3044
3045        /**
3046         * Split the given chain (containing non-polymer groups and water groups only)
3047         * into individual chains per non-polymer group and individual chains per contiguous sets of water groups.
3048         * @param chain
3049         * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains
3050         */
3051        private static List<List<Chain>> splitNonPolyChain(Chain chain) {
3052                List<Chain> splitNonPolys = new ArrayList<>();
3053                List<Chain> waterChains = new ArrayList<>();
3054
3055                Chain split = null;
3056                boolean previousGroupIsWater = false;
3057
3058                for (Group g:chain.getAtomGroups()){
3059
3060                        if (!previousGroupIsWater) {
3061                                // add last one if there's one
3062                                if (split!=null) {
3063                                        splitNonPolys.add(split);
3064                                }
3065                                split = new ChainImpl();
3066                                split.setName(chain.getName());
3067                        } else if (!g.isWater()) {
3068                                // previous group is water and this group is not water: we change from a water chain to a non-poly
3069                                // we'll need to add now the water chain to the list of water chains
3070                                waterChains.add(split);
3071                                split = new ChainImpl();
3072                                split.setName(chain.getName());
3073                        }
3074
3075                        if (g.isWater()) {
3076                                previousGroupIsWater = true;
3077                        } else {
3078                                previousGroupIsWater = false;
3079
3080                        }
3081
3082                        // this should include alt locs (referenced from the main group)
3083                        split.addGroup(g);
3084
3085                }
3086
3087                // adding the last split chain: either to water or non-poly depending on what was the last seen group
3088                if (split!=null) {
3089                        if (previousGroupIsWater)
3090                                waterChains.add(split);
3091                        else
3092                                splitNonPolys.add(split);
3093                }
3094
3095
3096                List<List<Chain>> all = new ArrayList<>(2);
3097                all.add(splitNonPolys);
3098                all.add(waterChains);
3099
3100                return all;
3101        }
3102
3103        /**
3104         * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files
3105         * @param polys
3106         * @param nonPolys
3107         * @param waters
3108         */
3109        private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) {
3110
3111                for (int i=0; i<polys.size(); i++) {
3112                        String asymId = "A";
3113
3114                        for (Chain poly:polys.get(i)) {
3115                                poly.setId(asymId);
3116                                asymId = getNextAsymId(asymId);
3117                        }
3118                        for (Chain nonPoly:nonPolys.get(i)) {
3119                                nonPoly.setId(asymId);
3120                                asymId = getNextAsymId(asymId);
3121                        }
3122                        for (Chain water:waters.get(i)) {
3123                                water.setId(asymId);
3124                                asymId = getNextAsymId(asymId);
3125                        }
3126                }
3127        }
3128
3129        /**
3130         * Gets the next asym id given an asymId, according to the convention followed by
3131         * mmCIF files produced by the PDB
3132         * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,...
3133         * @param asymId
3134         * @return
3135         */
3136        private String getNextAsymId(String asymId) {
3137                if (asymId.length()==1) {
3138                        if (!asymId.equals("Z")) {
3139                                return Character.toString(getNextChar(asymId.charAt(0)));
3140                        } else {
3141                                return "AA";
3142                        }
3143                } else if (asymId.length()==2) {
3144                        if (asymId.equals("ZZ")) {
3145                                return "AAA";
3146                        }
3147                        char[] c = new char[2];
3148                        asymId.getChars(0, 2, c, 0);
3149                        c[0] = getNextChar(c[0]);
3150                        if (c[0]=='A') {
3151                                c[1] = getNextChar(c[1]);
3152                        }
3153                        return new String(c);
3154                } else if (asymId.length()==3) {
3155                        char[] c = new char[3];
3156                        asymId.getChars(0, 3, c, 0);
3157                        c[0] = getNextChar(c[0]);
3158                        if (c[0]=='A') {
3159                                c[1] = getNextChar(c[1]);
3160                                if (c[1]=='A') {
3161                                        c[2] = getNextChar(c[2]);
3162                                }
3163                        }
3164                        return new String(c);
3165                }
3166                return null;
3167        }
3168
3169        private char getNextChar(char c) {
3170                if (c!='Z') {
3171                        return ((char)(c+1));
3172                } else {
3173                        return 'A';
3174                }
3175        }
3176
3177        /**
3178         * Here we assign chains following the mmCIF data model:
3179         * one chain per polymer, one chain per non-polymer group and
3180         * several water chains.
3181         * <p>
3182         * Subsequently we assign entities for them: either from those read from
3183         * COMPOUND records or from those found heuristically through {@link EntityFinder}
3184         *
3185         */
3186        private void assignChainsAndEntities(){
3187
3188                List<List<Chain>> polyModels = new ArrayList<>();
3189                List<List<Chain>> nonPolyModels = new ArrayList<>();
3190                List<List<Chain>> waterModels = new ArrayList<>();
3191
3192                for (List<Chain> model:allModels) {
3193
3194                        List<Chain> polyChains = new ArrayList<>();
3195                        List<Chain> nonPolyChains = new ArrayList<>();
3196                        List<Chain> waterChains = new ArrayList<>();
3197
3198                        polyModels.add(polyChains);
3199                        nonPolyModels.add(nonPolyChains);
3200                        waterModels.add(waterChains);
3201
3202                        for (Chain c:model) {
3203
3204                                // we only have entities for polymeric chains, all others are ignored for assigning entities
3205                                if (c.isWaterOnly()) {
3206                                        waterChains.add(c);
3207
3208                                } else if (c.isPureNonPolymer()) {
3209                                        nonPolyChains.add(c);
3210
3211                                } else {
3212                                        polyChains.add(c);
3213                                }
3214                        }
3215                }
3216
3217                List<List<Chain>> splitNonPolyModels = new ArrayList<>();
3218                for (int i=0; i<nonPolyModels.size(); i++) {
3219                        List<Chain> nonPolyModel = nonPolyModels.get(i);
3220                        List<Chain> waterModel = waterModels.get(i);
3221
3222                        List<Chain> splitNonPolys = new ArrayList<>();
3223                        splitNonPolyModels.add(splitNonPolys);
3224
3225                        for (Chain nonPoly:nonPolyModel) {
3226                                List<List<Chain>> splits = splitNonPolyChain(nonPoly);
3227                                splitNonPolys.addAll(splits.get(0));
3228                                waterModel.addAll(splits.get(1));
3229                        }
3230                }
3231
3232
3233                // now we have all chains as in mmcif, let's assign ids following the mmcif rules
3234                assignAsymIds(polyModels, splitNonPolyModels, waterModels);
3235
3236
3237                if (!entities.isEmpty()) {
3238                        // if the file contained COMPOUND records then we can assign entities to the poly chains
3239                        for (EntityInfo comp : entities){
3240                        List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId());
3241                        if ( chainIds == null)
3242                                continue;
3243                        for ( String chainId : chainIds) {
3244
3245                                        List<List<Chain>> models = findChains(chainId, polyModels);
3246
3247                                        for (List<Chain> matchingChains:models) {
3248                                                for (Chain chain:matchingChains) {
3249                                                        comp.addChain(chain);
3250                                                        chain.setEntityInfo(comp);
3251                                                }
3252
3253                                                if (matchingChains.isEmpty()) {
3254                                        // usually if this happens something is wrong with the PDB header
3255                                        // e.g. 2brd - there is no Chain A, although it is specified in the header
3256                                        // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES
3257                                        // but the authors didn't observe in the density so it's completely missing
3258                                        // from the ATOM lines
3259                                                        logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId());
3260                                                }
3261                                        }
3262                                }
3263                        }
3264
3265                } else {
3266
3267                        logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically");
3268                        // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
3269                        entities = EntityFinder.findPolyEntities(polyModels);
3270
3271                }
3272
3273                // now we assign entities to the nonpoly and water chains
3274                EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities);
3275
3276
3277                // in some rare cases purely non-polymer or purely water chain are present in pdb files
3278                // see https://github.com/biojava/biojava/pull/394
3279                // these case should be covered by the above
3280
3281
3282                // now that we have entities in chains we add the chains to the structure
3283
3284                for (int i=0;i<allModels.size();i++) {
3285                        List<Chain> model = new ArrayList<>();
3286                        model.addAll(polyModels.get(i));
3287                        model.addAll(splitNonPolyModels.get(i));
3288                        model.addAll(waterModels.get(i));
3289                        structure.addModel(model);
3290                        }
3291
3292
3293        }
3294
3295        /**
3296         * Links the Sites in the siteMap to the Groups in the Structure via the
3297         * siteToResidueMap ResidueNumber.
3298         * @author Jules Jacobsen
3299         * @return
3300         */
3301        private void linkSitesToGroups() {
3302
3303                //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size());
3304
3305                //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back.
3306                //the return list
3307
3308                if ( siteMap == null || siteToResidueMap == null){
3309                        logger.info("Sites can not be linked to residues!");
3310
3311                        return;
3312                }
3313
3314                List<Site> sites = null;
3315                //check that there are chains with which to associate the groups
3316                if (structure.getChains().isEmpty()) {
3317                        sites = new ArrayList<Site>(siteMap.values());
3318                        logger.info("No chains to link Site Groups with - Sites will not be present in the Structure");
3319                        return;
3320                }
3321
3322                //check that the keys in the siteMap and SiteToResidueMap are equal
3323                if (! siteMap.keySet().equals(siteToResidueMap.keySet())) {
3324                        logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure");
3325                        logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet());
3326                        //return;
3327                }
3328
3329                //so we have chains - associate the siteResidues-related groups with the ones
3330                //already in in the chains
3331                for (String key : siteMap.keySet()) {
3332                        Site currentSite = siteMap.get(key);
3333                        List<ResidueNumber> linkedGroups = siteToResidueMap.get(key);
3334                        if ( linkedGroups == null)
3335                                continue;
3336                        for (ResidueNumber residueNumber : linkedGroups) {
3337
3338                                String pdbCode = residueNumber.toString();
3339                                String chain = residueNumber.getChainName();
3340                                //                    System.out.println("chain: '" + chain + "'");
3341                                //                    String resNum = resNum.getSeqNum().toString();
3342                                //                    System.out.println("resNum: '" + resNum + "'");
3343
3344                                Group linkedGroup = null;
3345                                try {
3346                                        //TODO: implement findGroup(ResidueNumber resNum)
3347                                        linkedGroup = structure.findGroup(chain, pdbCode);
3348                                } catch (StructureException ex) {
3349                                        logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")");
3350                                        continue;
3351                                }
3352
3353                                //                    System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID());
3354                                currentSite.getGroups().add(linkedGroup);
3355                        }
3356                }
3357
3358                //System.out.println("SITEMAP: " + siteMap);
3359
3360                sites = new ArrayList<Site>(siteMap.values());
3361                structure.setSites(sites);
3362                //System.out.println("STRUCTURE SITES: " + structure.getSites().size());
3363                //            for (Site site : structure.getSites()) {
3364                //                System.out.println(site);
3365                //            }
3366                //            System.out.println("Linked Site Groups with Chains");
3367
3368        }
3369
3370        private void buildjournalArticle() {
3371
3372                logger.debug("building new JournalArticle");
3373                //            for (String line : journalLines) {
3374                //                System.out.println(line);
3375                //            }
3376
3377                this.journalArticle = new JournalArticle();
3378                //        JRNL        AUTH   M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI,
3379                //        JRNL        AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT
3380                //        JRNL        TITL   A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY
3381                //        JRNL        TITL 2 STAPHYLOCOCCUS AUREUS.
3382                //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3383                //        JRNL        REFN                   ISSN 1529-2908
3384                //        JRNL        PMID   17351618
3385                //        JRNL        DOI    10.1038/NI1450
3386                StringBuffer auth = new StringBuffer();
3387                StringBuffer titl = new StringBuffer();
3388                StringBuffer edit = new StringBuffer();
3389                StringBuffer ref = new StringBuffer();
3390                StringBuffer publ = new StringBuffer();
3391                StringBuffer refn = new StringBuffer();
3392                StringBuffer pmid = new StringBuffer();
3393                StringBuffer doi = new StringBuffer();
3394
3395                for (String line : journalLines) {
3396                        if ( line.length() < 19 ) {
3397                                logger.info("can not process Journal line: " + line);
3398                                continue;
3399                        }
3400                        //            System.out.println("'" + line + "'");
3401                        String subField = line.substring(12, 16);
3402                        //            System.out.println("'" + subField + "'");
3403                        if (subField.equals("AUTH")) {
3404                                auth.append(line.substring(19, line.length()).trim());
3405
3406                                logger.debug("AUTH '" + auth.toString() + "'");
3407
3408                        }
3409                        if (subField.equals("TITL")) {
3410                                //add a space to the end of a line so that when wrapped the
3411                                //words on the join won't be concatenated
3412                                titl.append(line.substring(19, line.length()).trim()).append(" ");
3413
3414                                logger.debug("TITL '" + titl.toString() + "'");
3415
3416                        }
3417                        if (subField.equals("EDIT")) {
3418                                edit.append(line.substring(19, line.length()).trim());
3419
3420                                logger.debug("EDIT '" + edit.toString() + "'");
3421
3422                        }
3423                        //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3424                        if (subField.equals("REF ")) {
3425                                ref.append(line.substring(19, line.length()).trim()).append(" ");
3426
3427                                logger.debug("REF '" + ref.toString() + "'");
3428
3429                        }
3430                        if (subField.equals("PUBL")) {
3431                                publ.append(line.substring(19, line.length()).trim()).append(" ");
3432
3433                                logger.debug("PUBL '" + publ.toString() + "'");
3434
3435                        }
3436                        //        JRNL        REFN                   ISSN 1529-2908
3437                        if (subField.equals("REFN")) {
3438                                if ( line.length() < 35 ) {
3439                                        logger.info("can not process Journal REFN line: " + line);
3440                                        continue;
3441                                }
3442                                refn.append(line.substring(35, line.length()).trim());
3443
3444                                logger.debug("REFN '" + refn.toString() + "'");
3445
3446                        }
3447                        //        JRNL        PMID   17351618
3448                        if (subField.equals("PMID")) {
3449                                pmid.append(line.substring(19, line.length()).trim());
3450
3451                                logger.debug("PMID '" + pmid.toString() + "'");
3452
3453                        }
3454                        //        JRNL        DOI    10.1038/NI1450
3455                        if (subField.equals("DOI ")) {
3456                                doi.append(line.substring(19, line.length()).trim());
3457
3458                                logger.debug("DOI '" + doi.toString() + "'");
3459
3460                        }
3461                }
3462
3463                //now set the parts of the JournalArticle
3464                journalArticle.setAuthorList(authorBuilder(auth.toString()));
3465                journalArticle.setEditorList(authorBuilder(edit.toString()));
3466                journalArticle.setRef(ref.toString());
3467                JournalParser journalParser = new JournalParser(ref.toString());
3468                journalArticle.setJournalName(journalParser.getJournalName());
3469                if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) {
3470                        journalArticle.setIsPublished(true);
3471                }
3472                journalArticle.setVolume(journalParser.getVolume());
3473                journalArticle.setStartPage(journalParser.getStartPage());
3474                journalArticle.setPublicationDate(journalParser.getPublicationDate());
3475                journalArticle.setPublisher(publ.toString().trim());
3476                journalArticle.setTitle(titl.toString().trim());
3477                journalArticle.setRefn(refn.toString().trim());
3478                journalArticle.setPmid(pmid.toString().trim());
3479                journalArticle.setDoi(doi.toString().trim());
3480
3481
3482                logger.debug("Made JournalArticle:");
3483                logger.debug(journalArticle.toString());
3484
3485        }
3486
3487        //inner class to deal with all the journal info
3488        private class JournalParser {
3489
3490                private String journalName;
3491                private String volume;
3492                private String startPage;
3493                private int publicationDate;
3494
3495
3496                public JournalParser(String ref) {
3497
3498                        logger.debug("JournalParser init '" + ref + "'");
3499
3500
3501                        if (ref.equals("TO BE PUBLISHED ")) {
3502                                journalName = ref.trim();
3503
3504                                logger.debug(String.format("JournalParser found journalString '%s'", journalName));
3505
3506                                return;
3507                        }
3508
3509                        if (ref.length() < 48) {
3510                                logger.info("REF line too short - must be at least 48 characters to be valid for parsing.");
3511                                journalName = "";
3512                                volume = "";
3513                                startPage = "";
3514                                publicationDate = 0;
3515                                return;
3516                        }
3517                        //can be multi line:
3518                        //REF    PHILOS.TRANS.R.SOC.LONDON,    V. 293    53 1981
3519                        //REF  2 SER.B
3520
3521                        //or
3522
3523                        //REF    GLYCOGEN PHOSPHORYLASE B:                1 1991
3524                        //REF  2 DESCRIPTION OF THE PROTEIN
3525                        //REF  3 STRUCTURE
3526
3527                        //but usually single line
3528                        //REF    NUCLEIC ACIDS RES.                         2009
3529                        //REF    MOL.CELL                                   2009
3530                        //REF    NAT.STRUCT.MOL.BIOL.          V.  16   238 2009
3531                        //REF    ACTA CRYSTALLOGR.,SECT.F      V.  65   199 2009
3532                        //check if the date is present at the end of the line.
3533                        //                             09876543210987654321
3534                        //'J.BIOL.CHEM.                  V. 280 23000 2005 '
3535                        //'J.AM.CHEM.SOC.                V. 130 16011 2008 '
3536                        //'NAT.STRUCT.MOL.BIOL.          V.  16   238 2009'
3537                        String volumeInformation = ref.substring(30, 48);
3538
3539                        logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation));
3540
3541                        //volumeInformation: 'V. 293    53 1981 '
3542                        //                      String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim();
3543                        //                      String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim();
3544                        //                      String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim();
3545                        //                      String journalString = ref.substring(0 , ref.length() - 18).trim();
3546                        String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim();
3547                        String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim();
3548                        String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim();
3549                        //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk)
3550                        String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim();
3551                        journalString = journalString.trim();
3552                        //                        System.out.println("journalString: " + journalString);
3553
3554                        logger.debug(String.format("JournalParser found volumeString '%s'", volumeString));
3555                        logger.debug(String.format("JournalParser found startPageString '%s'", startPageString));
3556                        logger.debug(String.format("JournalParser found dateString '%s'", dateString));
3557                        logger.debug(String.format("JournalParser found journalString '%s'", journalString));
3558
3559
3560                        if (!dateString.equals("    ")) {
3561                                try {
3562                                        publicationDate = Integer.valueOf(dateString);
3563                                } catch (NumberFormatException nfe) {
3564                                        logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1");
3565                                }
3566                                //                              if (DEBUG) {
3567                                //                                      System.out.println("JournalParser set date " + publicationDate);
3568                                //                              }
3569                        }
3570
3571                        if (!startPageString.equals("    ")) {
3572                                startPage = startPageString;
3573                                //                              if (DEBUG) {
3574                                //                                      System.out.println("JournalParser set startPage " + startPage);
3575                                //                              }
3576                        }
3577
3578                        if (!volumeString.equals("    ")) {
3579                                volume = volumeString;
3580                                //                              if (DEBUG) {
3581                                //                                      System.out.println("JournalParser set volume " + volume);
3582                                //                              }
3583                        }
3584
3585                        if (!journalString.equals("    ")) {
3586                                journalName = journalString;
3587
3588                                logger.debug("JournalParser set journalName " + journalName);
3589
3590                        }
3591                }
3592
3593                private String getJournalName() {
3594                        return journalName;
3595                }
3596
3597                private int getPublicationDate() {
3598                        return publicationDate;
3599                }
3600
3601                private String getStartPage() {
3602                        return startPage;
3603                }
3604
3605                private String getVolume() {
3606                        return volume;
3607                }
3608        }
3609
3610        private List<Author> authorBuilder(String authorString) {
3611                ArrayList<Author> authorList = new ArrayList<Author>();
3612
3613                if (authorString.equals("")) {
3614                        return authorList;
3615                }
3616
3617                String[] authors = authorString.split(",");
3618                //        if (DEBUG) {
3619                //            for (int i = 0; i < authors.length; i++) {
3620                //                String string = authors[i];
3621                //                System.out.println("authorBuilder author: '" + string + "'");
3622                //            }
3623                //        }
3624                //        AUTH   SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS
3625                //        AUTH 2 DISEASE (SSGCID)
3626                //        or
3627                //        AUTH   E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET,
3628                //        AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA,
3629                //        AUTH 3 A.BOCHKAREV,D.COSSAR,
3630                //        AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC)
3631                //        or
3632                //        AUTH   T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER
3633                if (authors.length == 1) {
3634                        //only one element means it's a consortium only
3635                        Author author = new Author();
3636                        author.setSurname(authors[0]);
3637
3638                        logger.debug("Set consortium author name " + author.getSurname());
3639
3640                        authorList.add(author);
3641                } else {
3642                        for (int i = 0; i < authors.length; i++) {
3643                                String authorFullName = authors[i];
3644
3645                                logger.debug("Building author " + authorFullName);
3646
3647                                Author author = new Author();
3648                                String regex = "\\.";
3649                                String[] authorNames = authorFullName.split(regex);
3650                                //                if (DEBUG) {
3651                                //                    System.out.println("authorNames size " + authorNames.length);
3652                                //                    for (int j = 0; j < authorNames.length; j++) {
3653                                //                        String name = authorNames[j];
3654                                //                        System.out.println("split authName '" + name + "'");
3655                                //
3656                                //                    }
3657                                //                }
3658                                if (authorNames.length == 0) {
3659                                        author.setSurname(authorFullName);
3660
3661                                        logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname());
3662
3663                                }
3664                                //again there might be a consortium name so there may be no elements
3665                                else if (authorNames.length == 1) {
3666                                        author.setSurname(authorNames[0]);
3667
3668                                        logger.debug("Set consortium author name in multiple author block " + author.getSurname
3669                                                                ());
3670
3671                                } else {
3672                                        String initials = "";
3673                                        for (int j = 0; j < authorNames.length - 1; j++) {
3674                                                String initial = authorNames[j];
3675                                                //                        if (DEBUG) {
3676                                                //                            System.out.println("adding initial '" + initial + "'");
3677                                                //                        }
3678                                                //build the initials back up again
3679                                                initials += initial + ".";
3680                                        }
3681
3682                                        logger.debug("built initials '" + initials + "'");
3683
3684                                        author.setInitials(initials);
3685                                        //surname is always last
3686                                        int lastName = authorNames.length - 1;
3687                                        String surname = authorNames[lastName];
3688
3689                                        logger.debug("built author surname " + surname);
3690
3691                                        author.setSurname(surname);
3692
3693                                }
3694                                authorList.add(author);
3695                        }
3696                }
3697                return authorList;
3698        }
3699
3700        public void setFileParsingParameters(FileParsingParameters params)
3701        {
3702                this.params= params;
3703
3704                // set the correct max values for parsing...
3705                loadMaxAtoms = params.getMaxAtoms();
3706                atomCAThreshold = params.getAtomCaThreshold();
3707
3708
3709        }
3710
3711        public FileParsingParameters getFileParsingParameters(){
3712                return params;
3713        }
3714
3715
3716}