001/*
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 * Created on 16.03.2004
020 *
021 */
022package org.biojava.nbio.structure.io;
023
024import static java.lang.Math.min;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.text.DateFormat;
031import java.text.ParseException;
032import java.text.SimpleDateFormat;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.Date;
036import java.util.HashMap;
037import java.util.Iterator;
038import java.util.LinkedHashMap;
039import java.util.List;
040import java.util.Locale;
041import java.util.Map;
042import java.util.StringTokenizer;
043import java.util.regex.Matcher;
044import java.util.regex.Pattern;
045
046import javax.vecmath.Matrix4d;
047
048import org.biojava.nbio.structure.AminoAcid;
049import org.biojava.nbio.structure.AminoAcidImpl;
050import org.biojava.nbio.structure.Atom;
051import org.biojava.nbio.structure.AtomImpl;
052import org.biojava.nbio.structure.Author;
053import org.biojava.nbio.structure.Chain;
054import org.biojava.nbio.structure.ChainImpl;
055import org.biojava.nbio.structure.DBRef;
056import org.biojava.nbio.structure.Element;
057import org.biojava.nbio.structure.EntityInfo;
058import org.biojava.nbio.structure.EntityType;
059import org.biojava.nbio.structure.Group;
060import org.biojava.nbio.structure.GroupIterator;
061import org.biojava.nbio.structure.HetatomImpl;
062import org.biojava.nbio.structure.JournalArticle;
063import org.biojava.nbio.structure.NucleotideImpl;
064import org.biojava.nbio.structure.PDBCrystallographicInfo;
065import org.biojava.nbio.structure.PDBHeader;
066import org.biojava.nbio.structure.PdbId;
067import org.biojava.nbio.structure.ResidueNumber;
068import org.biojava.nbio.structure.Site;
069import org.biojava.nbio.structure.Structure;
070import org.biojava.nbio.structure.StructureException;
071import org.biojava.nbio.structure.StructureImpl;
072import org.biojava.nbio.structure.StructureTools;
073import org.biojava.nbio.structure.chem.ChemCompAtom;
074import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord;
076import org.biojava.nbio.structure.secstruc.SecStrucInfo;
077import org.biojava.nbio.structure.secstruc.SecStrucType;
078import org.biojava.nbio.structure.xtal.CrystalCell;
079import org.biojava.nbio.structure.xtal.SpaceGroup;
080import org.biojava.nbio.structure.xtal.SymoplibParser;
081import org.slf4j.Logger;
082import org.slf4j.LoggerFactory;
083
084
085/**
086 * This class implements the actual PDB file parsing. Do not access it directly, but
087 * via the PDBFileReader class.
088 *
089 * <h2>Parsing</h2>
090 *
091 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods.
092 *
093 *
094 * <p>
095 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD.
096 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically
097 * switch to a C-alpha only representation.
098 *
099 * <p>
100 * The result of the parsing of the PDB file is a new {@link Structure} object.
101 *
102 * <p>
103 * For more documentation on how to work with the Structure API please
104 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top">
105 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a>
106 *
107 *
108 *
109 *
110 * <h2>Example</h2>
111 * <p>
112 * Q: How can I get a Structure object from a PDB file?
113 * <p>
114 * A:
115 * <pre>
116 * public {@link Structure} loadStructure(String pathToPDBFile){
117 *      // The PDBFileParser is wrapped by the PDBFileReader
118 *      {@link PDBFileReader} pdbreader = new {@link PDBFileReader}();
119 *
120 *      {@link Structure} structure = null;
121 *      try{
122 *              structure = pdbreader.getStructure(pathToPDBFile);
123 *              System.out.println(structure);
124 *      } catch (IOException e) {
125 *              e.printStackTrace();
126 *      }
127 *      return structure;
128 * }
129 * </pre>
130 *
131 *
132 * @author Andreas Prlic
133 * @author Jules Jacobsen
134 * @author Jose Duarte
135 * @since 1.4
136 */
137public class PDBFileParser  {
138
139
140
141        private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class);
142
143        // for printing
144        private static final String NEWLINE = System.getProperty("line.separator");
145
146
147        // required for parsing:
148        private String pdbId; //the actual id of the entry
149        private Structure     structure;
150        private List<List<Chain>> allModels; // a temp data structure to keep all models
151        private List<Chain>   currentModel; // contains the ATOM records for each model
152        private Chain         currentChain;
153        private Group         currentGroup;
154
155        private List<Chain>   seqResChains; // contains all the chains for the SEQRES records
156        //we're going to work on the assumption that the files are current -
157        //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true.
158        //if true then lines will be truncated at 72 characters in certain cases
159        //(pdb_COMPOUND_handler for example)
160        private boolean isLegacyFormat = false;
161
162        private boolean blankChainIdsPresent = false;
163
164        // for re-creating the biological assembly
165        private PDBBioAssemblyParser bioAssemblyParser = null;
166
167        private PDBHeader pdbHeader;
168        private PDBCrystallographicInfo crystallographicInfo;
169        private JournalArticle journalArticle;
170        private List<Map<String, Integer>> connects ;
171        private List<Map<String,String>> helixList;
172        private List<Map<String,String>> strandList;
173        private List<Map<String,String>> turnList;
174
175        private int lengthCheck ;
176
177        private boolean isLastCompndLine = false;
178        private boolean isLastSourceLine = false;
179        private EntityInfo current_compound;
180        private List<EntityInfo> entities = new ArrayList<EntityInfo>();
181        private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>();
182        private List<String> compndLines = new ArrayList<String>();
183        private List<String> sourceLines = new ArrayList<String>();
184        private List<String> journalLines = new ArrayList<String>();
185        private List<String> keywordsLines = new ArrayList<String>();
186        private List<DBRef> dbrefs;
187        private Map<String, Site> siteMap = new LinkedHashMap<String, Site>();
188        private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>();
189
190        private List<SSBondImpl> ssbonds = new ArrayList<>();
191
192        // for storing LINK until we have all the atoms parsed
193        private List<LinkRecord> linkRecords;
194
195        private Matrix4d currentNcsOp;
196        private List<Matrix4d> ncsOperators;
197
198        // for parsing COMPOUND and SOURCE Header lines
199        private int prevMolId;
200        private String previousContinuationField;
201        private String continuationField;
202        private String continuationString;
203
204        private DateFormat dateFormat;
205
206        // for rfree parsing
207        private float rfreeStandardLine = -1;
208        private float rfreeNoCutoffLine = -1;
209
210        private static  final List<String> compndFieldValues = new ArrayList<String>(
211                        Arrays.asList(
212                                        "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:",
213                                        "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:",
214                                        "BIOLOGICAL_UNIT:", "OTHER_DETAILS:"
215                                        ));
216
217
218        private static final List<String> ignoreCompndFieldValues = new ArrayList<String>(
219                        Arrays.asList(
220                                        "HETEROGEN:","ENGINEEREED:","FRAGMENT,",
221                                        "MUTANT:","SYNTHETIC:"
222                                        ));
223        // ENGINEEREED in pdb219d
224
225        private static final List<String> sourceFieldValues = new ArrayList<String>(
226                        Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:",
227                                        "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:",
228                                        "ORGANISM_TAXID:","STRAIN:",
229                                        "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:",
230                                        "CELL:", "ORGANELLE:", "SECRETION:", "GENE:",
231                                        "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:",
232                                        "EXPRESSION_SYSTEM_TAXID:",
233                                        "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:",
234                                        "EXPRESSION_SYSTEM_CELL_LINE:",
235                                        "EXPRESSION_SYSTEM_ATCC_NUMBER:",
236                                        "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:",
237                                        "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:",
238                                        "EXPRESSION_SYSTEM_CELLULAR_LOCATION:",
239                                        "EXPRESSION_SYSTEM_VECTOR_TYPE:",
240                                        "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:",
241                                        "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:"));
242
243        private int atomCount;
244
245        // parsing options:
246
247        private int atomCAThreshold ;
248
249        private int loadMaxAtoms;
250
251        private boolean atomOverflow;
252
253        /** flag to tell parser to only read Calpha coordinates **/
254        private boolean parseCAonly;
255
256
257        private FileParsingParameters params;
258
259        private boolean startOfMolecule;
260        private boolean startOfModel;
261
262        public PDBFileParser() {
263                params = new FileParsingParameters();
264
265                allModels = new ArrayList<>();
266                structure     = null           ;
267                currentModel  = null;
268                currentChain  = null;
269                currentGroup  = null;
270                // we initialise to true since at the beginning of the file we are always starting a new molecule
271                startOfMolecule = true;
272                startOfModel = true;
273
274
275                pdbHeader         = new PDBHeader();
276                crystallographicInfo = new PDBCrystallographicInfo();
277                connects      = new ArrayList<Map<String,Integer>>() ;
278
279
280                helixList     = new ArrayList<Map<String,String>>();
281                strandList    = new ArrayList<Map<String,String>>();
282                turnList      = new ArrayList<Map<String,String>>();
283                current_compound = null;
284                dbrefs        = new ArrayList<DBRef>();
285                siteMap = null;
286                dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US);
287                atomCount = 0;
288                atomOverflow = false;
289                parseCAonly = false;
290
291                // this SHOULD not be done
292                // DONOT:setFileParsingParameters(params);
293                // set the correct max values for parsing...
294                loadMaxAtoms = params.getMaxAtoms();
295                atomCAThreshold = params.getAtomCaThreshold();
296
297                linkRecords = new ArrayList<LinkRecord>();
298
299                blankChainIdsPresent = false;
300
301        }
302
303        /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */
304        private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) {
305
306                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3);
307                if ( g != null && !g.getChemComp().isEmpty())
308                        return g;
309
310
311                Group group;
312                if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){
313                        group = new HetatomImpl();
314
315                } else if(StructureTools.isNucleotide(aminoCode3))  {
316                        // it is a nucleotide
317                        NucleotideImpl nu = new NucleotideImpl();
318                        group = nu;
319
320                } else {
321                        AminoAcidImpl aa = new AminoAcidImpl() ;
322                        aa.setAminoType(aminoCode1);
323                        group = aa ;
324                }
325
326                //              System.out.println("new resNum type: "+ resNum.getType() );
327                return  group ;
328        }
329
330
331
332        // Handler methods to deal with PDB file records properly.
333        /**
334         Handler for
335         HEADER Record Format
336         <pre>
337         COLUMNS        DATA TYPE       FIELD           DEFINITION
338         ----------------------------------------------------------------------------------
339         1 -  6        Record name     "HEADER"
340         11 - 50        String(40)      classification  Classifies the molecule(s)
341         51 - 59        Date            depDate         Deposition date.  This is the date
342         the coordinates were received by
343         the PDB
344         63 - 66        IDcode          idCode          This identifier is unique within PDB
345        </pre>
346         */
347        private void pdb_HEADER_Handler(String line) {
348
349                String classification  = null;
350                String deposition_date = null;
351                String pdbCode         = null;
352
353                int len = line.trim().length();
354                if(len > 10) {
355                        classification  = line.substring (10, min(len,50)).trim() ;
356                        pdbHeader.setClassification(classification);
357                }
358                if(len > 50) {
359                        deposition_date = line.substring (50, min(len,59)).trim() ;
360                        try {
361                                Date dep = dateFormat.parse(deposition_date);
362                                pdbHeader.setDepDate(dep);
363
364                        } catch (ParseException e){
365                                logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date");
366                        }
367                }
368                if(len > 62) {
369                        pdbCode         = line.substring (62, min(len,66)).trim() ;
370                        pdbId = pdbCode;
371
372                        logger.debug("Parsing entry " + pdbId);
373
374
375                        PdbId pdbIdToSet;
376                        try {
377                                pdbIdToSet = new PdbId(pdbCode);
378                        } catch (IllegalArgumentException e) {
379                                logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode);
380                                pdbIdToSet = null;
381                        }
382                        structure.setPdbId(pdbIdToSet);
383                        pdbHeader.setPdbId(pdbIdToSet);
384                }
385
386                //*really* old files (you'll need to hunt to find these as they
387                //should have been remediated) have headers like below. Plus the
388                //pdbId at positions 72-76 is present in every line
389
390                //HEADER    PROTEINASE INHIBITOR (TRYPSIN)          05-OCT-84   5PTI      5PTI   3
391                //HEADER    TRANSFERASE (ACYLTRANSFERASE)           02-SEP-92   1LAC      1LAC   2
392                if (len > 66) {
393                        if (pdbId.equals(line.substring (72, 76))){
394                                isLegacyFormat = true;
395                                logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly.");
396                        }
397                }
398
399        }
400
401
402        /**
403         * Parses the following record:
404         * <pre>
405         *  COLUMNS      DATA  TYPE      FIELD         DEFINITION
406         * ------------------------------------------------------------------------------------
407         *  1 -  6      Record name     "AUTHOR"
408         *  9 - 10      Continuation    continuation  Allows concatenation of multiple records.
409         * 11 - 79      List            authorList    List of the author names, separated
410         *                                            by commas.
411         *
412         * </pre>
413         * @param line
414         */
415        private void pdb_AUTHOR_Handler(String line) {
416
417                String authors = line.substring(10).trim();
418
419                String auth = pdbHeader.getAuthors();
420                if (auth == null){
421                        pdbHeader.setAuthors(authors);
422                } else {
423                        auth +=  authors;
424                        pdbHeader.setAuthors(auth);
425                }
426
427        }
428
429
430
431        /**
432         * Parses the following record:
433         *
434         * <pre>
435         * COLUMNS       DATA TYPE        FIELD        DEFINITION
436         * --------------------------------------------------------------------
437         *  1 -  6       Record name      "HELIX "
438         *  8 - 10       Integer          serNum       Serial number of the helix.
439         *                                             This starts at 1 and increases
440         *                                             incrementally.
441         * 12 - 14       LString(3)       helixID      Helix identifier. In addition
442         *                                             to a serial number, each helix is
443         *                                             given an alphanumeric character
444         *                                             helix identifier.
445         * 16 - 18       Residue name     initResName  Name of the initial residue.
446         * 20            Character        initChainID  Chain identifier for the chain
447         *                                             containing this helix.
448         * 22 - 25       Integer          initSeqNum   Sequence number of the initial
449         *                                             residue.
450         * 26            AChar            initICode    Insertion code of the initial
451         *                                             residue.
452         * 28 - 30       Residue name     endResName   Name of the terminal residue of
453         *                                             the helix.
454         * 32            Character        endChainID   Chain identifier for the chain
455         *                                             containing this helix.
456         * 34 - 37       Integer          endSeqNum    Sequence number of the terminal
457         *                                             residue.
458         * 38            AChar            endICode     Insertion code of the terminal
459         *                                             residue.
460         * 39 - 40       Integer          helixClass   Helix class (see below).
461         * 41 - 70       String           comment      Comment about this helix.
462         * 72 - 76       Integer          length       Length of this helix.
463         * </pre>
464         */
465        private void pdb_HELIX_Handler(String line){
466
467                if (params.isHeaderOnly()) return;
468
469                if (line.length()<38) {
470                        logger.info("HELIX line has length under 38. Ignoring it.");
471                        return;
472                }
473
474                String initResName = line.substring(15,18).trim();
475                String initChainId = line.substring(19,20);
476                String initSeqNum  = line.substring(21,25).trim();
477                String initICode   = line.substring(25,26);
478                String endResName  = line.substring(27,30).trim();
479                String endChainId  = line.substring(31,32);
480                String endSeqNum   = line.substring(33,37).trim();
481                String endICode    = line.substring(37,38);
482
483                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
484                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
485
486                Map<String,String> m = new HashMap<String,String>();
487
488                m.put("initResName",initResName);
489                m.put("initChainId", initChainId);
490                m.put("initSeqNum", initSeqNum);
491                m.put("initICode", initICode);
492                m.put("endResName", endResName);
493                m.put("endChainId", endChainId);
494                m.put("endSeqNum",endSeqNum);
495                m.put("endICode",endICode);
496
497                helixList.add(m);
498
499        }
500
501        /**
502         * Handler for
503         * <pre>
504         *       COLUMNS     DATA TYPE        FIELD           DEFINITION
505         * --------------------------------------------------------------
506         *  1 -  6     Record name      "SHEET "
507         *  8 - 10     Integer          strand       Strand number which starts at 1
508         *                                           for each strand within a sheet
509         *                                           and increases by one.
510         * 12 - 14     LString(3)       sheetID      Sheet identifier.
511         * 15 - 16     Integer          numStrands   Number of strands in sheet.
512         * 18 - 20     Residue name     initResName  Residue name of initial residue.
513         * 22          Character        initChainID  Chain identifier of initial
514         *                                           residue in strand.
515         * 23 - 26     Integer          initSeqNum   Sequence number of initial
516         *                                           residue in strand.
517         * 27          AChar            initICode    Insertion code of initial residue
518         *                                           in strand.
519         * 29 - 31     Residue name     endResName   Residue name of terminal residue.
520         * 33          Character        endChainID   Chain identifier of terminal
521         *                                           residue.
522         * 34 - 37     Integer          endSeqNum    Sequence number of terminal
523         *                                           residue.
524         * 38          AChar            endICode     Insertion code of terminal
525         *                                           residue.
526         * 39 - 40     Integer          sense        Sense of strand with respect to
527         *                                           previous strand in the sheet. 0
528         *                                           if first strand, 1 if parallel,
529         *                                           -1 if anti-parallel.
530         * 42 - 45     Atom             curAtom      Registration. Atom name in
531         *                                           current strand.
532         * 46 - 48     Residue name     curResName   Registration. Residue name in
533         *                                           current strand.
534         * 50          Character        curChainId   Registration. Chain identifier in
535         *                                           current strand.
536         * 51 - 54     Integer          curResSeq    Registration. Residue sequence
537         *                                           number in current strand.
538         * 55          AChar            curICode     Registration. Insertion code in
539         *                                           current strand.
540         * 57 - 60     Atom             prevAtom     Registration. Atom name in
541         *                                           previous strand.
542         * 61 - 63     Residue name     prevResName  Registration. Residue name in
543         *                                           previous strand.
544         * 65          Character        prevChainId  Registration. Chain identifier in
545         *                                           previous strand.
546         * 66 - 69     Integer          prevResSeq   Registration. Residue sequence
547         *                                           number in previous strand.
548         * 70          AChar            prevICode    Registration. Insertion code in
549         *                                               previous strand.
550         * </pre>
551         */
552        private void pdb_SHEET_Handler( String line){
553
554                if (params.isHeaderOnly()) return;
555
556                if (line.length()<38) {
557                        logger.info("SHEET line has length under 38. Ignoring it.");
558                        return;
559                }
560
561                String initResName = line.substring(17,20).trim();
562                String initChainId = line.substring(21,22);
563                String initSeqNum  = line.substring(22,26).trim();
564                String initICode   = line.substring(26,27);
565                String endResName  = line.substring(28,31).trim();
566                String endChainId  = line.substring(32,33);
567                String endSeqNum   = line.substring(33,37).trim();
568                String endICode    = line.substring(37,38);
569
570                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
571                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
572
573                Map<String,String> m = new HashMap<String,String>();
574
575                m.put("initResName",initResName);
576                m.put("initChainId", initChainId);
577                m.put("initSeqNum", initSeqNum);
578                m.put("initICode", initICode);
579                m.put("endResName", endResName);
580                m.put("endChainId", endChainId);
581                m.put("endSeqNum",endSeqNum);
582                m.put("endICode",endICode);
583
584                strandList.add(m);
585        }
586
587
588        /**
589         * Handler for TURN lines
590         * <pre>
591         * COLUMNS      DATA TYPE        FIELD         DEFINITION
592         * --------------------------------------------------------------------
593         *  1 -  6      Record name      "TURN "
594         *  8 - 10      Integer          seq           Turn number; starts with 1 and
595         *                                             increments by one.
596         * 12 - 14      LString(3)       turnId        Turn identifier
597         * 16 - 18      Residue name     initResName   Residue name of initial residue in
598         *                                             turn.
599         * 20           Character        initChainId   Chain identifier for the chain
600         *                                             containing this turn.
601         * 21 - 24      Integer          initSeqNum    Sequence number of initial residue
602         *                                             in turn.
603         * 25           AChar            initICode     Insertion code of initial residue
604         *                                             in turn.
605         * 27 - 29      Residue name     endResName    Residue name of terminal residue
606         *                                             of turn.
607         * 31           Character        endChainId    Chain identifier for the chain
608         *                                             containing this turn.
609         * 32 - 35      Integer          endSeqNum     Sequence number of terminal
610         *                                             residue of turn.
611         * 36           AChar            endICode      Insertion code of terminal residue
612         *                                             of turn.
613         * 41 - 70      String           comment       Associated comment.
614         * </pre>
615         * @param line
616         */
617        private void pdb_TURN_Handler( String line){
618
619                if (params.isHeaderOnly()) return;
620
621                if (line.length()<36) {
622                        logger.info("TURN line has length under 36. Ignoring it.");
623                        return;
624                }
625
626                String initResName = line.substring(15,18).trim();
627                String initChainId = line.substring(19,20);
628                String initSeqNum  = line.substring(20,24).trim();
629                String initICode   = line.substring(24,25);
630                String endResName  = line.substring(26,29).trim();
631                String endChainId  = line.substring(30,31);
632                String endSeqNum   = line.substring(31,35).trim();
633                String endICode    = line.substring(35,36);
634
635                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
636                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
637
638                Map<String,String> m = new HashMap<String,String>();
639
640                m.put("initResName",initResName);
641                m.put("initChainId", initChainId);
642                m.put("initSeqNum", initSeqNum);
643                m.put("initICode", initICode);
644                m.put("endResName", endResName);
645                m.put("endChainId", endChainId);
646                m.put("endSeqNum",endSeqNum);
647                m.put("endICode",endICode);
648
649                turnList.add(m);
650        }
651
652        /**
653         * Handler for
654         * REVDAT Record format:
655         * <pre>
656         *
657         * COLUMNS       DATA TYPE      FIELD         DEFINITION
658         * ----------------------------------------------------------------------------------
659         * 1 -  6       Record name    "REVDAT"
660         * 8 - 10       Integer        modNum        Modification number.
661         * 11 - 12       Continuation   continuation  Allows concatenation of multiple
662         * records.
663         * 14 - 22       Date           modDate       Date of modification (or release for
664         * new entries).  This is not repeated
665         * on continuation lines.
666         * 24 - 28       String(5)      modId         Identifies this particular
667         * modification.  It links to the
668         * archive used internally by PDB.
669         * This is not repeated on continuation
670         * lines.
671         * 32            Integer        modType       An integer identifying the type of
672         * modification.  In case of revisions
673         * with more than one possible modType,
674         * the highest value applicable will be
675         * assigned.
676         * 40 - 45       LString(6)     record        Name of the modified record.
677         * 47 - 52       LString(6)     record        Name of the modified record.
678         * 54 - 59       LString(6)     record        Name of the modified record.
679         * 61 - 66       LString(6)     record        Name of the modified record.
680         * </pre>
681         */
682        private void pdb_REVDAT_Handler(String line) {
683
684                // keep the first as latest modified date and the last as release date
685                Date modDate = pdbHeader.getModDate();
686
687                if ( modDate==null || modDate.equals(new Date(0)) ) {
688
689                        // modified date is still uninitialized
690                        String modificationDate = line.substring (13, 22).trim() ;
691
692                        try {
693                                Date dep = dateFormat.parse(modificationDate);
694                                pdbHeader.setModDate(dep);
695                                pdbHeader.setRelDate(dep);
696                        } catch (ParseException e){
697                                logger.info("Could not parse revision date string '"+modificationDate+"'. ");
698                        }
699
700                } else {
701
702                        // set as the release date
703                        String releaseDate = line.substring (13, 22).trim() ;
704
705                        try {
706                                Date dep = dateFormat.parse(releaseDate);
707                                pdbHeader.setRelDate(dep);
708                        } catch (ParseException e){
709                                logger.info("Could not parse revision date string '"+releaseDate+"'. ");
710                        }
711                }
712        }
713
714        /**
715         * Handler for
716         * SEQRES record format
717         * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied.
718         * <p>
719         * Record Format:
720         * <p>
721         * <pre>
722         * COLUMNS        DATA TYPE       FIELD         DEFINITION
723         * ---------------------------------------------------------------------------------
724         * 1 -  6        Record name     "SEQRES"
725         * 9 - 10        Integer         serNum        Serial number of the SEQRES record
726         * for the current chain.  Starts at 1
727         * and increments by one each line.
728         * Reset to 1 for each chain.
729         * 12             Character       chainID       Chain identifier.  This may be any
730         * single legal character, including a
731         * blank which is used if there is
732         * only one chain.
733         * 14 - 17        Integer         numRes        Number of residues in the chain.
734         * This value is repeated on every
735         * record.
736         * 20 - 22        Residue name    resName       Residue name.
737         * 24 - 26        Residue name    resName       Residue name.
738         * 28 - 30        Residue name    resName       Residue name.
739         * 32 - 34        Residue name    resName       Residue name.
740         * 36 - 38        Residue name    resName       Residue name.
741         * 40 - 42        Residue name    resName       Residue name.
742         * 44 - 46        Residue name    resName       Residue name.
743         * 48 - 50        Residue name    resName       Residue name.
744         * 52 - 54        Residue name    resName       Residue name.
745         * 56 - 58        Residue name    resName       Residue name.
746         * 60 - 62        Residue name    resName       Residue name.
747         * 64 - 66        Residue name    resName       Residue name.
748         * 68 - 70        Residue name    resName       Residue name.
749         * </pre>
750         * @author Jules Jacobsen
751         */
752        private void pdb_SEQRES_Handler(String line) {
753
754                /*
755                 *          1         2         3         4         5         6         7
756                 * 1234567890123456789012345678901234567890123456789012345678901234567890
757                 * SEQRES   1 A  376  LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR
758                 * SEQRES   1 A   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
759                 * SEQRES   2 A   21  TYR GLN LEU GLU ASN TYR CYS ASN
760                 * SEQRES   1 B   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
761                 * SEQRES   2 B   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
762                 * SEQRES   3 B   30  THR PRO LYS ALA
763                 * SEQRES   1 C   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
764                 * SEQRES   2 C   21  TYR GLN LEU GLU ASN TYR CYS ASN
765                 * SEQRES   1 D   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
766                 * SEQRES   2 D   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
767                 * SEQRES   3 D   30  THR PRO LYS ALA
768                 */
769
770                String recordName = line.substring(0, 6).trim();
771                String chainID    = line.substring(11, 12);
772                String newLength   = line.substring(13,17).trim();
773                String subSequence = line.substring(18);
774
775                if ( lengthCheck == -1 ){
776                        lengthCheck = Integer.parseInt(newLength);
777                }
778
779                StringTokenizer subSequenceResidues = new StringTokenizer(subSequence);
780
781                Character aminoCode1 = null;
782                if (! recordName.equals(AminoAcid.SEQRESRECORD)) {
783                        // should not have been called
784                        return;
785                }
786
787                currentChain = isKnownChain(chainID, seqResChains);
788                if ( currentChain == null) {
789
790                        currentChain = new ChainImpl();
791                        currentChain.setId(chainID);
792                        currentChain.setName(chainID);
793
794                }
795
796                while (subSequenceResidues.hasMoreTokens()) {
797
798                        String threeLetter = subSequenceResidues.nextToken();
799
800                        aminoCode1 = StructureTools.get1LetterCode(threeLetter);
801
802                        //if (aminoCode1 == null) {
803                        // could be a nucleotide...
804                        // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide...
805                        //}
806                        currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter);
807
808                        currentGroup.setPDBName(threeLetter);
809
810                        if ( currentGroup instanceof AminoAcid){
811                                AminoAcid aa = (AminoAcid)currentGroup;
812                                aa.setRecordType(AminoAcid.SEQRESRECORD);
813                        }
814                        // add the current resNum to the new chain.
815                        currentChain.addGroup(currentGroup);
816
817                }
818                Chain test = isKnownChain(chainID, seqResChains);
819
820                if ( test == null)
821                        seqResChains.add(currentChain);
822
823                if (currentGroup != null)
824                        currentGroup.trimToSize();
825
826                currentGroup = null;
827                currentChain = null;
828
829                //               the current chain is finished!
830                //if ( current_chain.getLength() != lengthCheck ){
831                //      System.err.println("the length of chain " + current_chain.getName() + "(" +
832                //                      current_chain.getLength() + ") does not match the expected " + lengthCheck);
833                //}
834
835                lengthCheck = Integer.parseInt(newLength);
836
837        }
838
839
840
841        /**
842         * Handler for
843         * TITLE Record Format
844         * <pre>
845         COLUMNS        DATA TYPE       FIELD          DEFINITION
846         ----------------------------------------------------------------------------------
847         1 -  6        Record name     "TITLE "
848         9 - 10        Continuation    continuation   Allows concatenation of multiple
849         records.
850         11 - 70        String          title          Title of the experiment.
851         * </pre>
852         *
853         */
854        private void pdb_TITLE_Handler(String line) {
855                String title;
856                if ( line.length() > 79)
857                        title = line.substring(10,80).trim();
858                else
859                        title = line.substring(10,line.length()).trim();
860
861                String t = pdbHeader.getTitle();
862                if ( (t != null) && (! t.equals("")) ){
863                        if (t.endsWith("-"))
864                                t += ""; // if last line ends with a hyphen then we don't add space
865                        else
866                                t += " ";
867                }
868                else t = "";
869
870                t += title;
871
872                pdbHeader.setTitle(t);
873        }
874
875        /**
876         * JRNL handler.
877         * The JRNL record contains the primary literature citation that describes the experiment which resulted
878         * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary
879         * reference, then there is no JRNL reference. Other references are given in REMARK 1.
880         *
881         * Record Format
882         * <pre>
883         * COLUMNS       DATA TYPE     FIELD         DEFINITION
884         * -----------------------------------------------------------------------
885         * 1 -  6       Record name   "JRNL  "
886         *
887         * 13 - 70       LString        text         See Details below.
888         * </pre>
889         */
890        private void pdb_JRNL_Handler(String line) {
891                //add the strings to the journalLines
892                //the actual JournalArticle is then built when the whole entry is being
893                //finalized with triggerEndFileChecks()
894                //JRNL        TITL   NMR SOLUTION STRUCTURE OF RECOMBINANT TICK           1TAP  10
895                if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) {
896                        //trim off the trailing PDB id from legacy files.
897                        //are we really trying to still cater for these museum pieces?
898
899                        logger.debug("trimming legacy PDB id from end of JRNL section line");
900
901                        line = line.substring(0, line.length() - 8);
902                        journalLines.add(line);
903                } else {
904                        journalLines.add(line);
905                }
906        }
907
908        /**
909         * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same
910         * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be
911         * missing. Don't say I didn't warn you.
912         *
913         * @param line
914         */
915        private void pdb_COMPND_Handler(String line) {
916
917                logger.debug("previousContinuationField  is "
918                                        + previousContinuationField);
919                logger.debug("current continuationField  is "
920                                        + continuationField);
921                logger.debug("current continuationString is "
922                                        + continuationString);
923                logger.debug("current compound           is "
924                                        + current_compound);
925
926
927                // In legacy PDB files the line ends with the PDB code and a serial number, chop those off!
928                //format version 3.0 onwards will have 80 characters in a line
929                //              if (line.length() > 72) {
930                if (isLegacyFormat) {
931                        //                    if (DEBUG) {
932                        //                        System.out.println("We have a legacy file - truncating line length to 71 characters:");
933                        //                        System.out.println(line);
934                        //                    }
935                        line = line.substring(0, 72);
936                }
937
938                line = line.substring(10, line.length());
939
940
941                String[] fieldList = line.trim().split("\\s+");
942                int fl = fieldList.length;
943                if (fl > 0) {
944                        String field0 = fieldList[0];
945                        if (compndFieldValues.contains(field0)) {
946                                continuationField = field0;
947                                if (previousContinuationField.equals("")) {
948                                        previousContinuationField = continuationField;
949                                }
950                        } else if (field0.endsWith(";") && compndFieldValues.contains(field0.substring(0, field0.length()-1)) ) {
951                                // the ':' character indicates the end of a field name and should be invalid as part the first data token
952                                // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
953                                // UPDATE: There is no harm of having a ':' in the first data token. e.g. 3fdj contains a ':'.
954                                //   The intended case occurs only if the token is a key followed by a colon and a semicolon without spaces, e.g. "COMPND   2 MOLECULE:;"
955                                logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier");
956                                return;
957                        }
958                } else {
959                        // the line will be added as data to the previous field
960                }
961
962
963                line = line.replace(continuationField, "").trim();
964
965                StringTokenizer compndTokens = new StringTokenizer(line);
966
967                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
968
969                while (compndTokens.hasMoreTokens()) {
970                        String token = compndTokens.nextToken();
971
972                        if (previousContinuationField.equals("")) {
973                                previousContinuationField = continuationField;
974                        }
975
976                        if (previousContinuationField.equals(continuationField)
977                                        && compndFieldValues.contains(continuationField)) {
978
979                                logger.debug("Still in field " + continuationField);
980                                logger.debug("token = " + token);
981
982                                continuationString = continuationString.concat(token + " ");
983
984                                logger.debug("continuationString = "
985                                                        + continuationString);
986
987                        }
988                        if (!continuationField.equals(previousContinuationField)) {
989
990                                if (continuationString.equals("")) {
991                                        continuationString = token;
992
993                                } else {
994
995                                        compndValueSetter(previousContinuationField,
996                                                        continuationString);
997                                        previousContinuationField = continuationField;
998                                        continuationString = token + " ";
999                                }
1000                        } else if (ignoreCompndFieldValues.contains(token)) {
1001                                // this field shall be ignored
1002                                //continuationField = token;
1003                        }
1004                }
1005                if (isLastCompndLine) {
1006                        // final line in the section - finish off the compound
1007                        //                      System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header.");
1008                        compndValueSetter(continuationField, continuationString);
1009                        continuationString = "";
1010                        if (current_compound!=null) entities.add(current_compound);
1011                }
1012        }
1013
1014        /**
1015         * Set the value in the current molId object
1016         * @param field
1017         * @param value
1018         */
1019        private void compndValueSetter(String field, String value) {
1020
1021                value = value.trim().replace(";", "");
1022                if (field.equals("MOL_ID:")) {
1023
1024                        int i = -1;
1025                        try {
1026                                i = Integer.valueOf(value);
1027                        } catch (NumberFormatException e){
1028                                logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value);
1029                        }
1030                        if (i>0 && prevMolId!=i) {
1031
1032                                if (current_compound!=null) entities.add(current_compound);
1033
1034                                logger.debug("Initialising new Compound with mol_id {}", i);
1035
1036                                current_compound = new EntityInfo();
1037
1038                                current_compound.setMolId(i);
1039
1040                                // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25
1041                                current_compound.setType(EntityType.POLYMER);
1042
1043                                prevMolId = i;
1044                        }
1045
1046                }
1047
1048                // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return
1049                if (current_compound==null) {
1050                        return;
1051                }
1052
1053                if (field.equals("MOLECULE:")) {
1054                        current_compound.setDescription(value);
1055
1056                }
1057                if (field.equals("CHAIN:")) {
1058                        //System.out.println(value);
1059                        StringTokenizer chainTokens = new StringTokenizer(value, ",");
1060                        List<String> chains = new ArrayList<String>();
1061
1062                        while (chainTokens.hasMoreTokens()) {
1063                                String chainID = chainTokens.nextToken().trim();
1064                                // NULL is used in old PDB files to represent empty chain DI
1065                                if (chainID.equals("NULL"))
1066                                        chainID = " ";
1067                                chains.add(chainID);
1068                        }
1069                        compoundMolIds2chainIds.put(current_compound.getMolId(),chains);
1070
1071                }
1072                if (field.equals("SYNONYM:")) {
1073
1074                        StringTokenizer synonyms = new StringTokenizer(value, ",");
1075                        List<String> names = new ArrayList<String>();
1076
1077                        while (synonyms.hasMoreTokens()) {
1078                                names.add(synonyms.nextToken());
1079
1080                                current_compound.setSynonyms(names);
1081                        }
1082
1083                }
1084
1085                if (field.equals("EC:")) {
1086
1087                        StringTokenizer ecNumTokens = new StringTokenizer(value, ",");
1088                        List<String> ecNums = new ArrayList<String>();
1089
1090                        while (ecNumTokens.hasMoreTokens()) {
1091                                ecNums.add(ecNumTokens.nextToken());
1092
1093                                current_compound.setEcNums(ecNums);
1094                        }
1095
1096                }
1097                if (field.equals("FRAGMENT:")) {
1098
1099                        current_compound.setFragment(value);
1100
1101                }
1102                if (field.equals("ENGINEERED:")) {
1103
1104                        current_compound.setEngineered(value);
1105
1106                }
1107                if (field.equals("MUTATION:")) {
1108
1109                        current_compound.setMutation(value);
1110
1111                }
1112                if (field.equals("BIOLOGICAL_UNIT:")) {
1113
1114                        current_compound.setBiologicalUnit(value);
1115
1116                }
1117                if (field.equals("OTHER_DETAILS:")) {
1118
1119                        current_compound.setDetails(value);
1120
1121                }
1122
1123        }
1124
1125
1126        /**
1127         * Handler for
1128         * SOURCE Record format
1129         *
1130         * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied.
1131         * Record Format
1132         * <pre>
1133         * COLUMNS   DATA TYPE         FIELD          DEFINITION
1134         * -------------------------------------------------------------------------------
1135         *  1 -  6   Record name       "SOURCE"
1136         *  9 - 10   Continuation      continuation   Allows concatenation of multiple records.
1137         * 11 - 70   Specification     srcName        Identifies the source of the macromolecule in
1138         *            list                            a token: value format.
1139         * </pre>
1140         * @param line the line to be parsed
1141         */
1142        private void pdb_SOURCE_Handler(String line) {
1143                // works in the same way as the pdb_COMPND_Handler.
1144                String continuationNr = line.substring(9, 10).trim();
1145
1146
1147
1148                logger.debug("current continuationNo     is "
1149                                + continuationNr);
1150                logger.debug("previousContinuationField  is "
1151                                + previousContinuationField);
1152                logger.debug("current continuationField  is "
1153                                + continuationField);
1154                logger.debug("current continuationString is "
1155                                + continuationString);
1156                logger.debug("current compound           is "
1157                                + current_compound);
1158
1159
1160                // following the docs, the last valid character should be 79, chop off the rest
1161                if (line.length() > 79) {
1162                        line = line.substring(0, 79);
1163                }
1164
1165                line = line.substring(10, line.length());
1166
1167                logger.debug("LINE: >" + line + "<");
1168
1169                String[] fieldList = line.split("\\s+");
1170
1171                if (!fieldList[0].equals("")
1172                                && sourceFieldValues.contains(fieldList[0])) {
1173                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'");
1174                        continuationField = fieldList[0];
1175                        if (previousContinuationField.equals("")) {
1176                                previousContinuationField = continuationField;
1177                        }
1178
1179                } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) {
1180                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'");
1181                        continuationField = fieldList[1];
1182                        if (previousContinuationField.equals("")) {
1183                                previousContinuationField = continuationField;
1184                        }
1185
1186                } else {
1187                        if (continuationNr.equals("")) {
1188
1189                                logger.debug("looks like an old PDB file");
1190
1191                                continuationField = "MOLECULE:";
1192                                if (previousContinuationField.equals("")) {
1193                                        previousContinuationField = continuationField;
1194                                }
1195                        }
1196
1197                }
1198
1199                line = line.replace(continuationField, "").trim();
1200
1201                StringTokenizer compndTokens = new StringTokenizer(line);
1202
1203                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
1204
1205                while (compndTokens.hasMoreTokens()) {
1206                        String token = compndTokens.nextToken();
1207
1208                        if (previousContinuationField.equals("")) {
1209                                //                              System.out.println("previousContinuationField is empty. Setting to : " + continuationField);
1210                                previousContinuationField = continuationField;
1211                        }
1212
1213                        if (previousContinuationField.equals(continuationField)
1214                                        && sourceFieldValues.contains(continuationField)) {
1215
1216                                logger.debug("Still in field " + continuationField);
1217
1218                                continuationString = continuationString.concat(token + " ");
1219
1220                                logger.debug("continuationString = "
1221                                                        + continuationString);
1222                        }
1223                        if (!continuationField.equals(previousContinuationField)) {
1224
1225                                if (continuationString.equals("")) {
1226                                        continuationString = token;
1227
1228                                } else {
1229
1230                                        sourceValueSetter(previousContinuationField,
1231                                                        continuationString);
1232                                        previousContinuationField = continuationField;
1233                                        continuationString = token + " ";
1234                                }
1235                        } else if (ignoreCompndFieldValues.contains(token)) {
1236                                // this field shall be ignored
1237                                //continuationField = token;
1238                        }
1239                }
1240                if (isLastSourceLine) {
1241                        // final line in the section - finish off the compound
1242                        //                      System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header.");
1243                        sourceValueSetter(continuationField, continuationString);
1244                        continuationString = "";
1245                        //compounds.add(current_compound);
1246                }
1247
1248        }
1249
1250
1251        /**
1252         * Set the value in the current molId object
1253         *
1254         * @param field
1255         * @param value
1256         */
1257        private void sourceValueSetter(String field, String value) {
1258
1259                value = value.trim().replace(";", "");
1260                //              System.out.println("[sourceValueSetter] " + field);
1261                if (field.equals("MOL_ID:")) {
1262
1263                        try {
1264                                current_compound = entities.get(Integer.valueOf(value) - 1);
1265                        } catch (NumberFormatException e){
1266                                logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage());
1267                                return;
1268                        }
1269
1270
1271                        //                      System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId());
1272
1273                }
1274                if (field.equals("SYNTHETIC:")) {
1275                        current_compound.setSynthetic(value);
1276                } else if (field.equals("FRAGMENT:")) {
1277                        current_compound.setFragment(value);
1278                } else if (field.equals("ORGANISM_SCIENTIFIC:")) {
1279                        current_compound.setOrganismScientific(value);
1280                } else if (field.equals("ORGANISM_TAXID:")) {
1281                        current_compound.setOrganismTaxId(value);
1282                } else if (field.equals("ORGANISM_COMMON:")) {
1283                        current_compound.setOrganismCommon(value);
1284                } else if (field.equals("STRAIN:")) {
1285                        current_compound.setStrain(value);
1286                } else if (field.equals("VARIANT:")) {
1287                        current_compound.setVariant(value);
1288                } else if (field.equals("CELL_LINE:")) {
1289                        current_compound.setCellLine(value);
1290                } else if (field.equals("ATCC:")) {
1291                        current_compound.setAtcc(value);
1292                } else if (field.equals("ORGAN:")) {
1293                        current_compound.setOrgan(value);
1294                } else if (field.equals("TISSUE:")) {
1295                        current_compound.setTissue(value);
1296                } else if (field.equals("CELL:")) {
1297                        current_compound.setCell(value);
1298                } else if (field.equals("ORGANELLE:")) {
1299                        current_compound.setOrganelle(value);
1300                } else if (field.equals("SECRETION:")) {
1301                        current_compound.setSecretion(value);
1302                } else if (field.equals("GENE:")) {
1303                        current_compound.setGene(value);
1304                } else if (field.equals("CELLULAR_LOCATION:")) {
1305                        current_compound.setCellularLocation(value);
1306                } else if (field.equals("EXPRESSION_SYSTEM:")) {
1307                        current_compound.setExpressionSystem(value);
1308                } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) {
1309                        current_compound.setExpressionSystemTaxId(value);
1310                } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) {
1311                        current_compound.setExpressionSystemStrain(value);
1312                } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) {
1313                        current_compound.setExpressionSystemVariant(value);
1314                } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) {
1315                        current_compound.setExpressionSystemCellLine(value);
1316                } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) {
1317                        current_compound.setExpressionSystemAtccNumber(value);
1318                } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) {
1319                        current_compound.setExpressionSystemOrgan(value);
1320                } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) {
1321                        current_compound.setExpressionSystemTissue(value);
1322                } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) {
1323                        current_compound.setExpressionSystemCell(value);
1324                } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) {
1325                        current_compound.setExpressionSystemOrganelle(value);
1326                } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) {
1327                        current_compound.setExpressionSystemCellularLocation(value);
1328                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) {
1329                        current_compound.setExpressionSystemVectorType(value);
1330                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) {
1331                        current_compound.setExpressionSystemVector(value);
1332                } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) {
1333                        current_compound.setExpressionSystemPlasmid(value);
1334                } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) {
1335                        current_compound.setExpressionSystemGene(value);
1336                } else if (field.equals("OTHER_DETAILS:")) {
1337                        current_compound.setExpressionSystemOtherDetails(value);
1338                }
1339
1340        }
1341
1342        /**
1343         * Handler for REMARK lines
1344         */
1345        private void pdb_REMARK_Handler(String line) {
1346
1347                if ( line == null || line.length() < 11)
1348                        return;
1349
1350
1351                if (line.startsWith("REMARK 800")) {
1352                        pdb_REMARK_800_Handler(line);
1353
1354                } else if ( line.startsWith("REMARK 350")){
1355
1356                        if ( params.isParseBioAssembly()) {
1357
1358                                if (bioAssemblyParser == null){
1359                                        bioAssemblyParser = new PDBBioAssemblyParser();
1360                                }
1361
1362                                bioAssemblyParser.pdb_REMARK_350_Handler(line);
1363                        }
1364                } else if (line.startsWith("REMARK   2")) {
1365                        //REMARK   2 RESOLUTION.
1366                        Pattern pR = Pattern.compile("^REMARK   2 RESOLUTION.\\s+(\\d+\\.\\d+)\\s+ANGSTROMS\\..*");
1367                        handleResolutionLine(line, pR);
1368
1369                // REMARK 3 (for R free)
1370                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1371                // then last one encountered will be taken
1372                } else if (line.startsWith("REMARK   3   FREE R VALUE")) {
1373
1374                        // Rfree annotation is not very consistent in PDB format, it varies depending on the software
1375                        // Here we follow this strategy:
1376                        // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
1377                        // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
1378
1379                        Pattern pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*");
1380                        Matcher mR = pR.matcher(line);
1381                        if (mR.matches()) {
1382                                try {
1383                                        rfreeNoCutoffLine = Float.parseFloat(mR.group(1));
1384                                } catch (NumberFormatException e) {
1385                                        logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it");
1386                                }
1387                        }
1388                        pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*");
1389                        mR = pR.matcher(line);
1390                        if (mR.matches()) {
1391                                try {
1392                                        rfreeStandardLine = Float.parseFloat(mR.group(1));
1393                                } catch (NumberFormatException e) {
1394                                        logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1));
1395                                }
1396                        }
1397
1398                // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries)
1399                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1400                // then last one encountered will be taken
1401                } else if (line.startsWith("REMARK   3   RESOLUTION RANGE HIGH")){
1402                        Pattern pR = Pattern.compile("^REMARK   3   RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*");
1403                        handleResolutionLine(line, pR);
1404                } else if (line.startsWith("REMARK   3   EFFECTIVE RESOLUTION")){
1405                        Pattern pR = Pattern.compile("^REMARK   3   EFFECTIVE RESOLUTION \\(ANGSTROMS\\)\\s+:\\s+(\\d+\\.\\d+).*");
1406                        handleResolutionLine(line, pR);
1407                }
1408        }
1409
1410        public void handleResolutionLine(String line, Pattern pR) {
1411                Matcher mR = pR.matcher(line);
1412                if (mR.matches()) {
1413                        final String resString = mR.group(1);
1414                        try {
1415                                float res = Float.parseFloat(resString);
1416                                final float resInHeader = pdbHeader.getResolution();
1417                                if (resInHeader!=PDBHeader.DEFAULT_RESOLUTION && resInHeader != res) {
1418                                        logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1419                                                        ,resString, String.format("%4.2f",resInHeader));
1420                                }
1421                                pdbHeader.setResolution(res);
1422                        } catch (NumberFormatException e) {
1423                                logger.info("Could not parse resolution '{}', ignoring it",resString);
1424                        }
1425                }
1426        }
1427
1428
1429
1430
1431
1432
1433        /**
1434         * Handler for
1435         * EXPDTA Record Format
1436        <pre>
1437         COLUMNS       DATA TYPE      FIELD         DEFINITION
1438         -------------------------------------------------------------------------------
1439         1 -  6       Record name    "EXPDTA"
1440         9 - 10       Continuation   continuation  Allows concatenation of multiple
1441         records.
1442         11 - 70       SList          technique     The experimental technique(s) with
1443         optional comment describing the
1444         sample or experiment.
1445
1446         allowed techniques are:
1447         ELECTRON DIFFRACTION
1448         FIBER DIFFRACTION
1449         FLUORESCENCE TRANSFER
1450         NEUTRON DIFFRACTION
1451         NMR
1452         THEORETICAL MODEL
1453         X-RAY DIFFRACTION
1454        </pre>
1455         */
1456        private void pdb_EXPDTA_Handler(String line) {
1457
1458                String technique  ;
1459                if (line.length() > 69)
1460                        technique = line.substring (10, 70).trim() ;
1461                else
1462                        technique = line.substring(10).trim();
1463
1464                for (String singleTechnique: technique.split(";\\s+")) {
1465                        pdbHeader.setExperimentalTechnique(singleTechnique);
1466                }
1467
1468
1469        }
1470
1471        /**
1472         * Handler for
1473         * CRYST1 Record Format
1474         * The CRYST1 record presents the unit cell parameters, space group, and Z value.
1475         * If the entry describes a structure determined by a technique other than X-ray crystallography,
1476         * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1477         * <pre>
1478         * COLUMNS DATA TYPE    FIELD          DEFINITION
1479         * -------------------------------------------------------------
1480         *  1 - 6  Record name  "CRYST1"
1481         *  7 - 15 Real(9.3)    a              a (Angstroms).
1482         * 16 - 24 Real(9.3)    b              b (Angstroms).
1483         * 25 - 33 Real(9.3)    c              c (Angstroms).
1484         * 34 - 40 Real(7.2)    alpha          alpha (degrees).
1485         * 41 - 47 Real(7.2)    beta           beta (degrees).
1486         * 48 - 54 Real(7.2)    gamma          gamma (degrees).
1487         * 56 - 66 LString      sGroup         Space group.
1488         * 67 - 70 Integer      z              Z value.
1489         * </pre>
1490         */
1491        private void pdb_CRYST1_Handler(String line) {
1492                // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1')
1493                if (line.length() < 58) {
1494                        logger.warn("CRYST1 record has fewer than 58 columns: will ignore it");
1495                        return;
1496                }
1497
1498                float a;
1499                float b;
1500                float c;
1501                float alpha;
1502                float beta;
1503                float gamma;
1504                String spaceGroup = "";
1505
1506                try {
1507                        a = Float.parseFloat(line.substring(6,15).trim());
1508                        b = Float.parseFloat(line.substring(15,24).trim());
1509                        c = Float.parseFloat(line.substring(24,33).trim());
1510                        alpha = Float.parseFloat(line.substring(33,40).trim());
1511                        beta = Float.parseFloat(line.substring(40,47).trim());
1512                        gamma = Float.parseFloat(line.substring(47,54).trim());
1513                } catch (NumberFormatException e) {
1514                        logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line);
1515                        return ;
1516                }
1517                if (line.length()>=66) {
1518                        // for well formatted files
1519                        spaceGroup = line.substring(55,66).trim();
1520                } else {
1521                        // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value
1522                        spaceGroup = line.substring(55,line.length()).trim();
1523                }
1524
1525                CrystalCell xtalCell = new CrystalCell();
1526                xtalCell.setA(a);
1527                xtalCell.setB(b);
1528                xtalCell.setC(c);
1529                xtalCell.setAlpha(alpha);
1530                xtalCell.setBeta(beta);
1531                xtalCell.setGamma(gamma);
1532
1533                if (!xtalCell.isCellReasonable()) {
1534                        // If the entry describes a structure determined by a technique other than X-ray crystallography,
1535                        // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1536                        // if so we don't add the crystal cell and it remains null
1537                        logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1538                                        CrystalCell.MIN_VALID_CELL_SIZE);
1539                } else {
1540                        crystallographicInfo.setCrystalCell(xtalCell);
1541                }
1542
1543                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1544                if (sg==null) {
1545                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1546                        crystallographicInfo.setNonStandardSg(true);
1547                } else {
1548                        crystallographicInfo.setSpaceGroup(sg);
1549                        crystallographicInfo.setNonStandardSg(false);
1550                }
1551        }
1552
1553        /**
1554         * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries)
1555         *
1556         * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn
1557         * <pre>
1558         * COLUMNS        DATA TYPE     FIELD         DEFINITION
1559         * -------------------------------------------------------------
1560         *
1561         *  1 -  6        Record name   "MTRIXn"      n=1, 2, or 3
1562         *  8 - 10        Integer       serial        Serial number.
1563         * 11 - 20        Real(10.6)    m[n][1]       Mn1
1564         * 21 - 30        Real(10.6)    m[n][2]       Mn2
1565         * 31 - 40        Real(10.6)    m[n][3]       Mn3
1566         * 46 - 55        Real(10.5)    v[n]          Vn
1567         * 60             Integer       iGiven        1
1568         *
1569         * </pre>
1570         * Note that we ignore operators with iGiven==1
1571         *
1572         * @param line
1573         */
1574        private void pdb_MTRIXn_Handler(String line) {
1575
1576                // don't process incomplete records
1577                if (line.length() < 55) {
1578                        logger.info("MTRIXn record has fewer than 55 columns: will ignore it");
1579                        return;
1580                }
1581
1582
1583                try {
1584
1585                        int rowIndex = Integer.parseInt(line.substring(5,6));
1586                        double col1Value = Double.parseDouble(line.substring(10,20));
1587                        double col2Value = Double.parseDouble(line.substring(20,30));
1588                        double col3Value = Double.parseDouble(line.substring(30,40));
1589                        double translValue = Double.parseDouble(line.substring(45,55));
1590                        int iGiven = 0;
1591                        if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) {
1592                                iGiven = Integer.parseInt(line.substring(59,60));
1593                        }
1594
1595                        if (iGiven == 1) return;
1596
1597                        if (ncsOperators==null) {
1598                                // we initialise on first pass
1599                                ncsOperators = new ArrayList<Matrix4d>();
1600                        }
1601
1602                        if (currentNcsOp==null) {
1603                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1604                        }
1605
1606                        currentNcsOp.setElement(rowIndex-1, 0, col1Value);
1607                        currentNcsOp.setElement(rowIndex-1, 1, col2Value);
1608                        currentNcsOp.setElement(rowIndex-1, 2, col3Value);
1609                        currentNcsOp.setElement(rowIndex-1, 3, translValue);
1610
1611
1612                        if (rowIndex==3) {
1613                                ncsOperators.add(currentNcsOp);
1614                                // we initialise for next matrix to come
1615                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1616                        }
1617
1618                } catch (NumberFormatException e) {
1619                        logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<");
1620                }
1621        }
1622
1623        /**
1624         * Handler for ATOM.
1625         * Record Format:
1626         *
1627         * <pre>
1628         * ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1629         *
1630         * COLUMNS        DATA TYPE       FIELD         DEFINITION
1631         * ---------------------------------------------------------------------------------
1632         * 1 -  6        Record name     "ATOM  "
1633         * 7 - 11        Integer         serial        Atom serial number.
1634         * 13 - 16        Atom            name          Atom name.
1635         * 17             Character       altLoc        Alternate location indicator.
1636         * 18 - 20        Residue name    resName       Residue name.
1637         * 22             Character       chainID       Chain identifier.
1638         * 23 - 26        Integer         resSeq        Residue sequence number.
1639         * 27             AChar           iCode         Code for insertion of residues.
1640         * 31 - 38        Real(8.3)       x             Orthogonal coordinates for X in Angstroms.
1641         * 39 - 46        Real(8.3)       y             Orthogonal coordinates for Y in Angstroms.
1642         * 47 - 54        Real(8.3)       z             Orthogonal coordinates for Z in Angstroms.
1643         * 55 - 60        Real(6.2)       occupancy     Occupancy.
1644         * 61 - 66        Real(6.2)       tempFactor    Temperature factor.
1645         * 73 - 76        LString(4)      segID         Segment identifier, left-justified.
1646         * 77 - 78        LString(2)      element       Element symbol, right-justified.
1647         * 79 - 80        LString(2)      charge        Charge on the atom.
1648         * </pre>
1649         */
1650        private void  pdb_ATOM_Handler(String line)     {
1651
1652                if ( params.isHeaderOnly())
1653                        return;
1654
1655                // let's first get the chain name which will serve to identify if we are starting a new molecule
1656                String chainName      = line.substring(21,22);
1657
1658                if (chainName.equals(" ")) {
1659                        blankChainIdsPresent = true;
1660                }
1661
1662                if (currentChain!=null && !currentChain.getName().equals(chainName)) {
1663                        // new chain name: another molecule coming
1664                        startOfMolecule = true;
1665                }
1666
1667                if (startOfMolecule) {
1668                        // we add last chain if there was one
1669                        if (currentChain!=null) {
1670                                currentModel.add(currentChain);
1671                                // let's not forget adding the last group to the finishing chain
1672                                if (currentGroup!=null) {
1673                                        currentChain.addGroup(currentGroup);
1674                                }
1675                        }
1676                        // we initialise the new molecule to come
1677                        currentChain = new ChainImpl();
1678                        // note that the chainId (asym id) is set properly later in assignAsymIds
1679                        currentChain.setId(chainName);
1680                        currentChain.setName(chainName);
1681
1682                }
1683
1684                if (startOfModel) {
1685                        // we add last model if there was one
1686                        if (currentModel!=null) {
1687                                allModels.add(currentModel);
1688                        }
1689                        // we initialise the model to come
1690                        currentModel = new ArrayList<>();
1691                }
1692
1693
1694                // let's get the residue number and see if we need to start a new group
1695
1696                String groupCode3     = line.substring(17,20).trim();
1697                String resNum  = line.substring(22,26).trim();
1698                Character iCode = line.substring(26,27).charAt(0);
1699                if ( iCode == ' ')
1700                        iCode = null;
1701                ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode);
1702
1703                //recordName      groupCode3
1704                //|                |    resNum
1705                //|                |    |   iCode
1706                //|     |          | |  |   ||
1707                //ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1708                //ATOM   1964  N   ARG H 221A      5.963 -16.715  27.669  1.00 28.59           N
1709
1710                Character aminoCode1 = StructureTools.get1LetterCode(groupCode3);
1711
1712                String recordName     = line.substring (0, 6).trim ();
1713
1714                boolean isHetAtomInFile = false;
1715
1716                if (recordName.equals("HETATM") ){
1717                        // HETATOM RECORDS are treated slightly differently
1718                        // some modified amino acids that we want to treat as amino acids
1719                        // can be found as HETATOM records
1720                        if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
1721                                        aminoCode1 = null;
1722
1723                        isHetAtomInFile = true;
1724                }
1725
1726                if ( startOfMolecule) {
1727
1728                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1729
1730                        currentGroup.setPDBName(groupCode3);
1731                        currentGroup.setResidueNumber(residueNumber);
1732                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1733
1734                }
1735
1736                // resetting states
1737                startOfModel = false;
1738                startOfMolecule = false;
1739
1740
1741                Character altLoc   = line.substring (16, 17).charAt(0);
1742                Group altGroup = null;
1743
1744
1745                // check if residue number is the same ...
1746                if ( ! residueNumber.equals(currentGroup.getResidueNumber())) {
1747
1748                        currentChain.addGroup(currentGroup);
1749                        currentGroup.trimToSize();
1750
1751                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1752
1753                        currentGroup.setPDBName(groupCode3);
1754                        currentGroup.setResidueNumber(residueNumber);
1755                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1756
1757                } else {
1758                        // same residueNumber, but altLocs...
1759
1760                        // test altLoc
1761                        if ( ! altLoc.equals(' ')) {
1762                                logger.debug("found altLoc! " + currentGroup + " " + altGroup);
1763                                altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3);
1764                                if ( altGroup.getChain() == null) {
1765                                        // need to set current chain
1766                                        altGroup.setChain(currentChain);
1767                                }
1768
1769                        }
1770                }
1771
1772                atomCount++;
1773
1774                if ( atomCount == atomCAThreshold ) {
1775                        // throw away the SEQRES lines - too much to deal with...
1776                        logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines");
1777                        seqResChains.clear();
1778
1779                        switchCAOnly();
1780
1781                }
1782
1783
1784
1785                if ( atomCount == loadMaxAtoms){
1786                        logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line);
1787                        return;
1788                }
1789                if ( atomCount > loadMaxAtoms){
1790                        return;
1791                }
1792
1793
1794                //          1         2         3         4         5         6
1795                //012345678901234567890123456789012345678901234567890123456789
1796                //ATOM      1  N   MET     1      20.154  29.699   5.276   1.0
1797                //ATOM    112  CA  ASP   112      41.017  33.527  28.371  1.00  0.00
1798                //ATOM     53  CA  MET     7      23.772  33.989 -21.600  1.00  0.00           C
1799                //ATOM    112  CA  ASP   112      37.613  26.621  33.571     0     0
1800
1801
1802                String fullname = line.substring (12, 16);
1803
1804                // check for CA only if requested
1805                if ( parseCAonly ){
1806                        // yes , user wants to get CA only
1807                        // only parse CA atoms...
1808                        if (! fullname.equals(" CA ")){
1809                                //System.out.println("ignoring " + line);
1810                                atomCount--;
1811                                return;
1812                        }
1813                }
1814
1815                if ( params.getAcceptedAtomNames() != null) {
1816
1817                        boolean found = false;
1818                        for (String ok : params.getAcceptedAtomNames()){
1819                                //System.out.println(ok + "< >" + fullname +"<");
1820
1821                                if ( ok.equals(fullname.trim())) {
1822                                        found = true;
1823                                        break;
1824                                }
1825                        }
1826                        if ( ! found) {
1827                                atomCount--;
1828                                return;
1829                        }
1830                }
1831                // create new atom
1832
1833                int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ());
1834                AtomImpl atom = new AtomImpl() ;
1835                atom.setPDBserial(pdbnumber) ;
1836
1837                atom.setAltLoc(altLoc);
1838                atom.setName(fullname.trim());
1839
1840                double x = Double.parseDouble (line.substring (30, 38).trim());
1841                double y = Double.parseDouble (line.substring (38, 46).trim());
1842                double z = Double.parseDouble (line.substring (46, 54).trim());
1843
1844                double[] coords = new double[3];
1845                coords[0] = x ;
1846                coords[1] = y ;
1847                coords[2] = z ;
1848                atom.setCoords(coords);
1849
1850                float occu  = 1.0f;
1851                if ( line.length() > 59 ) {
1852                        try {
1853                                // occu and tempf are sometimes not used :-/
1854                                occu = Float.parseFloat (line.substring (54, 60).trim());
1855                        }  catch (NumberFormatException e){}
1856                }
1857
1858                float tempf = 0.0f;
1859                if ( line.length() > 65) {
1860                        try {
1861                                tempf = Float.parseFloat (line.substring (60, 66).trim());
1862                        }  catch (NumberFormatException e){}
1863                }
1864
1865                atom.setOccupancy(  occu  );
1866                atom.setTempFactor( tempf );
1867
1868
1869
1870
1871                // Parse element from the element field. If this field is
1872                // missing (i.e. misformatted PDB file), then parse the
1873                // element from the chemical component.
1874                Element element = Element.R;
1875                boolean guessElement = true;
1876                if ( line.length() > 77 ) {
1877                        // parse element from element field
1878                        String elementSymbol = line.substring(76, 78).trim();
1879                        if (elementSymbol.isEmpty()) {
1880                                logger.info("Element column was empty for atom {} {}. Assigning atom element "
1881                                                + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber);
1882                        } else {
1883
1884                        try {
1885                                        element = Element.valueOfIgnoreCase(elementSymbol);
1886                                        guessElement = false;
1887                                }  catch (IllegalArgumentException e){
1888                                        logger.info("Element {} of atom {} {} was not recognised. Assigning atom element "
1889                                                        + "from Chemical Component Dictionary information", elementSymbol,
1890                                                        fullname.trim(), pdbnumber);
1891                                }
1892                        }
1893                } else {
1894                        logger.info("Missformatted PDB file: element column of atom {} {} is not present. "
1895                                        + "Assigning atom element from Chemical Component Dictionary information",
1896                                        fullname.trim(), pdbnumber);
1897                }
1898                if (guessElement) {
1899                        String elementSymbol = null;
1900                        if (currentGroup.getChemComp() != null) {
1901                                for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) {
1902                                        if (a.getAtomId().equals(fullname.trim())) {
1903                                                elementSymbol = a.getTypeSymbol();
1904                                                break;
1905                                        }
1906                                }
1907                                if (elementSymbol == null) {
1908                                        logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. "
1909                                                        + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName());
1910                        } else {
1911                        try {
1912                                element = Element.valueOfIgnoreCase(elementSymbol);
1913                                        } catch (IllegalArgumentException e) {
1914                                                // this can still happen for cases like UNK
1915                                                logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. "
1916                                                                + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber);
1917                                        }
1918                                }
1919                        } else {
1920                                logger.warn("Chemical Component Dictionary information was not found for Atom name {}. "
1921                                                + "Assigning generic element R to it", fullname.trim());
1922                        }
1923
1924                }
1925                atom.setElement(element);
1926
1927
1928                //see if chain_id is one of the previous chains ...
1929                if ( altGroup != null) {
1930                        altGroup.addAtom(atom);
1931                        altGroup = null;
1932                }
1933                else {
1934                        currentGroup.addAtom(atom);
1935                }
1936
1937
1938                // make sure that main group has all atoms
1939                // GitHub issue: #76
1940                if ( ! currentGroup.hasAtom(atom.getName())) {
1941                        currentGroup.addAtom(atom);
1942                }
1943
1944
1945
1946        }
1947
1948
1949        private Group getCorrectAltLocGroup( Character altLoc,
1950                        String recordName, Character aminoCode1, String groupCode3) {
1951
1952                // see if we know this altLoc already;
1953                List<Atom> atoms = currentGroup.getAtoms();
1954                if ( atoms.size() > 0) {
1955                        Atom a1 = atoms.get(0);
1956                        // we are just adding atoms to the current group
1957                        // probably there is a second group following later...
1958                        if (a1.getAltLoc().equals(altLoc)) {
1959
1960                                return currentGroup;
1961                        }
1962                }
1963
1964                List<Group> altLocs = currentGroup.getAltLocs();
1965                for ( Group altLocG : altLocs ){
1966                        atoms = altLocG.getAtoms();
1967                        if ( atoms.size() > 0) {
1968                                for ( Atom a1 : atoms) {
1969                                        if (a1.getAltLoc().equals( altLoc)) {
1970
1971                                                return altLocG;
1972                                        }
1973                                }
1974                        }
1975                }
1976
1977                // no matching altLoc group found.
1978                // build it up.
1979
1980                if ( groupCode3.equals(currentGroup.getPDBName())) {
1981                        if ( currentGroup.getAtoms().size() == 0) {
1982                                //System.out.println("current group is empty " + current_group + " " + altLoc);
1983                                return currentGroup;
1984                        }
1985                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
1986                        Group altLocG = (Group) currentGroup.clone();
1987                        // drop atoms from cloned group...
1988                        // https://redmine.open-bio.org/issues/3307
1989                        altLocG.setAtoms(new ArrayList<Atom>());
1990                        altLocG.getAltLocs().clear();
1991                        currentGroup.addAltLoc(altLocG);
1992                        return altLocG;
1993                }
1994
1995                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
1996                Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3);
1997
1998
1999                altLocG.setPDBName(groupCode3);
2000
2001                altLocG.setResidueNumber(currentGroup.getResidueNumber());
2002                currentGroup.addAltLoc(altLocG);
2003                return altLocG;
2004        }
2005
2006        private void switchCAOnly(){
2007                parseCAonly = true;
2008
2009
2010                currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel);
2011
2012                for ( int i =0; i< structure.nrModels() ; i++){
2013                        //  iterate over all known models ...
2014                        List<Chain> model = structure.getModel(i);
2015                        model = CAConverter.getRepresentativeAtomsOnly(model);
2016                        structure.setModel(i,model);
2017                }
2018
2019                currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain);
2020
2021        }
2022
2023
2024        /** safes repeating a few lines ... */
2025        private Integer conect_helper (String line,int start,int end) {
2026                if (line.length() < end) return null;
2027
2028                String sbond = line.substring(start,end).trim();
2029                int bond  = -1 ;
2030                Integer b = null ;
2031
2032                if ( ! sbond.equals("")) {
2033                        bond = Integer.parseInt(sbond);
2034                        b = new Integer(bond);
2035                }
2036
2037                return b ;
2038        }
2039
2040        /**
2041         * Handler for CONECT Record Format
2042        <pre>
2043         COLUMNS         DATA TYPE        FIELD           DEFINITION
2044         ---------------------------------------------------------------------------------
2045         1 -  6         Record name      "CONECT"
2046         7 - 11         Integer          serial          Atom serial number
2047         12 - 16         Integer          serial          Serial number of bonded atom
2048         17 - 21         Integer          serial          Serial number of bonded atom
2049         22 - 26         Integer          serial          Serial number of bonded atom
2050         27 - 31         Integer          serial          Serial number of bonded atom
2051         32 - 36         Integer          serial          Serial number of hydrogen bonded
2052         atom
2053         37 - 41         Integer          serial          Serial number of hydrogen bonded
2054         atom
2055         42 - 46         Integer          serial          Serial number of salt bridged
2056         atom
2057         47 - 51         Integer          serial          Serial number of hydrogen bonded
2058         atom
2059         52 - 56         Integer          serial          Serial number of hydrogen bonded
2060         atom
2061         57 - 61         Integer          serial          Serial number of salt bridged
2062         atom
2063         </pre>
2064         */
2065        private void pdb_CONECT_Handler(String line) {
2066
2067                if ( atomOverflow) {
2068                        return ;
2069                }
2070                if (params.isHeaderOnly()) {
2071                        return;
2072                }
2073
2074                // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines...
2075                try {
2076                        int atomserial = Integer.parseInt (line.substring(6 ,11).trim());
2077                        Integer bond1      = conect_helper(line,11,16);
2078                        Integer bond2      = conect_helper(line,16,21);
2079                        Integer bond3      = conect_helper(line,21,26);
2080                        Integer bond4      = conect_helper(line,26,31);
2081                        Integer hyd1       = conect_helper(line,31,36);
2082                        Integer hyd2       = conect_helper(line,36,41);
2083                        Integer salt1      = conect_helper(line,41,46);
2084                        Integer hyd3       = conect_helper(line,46,51);
2085                        Integer hyd4       = conect_helper(line,51,56);
2086                        Integer salt2      = conect_helper(line,56,61);
2087
2088                        //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+
2089                        //                 hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2);
2090                        HashMap<String, Integer> cons = new HashMap<String, Integer>();
2091                        cons.put("atomserial",new Integer(atomserial));
2092
2093                        if ( bond1 != null) cons.put("bond1",bond1);
2094                        if ( bond2 != null) cons.put("bond2",bond2);
2095                        if ( bond3 != null) cons.put("bond3",bond3);
2096                        if ( bond4 != null) cons.put("bond4",bond4);
2097                        if ( hyd1  != null) cons.put("hydrogen1",hyd1);
2098                        if ( hyd2  != null) cons.put("hydrogen2",hyd2);
2099                        if ( salt1 != null) cons.put("salt1",salt1);
2100                        if ( hyd3  != null) cons.put("hydrogen3",hyd3);
2101                        if ( hyd4  != null) cons.put("hydrogen4",hyd4);
2102                        if ( salt2 != null) cons.put("salt2",salt2);
2103
2104                        connects.add(cons);
2105                } catch (NumberFormatException e){
2106                        logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line);
2107                        return;
2108                }
2109        }
2110
2111        /**
2112         * Handler for MODEL Record Format
2113         * <pre>
2114         * COLUMNS       DATA TYPE      FIELD         DEFINITION
2115         * ----------------------------------------------------------------------
2116         * 1 -  6       Record name    "MODEL "
2117         * 11 - 14       Integer        serial        Model serial number.
2118         * </pre>
2119         */
2120        private void pdb_MODEL_Handler(String line) {
2121
2122                if (params.isHeaderOnly()) return;
2123
2124                // new model: we start a new molecule
2125                startOfMolecule = true;
2126                startOfModel = true;
2127
2128        }
2129
2130        /**
2131         * Handler for TER record. The record is used in deposited PDB files and many others,
2132         * but it's often forgotten by some softwares. In any case it helps identifying the
2133         * start of ligand molecules so we use it for that.
2134         */
2135        private void pdb_TER_Handler() {
2136                startOfMolecule = true;
2137        }
2138
2139
2140        /**
2141         * DBREF handler
2142         * <pre>
2143         * COLUMNS       DATA TYPE          FIELD          DEFINITION
2144         * ----------------------------------------------------------------
2145         *  1 - 6        Record name        "DBREF "
2146         *  8 - 11       IDcode             idCode         ID code of this entry.
2147         * 13            Character          chainID        Chain identifier.
2148         * 15 - 18       Integer            seqBegin       Initial sequence number
2149         *                                                 of the PDB sequence segment.
2150         * 19            AChar              insertBegin    Initial insertion code
2151         *                                                 of the PDB sequence segment.
2152         * 21 - 24       Integer            seqEnd         Ending sequence number
2153         *                                                 of the PDB sequence segment.
2154         * 25            AChar              insertEnd      Ending insertion code
2155         *                                                 of the PDB sequence segment.
2156         * 27 - 32       LString            database       Sequence database name.
2157         * 34 - 41       LString            dbAccession    Sequence database accession code.
2158         * 43 - 54      LString            dbIdCode        Sequence database
2159         *                                                 identification code.
2160         * 56 - 60      Integer            dbseqBegin      Initial sequence number of the
2161         *                                                 database seqment.
2162         * 61           AChar              idbnsBeg        Insertion code of initial residue
2163         *                                                 of the segment, if PDB is the
2164         *                                                 reference.
2165         * 63 - 67      Integer            dbseqEnd        Ending sequence number of the
2166         *                                                 database segment.
2167         * 68           AChar              dbinsEnd        Insertion code of the ending
2168         *                                                 residue of the segment, if PDB is
2169         *                                                 the reference.
2170         * </pre>
2171         */
2172        private void pdb_DBREF_Handler(String line){
2173
2174                logger.debug("Parsing DBREF " + line);
2175
2176                DBRef dbref = new DBRef();
2177                String idCode      = line.substring(7,11);
2178                String chainName     = line.substring(12,13);
2179                String seqBegin    = line.substring(14,18);
2180                String insertBegin = line.substring(18,19);
2181                String seqEnd      = line.substring(20,24);
2182                String insertEnd   = line.substring(24,25);
2183                String database    = line.substring(26,32);
2184                String dbAccession = line.substring(33,41);
2185                String dbIdCode    = line.substring(42,54);
2186                String dbseqBegin  = line.substring(55,60);
2187                String idbnsBeg    = line.substring(60,61);
2188                String dbseqEnd    = line.substring(62,67);
2189                // Support implicit space character at end
2190                String dbinsEnd;
2191                if(line.length() >= 68)
2192                        dbinsEnd       = line.substring(67,68);
2193                else
2194                        dbinsEnd       = " ";
2195
2196                dbref.setIdCode(idCode);
2197                dbref.setChainName(chainName);
2198                dbref.setSeqBegin(intFromString(seqBegin));
2199                dbref.setInsertBegin(insertBegin.charAt(0));
2200                dbref.setSeqEnd(intFromString(seqEnd));
2201                dbref.setInsertEnd(insertEnd.charAt(0));
2202                dbref.setDatabase(database.trim());
2203                dbref.setDbAccession(dbAccession.trim());
2204                dbref.setDbIdCode(dbIdCode.trim());
2205                dbref.setDbSeqBegin(intFromString(dbseqBegin));
2206                dbref.setIdbnsBegin(idbnsBeg.charAt(0));
2207                dbref.setDbSeqEnd(intFromString(dbseqEnd));
2208                dbref.setIdbnsEnd(dbinsEnd.charAt(0));
2209
2210                //System.out.println(dbref.toPDB());
2211                dbrefs.add(dbref);
2212        }
2213
2214
2215        /**
2216         * Process the disulfide bond info provided by an SSBOND record
2217         *
2218         * <pre>
2219        COLUMNS        DATA TYPE       FIELD         DEFINITION
2220        -------------------------------------------------------------------
2221         1 -  6        Record name     "SSBOND"
2222         8 - 10        Integer         serNum       Serial number.
2223        12 - 14        LString(3)      "CYS"        Residue name.
2224        16             Character       chainID1     Chain identifier.
2225        18 - 21        Integer         seqNum1      Residue sequence number.
2226        22             AChar           icode1       Insertion code.
2227        26 - 28        LString(3)      "CYS"        Residue name.
2228        30             Character       chainID2     Chain identifier.
2229        32 - 35        Integer         seqNum2      Residue sequence number.
2230        36             AChar           icode2       Insertion code.
2231        60 - 65        SymOP           sym1         Symmetry oper for 1st resid
2232        67 - 72        SymOP           sym2         Symmetry oper for 2nd resid
2233         * </pre>
2234         */
2235        private void pdb_SSBOND_Handler(String line){
2236
2237                if (params.isHeaderOnly()) return;
2238
2239                if (line.length()<36) {
2240                        logger.info("SSBOND line has length under 36. Ignoring it.");
2241                        return;
2242                }
2243
2244                String chain1      = line.substring(15,16);
2245                String seqNum1     = line.substring(17,21).trim();
2246                String icode1      = line.substring(21,22);
2247                String chain2      = line.substring(29,30);
2248                String seqNum2     = line.substring(31,35).trim();
2249                String icode2      = line.substring(35,36);
2250
2251                if (line.length()>=72) {
2252                        String symop1 = line.substring(59, 65).trim();
2253                        String symop2 = line.substring(66, 72).trim();
2254
2255                        // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them
2256                        if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing
2257                                        (!symop1.equals("1555") || !symop2.equals("1555")) ) {
2258                                logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2);
2259                                return;
2260                        }
2261                }
2262
2263                if (icode1.equals(" "))
2264                        icode1 = "";
2265                if (icode2.equals(" "))
2266                        icode2 = "";
2267
2268                SSBondImpl ssbond = new SSBondImpl();
2269
2270                ssbond.setChainID1(chain1);
2271                ssbond.setResnum1(seqNum1);
2272                ssbond.setChainID2(chain2);
2273                ssbond.setResnum2(seqNum2);
2274                ssbond.setInsCode1(icode1);
2275                ssbond.setInsCode2(icode2);
2276                ssbonds.add(ssbond);
2277        }
2278
2279
2280        /**
2281         * Takes care of LINK records. These take the format of:
2282         *
2283         * <pre>
2284         * COLUMNS        DATA TYPE       FIELD       DEFINITION
2285         * --------------------------------------------------------------------------------
2286         *  1 -  6        Record name     "LINK  "
2287         * 13 - 16        Atom            name1       Atom name.
2288         * 17             Character       altLoc1     Alternate location indicator.
2289         * 18 - 20        Residue name    resName1    Residue name.
2290         * 22             Character       chainID1    Chain identifier.
2291         * 23 - 26        Integer         resSeq1     Residue sequence number.
2292         * 27             AChar           iCode1      Insertion code.
2293         * 43 - 46        Atom            name2       Atom name.
2294         * 47             Character       altLoc2     Alternate location indicator.
2295         * 48 - 50        Residue name    resName2    Residue name.
2296         * 52             Character       chainID2    Chain identifier.
2297         * 53 - 56        Integer         resSeq2     Residue sequence number.
2298         * 57             AChar           iCode2      Insertion code.
2299         * 60 - 65        SymOP           sym1        Symmetry operator for 1st atom.
2300         * 67 - 72        SymOP           sym2        Symmetry operator for 2nd atom.
2301         * </pre>
2302         *
2303         * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK)
2304         *
2305         * @param line the LINK record line to parse.
2306         */
2307        private void pdb_LINK_Handler(String line) {
2308
2309                if (params.isHeaderOnly()) return;
2310
2311                // Check for the minimal set of fields.
2312                if (line.length()<56) {
2313                        logger.info("LINK line has length under 56. Ignoring it.");
2314                        return;
2315                }
2316
2317                int len = line.length();
2318
2319                String name1 = line.substring(12, 16).trim();
2320                String altLoc1 = line.substring(16, 17).trim();
2321                String resName1 = line.substring(17, 20).trim();
2322                String chainID1 = line.substring(21, 22).trim();
2323                String resSeq1 = line.substring(22, 26).trim();
2324                String iCode1 = line.substring(26, 27).trim();
2325
2326                String name2 = line.substring(42, 46).trim();
2327                String altLoc2 = line.substring(46, 47).trim();
2328                String resName2 = line.substring(47, 50).trim();
2329                String chainID2 = line.substring(51, 52).trim();
2330                String resSeq2 = line.substring(52, 56).trim();
2331                String iCode2 = null;  // Might get trimmed if blank.
2332                if (len > 56) iCode2 = line.substring(56, 57).trim();
2333
2334                String sym1 = null;
2335                if (len > 64) sym1 = line.substring(59, 65).trim();
2336                String sym2 = null;
2337                if (len > 71) sym2 = line.substring(66, 72).trim();
2338
2339                linkRecords.add(new LinkRecord(
2340                                name1, altLoc1, resName1, chainID1, resSeq1, iCode1,
2341                                name2, altLoc2, resName2, chainID2, resSeq2, iCode2,
2342                                sym1, sym2));
2343        }
2344
2345        /**
2346         * Handler for the SITE records. <br>
2347         *
2348         * <pre>
2349         *
2350         * COLUMNS      DATA TYPE               FIELD           DEFINITION
2351         * ---------------------------------------------------------------------------------
2352         * 1 - 6        Record name     "SITE "
2353         * 8 - 10       Integer                 seqNum          Sequence number.
2354         * 12 - 14      LString(3)              siteID          Site name.
2355         * 16 - 17      Integer                 numRes          Number of residues that compose the siteResidues.
2356         * 19 - 21      Residue name    resName1        Residue name for first residue that
2357         *                                                                              creates the siteResidues.
2358         * 23           Character               chainID1        Chain identifier for first residue of siteResidues.
2359         * 24 - 27      Integer                 seq1            Residue sequence number for first residue
2360         *                                                                              of the siteResidues.
2361         * 28           AChar                   iCode1          Insertion code for first residue of the siteResidues.
2362         *
2363         * example:
2364         *          1         2         3         4         5         6         7         8
2365         * 12345678901234567890123456789012345678901234567890123456789012345678901234567890
2366         * SITE     1 AC1  3 HIS A  94 HIS A   96  HIS A 119
2367         * SITE     1 AC2  5 ASN A  62 GLY A   63  HIS A  64  HOH A 328
2368         * SITE     2 AC2  5 HOH A 634
2369         * SITE     1 AC3  5 GLN A 136 GLN A  137  PRO A 138  GLU A 205
2370         * SITE     2 AC3  5 CYS A 206
2371         * SITE     1 AC4 11 HIS A  64 HIS A   94  HIS A  96  HIS A 119
2372         * SITE     2 AC4 11 LEU A 198 THR A  199  THR A 200  TRP A 209
2373         * SITE     3 AC4 11 HOH A 572 HOH A  582  HOH A 635
2374         * </pre>
2375         * @param line the SITE line record being currently read
2376         * @author Amr ALHOSSARY
2377         * @author Jules Jacobsen
2378         */
2379        private void pdb_SITE_Handler(String line){
2380
2381                if (params.isHeaderOnly()) return;
2382
2383                //  make a map of: SiteId to List<ResidueNumber>
2384
2385                logger.debug("Site Line:"+line);
2386
2387
2388                String siteID = line.substring(11, 14);
2389                //fetch the siteResidues from the map
2390                List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID);
2391
2392                //if the siteResidues doesn't yet exist, make a new one.
2393                if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){
2394                        siteResidues = new ArrayList<ResidueNumber>();
2395                        siteToResidueMap.put(siteID.trim(), siteResidues);
2396
2397                        logger.debug(String.format("New Site made: %s %s", siteID,  siteResidues));
2398                        logger.debug("Now made " + siteMap.size() + " sites");
2399
2400                }
2401
2402                logger.debug(String.format("SiteId: %s", siteID));
2403
2404
2405                //line = 'SITE     1 AC1  6 ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2406                //line.substring(18) = 'ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2407                line = line.substring(18);
2408                String groupString = null;
2409                //groupString = 'ARG H 221A'
2410                //keep iterating through chunks of 10 characters - these are the groups in the siteResidues
2411                while (!(groupString = line.substring(0, 10)).equals("          ")) {
2412                        //groupstring: 'ARG H 221A'
2413
2414                        logger.debug("groupString: '" + groupString + "'");
2415
2416                        //set the residue name
2417                        //residueName = 'ARG'
2418                        String residueName = groupString.substring(0, 3);
2419                        Character aminoCode1 = StructureTools.get1LetterCode(residueName);
2420                        if (aminoCode1 != null) {
2421                                if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
2422                                        aminoCode1 = null;
2423                                }
2424                        }
2425
2426                        //this is already in the right format, so no need to fiddle with it...
2427                        //pdbCode = 'H 221A'
2428                        //                    String pdbCode = groupString.substring(4, 10).trim();
2429                        String chainId = groupString.substring(4, 5);
2430                        Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim());
2431                        Character insCode = groupString.substring(9, 10).charAt(0);
2432                        //set insCode to null as a measure to prevent storing thousands of empty Strings
2433                        //- the empty value is returned using Group.getInsCode()
2434                        //                    if (insCode.equals(" ")) {
2435                        //                        insCode = null;
2436                        //                    }
2437
2438                        logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode));
2439
2440                        //make a new resNum with the data - this will be linked up with a site later
2441                        ResidueNumber residueNumber = new ResidueNumber();
2442
2443
2444                        logger.debug("pdbCode: '" + resNum + insCode + "'");
2445
2446                        residueNumber.setChainName(chainId);
2447                        residueNumber.setSeqNum(resNum);
2448                        residueNumber.setInsCode(insCode);
2449                        //add the resNum to the groups
2450                        siteResidues.add(residueNumber);
2451
2452                        logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID);
2453
2454                        line = line.substring(11);
2455                }
2456
2457                logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):");
2458                for (String key : siteToResidueMap.keySet()) {
2459                        logger.debug(key + " : " + siteToResidueMap.get(key));
2460                }
2461
2462        }
2463
2464        //Site variable related to parsing the REMARK 800 records.
2465        Site site;
2466
2467        private String[] keywords;
2468        private void pdb_REMARK_800_Handler(String line){
2469
2470                if (params.isHeaderOnly()) return;
2471
2472                // 'REMARK 800 SITE_IDENTIFIER: CAT                                                 '
2473                line = line.substring(11);
2474                String[] fields = line.split(": ");
2475
2476                if (fields.length == 2) {
2477                        if (fields[0].equals("SITE_IDENTIFIER")) {
2478                                //                    remark800Counter++;
2479                                String siteID = fields[1].trim();
2480
2481                                logger.debug("siteID: '" + siteID +"'");
2482
2483                                //fetch the siteResidues from the map
2484                                site = siteMap.get(siteID);
2485
2486                                //if the siteResidues doesn't yet exist, make a new one.
2487                                if (site == null || !siteID.equals(site.getSiteID())) {
2488                                        site = new Site(siteID, new ArrayList<Group>());
2489                                        siteMap.put(site.getSiteID(), site);
2490
2491                                        logger.debug("New Site made: " + site);
2492                                        logger.debug("Now made " + siteMap.size() + " sites");
2493
2494                                }
2495                        }
2496                        if (fields[0].equals("EVIDENCE_CODE")) {
2497                                //                    remark800Counter++;
2498                                String evCode = fields[1].trim();
2499
2500                                logger.debug("evCode: '" + evCode +"'");
2501
2502                                //fetch the siteResidues from the map
2503                                site.setEvCode(evCode);
2504                        }
2505                        if (fields[0].equals("SITE_DESCRIPTION")) {
2506                                //                    remark800Counter++;
2507                                String desc = fields[1].trim();
2508
2509                                logger.debug("desc: '" + desc +"'");
2510
2511                                //fetch the siteResidues from the map
2512                                site.setDescription(desc);
2513
2514                                logger.debug("Finished making REMARK 800 for site " + site.getSiteID());
2515                                logger.debug(site.remark800toPDB());
2516
2517                        }
2518                }
2519        }
2520
2521        private int intFromString(String intString){
2522                int val = Integer.MIN_VALUE;
2523                try {
2524                        val = Integer.parseInt(intString.trim());
2525                } catch (NumberFormatException ex){
2526                        logger.info("Could not parse a number: " + ex.getMessage());
2527                }
2528                return val;
2529        }
2530
2531
2532
2533        /**
2534         * Finds in the given list of chains the first one that has as name the given chainID.
2535         * If no such Chain can be found it returns null.
2536         */
2537        private static Chain isKnownChain(String chainID, List<Chain> chains){
2538
2539                for (int i = 0; i< chains.size();i++){
2540                        Chain testchain =  chains.get(i);
2541                        if (chainID.equals(testchain.getName())) {
2542                                return testchain;
2543                        }
2544                }
2545
2546                return null;
2547        }
2548
2549
2550
2551        private BufferedReader getBufferedReader(InputStream inStream)
2552                        throws IOException {
2553
2554                BufferedReader buf ;
2555                if (inStream == null) {
2556                        throw new IOException ("input stream is null!");
2557                }
2558
2559                buf = new BufferedReader (new InputStreamReader (inStream));
2560                return buf ;
2561
2562        }
2563
2564
2565
2566        /**
2567         * Parse a PDB file and return a datastructure implementing
2568         * PDBStructure interface.
2569         *
2570         * @param inStream  an InputStream object
2571         * @return a Structure object
2572         * @throws IOException
2573         */
2574        public Structure parsePDBFile(InputStream inStream)
2575                        throws IOException
2576        {
2577
2578                BufferedReader buf = getBufferedReader(inStream);
2579
2580                return parsePDBFile(buf);
2581
2582        }
2583
2584        /**
2585         * Parse a PDB file and return a datastructure implementing
2586         * PDBStructure interface.
2587         *
2588         * @param buf  a BufferedReader object
2589         * @return the Structure object
2590         * @throws IOException ...
2591         */
2592        public  Structure parsePDBFile(BufferedReader buf)
2593                        throws IOException
2594                        {
2595                // set the correct max values for parsing...
2596                loadMaxAtoms = params.getMaxAtoms();
2597                atomCAThreshold = params.getAtomCaThreshold();
2598
2599
2600                // (re)set structure
2601
2602                allModels = new ArrayList<>();
2603                structure     = new StructureImpl() ;
2604                currentModel  = null;
2605                currentChain  = null;
2606                currentGroup  = null;
2607                // we initialise to true since at the beginning of the file we are always starting a new molecule
2608                startOfMolecule = true;
2609                startOfModel = true;
2610
2611                seqResChains  = new ArrayList<Chain>();
2612                siteMap = new LinkedHashMap<String, Site>();
2613                pdbHeader     = new PDBHeader();
2614                connects      = new ArrayList<Map<String,Integer>>();
2615                previousContinuationField = "";
2616                continuationField = "";
2617                continuationString = "";
2618                current_compound = null;
2619                sourceLines.clear();
2620                compndLines.clear();
2621                keywordsLines.clear();
2622                isLastCompndLine = false;
2623                isLastSourceLine = false;
2624                prevMolId = -1;
2625                entities.clear();
2626                helixList.clear();
2627                strandList.clear();
2628                turnList.clear();
2629                lengthCheck = -1;
2630                atomCount = 0;
2631                atomOverflow = false;
2632                linkRecords = new ArrayList<LinkRecord>();
2633                siteToResidueMap.clear();
2634
2635                blankChainIdsPresent = false;
2636
2637                parseCAonly = params.isParseCAOnly();
2638
2639                String line = null;
2640
2641                while ((line = buf.readLine()) != null) {
2642
2643                        // ignore empty lines
2644                        if ( line.equals("") ||
2645                                        (line.equals(NEWLINE))){
2646                                continue;
2647                        }
2648
2649
2650                        // ignore short TER and END lines
2651                        if ( line.startsWith("END")) {
2652                                continue;
2653                        }
2654
2655                        if ( line.length() < 6 && !line.startsWith("TER")) {
2656                                logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
2657                                continue;
2658                        }
2659
2660                        String recordName = null;
2661                        if (line.length()<6)
2662                                recordName = line.trim();
2663                        else
2664                                recordName = line.substring (0, 6).trim ();
2665
2666                        try {
2667                                if (recordName.equals("ATOM"))
2668                                        pdb_ATOM_Handler(line);
2669                                else if (recordName.equals("SEQRES"))
2670                                        pdb_SEQRES_Handler(line);
2671                                else if (recordName.equals("HETATM"))
2672                                        pdb_ATOM_Handler(line);
2673                                else if (recordName.equals("MODEL"))
2674                                        pdb_MODEL_Handler(line);
2675                                else if (recordName.equals("TER"))
2676                                        pdb_TER_Handler();
2677                                else if (recordName.equals("HEADER"))
2678                                        pdb_HEADER_Handler(line);
2679                                else if (recordName.equals("AUTHOR"))
2680                                        pdb_AUTHOR_Handler(line);
2681                                else if (recordName.equals("TITLE"))
2682                                        pdb_TITLE_Handler(line);
2683                                else if (recordName.equals("SOURCE"))
2684                                        sourceLines.add(line); //pdb_SOURCE_Handler
2685                                else if (recordName.equals("COMPND"))
2686                                        compndLines.add(line); //pdb_COMPND_Handler
2687                                else if (recordName.equals("KEYWDS"))
2688                                        keywordsLines.add(line);
2689                                else if (recordName.equals("JRNL"))
2690                                        pdb_JRNL_Handler(line);
2691                                else if (recordName.equals("EXPDTA"))
2692                                        pdb_EXPDTA_Handler(line);
2693                                else if (recordName.equals("CRYST1"))
2694                                        pdb_CRYST1_Handler(line);
2695                                else if (recordName.startsWith("MTRIX"))
2696                                        pdb_MTRIXn_Handler(line);
2697                                else if (recordName.equals("REMARK"))
2698                                        pdb_REMARK_Handler(line);
2699                                else if (recordName.equals("CONECT"))
2700                                        pdb_CONECT_Handler(line);
2701                                else if (recordName.equals("REVDAT"))
2702                                        pdb_REVDAT_Handler(line);
2703                                else if (recordName.equals("DBREF"))
2704                                        pdb_DBREF_Handler(line);
2705                                else if (recordName.equals("SITE"))
2706                                        pdb_SITE_Handler(line);
2707                                else if (recordName.equals("SSBOND"))
2708                                        pdb_SSBOND_Handler(line);
2709                                else if (recordName.equals("LINK"))
2710                                        pdb_LINK_Handler(line);
2711                                else if ( params.isParseSecStruc()) {
2712                                        if ( recordName.equals("HELIX") ) pdb_HELIX_Handler (  line ) ;
2713                                        else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ;
2714                                        else if (recordName.equals("TURN")) pdb_TURN_Handler(   line ) ;
2715                                }
2716                        } catch (StringIndexOutOfBoundsException | NullPointerException ex) {
2717                                logger.info("Unable to parse [" + line + "]");
2718                        }
2719                }
2720
2721                makeCompounds(compndLines, sourceLines);
2722                
2723                handlePDBKeywords(keywordsLines);
2724
2725                triggerEndFileChecks();
2726
2727                if (params.shouldCreateAtomBonds()) {
2728                        formBonds();
2729                }
2730
2731                if ( params.shouldCreateAtomCharges()) {
2732                        addCharges();
2733                }
2734
2735                if ( params.isParseSecStruc() && !params.isHeaderOnly())
2736                        setSecStruc();
2737
2738                // Now correct the alternate location group
2739                StructureTools.cleanUpAltLocs(structure);
2740
2741                return structure;
2742
2743                        }
2744
2745
2746        /**
2747         * Add the charges to the Structure
2748         */
2749        private void addCharges() {
2750                ChargeAdder.addCharges(structure);
2751        }
2752
2753        /**
2754         * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained.
2755         * @author Jules Jacobsen
2756         * @param  compoundList
2757         * @param  sourceList
2758         */
2759        private void makeCompounds(List<String> compoundList,
2760                        List<String> sourceList) {
2761                //              System.out.println("[makeCompounds] making compounds from compoundLines");
2762
2763                for (String line : compoundList) {
2764                        if (compoundList.indexOf(line) + 1 == compoundList.size()) {
2765                                //                              System.out.println("[makeCompounds] Final line in compoundLines.");
2766                                isLastCompndLine = true;
2767                        }
2768                        pdb_COMPND_Handler(line);
2769
2770                }
2771                //              System.out.println("[makeCompounds] adding sources to compounds from sourceLines");
2772                // since we're starting again from the first compound, reset it here
2773                if ( entities.size() == 0){
2774                        current_compound = new EntityInfo();
2775                } else {
2776                        current_compound = entities.get(0);
2777                }
2778                for (String line : sourceList) {
2779                        if (sourceList.indexOf(line) + 1 == sourceList.size()) {
2780                                //                              System.out.println("[makeCompounds] Final line in sourceLines.");
2781                                isLastSourceLine = true;
2782                        }
2783                        pdb_SOURCE_Handler(line);
2784                }
2785
2786        }
2787
2788        /**Parse KEYWODS record of the PDB file.<br>
2789         * A keyword may be split over two lines. whether a keyword ends by the end 
2790         * of a line or it is aplit over two lines, a <code>space</code> is added 
2791         * between the 2 lines's contents, unless the first line ends in 
2792         * a '-' character.
2793         * <pre>
2794         * Record Format
2795         * COLUMNS       DATA  TYPE     FIELD         DEFINITION 
2796         *      ---------------------------------------------------------------------------------
2797         *       1 -  6       Record name    "KEYWDS" 
2798         *       9 - 10       Continuation   continuation  Allows concatenation of records if necessary.
2799         *      11 - 79       List           keywds        Comma-separated list of keywords relevant
2800         *                                                 to the entry.      
2801         * Example
2802         *               1         2         3         4         5         6         7         8
2803         *      12345678901234567890123456789012345678901234567890123456789012345678901234567890
2804         *      KEYWDS    LYASE,  TRICARBOXYLIC ACID CYCLE, MITOCHONDRION, OXIDATIVE
2805         *      KEYWDS   2 METABOLISM
2806         * </pre>
2807         * @param lines The KEWODS record lines.
2808         * @author Amr ALHOSSARY
2809         */
2810        private void handlePDBKeywords(List<String> lines) {
2811                StringBuilder fullList = new StringBuilder();
2812                for (String line : lines) {
2813                        String kwList = line.substring(10).trim();
2814                        if(kwList.length() > 0) {
2815                                if(fullList.length() > 0 && fullList.indexOf("-", fullList.length()-1) < 0) {
2816                                        fullList.append(' ');
2817                                }
2818                                fullList.append(kwList);
2819                        }
2820                }
2821                String fulllengthList = fullList.toString();
2822                keywords = fulllengthList.split("( )*,( )*");
2823                ArrayList<String> lst = new ArrayList<String>(keywords.length);
2824                for (String keyword : keywords) {
2825                        if(keyword.length() == 0) {
2826                                logger.debug("Keyword empty in structure {}", structure.getIdentifier().toString());
2827                                continue;
2828                        }
2829                        lst.add(keyword);
2830                }
2831                pdbHeader.setKeywords(lst);
2832        }
2833        
2834        /**
2835         * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide
2836         * bonds), peptide bonds, and intra-residue bonds.
2837         * <p>
2838         * Note: the current implementation only looks at the first model of each
2839         * structure. This may need to be fixed in the future.
2840         */
2841        private void formBonds() {
2842
2843                BondMaker maker = new BondMaker(structure, params);
2844
2845                // LINK records should be preserved, they are the way that
2846                // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers.
2847                // The analogy in mmCIF is the _struct_conn record.
2848                for (LinkRecord linkRecord : linkRecords) {
2849                        maker.formLinkRecordBond(linkRecord);
2850                }
2851
2852                maker.formDisulfideBonds(ssbonds);
2853
2854                maker.makeBonds();
2855        }
2856
2857
2858
2859        private void triggerEndFileChecks(){
2860
2861                // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines)
2862                if (currentChain!=null && currentGroup!=null) {
2863                        currentChain.addGroup(currentGroup);
2864                }
2865                if (currentModel!=null && currentChain!=null) {
2866                        currentModel.add(currentChain);
2867                }
2868                if (currentModel!=null) {
2869                        allModels.add(currentModel);
2870                }
2871
2872                if (blankChainIdsPresent) {
2873                        // from biojava 5.0 there's limited support for old pdb files with blank chain ids
2874                        logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly.");
2875                }
2876
2877                // reordering chains following the mmcif model and assigning entities
2878                assignChainsAndEntities();
2879                structure.setEntityInfos(entities);
2880
2881
2882
2883                // header data
2884
2885                Date modDate = pdbHeader.getModDate();
2886                if ( modDate.equals(new Date(0)) ) {
2887                        // modification date = deposition date
2888                        Date depositionDate = pdbHeader.getDepDate();
2889
2890                        if (! depositionDate.equals(modDate)){
2891                                // depDate is 0000-00-00
2892                                pdbHeader.setModDate(depositionDate);
2893                        }
2894                }
2895
2896                structure.setPDBHeader(pdbHeader);
2897                structure.setCrystallographicInfo(crystallographicInfo);
2898
2899                //set the JournalArticle, if there is one
2900                if (!journalLines.isEmpty()) {
2901                        buildjournalArticle();
2902                        pdbHeader.setJournalArticle(journalArticle);
2903                }
2904
2905                structure.setDBRefs(dbrefs);
2906
2907                // Only align if requested (default) and not when headerOnly mode with no Atoms.
2908                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
2909                if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){
2910                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
2911                        SeqRes2AtomAligner aligner = new SeqRes2AtomAligner();
2912                        aligner.align(structure,seqResChains);
2913
2914                } else {
2915                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
2916                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
2917                }
2918
2919
2920
2921                //associate the temporary Groups in the siteMap to the ones
2922                if (!params.isHeaderOnly()) {
2923                        // Only can link SITES if Atom Groups were parsed.
2924                        linkSitesToGroups(); // will work now that setSites is called
2925                }
2926
2927                if ( bioAssemblyParser != null){
2928                        bioAssemblyParser.setMacromolecularSizes();
2929                        pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap());
2930                }
2931
2932                if (ncsOperators !=null && ncsOperators.size()>0) {
2933                        crystallographicInfo.setNcsOperators(
2934                                ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
2935                }
2936
2937
2938                // rfree end file check
2939                // Rfree annotation is not very consistent in PDB format, it varies depending on the software
2940                // Here we follow this strategy:
2941                // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
2942                // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
2943
2944                if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) {
2945                        pdbHeader.setRfree(rfreeNoCutoffLine);
2946                } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) {
2947                        pdbHeader.setRfree(rfreeStandardLine);
2948                } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) {
2949                        pdbHeader.setRfree(rfreeStandardLine);
2950                } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE
2951
2952
2953
2954        }
2955
2956        private void setSecStruc(){
2957
2958                setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2959                                SecStrucType.helix4);
2960                setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2961                                SecStrucType.extended);
2962                setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2963                                SecStrucType.turn);
2964
2965                //Now insert random coil to the Groups that did not have SS information
2966                GroupIterator gi = new GroupIterator(structure);
2967                while (gi.hasNext()){
2968                        Group g = gi.next();
2969                        if (g.hasAminoAtoms()){
2970                                if (g.getProperty(Group.SEC_STRUC) == null){
2971                                        SecStrucInfo ss = new SecStrucInfo(g,
2972                                                        SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2973                                                        SecStrucType.coil);
2974                                        g.setProperty(Group.SEC_STRUC, ss);
2975                                }
2976                        }
2977                }
2978
2979        }
2980
2981        private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){
2982
2983
2984                Iterator<Map<String,String>> iter = secList.iterator();
2985                nextElement:
2986                        while (iter.hasNext()){
2987                                Map<String,String> m = iter.next();
2988
2989                                // assign all residues in this range to this secondary structure type
2990                                // String initResName = (String)m.get("initResName");
2991                                String initChainId = m.get("initChainId");
2992                                String initSeqNum  = m.get("initSeqNum" );
2993                                String initICode   = m.get("initICode" );
2994                                // String endResName  = (String)m.get("endResName" );
2995                                String endChainId  = m.get("endChainId" );
2996                                String endSeqNum   = m.get("endSeqNum");
2997                                String endICode    = m.get("endICode");
2998
2999                                if (initICode.equals(" "))
3000                                        initICode = "";
3001                                if (endICode.equals(" "))
3002                                        endICode = "";
3003
3004                                GroupIterator gi = new GroupIterator(structure);
3005                                boolean inRange = false;
3006                                while (gi.hasNext()){
3007                                        Group g = gi.next();
3008                                        Chain c = g.getChain();
3009
3010                                        if (c.getName().equals(initChainId)){
3011
3012                                                String pdbCode = initSeqNum + initICode;
3013                                                if ( g.getResidueNumber().toString().equals(pdbCode)  ) {
3014                                                        inRange = true;
3015                                                }
3016                                        }
3017                                        if ( inRange){
3018                                                if (g.hasAminoAtoms()) {
3019                                                        SecStrucInfo ss = new SecStrucInfo(g, assignment, type);
3020                                                        g.setProperty(Group.SEC_STRUC, ss);
3021                                                }
3022
3023                                        }
3024                                        if ( c.getName().equals(endChainId)){
3025                                                String pdbCode = endSeqNum + endICode;
3026                                                if (pdbCode.equals(g.getResidueNumber().toString())){
3027                                                        inRange = false;
3028                                                        continue nextElement;
3029                                                }
3030                                        }
3031                                }
3032                        }
3033        }
3034
3035        /**
3036         * Gets all chains with given chainName from given models list
3037         * @param chainName
3038         * @param polyModels
3039         * @return
3040         */
3041        private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) {
3042                List<List<Chain>> models = new ArrayList<>();
3043
3044                for (List<Chain> chains:polyModels) {
3045                        List<Chain> matchingChains = new ArrayList<>();
3046                        models.add(matchingChains);
3047                        for (Chain c:chains) {
3048                                if (c.getName().equals(chainName)) {
3049                                        matchingChains.add(c);
3050                                }
3051                        }
3052                }
3053                return models;
3054        }
3055
3056        /**
3057         * Split the given chain (containing non-polymer groups and water groups only)
3058         * into individual chains per non-polymer group and individual chains per contiguous sets of water groups.
3059         * @param chain
3060         * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains
3061         */
3062        private static List<List<Chain>> splitNonPolyChain(Chain chain) {
3063                List<Chain> splitNonPolys = new ArrayList<>();
3064                List<Chain> waterChains = new ArrayList<>();
3065
3066                Chain split = null;
3067                boolean previousGroupIsWater = false;
3068
3069                for (Group g:chain.getAtomGroups()){
3070
3071                        if (!previousGroupIsWater) {
3072                                // add last one if there's one
3073                                if (split!=null) {
3074                                        splitNonPolys.add(split);
3075                                }
3076                                split = new ChainImpl();
3077                                split.setName(chain.getName());
3078                        } else if (!g.isWater()) {
3079                                // previous group is water and this group is not water: we change from a water chain to a non-poly
3080                                // we'll need to add now the water chain to the list of water chains
3081                                waterChains.add(split);
3082                                split = new ChainImpl();
3083                                split.setName(chain.getName());
3084                        }
3085
3086                        if (g.isWater()) {
3087                                previousGroupIsWater = true;
3088                        } else {
3089                                previousGroupIsWater = false;
3090
3091                        }
3092
3093                        // this should include alt locs (referenced from the main group)
3094                        split.addGroup(g);
3095
3096                }
3097
3098                // adding the last split chain: either to water or non-poly depending on what was the last seen group
3099                if (split!=null) {
3100                        if (previousGroupIsWater)
3101                                waterChains.add(split);
3102                        else
3103                                splitNonPolys.add(split);
3104                }
3105
3106
3107                List<List<Chain>> all = new ArrayList<>(2);
3108                all.add(splitNonPolys);
3109                all.add(waterChains);
3110
3111                return all;
3112        }
3113
3114        /**
3115         * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files
3116         * @param polys
3117         * @param nonPolys
3118         * @param waters
3119         */
3120        private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) {
3121
3122                for (int i=0; i<polys.size(); i++) {
3123                        String asymId = "A";
3124
3125                        for (Chain poly:polys.get(i)) {
3126                                poly.setId(asymId);
3127                                asymId = getNextAsymId(asymId);
3128                        }
3129                        for (Chain nonPoly:nonPolys.get(i)) {
3130                                nonPoly.setId(asymId);
3131                                asymId = getNextAsymId(asymId);
3132                        }
3133                        for (Chain water:waters.get(i)) {
3134                                water.setId(asymId);
3135                                asymId = getNextAsymId(asymId);
3136                        }
3137                }
3138        }
3139
3140        /**
3141         * Gets the next asym id given an asymId, according to the convention followed by
3142         * mmCIF files produced by the PDB
3143         * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,...
3144         * @param asymId
3145         * @return
3146         */
3147        private String getNextAsymId(String asymId) {
3148                if (asymId.length()==1) {
3149                        if (!asymId.equals("Z")) {
3150                                return Character.toString(getNextChar(asymId.charAt(0)));
3151                        } else {
3152                                return "AA";
3153                        }
3154                } else if (asymId.length()==2) {
3155                        if (asymId.equals("ZZ")) {
3156                                return "AAA";
3157                        }
3158                        char[] c = new char[2];
3159                        asymId.getChars(0, 2, c, 0);
3160                        c[0] = getNextChar(c[0]);
3161                        if (c[0]=='A') {
3162                                c[1] = getNextChar(c[1]);
3163                        }
3164                        return new String(c);
3165                } else if (asymId.length()==3) {
3166                        char[] c = new char[3];
3167                        asymId.getChars(0, 3, c, 0);
3168                        c[0] = getNextChar(c[0]);
3169                        if (c[0]=='A') {
3170                                c[1] = getNextChar(c[1]);
3171                                if (c[1]=='A') {
3172                                        c[2] = getNextChar(c[2]);
3173                                }
3174                        }
3175                        return new String(c);
3176                }
3177                return null;
3178        }
3179
3180        private char getNextChar(char c) {
3181                if (c!='Z') {
3182                        return ((char)(c+1));
3183                } else {
3184                        return 'A';
3185                }
3186        }
3187
3188        /**
3189         * Here we assign chains following the mmCIF data model:
3190         * one chain per polymer, one chain per non-polymer group and
3191         * several water chains.
3192         * <p>
3193         * Subsequently we assign entities for them: either from those read from
3194         * COMPOUND records or from those found heuristically through {@link EntityFinder}
3195         *
3196         */
3197        private void assignChainsAndEntities(){
3198
3199                List<List<Chain>> polyModels = new ArrayList<>();
3200                List<List<Chain>> nonPolyModels = new ArrayList<>();
3201                List<List<Chain>> waterModels = new ArrayList<>();
3202
3203                for (List<Chain> model:allModels) {
3204
3205                        List<Chain> polyChains = new ArrayList<>();
3206                        List<Chain> nonPolyChains = new ArrayList<>();
3207                        List<Chain> waterChains = new ArrayList<>();
3208
3209                        polyModels.add(polyChains);
3210                        nonPolyModels.add(nonPolyChains);
3211                        waterModels.add(waterChains);
3212
3213                        for (Chain c:model) {
3214
3215                                // we only have entities for polymeric chains, all others are ignored for assigning entities
3216                                if (c.isWaterOnly()) {
3217                                        waterChains.add(c);
3218
3219                                } else if (c.isPureNonPolymer()) {
3220                                        nonPolyChains.add(c);
3221
3222                                } else {
3223                                        polyChains.add(c);
3224                                }
3225                        }
3226                }
3227
3228                List<List<Chain>> splitNonPolyModels = new ArrayList<>();
3229                for (int i=0; i<nonPolyModels.size(); i++) {
3230                        List<Chain> nonPolyModel = nonPolyModels.get(i);
3231                        List<Chain> waterModel = waterModels.get(i);
3232
3233                        List<Chain> splitNonPolys = new ArrayList<>();
3234                        splitNonPolyModels.add(splitNonPolys);
3235
3236                        for (Chain nonPoly:nonPolyModel) {
3237                                List<List<Chain>> splits = splitNonPolyChain(nonPoly);
3238                                splitNonPolys.addAll(splits.get(0));
3239                                waterModel.addAll(splits.get(1));
3240                        }
3241                }
3242
3243
3244                // now we have all chains as in mmcif, let's assign ids following the mmcif rules
3245                assignAsymIds(polyModels, splitNonPolyModels, waterModels);
3246
3247
3248                if (!entities.isEmpty()) {
3249                        // if the file contained COMPOUND records then we can assign entities to the poly chains
3250                        for (EntityInfo comp : entities){
3251                        List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId());
3252                        if ( chainIds == null)
3253                                continue;
3254                        for ( String chainId : chainIds) {
3255
3256                                        List<List<Chain>> models = findChains(chainId, polyModels);
3257
3258                                        for (List<Chain> matchingChains:models) {
3259                                                for (Chain chain:matchingChains) {
3260                                                        comp.addChain(chain);
3261                                                        chain.setEntityInfo(comp);
3262                                                }
3263
3264                                                if (matchingChains.isEmpty()) {
3265                                        // usually if this happens something is wrong with the PDB header
3266                                        // e.g. 2brd - there is no Chain A, although it is specified in the header
3267                                        // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES
3268                                        // but the authors didn't observe in the density so it's completely missing
3269                                        // from the ATOM lines
3270                                                        logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId());
3271                                                }
3272                                        }
3273                                }
3274                        }
3275
3276                } else {
3277
3278                        logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically");
3279                        // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
3280                        entities = EntityFinder.findPolyEntities(polyModels);
3281
3282                }
3283
3284                // now we assign entities to the nonpoly and water chains
3285                EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities);
3286
3287
3288                // in some rare cases purely non-polymer or purely water chain are present in pdb files
3289                // see https://github.com/biojava/biojava/pull/394
3290                // these case should be covered by the above
3291
3292
3293                // now that we have entities in chains we add the chains to the structure
3294
3295                for (int i=0;i<allModels.size();i++) {
3296                        List<Chain> model = new ArrayList<>();
3297                        model.addAll(polyModels.get(i));
3298                        model.addAll(splitNonPolyModels.get(i));
3299                        model.addAll(waterModels.get(i));
3300                        structure.addModel(model);
3301                        }
3302
3303
3304        }
3305
3306        /**
3307         * Links the Sites in the siteMap to the Groups in the Structure via the
3308         * siteToResidueMap ResidueNumber.
3309         * @author Jules Jacobsen
3310         * @return
3311         */
3312        private void linkSitesToGroups() {
3313
3314                //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size());
3315
3316                //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back.
3317                //the return list
3318
3319                if ( siteMap == null || siteToResidueMap == null){
3320                        logger.info("Sites can not be linked to residues!");
3321
3322                        return;
3323                }
3324
3325                List<Site> sites = null;
3326                //check that there are chains with which to associate the groups
3327                if (structure.getChains().isEmpty()) {
3328                        sites = new ArrayList<Site>(siteMap.values());
3329                        logger.info("No chains to link Site Groups with - Sites will not be present in the Structure");
3330                        return;
3331                }
3332
3333                //check that the keys in the siteMap and SiteToResidueMap are equal
3334                if (! siteMap.keySet().equals(siteToResidueMap.keySet())) {
3335                        logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure");
3336                        logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet());
3337                        //return;
3338                }
3339
3340                //so we have chains - associate the siteResidues-related groups with the ones
3341                //already in in the chains
3342                for (String key : siteMap.keySet()) {
3343                        Site currentSite = siteMap.get(key);
3344                        List<ResidueNumber> linkedGroups = siteToResidueMap.get(key);
3345                        if ( linkedGroups == null)
3346                                continue;
3347                        for (ResidueNumber residueNumber : linkedGroups) {
3348
3349                                String pdbCode = residueNumber.toString();
3350                                String chain = residueNumber.getChainName();
3351                                //                    System.out.println("chain: '" + chain + "'");
3352                                //                    String resNum = resNum.getSeqNum().toString();
3353                                //                    System.out.println("resNum: '" + resNum + "'");
3354
3355                                Group linkedGroup = null;
3356                                try {
3357                                        //TODO: implement findGroup(ResidueNumber resNum)
3358                                        linkedGroup = structure.findGroup(chain, pdbCode);
3359                                } catch (StructureException ex) {
3360                                        logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")");
3361                                        continue;
3362                                }
3363
3364                                //                    System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID());
3365                                currentSite.getGroups().add(linkedGroup);
3366                        }
3367                }
3368
3369                //System.out.println("SITEMAP: " + siteMap);
3370
3371                sites = new ArrayList<Site>(siteMap.values());
3372                structure.setSites(sites);
3373                //System.out.println("STRUCTURE SITES: " + structure.getSites().size());
3374                //            for (Site site : structure.getSites()) {
3375                //                System.out.println(site);
3376                //            }
3377                //            System.out.println("Linked Site Groups with Chains");
3378
3379        }
3380
3381        private void buildjournalArticle() {
3382
3383                logger.debug("building new JournalArticle");
3384                //            for (String line : journalLines) {
3385                //                System.out.println(line);
3386                //            }
3387
3388                this.journalArticle = new JournalArticle();
3389                //        JRNL        AUTH   M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI,
3390                //        JRNL        AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT
3391                //        JRNL        TITL   A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY
3392                //        JRNL        TITL 2 STAPHYLOCOCCUS AUREUS.
3393                //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3394                //        JRNL        REFN                   ISSN 1529-2908
3395                //        JRNL        PMID   17351618
3396                //        JRNL        DOI    10.1038/NI1450
3397                StringBuffer auth = new StringBuffer();
3398                StringBuffer titl = new StringBuffer();
3399                StringBuffer edit = new StringBuffer();
3400                StringBuffer ref = new StringBuffer();
3401                StringBuffer publ = new StringBuffer();
3402                StringBuffer refn = new StringBuffer();
3403                StringBuffer pmid = new StringBuffer();
3404                StringBuffer doi = new StringBuffer();
3405
3406                for (String line : journalLines) {
3407                        if ( line.length() < 19 ) {
3408                                logger.info("can not process Journal line: " + line);
3409                                continue;
3410                        }
3411                        //            System.out.println("'" + line + "'");
3412                        String subField = line.substring(12, 16);
3413                        //            System.out.println("'" + subField + "'");
3414                        if (subField.equals("AUTH")) {
3415                                auth.append(line.substring(19, line.length()).trim());
3416
3417                                logger.debug("AUTH '" + auth.toString() + "'");
3418
3419                        }
3420                        if (subField.equals("TITL")) {
3421                                //add a space to the end of a line so that when wrapped the
3422                                //words on the join won't be concatenated
3423                                titl.append(line.substring(19, line.length()).trim()).append(" ");
3424
3425                                logger.debug("TITL '" + titl.toString() + "'");
3426
3427                        }
3428                        if (subField.equals("EDIT")) {
3429                                edit.append(line.substring(19, line.length()).trim());
3430
3431                                logger.debug("EDIT '" + edit.toString() + "'");
3432
3433                        }
3434                        //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3435                        if (subField.equals("REF ")) {
3436                                ref.append(line.substring(19, line.length()).trim()).append(" ");
3437
3438                                logger.debug("REF '" + ref.toString() + "'");
3439
3440                        }
3441                        if (subField.equals("PUBL")) {
3442                                publ.append(line.substring(19, line.length()).trim()).append(" ");
3443
3444                                logger.debug("PUBL '" + publ.toString() + "'");
3445
3446                        }
3447                        //        JRNL        REFN                   ISSN 1529-2908
3448                        if (subField.equals("REFN")) {
3449                                if ( line.length() < 35 ) {
3450                                        logger.info("can not process Journal REFN line: " + line);
3451                                        continue;
3452                                }
3453                                refn.append(line.substring(35, line.length()).trim());
3454
3455                                logger.debug("REFN '" + refn.toString() + "'");
3456
3457                        }
3458                        //        JRNL        PMID   17351618
3459                        if (subField.equals("PMID")) {
3460                                pmid.append(line.substring(19, line.length()).trim());
3461
3462                                logger.debug("PMID '" + pmid.toString() + "'");
3463
3464                        }
3465                        //        JRNL        DOI    10.1038/NI1450
3466                        if (subField.equals("DOI ")) {
3467                                doi.append(line.substring(19, line.length()).trim());
3468
3469                                logger.debug("DOI '" + doi.toString() + "'");
3470
3471                        }
3472                }
3473
3474                //now set the parts of the JournalArticle
3475                journalArticle.setAuthorList(authorBuilder(auth.toString()));
3476                journalArticle.setEditorList(authorBuilder(edit.toString()));
3477                journalArticle.setRef(ref.toString());
3478                JournalParser journalParser = new JournalParser(ref.toString());
3479                journalArticle.setJournalName(journalParser.getJournalName());
3480                if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) {
3481                        journalArticle.setIsPublished(true);
3482                }
3483                journalArticle.setVolume(journalParser.getVolume());
3484                journalArticle.setStartPage(journalParser.getStartPage());
3485                journalArticle.setPublicationDate(journalParser.getPublicationDate());
3486                journalArticle.setPublisher(publ.toString().trim());
3487                journalArticle.setTitle(titl.toString().trim());
3488                journalArticle.setRefn(refn.toString().trim());
3489                journalArticle.setPmid(pmid.toString().trim());
3490                journalArticle.setDoi(doi.toString().trim());
3491
3492
3493                logger.debug("Made JournalArticle:");
3494                logger.debug(journalArticle.toString());
3495
3496        }
3497
3498        //inner class to deal with all the journal info
3499        private class JournalParser {
3500
3501                private String journalName;
3502                private String volume;
3503                private String startPage;
3504                private int publicationDate;
3505
3506
3507                public JournalParser(String ref) {
3508
3509                        logger.debug("JournalParser init '" + ref + "'");
3510
3511
3512                        if (ref.equals("TO BE PUBLISHED ")) {
3513                                journalName = ref.trim();
3514
3515                                logger.debug(String.format("JournalParser found journalString '%s'", journalName));
3516
3517                                return;
3518                        }
3519
3520                        if (ref.length() < 48) {
3521                                logger.info("REF line too short - must be at least 48 characters to be valid for parsing.");
3522                                journalName = "";
3523                                volume = "";
3524                                startPage = "";
3525                                publicationDate = 0;
3526                                return;
3527                        }
3528                        //can be multi line:
3529                        //REF    PHILOS.TRANS.R.SOC.LONDON,    V. 293    53 1981
3530                        //REF  2 SER.B
3531
3532                        //or
3533
3534                        //REF    GLYCOGEN PHOSPHORYLASE B:                1 1991
3535                        //REF  2 DESCRIPTION OF THE PROTEIN
3536                        //REF  3 STRUCTURE
3537
3538                        //but usually single line
3539                        //REF    NUCLEIC ACIDS RES.                         2009
3540                        //REF    MOL.CELL                                   2009
3541                        //REF    NAT.STRUCT.MOL.BIOL.          V.  16   238 2009
3542                        //REF    ACTA CRYSTALLOGR.,SECT.F      V.  65   199 2009
3543                        //check if the date is present at the end of the line.
3544                        //                             09876543210987654321
3545                        //'J.BIOL.CHEM.                  V. 280 23000 2005 '
3546                        //'J.AM.CHEM.SOC.                V. 130 16011 2008 '
3547                        //'NAT.STRUCT.MOL.BIOL.          V.  16   238 2009'
3548                        String volumeInformation = ref.substring(30, 48);
3549
3550                        logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation));
3551
3552                        //volumeInformation: 'V. 293    53 1981 '
3553                        //                      String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim();
3554                        //                      String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim();
3555                        //                      String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim();
3556                        //                      String journalString = ref.substring(0 , ref.length() - 18).trim();
3557                        String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim();
3558                        String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim();
3559                        String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim();
3560                        //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk)
3561                        String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim();
3562                        journalString = journalString.trim();
3563                        //                        System.out.println("journalString: " + journalString);
3564
3565                        logger.debug(String.format("JournalParser found volumeString '%s'", volumeString));
3566                        logger.debug(String.format("JournalParser found startPageString '%s'", startPageString));
3567                        logger.debug(String.format("JournalParser found dateString '%s'", dateString));
3568                        logger.debug(String.format("JournalParser found journalString '%s'", journalString));
3569
3570
3571                        if (!dateString.equals("    ")) {
3572                                try {
3573                                        publicationDate = Integer.valueOf(dateString);
3574                                } catch (NumberFormatException nfe) {
3575                                        logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1");
3576                                }
3577                                //                              if (DEBUG) {
3578                                //                                      System.out.println("JournalParser set date " + publicationDate);
3579                                //                              }
3580                        }
3581
3582                        if (!startPageString.equals("    ")) {
3583                                startPage = startPageString;
3584                                //                              if (DEBUG) {
3585                                //                                      System.out.println("JournalParser set startPage " + startPage);
3586                                //                              }
3587                        }
3588
3589                        if (!volumeString.equals("    ")) {
3590                                volume = volumeString;
3591                                //                              if (DEBUG) {
3592                                //                                      System.out.println("JournalParser set volume " + volume);
3593                                //                              }
3594                        }
3595
3596                        if (!journalString.equals("    ")) {
3597                                journalName = journalString;
3598
3599                                logger.debug("JournalParser set journalName " + journalName);
3600
3601                        }
3602                }
3603
3604                private String getJournalName() {
3605                        return journalName;
3606                }
3607
3608                private int getPublicationDate() {
3609                        return publicationDate;
3610                }
3611
3612                private String getStartPage() {
3613                        return startPage;
3614                }
3615
3616                private String getVolume() {
3617                        return volume;
3618                }
3619        }
3620
3621        private List<Author> authorBuilder(String authorString) {
3622                ArrayList<Author> authorList = new ArrayList<Author>();
3623
3624                if (authorString.equals("")) {
3625                        return authorList;
3626                }
3627
3628                String[] authors = authorString.split(",");
3629                //        if (DEBUG) {
3630                //            for (int i = 0; i < authors.length; i++) {
3631                //                String string = authors[i];
3632                //                System.out.println("authorBuilder author: '" + string + "'");
3633                //            }
3634                //        }
3635                //        AUTH   SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS
3636                //        AUTH 2 DISEASE (SSGCID)
3637                //        or
3638                //        AUTH   E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET,
3639                //        AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA,
3640                //        AUTH 3 A.BOCHKAREV,D.COSSAR,
3641                //        AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC)
3642                //        or
3643                //        AUTH   T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER
3644                if (authors.length == 1) {
3645                        //only one element means it's a consortium only
3646                        Author author = new Author();
3647                        author.setSurname(authors[0]);
3648
3649                        logger.debug("Set consortium author name " + author.getSurname());
3650
3651                        authorList.add(author);
3652                } else {
3653                        for (int i = 0; i < authors.length; i++) {
3654                                String authorFullName = authors[i];
3655
3656                                logger.debug("Building author " + authorFullName);
3657
3658                                Author author = new Author();
3659                                String regex = "\\.";
3660                                String[] authorNames = authorFullName.split(regex);
3661                                //                if (DEBUG) {
3662                                //                    System.out.println("authorNames size " + authorNames.length);
3663                                //                    for (int j = 0; j < authorNames.length; j++) {
3664                                //                        String name = authorNames[j];
3665                                //                        System.out.println("split authName '" + name + "'");
3666                                //
3667                                //                    }
3668                                //                }
3669                                if (authorNames.length == 0) {
3670                                        author.setSurname(authorFullName);
3671
3672                                        logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname());
3673
3674                                }
3675                                //again there might be a consortium name so there may be no elements
3676                                else if (authorNames.length == 1) {
3677                                        author.setSurname(authorNames[0]);
3678
3679                                        logger.debug("Set consortium author name in multiple author block " + author.getSurname
3680                                                                ());
3681
3682                                } else {
3683                                        String initials = "";
3684                                        for (int j = 0; j < authorNames.length - 1; j++) {
3685                                                String initial = authorNames[j];
3686                                                //                        if (DEBUG) {
3687                                                //                            System.out.println("adding initial '" + initial + "'");
3688                                                //                        }
3689                                                //build the initials back up again
3690                                                initials += initial + ".";
3691                                        }
3692
3693                                        logger.debug("built initials '" + initials + "'");
3694
3695                                        author.setInitials(initials);
3696                                        //surname is always last
3697                                        int lastName = authorNames.length - 1;
3698                                        String surname = authorNames[lastName];
3699
3700                                        logger.debug("built author surname " + surname);
3701
3702                                        author.setSurname(surname);
3703
3704                                }
3705                                authorList.add(author);
3706                        }
3707                }
3708                return authorList;
3709        }
3710
3711        public void setFileParsingParameters(FileParsingParameters params)
3712        {
3713                this.params= params;
3714
3715                // set the correct max values for parsing...
3716                loadMaxAtoms = params.getMaxAtoms();
3717                atomCAThreshold = params.getAtomCaThreshold();
3718
3719
3720        }
3721
3722        public FileParsingParameters getFileParsingParameters(){
3723                return params;
3724        }
3725
3726
3727}