001/*
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 * Created on 16.03.2004
020 *
021 */
022package org.biojava.nbio.structure.io;
023
024import static java.lang.Math.min;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.text.DateFormat;
031import java.text.ParseException;
032import java.text.SimpleDateFormat;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.Date;
036import java.util.HashMap;
037import java.util.Iterator;
038import java.util.LinkedHashMap;
039import java.util.List;
040import java.util.Locale;
041import java.util.Map;
042import java.util.StringTokenizer;
043import java.util.regex.Matcher;
044import java.util.regex.Pattern;
045
046import javax.vecmath.Matrix4d;
047
048import org.biojava.nbio.structure.AminoAcid;
049import org.biojava.nbio.structure.AminoAcidImpl;
050import org.biojava.nbio.structure.Atom;
051import org.biojava.nbio.structure.AtomImpl;
052import org.biojava.nbio.structure.Author;
053import org.biojava.nbio.structure.Chain;
054import org.biojava.nbio.structure.ChainImpl;
055import org.biojava.nbio.structure.DBRef;
056import org.biojava.nbio.structure.Element;
057import org.biojava.nbio.structure.EntityInfo;
058import org.biojava.nbio.structure.EntityType;
059import org.biojava.nbio.structure.Group;
060import org.biojava.nbio.structure.GroupIterator;
061import org.biojava.nbio.structure.HetatomImpl;
062import org.biojava.nbio.structure.JournalArticle;
063import org.biojava.nbio.structure.NucleotideImpl;
064import org.biojava.nbio.structure.PDBCrystallographicInfo;
065import org.biojava.nbio.structure.PDBHeader;
066import org.biojava.nbio.structure.ResidueNumber;
067import org.biojava.nbio.structure.Site;
068import org.biojava.nbio.structure.Structure;
069import org.biojava.nbio.structure.StructureException;
070import org.biojava.nbio.structure.StructureImpl;
071import org.biojava.nbio.structure.StructureTools;
072import org.biojava.nbio.structure.io.mmcif.ChemCompGroupFactory;
073import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom;
074import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord;
075import org.biojava.nbio.structure.secstruc.SecStrucInfo;
076import org.biojava.nbio.structure.secstruc.SecStrucType;
077import org.biojava.nbio.structure.xtal.CrystalCell;
078import org.biojava.nbio.structure.xtal.SpaceGroup;
079import org.biojava.nbio.structure.xtal.SymoplibParser;
080import org.slf4j.Logger;
081import org.slf4j.LoggerFactory;
082
083
084/**
085 * This class implements the actual PDB file parsing. Do not access it directly, but
086 * via the PDBFileReader class.
087 *
088 * <h2>Parsing</h2>
089 *
090 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods.
091 *
092 *
093 * <p>
094 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD.
095 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically
096 * switch to a C-alpha only representation.
097 *
098 * <p>
099 * The result of the parsing of the PDB file is a new {@link Structure} object.
100 *
101 * <p>
102 * For more documentation on how to work with the Structure API please
103 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top">
104 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a>
105 *
106 *
107 *
108 *
109 * <h2>Example</h2>
110 * <p>
111 * Q: How can I get a Structure object from a PDB file?
112 * <p>
113 * A:
114 * <pre>
115 * public {@link Structure} loadStructure(String pathToPDBFile){
116 *      // The PDBFileParser is wrapped by the PDBFileReader
117 *      {@link PDBFileReader} pdbreader = new {@link PDBFileReader}();
118 *
119 *      {@link Structure} structure = null;
120 *      try{
121 *              structure = pdbreader.getStructure(pathToPDBFile);
122 *              System.out.println(structure);
123 *      } catch (IOException e) {
124 *              e.printStackTrace();
125 *      }
126 *      return structure;
127 * }
128 * </pre>
129 *
130 *
131 * @author Andreas Prlic
132 * @author Jules Jacobsen
133 * @author Jose Duarte
134 * @since 1.4
135 */
136public class PDBFileParser  {
137
138
139
140        private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class);
141
142        // for printing
143        private static final String NEWLINE = System.getProperty("line.separator");
144
145
146        // required for parsing:
147        private String pdbId; //the actual id of the entry
148        private Structure     structure;
149        private List<List<Chain>> allModels; // a temp data structure to keep all models
150        private List<Chain>   currentModel; // contains the ATOM records for each model
151        private Chain         currentChain;
152        private Group         currentGroup;
153
154        private List<Chain>   seqResChains; // contains all the chains for the SEQRES records
155        //we're going to work on the assumption that the files are current -
156        //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true.
157        //if true then lines will be truncated at 72 characters in certain cases
158        //(pdb_COMPOUND_handler for example)
159        private boolean isLegacyFormat = false;
160
161        private boolean blankChainIdsPresent = false;
162
163        // for re-creating the biological assembly
164        private PDBBioAssemblyParser bioAssemblyParser = null;
165
166        private PDBHeader pdbHeader;
167        private PDBCrystallographicInfo crystallographicInfo;
168        private JournalArticle journalArticle;
169        private List<Map<String, Integer>> connects ;
170        private List<Map<String,String>> helixList;
171        private List<Map<String,String>> strandList;
172        private List<Map<String,String>> turnList;
173
174        private int lengthCheck ;
175
176        private boolean isLastCompndLine = false;
177        private boolean isLastSourceLine = false;
178        private EntityInfo current_compound;
179        private List<EntityInfo> entities = new ArrayList<EntityInfo>();
180        private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<Integer, List<String>>();
181        private List<String> compndLines = new ArrayList<String>();
182        private List<String> sourceLines = new ArrayList<String>();
183        private List<String> journalLines = new ArrayList<String>();
184        private List<DBRef> dbrefs;
185        private Map<String, Site> siteMap = new LinkedHashMap<String, Site>();
186        private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<String, List<ResidueNumber>>();
187
188        private List<SSBondImpl> ssbonds = new ArrayList<>();
189
190        // for storing LINK until we have all the atoms parsed
191        private List<LinkRecord> linkRecords;
192
193        private Matrix4d currentNcsOp;
194        private List<Matrix4d> ncsOperators;
195
196        // for parsing COMPOUND and SOURCE Header lines
197        private int prevMolId;
198        private String previousContinuationField;
199        private String continuationField;
200        private String continuationString;
201
202        private DateFormat dateFormat;
203
204        // for rfree parsing
205        private float rfreeStandardLine = -1;
206        private float rfreeNoCutoffLine = -1;
207
208        private static  final List<String> compndFieldValues = new ArrayList<String>(
209                        Arrays.asList(
210                                        "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:",
211                                        "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:",
212                                        "BIOLOGICAL_UNIT:", "OTHER_DETAILS:"
213                                        ));
214
215
216        private static final List<String> ignoreCompndFieldValues = new ArrayList<String>(
217                        Arrays.asList(
218                                        "HETEROGEN:","ENGINEEREED:","FRAGMENT,",
219                                        "MUTANT:","SYNTHETIC:"
220                                        ));
221        // ENGINEEREED in pdb219d
222
223        private static final List<String> sourceFieldValues = new ArrayList<String>(
224                        Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:",
225                                        "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:",
226                                        "ORGANISM_TAXID:","STRAIN:",
227                                        "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:",
228                                        "CELL:", "ORGANELLE:", "SECRETION:", "GENE:",
229                                        "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:",
230                                        "EXPRESSION_SYSTEM_TAXID:",
231                                        "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:",
232                                        "EXPRESSION_SYSTEM_CELL_LINE:",
233                                        "EXPRESSION_SYSTEM_ATCC_NUMBER:",
234                                        "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:",
235                                        "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:",
236                                        "EXPRESSION_SYSTEM_CELLULAR_LOCATION:",
237                                        "EXPRESSION_SYSTEM_VECTOR_TYPE:",
238                                        "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:",
239                                        "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:"));
240
241        private int atomCount;
242
243        // parsing options:
244
245        private int atomCAThreshold ;
246
247        private int loadMaxAtoms;
248
249        private boolean atomOverflow;
250
251        /** flag to tell parser to only read Calpha coordinates **/
252        private boolean parseCAonly;
253
254
255        private FileParsingParameters params;
256
257        private boolean startOfMolecule;
258        private boolean startOfModel;
259
260        public PDBFileParser() {
261                params = new FileParsingParameters();
262
263                allModels = new ArrayList<>();
264                structure     = null           ;
265                currentModel  = null;
266                currentChain  = null;
267                currentGroup  = null;
268                // we initialise to true since at the beginning of the file we are always starting a new molecule
269                startOfMolecule = true;
270                startOfModel = true;
271
272
273                pdbHeader         = new PDBHeader();
274                crystallographicInfo = new PDBCrystallographicInfo();
275                connects      = new ArrayList<Map<String,Integer>>() ;
276
277
278                helixList     = new ArrayList<Map<String,String>>();
279                strandList    = new ArrayList<Map<String,String>>();
280                turnList      = new ArrayList<Map<String,String>>();
281                current_compound = null;
282                dbrefs        = new ArrayList<DBRef>();
283                siteMap = null;
284                dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US);
285                atomCount = 0;
286                atomOverflow = false;
287                parseCAonly = false;
288
289                // this SHOULD not be done
290                // DONOT:setFileParsingParameters(params);
291                // set the correct max values for parsing...
292                loadMaxAtoms = params.getMaxAtoms();
293                atomCAThreshold = params.getAtomCaThreshold();
294
295                linkRecords = new ArrayList<LinkRecord>();
296
297                blankChainIdsPresent = false;
298
299        }
300
301        /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */
302        private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) {
303
304                Group g =  ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3);
305                if ( g != null && !g.getChemComp().isEmpty())
306                        return g;
307
308
309                Group group;
310                if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){
311                        group = new HetatomImpl();
312
313                } else if(StructureTools.isNucleotide(aminoCode3))  {
314                        // it is a nucleotide
315                        NucleotideImpl nu = new NucleotideImpl();
316                        group = nu;
317
318                } else {
319                        AminoAcidImpl aa = new AminoAcidImpl() ;
320                        aa.setAminoType(aminoCode1);
321                        group = aa ;
322                }
323
324                //              System.out.println("new resNum type: "+ resNum.getType() );
325                return  group ;
326        }
327
328
329
330        // Handler methods to deal with PDB file records properly.
331        /**
332         Handler for
333         HEADER Record Format
334         <pre>
335         COLUMNS        DATA TYPE       FIELD           DEFINITION
336         ----------------------------------------------------------------------------------
337         1 -  6        Record name     "HEADER"
338         11 - 50        String(40)      classification  Classifies the molecule(s)
339         51 - 59        Date            depDate         Deposition date.  This is the date
340         the coordinates were received by
341         the PDB
342         63 - 66        IDcode          idCode          This identifier is unique within PDB
343        </pre>
344         */
345        private void pdb_HEADER_Handler(String line) {
346
347                String classification  = null;
348                String deposition_date = null;
349                String pdbCode         = null;
350
351                int len = line.trim().length();
352                if(len > 10) {
353                        classification  = line.substring (10, min(len,50)).trim() ;
354                        pdbHeader.setClassification(classification);
355                }
356                if(len > 50) {
357                        deposition_date = line.substring (50, min(len,59)).trim() ;
358                        try {
359                                Date dep = dateFormat.parse(deposition_date);
360                                pdbHeader.setDepDate(dep);
361
362                        } catch (ParseException e){
363                                logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date");
364                        }
365                }
366                if(len > 62) {
367                        pdbCode         = line.substring (62, min(len,66)).trim() ;
368                        pdbId = pdbCode;
369
370                        logger.debug("Parsing entry " + pdbId);
371
372
373                        structure.setPDBCode(pdbCode);
374                        pdbHeader.setIdCode(pdbCode);
375                }
376
377                //*really* old files (you'll need to hunt to find these as they
378                //should have been remediated) have headers like below. Plus the
379                //pdbId at positions 72-76 is present in every line
380
381                //HEADER    PROTEINASE INHIBITOR (TRYPSIN)          05-OCT-84   5PTI      5PTI   3
382                //HEADER    TRANSFERASE (ACYLTRANSFERASE)           02-SEP-92   1LAC      1LAC   2
383                if (len > 66) {
384                        if (pdbId.equals(line.substring (72, 76))){
385                                isLegacyFormat = true;
386                                logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly.");
387                        }
388                }
389
390        }
391
392
393        /**
394         * Parses the following record:
395         * <pre>
396         *  COLUMNS      DATA  TYPE      FIELD         DEFINITION
397         * ------------------------------------------------------------------------------------
398         *  1 -  6      Record name     "AUTHOR"
399         *  9 - 10      Continuation    continuation  Allows concatenation of multiple records.
400         * 11 - 79      List            authorList    List of the author names, separated
401         *                                            by commas.
402         *
403         * </pre>
404         * @param line
405         */
406        private void pdb_AUTHOR_Handler(String line) {
407
408                String authors = line.substring(10).trim();
409
410                String auth = pdbHeader.getAuthors();
411                if (auth == null){
412                        pdbHeader.setAuthors(authors);
413                } else {
414                        auth +=  authors;
415                        pdbHeader.setAuthors(auth);
416                }
417
418        }
419
420
421
422        /**
423         * Parses the following record:
424         *
425         * <pre>
426         * COLUMNS       DATA TYPE        FIELD        DEFINITION
427         * --------------------------------------------------------------------
428         *  1 -  6       Record name      "HELIX "
429         *  8 - 10       Integer          serNum       Serial number of the helix.
430         *                                             This starts at 1 and increases
431         *                                             incrementally.
432         * 12 - 14       LString(3)       helixID      Helix identifier. In addition
433         *                                             to a serial number, each helix is
434         *                                             given an alphanumeric character
435         *                                             helix identifier.
436         * 16 - 18       Residue name     initResName  Name of the initial residue.
437         * 20            Character        initChainID  Chain identifier for the chain
438         *                                             containing this helix.
439         * 22 - 25       Integer          initSeqNum   Sequence number of the initial
440         *                                             residue.
441         * 26            AChar            initICode    Insertion code of the initial
442         *                                             residue.
443         * 28 - 30       Residue name     endResName   Name of the terminal residue of
444         *                                             the helix.
445         * 32            Character        endChainID   Chain identifier for the chain
446         *                                             containing this helix.
447         * 34 - 37       Integer          endSeqNum    Sequence number of the terminal
448         *                                             residue.
449         * 38            AChar            endICode     Insertion code of the terminal
450         *                                             residue.
451         * 39 - 40       Integer          helixClass   Helix class (see below).
452         * 41 - 70       String           comment      Comment about this helix.
453         * 72 - 76       Integer          length       Length of this helix.
454         * </pre>
455         */
456        private void pdb_HELIX_Handler(String line){
457
458                if (params.isHeaderOnly()) return;
459
460                if (line.length()<38) {
461                        logger.info("HELIX line has length under 38. Ignoring it.");
462                        return;
463                }
464
465                String initResName = line.substring(15,18).trim();
466                String initChainId = line.substring(19,20);
467                String initSeqNum  = line.substring(21,25).trim();
468                String initICode   = line.substring(25,26);
469                String endResName  = line.substring(27,30).trim();
470                String endChainId  = line.substring(31,32);
471                String endSeqNum   = line.substring(33,37).trim();
472                String endICode    = line.substring(37,38);
473
474                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
475                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
476
477                Map<String,String> m = new HashMap<String,String>();
478
479                m.put("initResName",initResName);
480                m.put("initChainId", initChainId);
481                m.put("initSeqNum", initSeqNum);
482                m.put("initICode", initICode);
483                m.put("endResName", endResName);
484                m.put("endChainId", endChainId);
485                m.put("endSeqNum",endSeqNum);
486                m.put("endICode",endICode);
487
488                helixList.add(m);
489
490        }
491
492        /**
493         * Handler for
494         * <pre>
495         *       COLUMNS     DATA TYPE        FIELD           DEFINITION
496         * --------------------------------------------------------------
497         *  1 -  6     Record name      "SHEET "
498         *  8 - 10     Integer          strand       Strand number which starts at 1
499         *                                           for each strand within a sheet
500         *                                           and increases by one.
501         * 12 - 14     LString(3)       sheetID      Sheet identifier.
502         * 15 - 16     Integer          numStrands   Number of strands in sheet.
503         * 18 - 20     Residue name     initResName  Residue name of initial residue.
504         * 22          Character        initChainID  Chain identifier of initial
505         *                                           residue in strand.
506         * 23 - 26     Integer          initSeqNum   Sequence number of initial
507         *                                           residue in strand.
508         * 27          AChar            initICode    Insertion code of initial residue
509         *                                           in strand.
510         * 29 - 31     Residue name     endResName   Residue name of terminal residue.
511         * 33          Character        endChainID   Chain identifier of terminal
512         *                                           residue.
513         * 34 - 37     Integer          endSeqNum    Sequence number of terminal
514         *                                           residue.
515         * 38          AChar            endICode     Insertion code of terminal
516         *                                           residue.
517         * 39 - 40     Integer          sense        Sense of strand with respect to
518         *                                           previous strand in the sheet. 0
519         *                                           if first strand, 1 if parallel,
520         *                                           -1 if anti-parallel.
521         * 42 - 45     Atom             curAtom      Registration. Atom name in
522         *                                           current strand.
523         * 46 - 48     Residue name     curResName   Registration. Residue name in
524         *                                           current strand.
525         * 50          Character        curChainId   Registration. Chain identifier in
526         *                                           current strand.
527         * 51 - 54     Integer          curResSeq    Registration. Residue sequence
528         *                                           number in current strand.
529         * 55          AChar            curICode     Registration. Insertion code in
530         *                                           current strand.
531         * 57 - 60     Atom             prevAtom     Registration. Atom name in
532         *                                           previous strand.
533         * 61 - 63     Residue name     prevResName  Registration. Residue name in
534         *                                           previous strand.
535         * 65          Character        prevChainId  Registration. Chain identifier in
536         *                                           previous strand.
537         * 66 - 69     Integer          prevResSeq   Registration. Residue sequence
538         *                                           number in previous strand.
539         * 70          AChar            prevICode    Registration. Insertion code in
540         *                                               previous strand.
541         * </pre>
542         */
543        private void pdb_SHEET_Handler( String line){
544
545                if (params.isHeaderOnly()) return;
546
547                if (line.length()<38) {
548                        logger.info("SHEET line has length under 38. Ignoring it.");
549                        return;
550                }
551
552                String initResName = line.substring(17,20).trim();
553                String initChainId = line.substring(21,22);
554                String initSeqNum  = line.substring(22,26).trim();
555                String initICode   = line.substring(26,27);
556                String endResName  = line.substring(28,31).trim();
557                String endChainId  = line.substring(32,33);
558                String endSeqNum   = line.substring(33,37).trim();
559                String endICode    = line.substring(37,38);
560
561                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
562                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
563
564                Map<String,String> m = new HashMap<String,String>();
565
566                m.put("initResName",initResName);
567                m.put("initChainId", initChainId);
568                m.put("initSeqNum", initSeqNum);
569                m.put("initICode", initICode);
570                m.put("endResName", endResName);
571                m.put("endChainId", endChainId);
572                m.put("endSeqNum",endSeqNum);
573                m.put("endICode",endICode);
574
575                strandList.add(m);
576        }
577
578
579        /**
580         * Handler for TURN lines
581         * <pre>
582         * COLUMNS      DATA TYPE        FIELD         DEFINITION
583         * --------------------------------------------------------------------
584         *  1 -  6      Record name      "TURN "
585         *  8 - 10      Integer          seq           Turn number; starts with 1 and
586         *                                             increments by one.
587         * 12 - 14      LString(3)       turnId        Turn identifier
588         * 16 - 18      Residue name     initResName   Residue name of initial residue in
589         *                                             turn.
590         * 20           Character        initChainId   Chain identifier for the chain
591         *                                             containing this turn.
592         * 21 - 24      Integer          initSeqNum    Sequence number of initial residue
593         *                                             in turn.
594         * 25           AChar            initICode     Insertion code of initial residue
595         *                                             in turn.
596         * 27 - 29      Residue name     endResName    Residue name of terminal residue
597         *                                             of turn.
598         * 31           Character        endChainId    Chain identifier for the chain
599         *                                             containing this turn.
600         * 32 - 35      Integer          endSeqNum     Sequence number of terminal
601         *                                             residue of turn.
602         * 36           AChar            endICode      Insertion code of terminal residue
603         *                                             of turn.
604         * 41 - 70      String           comment       Associated comment.
605         * </pre>
606         * @param line
607         */
608        private void pdb_TURN_Handler( String line){
609
610                if (params.isHeaderOnly()) return;
611
612                if (line.length()<36) {
613                        logger.info("TURN line has length under 36. Ignoring it.");
614                        return;
615                }
616
617                String initResName = line.substring(15,18).trim();
618                String initChainId = line.substring(19,20);
619                String initSeqNum  = line.substring(20,24).trim();
620                String initICode   = line.substring(24,25);
621                String endResName  = line.substring(26,29).trim();
622                String endChainId  = line.substring(30,31);
623                String endSeqNum   = line.substring(31,35).trim();
624                String endICode    = line.substring(35,36);
625
626                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
627                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
628
629                Map<String,String> m = new HashMap<String,String>();
630
631                m.put("initResName",initResName);
632                m.put("initChainId", initChainId);
633                m.put("initSeqNum", initSeqNum);
634                m.put("initICode", initICode);
635                m.put("endResName", endResName);
636                m.put("endChainId", endChainId);
637                m.put("endSeqNum",endSeqNum);
638                m.put("endICode",endICode);
639
640                turnList.add(m);
641        }
642
643        /**
644         * Handler for
645         * REVDAT Record format:
646         * <pre>
647         *
648         * COLUMNS       DATA TYPE      FIELD         DEFINITION
649         * ----------------------------------------------------------------------------------
650         * 1 -  6       Record name    "REVDAT"
651         * 8 - 10       Integer        modNum        Modification number.
652         * 11 - 12       Continuation   continuation  Allows concatenation of multiple
653         * records.
654         * 14 - 22       Date           modDate       Date of modification (or release for
655         * new entries).  This is not repeated
656         * on continuation lines.
657         * 24 - 28       String(5)      modId         Identifies this particular
658         * modification.  It links to the
659         * archive used internally by PDB.
660         * This is not repeated on continuation
661         * lines.
662         * 32            Integer        modType       An integer identifying the type of
663         * modification.  In case of revisions
664         * with more than one possible modType,
665         * the highest value applicable will be
666         * assigned.
667         * 40 - 45       LString(6)     record        Name of the modified record.
668         * 47 - 52       LString(6)     record        Name of the modified record.
669         * 54 - 59       LString(6)     record        Name of the modified record.
670         * 61 - 66       LString(6)     record        Name of the modified record.
671         * </pre>
672         */
673        private void pdb_REVDAT_Handler(String line) {
674
675                // keep the first as latest modified date and the last as release date
676                Date modDate = pdbHeader.getModDate();
677
678                if ( modDate==null || modDate.equals(new Date(0)) ) {
679
680                        // modified date is still uninitialized
681                        String modificationDate = line.substring (13, 22).trim() ;
682
683                        try {
684                                Date dep = dateFormat.parse(modificationDate);
685                                pdbHeader.setModDate(dep);
686                                pdbHeader.setRelDate(dep);
687                        } catch (ParseException e){
688                                logger.info("Could not parse revision date string '"+modificationDate+"'. ");
689                        }
690
691                } else {
692
693                        // set as the release date
694                        String releaseDate = line.substring (13, 22).trim() ;
695
696                        try {
697                                Date dep = dateFormat.parse(releaseDate);
698                                pdbHeader.setRelDate(dep);
699                        } catch (ParseException e){
700                                logger.info("Could not parse revision date string '"+releaseDate+"'. ");
701                        }
702                }
703        }
704
705        /**
706         * Handler for
707         * SEQRES record format
708         * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied.
709         * <p>
710         * Record Format:
711         * <p>
712         * <pre>
713         * COLUMNS        DATA TYPE       FIELD         DEFINITION
714         * ---------------------------------------------------------------------------------
715         * 1 -  6        Record name     "SEQRES"
716         * 9 - 10        Integer         serNum        Serial number of the SEQRES record
717         * for the current chain.  Starts at 1
718         * and increments by one each line.
719         * Reset to 1 for each chain.
720         * 12             Character       chainID       Chain identifier.  This may be any
721         * single legal character, including a
722         * blank which is used if there is
723         * only one chain.
724         * 14 - 17        Integer         numRes        Number of residues in the chain.
725         * This value is repeated on every
726         * record.
727         * 20 - 22        Residue name    resName       Residue name.
728         * 24 - 26        Residue name    resName       Residue name.
729         * 28 - 30        Residue name    resName       Residue name.
730         * 32 - 34        Residue name    resName       Residue name.
731         * 36 - 38        Residue name    resName       Residue name.
732         * 40 - 42        Residue name    resName       Residue name.
733         * 44 - 46        Residue name    resName       Residue name.
734         * 48 - 50        Residue name    resName       Residue name.
735         * 52 - 54        Residue name    resName       Residue name.
736         * 56 - 58        Residue name    resName       Residue name.
737         * 60 - 62        Residue name    resName       Residue name.
738         * 64 - 66        Residue name    resName       Residue name.
739         * 68 - 70        Residue name    resName       Residue name.
740         * </pre>
741         * @author Jules Jacobsen
742         */
743        private void pdb_SEQRES_Handler(String line) {
744
745                /*
746                 *          1         2         3         4         5         6         7
747                 * 1234567890123456789012345678901234567890123456789012345678901234567890
748                 * SEQRES   1 A  376  LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR
749                 * SEQRES   1 A   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
750                 * SEQRES   2 A   21  TYR GLN LEU GLU ASN TYR CYS ASN
751                 * SEQRES   1 B   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
752                 * SEQRES   2 B   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
753                 * SEQRES   3 B   30  THR PRO LYS ALA
754                 * SEQRES   1 C   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
755                 * SEQRES   2 C   21  TYR GLN LEU GLU ASN TYR CYS ASN
756                 * SEQRES   1 D   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
757                 * SEQRES   2 D   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
758                 * SEQRES   3 D   30  THR PRO LYS ALA
759                 */
760
761                String recordName = line.substring(0, 6).trim();
762                String chainID    = line.substring(11, 12);
763                String newLength   = line.substring(13,17).trim();
764                String subSequence = line.substring(18);
765
766                if ( lengthCheck == -1 ){
767                        lengthCheck = Integer.parseInt(newLength);
768                }
769
770                StringTokenizer subSequenceResidues = new StringTokenizer(subSequence);
771
772                Character aminoCode1 = null;
773                if (! recordName.equals(AminoAcid.SEQRESRECORD)) {
774                        // should not have been called
775                        return;
776                }
777
778                currentChain = isKnownChain(chainID, seqResChains);
779                if ( currentChain == null) {
780
781                        currentChain = new ChainImpl();
782                        currentChain.setId(chainID);
783                        currentChain.setName(chainID);
784
785                }
786
787                while (subSequenceResidues.hasMoreTokens()) {
788
789                        String threeLetter = subSequenceResidues.nextToken();
790
791                        aminoCode1 = StructureTools.get1LetterCode(threeLetter);
792
793                        //if (aminoCode1 == null) {
794                        // could be a nucleotide...
795                        // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide...
796                        //}
797                        currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter);
798
799                        currentGroup.setPDBName(threeLetter);
800
801                        if ( currentGroup instanceof AminoAcid){
802                                AminoAcid aa = (AminoAcid)currentGroup;
803                                aa.setRecordType(AminoAcid.SEQRESRECORD);
804                        }
805                        // add the current resNum to the new chain.
806                        currentChain.addGroup(currentGroup);
807
808                }
809                Chain test = isKnownChain(chainID, seqResChains);
810
811                if ( test == null)
812                        seqResChains.add(currentChain);
813
814                if (currentGroup != null)
815                        currentGroup.trimToSize();
816
817                currentGroup = null;
818                currentChain = null;
819
820                //               the current chain is finished!
821                //if ( current_chain.getLength() != lengthCheck ){
822                //      System.err.println("the length of chain " + current_chain.getName() + "(" +
823                //                      current_chain.getLength() + ") does not match the expected " + lengthCheck);
824                //}
825
826                lengthCheck = Integer.parseInt(newLength);
827
828        }
829
830
831
832        /**
833         * Handler for
834         * TITLE Record Format
835         * <pre>
836         COLUMNS        DATA TYPE       FIELD          DEFINITION
837         ----------------------------------------------------------------------------------
838         1 -  6        Record name     "TITLE "
839         9 - 10        Continuation    continuation   Allows concatenation of multiple
840         records.
841         11 - 70        String          title          Title of the experiment.
842         * </pre>
843         *
844         */
845        private void pdb_TITLE_Handler(String line) {
846                String title;
847                if ( line.length() > 79)
848                        title = line.substring(10,80).trim();
849                else
850                        title = line.substring(10,line.length()).trim();
851
852                String t = pdbHeader.getTitle();
853                if ( (t != null) && (! t.equals("")) ){
854                        if (t.endsWith("-"))
855                                t += ""; // if last line ends with a hyphen then we don't add space
856                        else
857                                t += " ";
858                }
859                else t = "";
860
861                t += title;
862
863                pdbHeader.setTitle(t);
864        }
865
866        /**
867         * JRNL handler.
868         * The JRNL record contains the primary literature citation that describes the experiment which resulted
869         * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary
870         * reference, then there is no JRNL reference. Other references are given in REMARK 1.
871         *
872         * Record Format
873         * <pre>
874         * COLUMNS       DATA TYPE     FIELD         DEFINITION
875         * -----------------------------------------------------------------------
876         * 1 -  6       Record name   "JRNL  "
877         *
878         * 13 - 70       LString        text         See Details below.
879         * </pre>
880         */
881        private void pdb_JRNL_Handler(String line) {
882                //add the strings to the journalLines
883                //the actual JournalArticle is then built when the whole entry is being
884                //finalized with triggerEndFileChecks()
885                //JRNL        TITL   NMR SOLUTION STRUCTURE OF RECOMBINANT TICK           1TAP  10
886                if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) {
887                        //trim off the trailing PDB id from legacy files.
888                        //are we really trying to still cater for these museum pieces?
889
890                        logger.debug("trimming legacy PDB id from end of JRNL section line");
891
892                        line = line.substring(0, line.length() - 8);
893                        journalLines.add(line);
894                } else {
895                        journalLines.add(line);
896                }
897        }
898
899        /**
900         * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same
901         * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be
902         * missing. Don't say I didn't warn you.
903         *
904         * @param line
905         */
906        private void pdb_COMPND_Handler(String line) {
907
908                logger.debug("previousContinuationField  is "
909                                        + previousContinuationField);
910                logger.debug("current continuationField  is "
911                                        + continuationField);
912                logger.debug("current continuationString is "
913                                        + continuationString);
914                logger.debug("current compound           is "
915                                        + current_compound);
916
917
918                // In legacy PDB files the line ends with the PDB code and a serial number, chop those off!
919                //format version 3.0 onwards will have 80 characters in a line
920                //              if (line.length() > 72) {
921                if (isLegacyFormat) {
922                        //                    if (DEBUG) {
923                        //                        System.out.println("We have a legacy file - truncating line length to 71 characters:");
924                        //                        System.out.println(line);
925                        //                    }
926                        line = line.substring(0, 72);
927                }
928
929                line = line.substring(10, line.length());
930
931
932                String[] fieldList = line.trim().split("\\s+");
933                int fl = fieldList.length;
934                if ((fl >0 ) && compndFieldValues.contains(fieldList[0])) {
935
936                        continuationField = fieldList[0];
937                        if (previousContinuationField.equals("")) {
938                                previousContinuationField = continuationField;
939                        }
940
941                } else if (fl>0) {
942                        // the ':' character indicates the end of a field name and should be invalid as part the first data token
943                        // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
944                        if (fieldList[0].contains(":") ) {
945                                logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier");
946                                return;
947                        }
948
949                } else {
950
951                        // the line will be added as data to the previous field
952                }
953
954                line = line.replace(continuationField, "").trim();
955
956                StringTokenizer compndTokens = new StringTokenizer(line);
957
958                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
959
960                while (compndTokens.hasMoreTokens()) {
961                        String token = compndTokens.nextToken();
962
963                        if (previousContinuationField.equals("")) {
964                                previousContinuationField = continuationField;
965                        }
966
967                        if (previousContinuationField.equals(continuationField)
968                                        && compndFieldValues.contains(continuationField)) {
969
970                                logger.debug("Still in field " + continuationField);
971                                logger.debug("token = " + token);
972
973                                continuationString = continuationString.concat(token + " ");
974
975                                logger.debug("continuationString = "
976                                                        + continuationString);
977
978                        }
979                        if (!continuationField.equals(previousContinuationField)) {
980
981                                if (continuationString.equals("")) {
982                                        continuationString = token;
983
984                                } else {
985
986                                        compndValueSetter(previousContinuationField,
987                                                        continuationString);
988                                        previousContinuationField = continuationField;
989                                        continuationString = token + " ";
990                                }
991                        } else if (ignoreCompndFieldValues.contains(token)) {
992                                // this field shall be ignored
993                                //continuationField = token;
994                        }
995                }
996                if (isLastCompndLine) {
997                        // final line in the section - finish off the compound
998                        //                      System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header.");
999                        compndValueSetter(continuationField, continuationString);
1000                        continuationString = "";
1001                        if (current_compound!=null) entities.add(current_compound);
1002                }
1003        }
1004
1005        /**
1006         * Set the value in the current molId object
1007         * @param field
1008         * @param value
1009         */
1010        private void compndValueSetter(String field, String value) {
1011
1012                value = value.trim().replace(";", "");
1013                if (field.equals("MOL_ID:")) {
1014
1015                        int i = -1;
1016                        try {
1017                                i = Integer.valueOf(value);
1018                        } catch (NumberFormatException e){
1019                                logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value);
1020                        }
1021                        if (i>0 && prevMolId!=i) {
1022
1023                                if (current_compound!=null) entities.add(current_compound);
1024
1025                                logger.debug("Initialising new Compound with mol_id {}", i);
1026
1027                                current_compound = new EntityInfo();
1028
1029                                current_compound.setMolId(i);
1030
1031                                // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25
1032                                current_compound.setType(EntityType.POLYMER);
1033
1034                                prevMolId = i;
1035                        }
1036
1037                }
1038
1039                // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return
1040                if (current_compound==null) {
1041                        return;
1042                }
1043
1044                if (field.equals("MOLECULE:")) {
1045                        current_compound.setDescription(value);
1046
1047                }
1048                if (field.equals("CHAIN:")) {
1049                        //System.out.println(value);
1050                        StringTokenizer chainTokens = new StringTokenizer(value, ",");
1051                        List<String> chains = new ArrayList<String>();
1052
1053                        while (chainTokens.hasMoreTokens()) {
1054                                String chainID = chainTokens.nextToken().trim();
1055                                // NULL is used in old PDB files to represent empty chain DI
1056                                if (chainID.equals("NULL"))
1057                                        chainID = " ";
1058                                chains.add(chainID);
1059                        }
1060                        compoundMolIds2chainIds.put(current_compound.getMolId(),chains);
1061
1062                }
1063                if (field.equals("SYNONYM:")) {
1064
1065                        StringTokenizer synonyms = new StringTokenizer(value, ",");
1066                        List<String> names = new ArrayList<String>();
1067
1068                        while (synonyms.hasMoreTokens()) {
1069                                names.add(synonyms.nextToken());
1070
1071                                current_compound.setSynonyms(names);
1072                        }
1073
1074                }
1075
1076                if (field.equals("EC:")) {
1077
1078                        StringTokenizer ecNumTokens = new StringTokenizer(value, ",");
1079                        List<String> ecNums = new ArrayList<String>();
1080
1081                        while (ecNumTokens.hasMoreTokens()) {
1082                                ecNums.add(ecNumTokens.nextToken());
1083
1084                                current_compound.setEcNums(ecNums);
1085                        }
1086
1087                }
1088                if (field.equals("FRAGMENT:")) {
1089
1090                        current_compound.setFragment(value);
1091
1092                }
1093                if (field.equals("ENGINEERED:")) {
1094
1095                        current_compound.setEngineered(value);
1096
1097                }
1098                if (field.equals("MUTATION:")) {
1099
1100                        current_compound.setMutation(value);
1101
1102                }
1103                if (field.equals("BIOLOGICAL_UNIT:")) {
1104
1105                        current_compound.setBiologicalUnit(value);
1106
1107                }
1108                if (field.equals("OTHER_DETAILS:")) {
1109
1110                        current_compound.setDetails(value);
1111
1112                }
1113
1114        }
1115
1116
1117        /**
1118         * Handler for
1119         * SOURCE Record format
1120         *
1121         * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied.
1122         * Record Format
1123         * <pre>
1124         * COLUMNS   DATA TYPE         FIELD          DEFINITION
1125         * -------------------------------------------------------------------------------
1126         *  1 -  6   Record name       "SOURCE"
1127         *  9 - 10   Continuation      continuation   Allows concatenation of multiple records.
1128         * 11 - 70   Specification     srcName        Identifies the source of the macromolecule in
1129         *            list                            a token: value format.
1130         * </pre>
1131         * @param line the line to be parsed
1132         */
1133        private void pdb_SOURCE_Handler(String line) {
1134                // works in the same way as the pdb_COMPND_Handler.
1135                String continuationNr = line.substring(9, 10).trim();
1136
1137
1138
1139                logger.debug("current continuationNo     is "
1140                                + continuationNr);
1141                logger.debug("previousContinuationField  is "
1142                                + previousContinuationField);
1143                logger.debug("current continuationField  is "
1144                                + continuationField);
1145                logger.debug("current continuationString is "
1146                                + continuationString);
1147                logger.debug("current compound           is "
1148                                + current_compound);
1149
1150
1151                // following the docs, the last valid character should be 79, chop off the rest
1152                if (line.length() > 79) {
1153                        line = line.substring(0, 79);
1154                }
1155
1156                line = line.substring(10, line.length());
1157
1158                logger.debug("LINE: >" + line + "<");
1159
1160                String[] fieldList = line.split("\\s+");
1161
1162                if (!fieldList[0].equals("")
1163                                && sourceFieldValues.contains(fieldList[0])) {
1164                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'");
1165                        continuationField = fieldList[0];
1166                        if (previousContinuationField.equals("")) {
1167                                previousContinuationField = continuationField;
1168                        }
1169
1170                } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) {
1171                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'");
1172                        continuationField = fieldList[1];
1173                        if (previousContinuationField.equals("")) {
1174                                previousContinuationField = continuationField;
1175                        }
1176
1177                } else {
1178                        if (continuationNr.equals("")) {
1179
1180                                logger.debug("looks like an old PDB file");
1181
1182                                continuationField = "MOLECULE:";
1183                                if (previousContinuationField.equals("")) {
1184                                        previousContinuationField = continuationField;
1185                                }
1186                        }
1187
1188                }
1189
1190                line = line.replace(continuationField, "").trim();
1191
1192                StringTokenizer compndTokens = new StringTokenizer(line);
1193
1194                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
1195
1196                while (compndTokens.hasMoreTokens()) {
1197                        String token = compndTokens.nextToken();
1198
1199                        if (previousContinuationField.equals("")) {
1200                                //                              System.out.println("previousContinuationField is empty. Setting to : " + continuationField);
1201                                previousContinuationField = continuationField;
1202                        }
1203
1204                        if (previousContinuationField.equals(continuationField)
1205                                        && sourceFieldValues.contains(continuationField)) {
1206
1207                                logger.debug("Still in field " + continuationField);
1208
1209                                continuationString = continuationString.concat(token + " ");
1210
1211                                logger.debug("continuationString = "
1212                                                        + continuationString);
1213                        }
1214                        if (!continuationField.equals(previousContinuationField)) {
1215
1216                                if (continuationString.equals("")) {
1217                                        continuationString = token;
1218
1219                                } else {
1220
1221                                        sourceValueSetter(previousContinuationField,
1222                                                        continuationString);
1223                                        previousContinuationField = continuationField;
1224                                        continuationString = token + " ";
1225                                }
1226                        } else if (ignoreCompndFieldValues.contains(token)) {
1227                                // this field shall be ignored
1228                                //continuationField = token;
1229                        }
1230                }
1231                if (isLastSourceLine) {
1232                        // final line in the section - finish off the compound
1233                        //                      System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header.");
1234                        sourceValueSetter(continuationField, continuationString);
1235                        continuationString = "";
1236                        //compounds.add(current_compound);
1237                }
1238
1239        }
1240
1241
1242        /**
1243         * Set the value in the current molId object
1244         *
1245         * @param field
1246         * @param value
1247         */
1248        private void sourceValueSetter(String field, String value) {
1249
1250                value = value.trim().replace(";", "");
1251                //              System.out.println("[sourceValueSetter] " + field);
1252                if (field.equals("MOL_ID:")) {
1253
1254                        try {
1255                                current_compound = entities.get(Integer.valueOf(value) - 1);
1256                        } catch (NumberFormatException e){
1257                                logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage());
1258                                return;
1259                        }
1260
1261
1262                        //                      System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId());
1263
1264                }
1265                if (field.equals("SYNTHETIC:")) {
1266                        current_compound.setSynthetic(value);
1267                } else if (field.equals("FRAGMENT:")) {
1268                        current_compound.setFragment(value);
1269                } else if (field.equals("ORGANISM_SCIENTIFIC:")) {
1270                        current_compound.setOrganismScientific(value);
1271                } else if (field.equals("ORGANISM_TAXID:")) {
1272                        current_compound.setOrganismTaxId(value);
1273                } else if (field.equals("ORGANISM_COMMON:")) {
1274                        current_compound.setOrganismCommon(value);
1275                } else if (field.equals("STRAIN:")) {
1276                        current_compound.setStrain(value);
1277                } else if (field.equals("VARIANT:")) {
1278                        current_compound.setVariant(value);
1279                } else if (field.equals("CELL_LINE:")) {
1280                        current_compound.setCellLine(value);
1281                } else if (field.equals("ATCC:")) {
1282                        current_compound.setAtcc(value);
1283                } else if (field.equals("ORGAN:")) {
1284                        current_compound.setOrgan(value);
1285                } else if (field.equals("TISSUE:")) {
1286                        current_compound.setTissue(value);
1287                } else if (field.equals("CELL:")) {
1288                        current_compound.setCell(value);
1289                } else if (field.equals("ORGANELLE:")) {
1290                        current_compound.setOrganelle(value);
1291                } else if (field.equals("SECRETION:")) {
1292                        current_compound.setSecretion(value);
1293                } else if (field.equals("GENE:")) {
1294                        current_compound.setGene(value);
1295                } else if (field.equals("CELLULAR_LOCATION:")) {
1296                        current_compound.setCellularLocation(value);
1297                } else if (field.equals("EXPRESSION_SYSTEM:")) {
1298                        current_compound.setExpressionSystem(value);
1299                } else if (field.equals("EXPRESSION_SYSTEM_TAXID:")) {
1300                        current_compound.setExpressionSystemTaxId(value);
1301                } else if (field.equals("EXPRESSION_SYSTEM_STRAIN:")) {
1302                        current_compound.setExpressionSystemStrain(value);
1303                } else if (field.equals("EXPRESSION_SYSTEM_VARIANT:")) {
1304                        current_compound.setExpressionSystemVariant(value);
1305                } else if (field.equals("EXPRESSION_SYSTEM_CELL_LINE:")) {
1306                        current_compound.setExpressionSystemCellLine(value);
1307                } else if (field.equals("EXPRESSION_SYSTEM_ATCC_NUMBER:")) {
1308                        current_compound.setExpressionSystemAtccNumber(value);
1309                } else if (field.equals("EXPRESSION_SYSTEM_ORGAN:")) {
1310                        current_compound.setExpressionSystemOrgan(value);
1311                } else if (field.equals("EXPRESSION_SYSTEM_TISSUE:")) {
1312                        current_compound.setExpressionSystemTissue(value);
1313                } else if (field.equals("EXPRESSION_SYSTEM_CELL:")) {
1314                        current_compound.setExpressionSystemCell(value);
1315                } else if (field.equals("EXPRESSION_SYSTEM_ORGANELLE:")) {
1316                        current_compound.setExpressionSystemOrganelle(value);
1317                } else if (field.equals("EXPRESSION_SYSTEM_CELLULAR_LOCATION:")) {
1318                        current_compound.setExpressionSystemCellularLocation(value);
1319                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR_TYPE:")) {
1320                        current_compound.setExpressionSystemVectorType(value);
1321                } else if (field.equals("EXPRESSION_SYSTEM_VECTOR:")) {
1322                        current_compound.setExpressionSystemVector(value);
1323                } else if (field.equals("EXPRESSION_SYSTEM_PLASMID:")) {
1324                        current_compound.setExpressionSystemPlasmid(value);
1325                } else if (field.equals("EXPRESSION_SYSTEM_GENE:")) {
1326                        current_compound.setExpressionSystemGene(value);
1327                } else if (field.equals("OTHER_DETAILS:")) {
1328                        current_compound.setExpressionSystemOtherDetails(value);
1329                }
1330
1331        }
1332
1333        /**
1334         * Handler for REMARK lines
1335         */
1336        private void pdb_REMARK_Handler(String line) {
1337
1338                if ( line == null || line.length() < 11)
1339                        return;
1340
1341
1342                if (line.startsWith("REMARK 800")) {
1343                        pdb_REMARK_800_Handler(line);
1344
1345                }  else if ( line.startsWith("REMARK 350")){
1346
1347                        if ( params.isParseBioAssembly()) {
1348
1349                                if (bioAssemblyParser == null){
1350                                        bioAssemblyParser = new PDBBioAssemblyParser();
1351                                }
1352
1353                                bioAssemblyParser.pdb_REMARK_350_Handler(line);
1354                        }
1355
1356                // REMARK 3 (for R free)
1357                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1358                // then last one encountered will be taken
1359                } else if (line.startsWith("REMARK   3   FREE R VALUE")) {
1360
1361                        // Rfree annotation is not very consistent in PDB format, it varies depending on the software
1362                        // Here we follow this strategy:
1363                        // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
1364                        // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
1365
1366                        Pattern pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*");
1367                        Matcher mR = pR.matcher(line);
1368                        if (mR.matches()) {
1369                                try {
1370                                        rfreeNoCutoffLine = Float.parseFloat(mR.group(1));
1371                                } catch (NumberFormatException e) {
1372                                        logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it");
1373                                }
1374                        }
1375                        pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*");
1376                        mR = pR.matcher(line);
1377                        if (mR.matches()) {
1378                                try {
1379                                        rfreeStandardLine = Float.parseFloat(mR.group(1));
1380                                } catch (NumberFormatException e) {
1381                                        logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1));
1382                                }
1383                        }
1384
1385                // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries)
1386                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1387                // then last one encountered will be taken
1388                } else if (line.startsWith("REMARK   3   RESOLUTION RANGE HIGH")){
1389                        Pattern pR = Pattern.compile("^REMARK   3   RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*");
1390                        Matcher mR = pR.matcher(line);
1391                        if (mR.matches()) {
1392                                try {
1393                                        float res = Float.parseFloat(mR.group(1));
1394                                        if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) {
1395                                                logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1396                                                                ,mR.group(1), String.format("%4.2f",pdbHeader.getResolution()));
1397                                        }
1398                                        pdbHeader.setResolution(res);
1399                                } catch (NumberFormatException e) {
1400                                        logger.info("Could not parse resolution '{}', ignoring it",mR.group(1));
1401                                }
1402                        }
1403                }
1404
1405        }
1406
1407
1408
1409
1410
1411
1412        /**
1413         * Handler for
1414         * EXPDTA Record Format
1415        <pre>
1416         COLUMNS       DATA TYPE      FIELD         DEFINITION
1417         -------------------------------------------------------------------------------
1418         1 -  6       Record name    "EXPDTA"
1419         9 - 10       Continuation   continuation  Allows concatenation of multiple
1420         records.
1421         11 - 70       SList          technique     The experimental technique(s) with
1422         optional comment describing the
1423         sample or experiment.
1424
1425         allowed techniques are:
1426         ELECTRON DIFFRACTION
1427         FIBER DIFFRACTION
1428         FLUORESCENCE TRANSFER
1429         NEUTRON DIFFRACTION
1430         NMR
1431         THEORETICAL MODEL
1432         X-RAY DIFFRACTION
1433        </pre>
1434         */
1435        private void pdb_EXPDTA_Handler(String line) {
1436
1437                String technique  ;
1438                if (line.length() > 69)
1439                        technique = line.substring (10, 70).trim() ;
1440                else
1441                        technique = line.substring(10).trim();
1442
1443                for (String singleTechnique: technique.split(";\\s+")) {
1444                        pdbHeader.setExperimentalTechnique(singleTechnique);
1445                }
1446
1447
1448        }
1449
1450        /**
1451         * Handler for
1452         * CRYST1 Record Format
1453         * The CRYST1 record presents the unit cell parameters, space group, and Z value.
1454         * If the entry describes a structure determined by a technique other than X-ray crystallography,
1455         * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1456         * <pre>
1457         * COLUMNS DATA TYPE    FIELD          DEFINITION
1458         * -------------------------------------------------------------
1459         *  1 - 6  Record name  "CRYST1"
1460         *  7 - 15 Real(9.3)    a              a (Angstroms).
1461         * 16 - 24 Real(9.3)    b              b (Angstroms).
1462         * 25 - 33 Real(9.3)    c              c (Angstroms).
1463         * 34 - 40 Real(7.2)    alpha          alpha (degrees).
1464         * 41 - 47 Real(7.2)    beta           beta (degrees).
1465         * 48 - 54 Real(7.2)    gamma          gamma (degrees).
1466         * 56 - 66 LString      sGroup         Space group.
1467         * 67 - 70 Integer      z              Z value.
1468         * </pre>
1469         */
1470        private void pdb_CRYST1_Handler(String line) {
1471                // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1')
1472                if (line.length() < 58) {
1473                        logger.warn("CRYST1 record has fewer than 58 columns: will ignore it");
1474                        return;
1475                }
1476
1477                float a;
1478                float b;
1479                float c;
1480                float alpha;
1481                float beta;
1482                float gamma;
1483                String spaceGroup = "";
1484
1485                try {
1486                        a = Float.parseFloat(line.substring(6,15).trim());
1487                        b = Float.parseFloat(line.substring(15,24).trim());
1488                        c = Float.parseFloat(line.substring(24,33).trim());
1489                        alpha = Float.parseFloat(line.substring(33,40).trim());
1490                        beta = Float.parseFloat(line.substring(40,47).trim());
1491                        gamma = Float.parseFloat(line.substring(47,54).trim());
1492                } catch (NumberFormatException e) {
1493                        logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line);
1494                        return ;
1495                }
1496                if (line.length()>=66) {
1497                        // for well formatted files
1498                        spaceGroup = line.substring(55,66).trim();
1499                } else {
1500                        // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value
1501                        spaceGroup = line.substring(55,line.length()).trim();
1502                }
1503
1504                CrystalCell xtalCell = new CrystalCell();
1505                xtalCell.setA(a);
1506                xtalCell.setB(b);
1507                xtalCell.setC(c);
1508                xtalCell.setAlpha(alpha);
1509                xtalCell.setBeta(beta);
1510                xtalCell.setGamma(gamma);
1511
1512                if (!xtalCell.isCellReasonable()) {
1513                        // If the entry describes a structure determined by a technique other than X-ray crystallography,
1514                        // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1515                        // if so we don't add the crystal cell and it remains null
1516                        logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1517                                        CrystalCell.MIN_VALID_CELL_SIZE);
1518                } else {
1519                        crystallographicInfo.setCrystalCell(xtalCell);
1520                }
1521
1522                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1523                if (sg==null) {
1524                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1525                        crystallographicInfo.setNonStandardSg(true);
1526                } else {
1527                        crystallographicInfo.setSpaceGroup(sg);
1528                        crystallographicInfo.setNonStandardSg(false);
1529                }
1530        }
1531
1532        /**
1533         * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries)
1534         *
1535         * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn
1536         * <pre>
1537         * COLUMNS        DATA TYPE     FIELD         DEFINITION
1538         * -------------------------------------------------------------
1539         *
1540         *  1 -  6        Record name   "MTRIXn"      n=1, 2, or 3
1541         *  8 - 10        Integer       serial        Serial number.
1542         * 11 - 20        Real(10.6)    m[n][1]       Mn1
1543         * 21 - 30        Real(10.6)    m[n][2]       Mn2
1544         * 31 - 40        Real(10.6)    m[n][3]       Mn3
1545         * 46 - 55        Real(10.5)    v[n]          Vn
1546         * 60             Integer       iGiven        1
1547         *
1548         * </pre>
1549         * Note that we ignore operators with iGiven==1
1550         *
1551         * @param line
1552         */
1553        private void pdb_MTRIXn_Handler(String line) {
1554
1555                // don't process incomplete records
1556                if (line.length() < 55) {
1557                        logger.info("MTRIXn record has fewer than 55 columns: will ignore it");
1558                        return;
1559                }
1560
1561
1562                try {
1563
1564                        int rowIndex = Integer.parseInt(line.substring(5,6));
1565                        double col1Value = Double.parseDouble(line.substring(10,20));
1566                        double col2Value = Double.parseDouble(line.substring(20,30));
1567                        double col3Value = Double.parseDouble(line.substring(30,40));
1568                        double translValue = Double.parseDouble(line.substring(45,55));
1569                        int iGiven = 0;
1570                        if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) {
1571                                iGiven = Integer.parseInt(line.substring(59,60));
1572                        }
1573
1574                        if (iGiven == 1) return;
1575
1576                        if (ncsOperators==null) {
1577                                // we initialise on first pass
1578                                ncsOperators = new ArrayList<Matrix4d>();
1579                        }
1580
1581                        if (currentNcsOp==null) {
1582                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1583                        }
1584
1585                        currentNcsOp.setElement(rowIndex-1, 0, col1Value);
1586                        currentNcsOp.setElement(rowIndex-1, 1, col2Value);
1587                        currentNcsOp.setElement(rowIndex-1, 2, col3Value);
1588                        currentNcsOp.setElement(rowIndex-1, 3, translValue);
1589
1590
1591                        if (rowIndex==3) {
1592                                ncsOperators.add(currentNcsOp);
1593                                // we initialise for next matrix to come
1594                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1595                        }
1596
1597                } catch (NumberFormatException e) {
1598                        logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<");
1599                }
1600        }
1601
1602        /**
1603         * Handler for ATOM.
1604         * Record Format:
1605         *
1606         * <pre>
1607         * ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1608         *
1609         * COLUMNS        DATA TYPE       FIELD         DEFINITION
1610         * ---------------------------------------------------------------------------------
1611         * 1 -  6        Record name     "ATOM  "
1612         * 7 - 11        Integer         serial        Atom serial number.
1613         * 13 - 16        Atom            name          Atom name.
1614         * 17             Character       altLoc        Alternate location indicator.
1615         * 18 - 20        Residue name    resName       Residue name.
1616         * 22             Character       chainID       Chain identifier.
1617         * 23 - 26        Integer         resSeq        Residue sequence number.
1618         * 27             AChar           iCode         Code for insertion of residues.
1619         * 31 - 38        Real(8.3)       x             Orthogonal coordinates for X in Angstroms.
1620         * 39 - 46        Real(8.3)       y             Orthogonal coordinates for Y in Angstroms.
1621         * 47 - 54        Real(8.3)       z             Orthogonal coordinates for Z in Angstroms.
1622         * 55 - 60        Real(6.2)       occupancy     Occupancy.
1623         * 61 - 66        Real(6.2)       tempFactor    Temperature factor.
1624         * 73 - 76        LString(4)      segID         Segment identifier, left-justified.
1625         * 77 - 78        LString(2)      element       Element symbol, right-justified.
1626         * 79 - 80        LString(2)      charge        Charge on the atom.
1627         * </pre>
1628         */
1629        private void  pdb_ATOM_Handler(String line)     {
1630
1631                if ( params.isHeaderOnly())
1632                        return;
1633
1634                // let's first get the chain name which will serve to identify if we are starting a new molecule
1635                String chainName      = line.substring(21,22);
1636
1637                if (chainName.equals(" ")) {
1638                        blankChainIdsPresent = true;
1639                }
1640
1641                if (currentChain!=null && !currentChain.getName().equals(chainName)) {
1642                        // new chain name: another molecule coming
1643                        startOfMolecule = true;
1644                }
1645
1646                if (startOfMolecule) {
1647                        // we add last chain if there was one
1648                        if (currentChain!=null) {
1649                                currentModel.add(currentChain);
1650                                // let's not forget adding the last group to the finishing chain
1651                                if (currentGroup!=null) {
1652                                        currentChain.addGroup(currentGroup);
1653                                }
1654                        }
1655                        // we initialise the new molecule to come
1656                        currentChain = new ChainImpl();
1657                        // note that the chainId (asym id) is set properly later in assignAsymIds
1658                        currentChain.setId(chainName);
1659                        currentChain.setName(chainName);
1660
1661                }
1662
1663                if (startOfModel) {
1664                        // we add last model if there was one
1665                        if (currentModel!=null) {
1666                                allModels.add(currentModel);
1667                        }
1668                        // we initialise the model to come
1669                        currentModel = new ArrayList<>();
1670                }
1671
1672
1673                // let's get the residue number and see if we need to start a new group
1674
1675                String groupCode3     = line.substring(17,20).trim();
1676                String resNum  = line.substring(22,26).trim();
1677                Character iCode = line.substring(26,27).charAt(0);
1678                if ( iCode == ' ')
1679                        iCode = null;
1680                ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode);
1681
1682                //recordName      groupCode3
1683                //|                |    resNum
1684                //|                |    |   iCode
1685                //|     |          | |  |   ||
1686                //ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1687                //ATOM   1964  N   ARG H 221A      5.963 -16.715  27.669  1.00 28.59           N
1688
1689                Character aminoCode1 = StructureTools.get1LetterCode(groupCode3);
1690
1691                String recordName     = line.substring (0, 6).trim ();
1692
1693                boolean isHetAtomInFile = false;
1694
1695                if (recordName.equals("HETATM") ){
1696                        // HETATOM RECORDS are treated slightly differently
1697                        // some modified amino acids that we want to treat as amino acids
1698                        // can be found as HETATOM records
1699                        if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
1700                                        aminoCode1 = null;
1701
1702                        isHetAtomInFile = true;
1703                }
1704
1705                if ( startOfMolecule) {
1706
1707                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1708
1709                        currentGroup.setPDBName(groupCode3);
1710                        currentGroup.setResidueNumber(residueNumber);
1711                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1712
1713                }
1714
1715                // resetting states
1716                startOfModel = false;
1717                startOfMolecule = false;
1718
1719
1720                Character altLoc   = new Character(line.substring (16, 17).charAt(0));
1721                Group altGroup = null;
1722
1723
1724                // check if residue number is the same ...
1725                if ( ! residueNumber.equals(currentGroup.getResidueNumber())) {
1726
1727                        currentChain.addGroup(currentGroup);
1728                        currentGroup.trimToSize();
1729
1730                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1731
1732                        currentGroup.setPDBName(groupCode3);
1733                        currentGroup.setResidueNumber(residueNumber);
1734                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1735
1736                } else {
1737                        // same residueNumber, but altLocs...
1738
1739                        // test altLoc
1740                        if ( ! altLoc.equals(' ')) {
1741                                logger.debug("found altLoc! " + currentGroup + " " + altGroup);
1742                                altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3);
1743                                if ( altGroup.getChain() == null) {
1744                                        // need to set current chain
1745                                        altGroup.setChain(currentChain);
1746                                }
1747
1748                        }
1749                }
1750
1751                atomCount++;
1752
1753                if ( atomCount == atomCAThreshold ) {
1754                        // throw away the SEQRES lines - too much to deal with...
1755                        logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines");
1756                        seqResChains.clear();
1757
1758                        switchCAOnly();
1759
1760                }
1761
1762
1763
1764                if ( atomCount == loadMaxAtoms){
1765                        logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line);
1766                        return;
1767                }
1768                if ( atomCount > loadMaxAtoms){
1769                        return;
1770                }
1771
1772
1773                //          1         2         3         4         5         6
1774                //012345678901234567890123456789012345678901234567890123456789
1775                //ATOM      1  N   MET     1      20.154  29.699   5.276   1.0
1776                //ATOM    112  CA  ASP   112      41.017  33.527  28.371  1.00  0.00
1777                //ATOM     53  CA  MET     7      23.772  33.989 -21.600  1.00  0.00           C
1778                //ATOM    112  CA  ASP   112      37.613  26.621  33.571     0     0
1779
1780
1781                String fullname = line.substring (12, 16);
1782
1783                // check for CA only if requested
1784                if ( parseCAonly ){
1785                        // yes , user wants to get CA only
1786                        // only parse CA atoms...
1787                        if (! fullname.equals(" CA ")){
1788                                //System.out.println("ignoring " + line);
1789                                atomCount--;
1790                                return;
1791                        }
1792                }
1793
1794                if ( params.getAcceptedAtomNames() != null) {
1795
1796                        boolean found = false;
1797                        for (String ok : params.getAcceptedAtomNames()){
1798                                //System.out.println(ok + "< >" + fullname +"<");
1799
1800                                if ( ok.equals(fullname.trim())) {
1801                                        found = true;
1802                                        break;
1803                                }
1804                        }
1805                        if ( ! found) {
1806                                atomCount--;
1807                                return;
1808                        }
1809                }
1810                // create new atom
1811
1812                int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ());
1813                AtomImpl atom = new AtomImpl() ;
1814                atom.setPDBserial(pdbnumber) ;
1815
1816                atom.setAltLoc(altLoc);
1817                atom.setName(fullname.trim());
1818
1819                double x = Double.parseDouble (line.substring (30, 38).trim());
1820                double y = Double.parseDouble (line.substring (38, 46).trim());
1821                double z = Double.parseDouble (line.substring (46, 54).trim());
1822
1823                double[] coords = new double[3];
1824                coords[0] = x ;
1825                coords[1] = y ;
1826                coords[2] = z ;
1827                atom.setCoords(coords);
1828
1829                float occu  = 1.0f;
1830                if ( line.length() > 59 ) {
1831                        try {
1832                                // occu and tempf are sometimes not used :-/
1833                                occu = Float.parseFloat (line.substring (54, 60).trim());
1834                        }  catch (NumberFormatException e){}
1835                }
1836
1837                float tempf = 0.0f;
1838                if ( line.length() > 65) {
1839                        try {
1840                                tempf = Float.parseFloat (line.substring (60, 66).trim());
1841                        }  catch (NumberFormatException e){}
1842                }
1843
1844                atom.setOccupancy(  occu  );
1845                atom.setTempFactor( tempf );
1846
1847
1848
1849
1850                // Parse element from the element field. If this field is
1851                // missing (i.e. misformatted PDB file), then parse the
1852                // element from the chemical component.
1853                Element element = Element.R;
1854                boolean guessElement = true;
1855                if ( line.length() > 77 ) {
1856                        // parse element from element field
1857                        String elementSymbol = line.substring(76, 78).trim();
1858                        if (elementSymbol.isEmpty()) {
1859                                logger.info("Element column was empty for atom {} {}. Assigning atom element "
1860                                                + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber);
1861                        } else {
1862
1863                        try {
1864                                        element = Element.valueOfIgnoreCase(elementSymbol);
1865                                        guessElement = false;
1866                                }  catch (IllegalArgumentException e){
1867                                        logger.info("Element {} of atom {} {} was not recognised. Assigning atom element "
1868                                                        + "from Chemical Component Dictionary information", elementSymbol,
1869                                                        fullname.trim(), pdbnumber);
1870                                }
1871                        }
1872                } else {
1873                        logger.info("Missformatted PDB file: element column of atom {} {} is not present. "
1874                                        + "Assigning atom element from Chemical Component Dictionary information",
1875                                        fullname.trim(), pdbnumber);
1876                }
1877                if (guessElement) {
1878                        String elementSymbol = null;
1879                        if (currentGroup.getChemComp() != null) {
1880                                for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) {
1881                                        if (a.getAtom_id().equals(fullname.trim())) {
1882                                                elementSymbol = a.getType_symbol();
1883                                                break;
1884                                        }
1885                                }
1886                                if (elementSymbol == null) {
1887                                        logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. "
1888                                                        + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName());
1889                        } else {
1890                        try {
1891                                element = Element.valueOfIgnoreCase(elementSymbol);
1892                                        } catch (IllegalArgumentException e) {
1893                                                // this can still happen for cases like UNK
1894                                                logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. "
1895                                                                + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber);
1896                                        }
1897                                }
1898                        } else {
1899                                logger.warn("Chemical Component Dictionary information was not found for Atom name {}. "
1900                                                + "Assigning generic element R to it", fullname.trim());
1901                        }
1902
1903                }
1904                atom.setElement(element);
1905
1906
1907                //see if chain_id is one of the previous chains ...
1908                if ( altGroup != null) {
1909                        altGroup.addAtom(atom);
1910                        altGroup = null;
1911                }
1912                else {
1913                        currentGroup.addAtom(atom);
1914                }
1915
1916
1917                // make sure that main group has all atoms
1918                // GitHub issue: #76
1919                if ( ! currentGroup.hasAtom(atom.getName())) {
1920                        currentGroup.addAtom(atom);
1921                }
1922
1923
1924
1925                        }
1926
1927
1928        private Group getCorrectAltLocGroup( Character altLoc,
1929                        String recordName, Character aminoCode1, String groupCode3) {
1930
1931                // see if we know this altLoc already;
1932                List<Atom> atoms = currentGroup.getAtoms();
1933                if ( atoms.size() > 0) {
1934                        Atom a1 = atoms.get(0);
1935                        // we are just adding atoms to the current group
1936                        // probably there is a second group following later...
1937                        if (a1.getAltLoc().equals(altLoc)) {
1938
1939                                return currentGroup;
1940                        }
1941                }
1942
1943                List<Group> altLocs = currentGroup.getAltLocs();
1944                for ( Group altLocG : altLocs ){
1945                        atoms = altLocG.getAtoms();
1946                        if ( atoms.size() > 0) {
1947                                for ( Atom a1 : atoms) {
1948                                        if (a1.getAltLoc().equals( altLoc)) {
1949
1950                                                return altLocG;
1951                                        }
1952                                }
1953                        }
1954                }
1955
1956                // no matching altLoc group found.
1957                // build it up.
1958
1959                if ( groupCode3.equals(currentGroup.getPDBName())) {
1960                        if ( currentGroup.getAtoms().size() == 0) {
1961                                //System.out.println("current group is empty " + current_group + " " + altLoc);
1962                                return currentGroup;
1963                        }
1964                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
1965                        Group altLocG = (Group) currentGroup.clone();
1966                        // drop atoms from cloned group...
1967                        // https://redmine.open-bio.org/issues/3307
1968                        altLocG.setAtoms(new ArrayList<Atom>());
1969                        altLocG.getAltLocs().clear();
1970                        currentGroup.addAltLoc(altLocG);
1971                        return altLocG;
1972                }
1973
1974                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
1975                Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3);
1976
1977
1978                altLocG.setPDBName(groupCode3);
1979
1980                altLocG.setResidueNumber(currentGroup.getResidueNumber());
1981                currentGroup.addAltLoc(altLocG);
1982                return altLocG;
1983        }
1984
1985        private void switchCAOnly(){
1986                parseCAonly = true;
1987
1988
1989                currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel);
1990
1991                for ( int i =0; i< structure.nrModels() ; i++){
1992                        //  iterate over all known models ...
1993                        List<Chain> model = structure.getModel(i);
1994                        model = CAConverter.getRepresentativeAtomsOnly(model);
1995                        structure.setModel(i,model);
1996                }
1997
1998                currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain);
1999
2000        }
2001
2002
2003        /** safes repeating a few lines ... */
2004        private Integer conect_helper (String line,int start,int end) {
2005                if (line.length() < end) return null;
2006
2007                String sbond = line.substring(start,end).trim();
2008                int bond  = -1 ;
2009                Integer b = null ;
2010
2011                if ( ! sbond.equals("")) {
2012                        bond = Integer.parseInt(sbond);
2013                        b = new Integer(bond);
2014                }
2015
2016                return b ;
2017        }
2018
2019        /**
2020         * Handler for CONECT Record Format
2021        <pre>
2022         COLUMNS         DATA TYPE        FIELD           DEFINITION
2023         ---------------------------------------------------------------------------------
2024         1 -  6         Record name      "CONECT"
2025         7 - 11         Integer          serial          Atom serial number
2026         12 - 16         Integer          serial          Serial number of bonded atom
2027         17 - 21         Integer          serial          Serial number of bonded atom
2028         22 - 26         Integer          serial          Serial number of bonded atom
2029         27 - 31         Integer          serial          Serial number of bonded atom
2030         32 - 36         Integer          serial          Serial number of hydrogen bonded
2031         atom
2032         37 - 41         Integer          serial          Serial number of hydrogen bonded
2033         atom
2034         42 - 46         Integer          serial          Serial number of salt bridged
2035         atom
2036         47 - 51         Integer          serial          Serial number of hydrogen bonded
2037         atom
2038         52 - 56         Integer          serial          Serial number of hydrogen bonded
2039         atom
2040         57 - 61         Integer          serial          Serial number of salt bridged
2041         atom
2042         </pre>
2043         */
2044        private void pdb_CONECT_Handler(String line) {
2045
2046                if ( atomOverflow) {
2047                        return ;
2048                }
2049                if (params.isHeaderOnly()) {
2050                        return;
2051                }
2052
2053                // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines...
2054                try {
2055                        int atomserial = Integer.parseInt (line.substring(6 ,11).trim());
2056                        Integer bond1      = conect_helper(line,11,16);
2057                        Integer bond2      = conect_helper(line,16,21);
2058                        Integer bond3      = conect_helper(line,21,26);
2059                        Integer bond4      = conect_helper(line,26,31);
2060                        Integer hyd1       = conect_helper(line,31,36);
2061                        Integer hyd2       = conect_helper(line,36,41);
2062                        Integer salt1      = conect_helper(line,41,46);
2063                        Integer hyd3       = conect_helper(line,46,51);
2064                        Integer hyd4       = conect_helper(line,51,56);
2065                        Integer salt2      = conect_helper(line,56,61);
2066
2067                        //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+
2068                        //                 hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2);
2069                        HashMap<String, Integer> cons = new HashMap<String, Integer>();
2070                        cons.put("atomserial",new Integer(atomserial));
2071
2072                        if ( bond1 != null) cons.put("bond1",bond1);
2073                        if ( bond2 != null) cons.put("bond2",bond2);
2074                        if ( bond3 != null) cons.put("bond3",bond3);
2075                        if ( bond4 != null) cons.put("bond4",bond4);
2076                        if ( hyd1  != null) cons.put("hydrogen1",hyd1);
2077                        if ( hyd2  != null) cons.put("hydrogen2",hyd2);
2078                        if ( salt1 != null) cons.put("salt1",salt1);
2079                        if ( hyd3  != null) cons.put("hydrogen3",hyd3);
2080                        if ( hyd4  != null) cons.put("hydrogen4",hyd4);
2081                        if ( salt2 != null) cons.put("salt2",salt2);
2082
2083                        connects.add(cons);
2084                } catch (NumberFormatException e){
2085                        logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line);
2086                        return;
2087                }
2088        }
2089
2090        /**
2091         * Handler for MODEL Record Format
2092         * <pre>
2093         * COLUMNS       DATA TYPE      FIELD         DEFINITION
2094         * ----------------------------------------------------------------------
2095         * 1 -  6       Record name    "MODEL "
2096         * 11 - 14       Integer        serial        Model serial number.
2097         * </pre>
2098         */
2099        private void pdb_MODEL_Handler(String line) {
2100
2101                if (params.isHeaderOnly()) return;
2102
2103                // new model: we start a new molecule
2104                startOfMolecule = true;
2105                startOfModel = true;
2106
2107        }
2108
2109        /**
2110         * Handler for TER record. The record is used in deposited PDB files and many others,
2111         * but it's often forgotten by some softwares. In any case it helps identifying the
2112         * start of ligand molecules so we use it for that.
2113         */
2114        private void pdb_TER_Handler() {
2115                startOfMolecule = true;
2116        }
2117
2118
2119        /**
2120         * DBREF handler
2121         * <pre>
2122         * COLUMNS       DATA TYPE          FIELD          DEFINITION
2123         * ----------------------------------------------------------------
2124         *  1 - 6        Record name        "DBREF "
2125         *  8 - 11       IDcode             idCode         ID code of this entry.
2126         * 13            Character          chainID        Chain identifier.
2127         * 15 - 18       Integer            seqBegin       Initial sequence number
2128         *                                                 of the PDB sequence segment.
2129         * 19            AChar              insertBegin    Initial insertion code
2130         *                                                 of the PDB sequence segment.
2131         * 21 - 24       Integer            seqEnd         Ending sequence number
2132         *                                                 of the PDB sequence segment.
2133         * 25            AChar              insertEnd      Ending insertion code
2134         *                                                 of the PDB sequence segment.
2135         * 27 - 32       LString            database       Sequence database name.
2136         * 34 - 41       LString            dbAccession    Sequence database accession code.
2137         * 43 - 54      LString            dbIdCode        Sequence database
2138         *                                                 identification code.
2139         * 56 - 60      Integer            dbseqBegin      Initial sequence number of the
2140         *                                                 database seqment.
2141         * 61           AChar              idbnsBeg        Insertion code of initial residue
2142         *                                                 of the segment, if PDB is the
2143         *                                                 reference.
2144         * 63 - 67      Integer            dbseqEnd        Ending sequence number of the
2145         *                                                 database segment.
2146         * 68           AChar              dbinsEnd        Insertion code of the ending
2147         *                                                 residue of the segment, if PDB is
2148         *                                                 the reference.
2149         * </pre>
2150         */
2151        private void pdb_DBREF_Handler(String line){
2152
2153                logger.debug("Parsing DBREF " + line);
2154
2155                DBRef dbref = new DBRef();
2156                String idCode      = line.substring(7,11);
2157                String chainName     = line.substring(12,13);
2158                String seqBegin    = line.substring(14,18);
2159                String insertBegin = line.substring(18,19);
2160                String seqEnd      = line.substring(20,24);
2161                String insertEnd   = line.substring(24,25);
2162                String database    = line.substring(26,32);
2163                String dbAccession = line.substring(33,41);
2164                String dbIdCode    = line.substring(42,54);
2165                String dbseqBegin  = line.substring(55,60);
2166                String idbnsBeg    = line.substring(60,61);
2167                String dbseqEnd    = line.substring(62,67);
2168                // Support implicit space character at end
2169                String dbinsEnd;
2170                if(line.length() >= 68)
2171                        dbinsEnd       = line.substring(67,68);
2172                else
2173                        dbinsEnd       = " ";
2174
2175                dbref.setIdCode(idCode);
2176                dbref.setChainName(chainName);
2177                dbref.setSeqBegin(intFromString(seqBegin));
2178                dbref.setInsertBegin(insertBegin.charAt(0));
2179                dbref.setSeqEnd(intFromString(seqEnd));
2180                dbref.setInsertEnd(insertEnd.charAt(0));
2181                dbref.setDatabase(database.trim());
2182                dbref.setDbAccession(dbAccession.trim());
2183                dbref.setDbIdCode(dbIdCode.trim());
2184                dbref.setDbSeqBegin(intFromString(dbseqBegin));
2185                dbref.setIdbnsBegin(idbnsBeg.charAt(0));
2186                dbref.setDbSeqEnd(intFromString(dbseqEnd));
2187                dbref.setIdbnsEnd(dbinsEnd.charAt(0));
2188
2189                //System.out.println(dbref.toPDB());
2190                dbrefs.add(dbref);
2191        }
2192
2193
2194        /**
2195         * Process the disulfide bond info provided by an SSBOND record
2196         *
2197         * <pre>
2198        COLUMNS        DATA TYPE       FIELD         DEFINITION
2199        -------------------------------------------------------------------
2200         1 -  6        Record name     "SSBOND"
2201         8 - 10        Integer         serNum       Serial number.
2202        12 - 14        LString(3)      "CYS"        Residue name.
2203        16             Character       chainID1     Chain identifier.
2204        18 - 21        Integer         seqNum1      Residue sequence number.
2205        22             AChar           icode1       Insertion code.
2206        26 - 28        LString(3)      "CYS"        Residue name.
2207        30             Character       chainID2     Chain identifier.
2208        32 - 35        Integer         seqNum2      Residue sequence number.
2209        36             AChar           icode2       Insertion code.
2210        60 - 65        SymOP           sym1         Symmetry oper for 1st resid
2211        67 - 72        SymOP           sym2         Symmetry oper for 2nd resid
2212         * </pre>
2213         */
2214        private void pdb_SSBOND_Handler(String line){
2215
2216                if (params.isHeaderOnly()) return;
2217
2218                if (line.length()<36) {
2219                        logger.info("SSBOND line has length under 36. Ignoring it.");
2220                        return;
2221                }
2222
2223                String chain1      = line.substring(15,16);
2224                String seqNum1     = line.substring(17,21).trim();
2225                String icode1      = line.substring(21,22);
2226                String chain2      = line.substring(29,30);
2227                String seqNum2     = line.substring(31,35).trim();
2228                String icode2      = line.substring(35,36);
2229
2230                if (line.length()>=72) {
2231                        String symop1 = line.substring(59, 65).trim();
2232                        String symop2 = line.substring(66, 72).trim();
2233
2234                        // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them
2235                        if (!symop1.equals("") && !symop2.equals("") && // in case the field is missing
2236                                        (!symop1.equals("1555") || !symop2.equals("1555")) ) {
2237                                logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2);
2238                                return;
2239                        }
2240                }
2241
2242                if (icode1.equals(" "))
2243                        icode1 = "";
2244                if (icode2.equals(" "))
2245                        icode2 = "";
2246
2247                SSBondImpl ssbond = new SSBondImpl();
2248
2249                ssbond.setChainID1(chain1);
2250                ssbond.setResnum1(seqNum1);
2251                ssbond.setChainID2(chain2);
2252                ssbond.setResnum2(seqNum2);
2253                ssbond.setInsCode1(icode1);
2254                ssbond.setInsCode2(icode2);
2255                ssbonds.add(ssbond);
2256        }
2257
2258
2259        /**
2260         * Takes care of LINK records. These take the format of:
2261         *
2262         * <pre>
2263         * COLUMNS        DATA TYPE       FIELD       DEFINITION
2264         * --------------------------------------------------------------------------------
2265         *  1 -  6        Record name     "LINK  "
2266         * 13 - 16        Atom            name1       Atom name.
2267         * 17             Character       altLoc1     Alternate location indicator.
2268         * 18 - 20        Residue name    resName1    Residue name.
2269         * 22             Character       chainID1    Chain identifier.
2270         * 23 - 26        Integer         resSeq1     Residue sequence number.
2271         * 27             AChar           iCode1      Insertion code.
2272         * 43 - 46        Atom            name2       Atom name.
2273         * 47             Character       altLoc2     Alternate location indicator.
2274         * 48 - 50        Residue name    resName2    Residue name.
2275         * 52             Character       chainID2    Chain identifier.
2276         * 53 - 56        Integer         resSeq2     Residue sequence number.
2277         * 57             AChar           iCode2      Insertion code.
2278         * 60 - 65        SymOP           sym1        Symmetry operator for 1st atom.
2279         * 67 - 72        SymOP           sym2        Symmetry operator for 2nd atom.
2280         * </pre>
2281         *
2282         * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK)
2283         *
2284         * @param line the LINK record line to parse.
2285         */
2286        private void pdb_LINK_Handler(String line) {
2287
2288                if (params.isHeaderOnly()) return;
2289
2290                // Check for the minimal set of fields.
2291                if (line.length()<56) {
2292                        logger.info("LINK line has length under 56. Ignoring it.");
2293                        return;
2294                }
2295
2296                int len = line.length();
2297
2298                String name1 = line.substring(12, 16).trim();
2299                String altLoc1 = line.substring(16, 17).trim();
2300                String resName1 = line.substring(17, 20).trim();
2301                String chainID1 = line.substring(21, 22).trim();
2302                String resSeq1 = line.substring(22, 26).trim();
2303                String iCode1 = line.substring(26, 27).trim();
2304
2305                String name2 = line.substring(42, 46).trim();
2306                String altLoc2 = line.substring(46, 47).trim();
2307                String resName2 = line.substring(47, 50).trim();
2308                String chainID2 = line.substring(51, 52).trim();
2309                String resSeq2 = line.substring(52, 56).trim();
2310                String iCode2 = null;  // Might get trimmed if blank.
2311                if (len > 56) iCode2 = line.substring(56, 57).trim();
2312
2313                String sym1 = null;
2314                if (len > 64) sym1 = line.substring(59, 65).trim();
2315                String sym2 = null;
2316                if (len > 71) sym2 = line.substring(66, 72).trim();
2317
2318                linkRecords.add(new LinkRecord(
2319                                name1, altLoc1, resName1, chainID1, resSeq1, iCode1,
2320                                name2, altLoc2, resName2, chainID2, resSeq2, iCode2,
2321                                sym1, sym2));
2322        }
2323
2324        /**
2325         * Handler for the SITE records. <br>
2326         *
2327         * <pre>
2328         *
2329         * COLUMNS      DATA TYPE               FIELD           DEFINITION
2330         * ---------------------------------------------------------------------------------
2331         * 1 - 6        Record name     "SITE "
2332         * 8 - 10       Integer                 seqNum          Sequence number.
2333         * 12 - 14      LString(3)              siteID          Site name.
2334         * 16 - 17      Integer                 numRes          Number of residues that compose the siteResidues.
2335         * 19 - 21      Residue name    resName1        Residue name for first residue that
2336         *                                                                              creates the siteResidues.
2337         * 23           Character               chainID1        Chain identifier for first residue of siteResidues.
2338         * 24 - 27      Integer                 seq1            Residue sequence number for first residue
2339         *                                                                              of the siteResidues.
2340         * 28           AChar                   iCode1          Insertion code for first residue of the siteResidues.
2341         *
2342         * example:
2343         *          1         2         3         4         5         6         7         8
2344         * 12345678901234567890123456789012345678901234567890123456789012345678901234567890
2345         * SITE     1 AC1  3 HIS A  94 HIS A   96  HIS A 119
2346         * SITE     1 AC2  5 ASN A  62 GLY A   63  HIS A  64  HOH A 328
2347         * SITE     2 AC2  5 HOH A 634
2348         * SITE     1 AC3  5 GLN A 136 GLN A  137  PRO A 138  GLU A 205
2349         * SITE     2 AC3  5 CYS A 206
2350         * SITE     1 AC4 11 HIS A  64 HIS A   94  HIS A  96  HIS A 119
2351         * SITE     2 AC4 11 LEU A 198 THR A  199  THR A 200  TRP A 209
2352         * SITE     3 AC4 11 HOH A 572 HOH A  582  HOH A 635
2353         * </pre>
2354         * @param line the SITE line record being currently read
2355         * @author Amr AL-Hossary
2356         * @author Jules Jacobsen
2357         */
2358        private void pdb_SITE_Handler(String line){
2359
2360                if (params.isHeaderOnly()) return;
2361
2362                //  make a map of: SiteId to List<ResidueNumber>
2363
2364                logger.debug("Site Line:"+line);
2365
2366
2367                String siteID = line.substring(11, 14);
2368                //fetch the siteResidues from the map
2369                List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID);
2370
2371                //if the siteResidues doesn't yet exist, make a new one.
2372                if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){
2373                        siteResidues = new ArrayList<ResidueNumber>();
2374                        siteToResidueMap.put(siteID.trim(), siteResidues);
2375
2376                        logger.debug(String.format("New Site made: %s %s", siteID,  siteResidues));
2377                        logger.debug("Now made " + siteMap.size() + " sites");
2378
2379                }
2380
2381                logger.debug(String.format("SiteId: %s", siteID));
2382
2383
2384                //line = 'SITE     1 AC1  6 ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2385                //line.substring(18) = 'ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2386                line = line.substring(18);
2387                String groupString = null;
2388                //groupString = 'ARG H 221A'
2389                //keep iterating through chunks of 10 characters - these are the groups in the siteResidues
2390                while (!(groupString = line.substring(0, 10)).equals("          ")) {
2391                        //groupstring: 'ARG H 221A'
2392
2393                        logger.debug("groupString: '" + groupString + "'");
2394
2395                        //set the residue name
2396                        //residueName = 'ARG'
2397                        String residueName = groupString.substring(0, 3);
2398                        Character aminoCode1 = StructureTools.get1LetterCode(residueName);
2399                        if (aminoCode1 != null) {
2400                                if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
2401                                        aminoCode1 = null;
2402                                }
2403                        }
2404
2405                        //this is already in the right format, so no need to fiddle with it...
2406                        //pdbCode = 'H 221A'
2407                        //                    String pdbCode = groupString.substring(4, 10).trim();
2408                        String chainId = groupString.substring(4, 5);
2409                        Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim());
2410                        Character insCode = groupString.substring(9, 10).charAt(0);
2411                        //set insCode to null as a measure to prevent storing thousands of empty Strings
2412                        //- the empty value is returned using Group.getInsCode()
2413                        //                    if (insCode.equals(" ")) {
2414                        //                        insCode = null;
2415                        //                    }
2416
2417                        logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode));
2418
2419                        //make a new resNum with the data - this will be linked up with a site later
2420                        ResidueNumber residueNumber = new ResidueNumber();
2421
2422
2423                        logger.debug("pdbCode: '" + resNum + insCode + "'");
2424
2425                        residueNumber.setChainName(chainId);
2426                        residueNumber.setSeqNum(resNum);
2427                        residueNumber.setInsCode(insCode);
2428                        //add the resNum to the groups
2429                        siteResidues.add(residueNumber);
2430
2431                        logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID);
2432
2433                        line = line.substring(11);
2434                }
2435
2436                logger.debug("Current SiteMap (contains "+ siteToResidueMap.keySet().size() + " sites):");
2437                for (String key : siteToResidueMap.keySet()) {
2438                        logger.debug(key + " : " + siteToResidueMap.get(key));
2439                }
2440
2441        }
2442
2443        //Site variable related to parsing the REMARK 800 records.
2444        Site site;
2445        private void pdb_REMARK_800_Handler(String line){
2446
2447                if (params.isHeaderOnly()) return;
2448
2449                // 'REMARK 800 SITE_IDENTIFIER: CAT                                                 '
2450                line = line.substring(11);
2451                String[] fields = line.split(": ");
2452
2453                if (fields.length == 2) {
2454                        if (fields[0].equals("SITE_IDENTIFIER")) {
2455                                //                    remark800Counter++;
2456                                String siteID = fields[1].trim();
2457
2458                                logger.debug("siteID: '" + siteID +"'");
2459
2460                                //fetch the siteResidues from the map
2461                                site = siteMap.get(siteID);
2462
2463                                //if the siteResidues doesn't yet exist, make a new one.
2464                                if (site == null || !siteID.equals(site.getSiteID())) {
2465                                        site = new Site(siteID, new ArrayList<Group>());
2466                                        siteMap.put(site.getSiteID(), site);
2467
2468                                        logger.debug("New Site made: " + site);
2469                                        logger.debug("Now made " + siteMap.size() + " sites");
2470
2471                                }
2472                        }
2473                        if (fields[0].equals("EVIDENCE_CODE")) {
2474                                //                    remark800Counter++;
2475                                String evCode = fields[1].trim();
2476
2477                                logger.debug("evCode: '" + evCode +"'");
2478
2479                                //fetch the siteResidues from the map
2480                                site.setEvCode(evCode);
2481                        }
2482                        if (fields[0].equals("SITE_DESCRIPTION")) {
2483                                //                    remark800Counter++;
2484                                String desc = fields[1].trim();
2485
2486                                logger.debug("desc: '" + desc +"'");
2487
2488                                //fetch the siteResidues from the map
2489                                site.setDescription(desc);
2490
2491                                logger.debug("Finished making REMARK 800 for site " + site.getSiteID());
2492                                logger.debug(site.remark800toPDB());
2493
2494                        }
2495                }
2496        }
2497
2498        private int intFromString(String intString){
2499                int val = Integer.MIN_VALUE;
2500                try {
2501                        val = Integer.parseInt(intString.trim());
2502                } catch (NumberFormatException ex){
2503                        logger.info("Could not parse a number: " + ex.getMessage());
2504                }
2505                return val;
2506        }
2507
2508
2509
2510        /**
2511         * Finds in the given list of chains the first one that has as name the given chainID.
2512         * If no such Chain can be found it returns null.
2513         */
2514        private static Chain isKnownChain(String chainID, List<Chain> chains){
2515
2516                for (int i = 0; i< chains.size();i++){
2517                        Chain testchain =  chains.get(i);
2518                        if (chainID.equals(testchain.getName())) {
2519                                return testchain;
2520                        }
2521                }
2522
2523                return null;
2524        }
2525
2526
2527
2528        private BufferedReader getBufferedReader(InputStream inStream)
2529                        throws IOException {
2530
2531                BufferedReader buf ;
2532                if (inStream == null) {
2533                        throw new IOException ("input stream is null!");
2534                }
2535
2536                buf = new BufferedReader (new InputStreamReader (inStream));
2537                return buf ;
2538
2539        }
2540
2541
2542
2543        /**
2544         * Parse a PDB file and return a datastructure implementing
2545         * PDBStructure interface.
2546         *
2547         * @param inStream  an InputStream object
2548         * @return a Structure object
2549         * @throws IOException
2550         */
2551        public Structure parsePDBFile(InputStream inStream)
2552                        throws IOException
2553        {
2554
2555                BufferedReader buf = getBufferedReader(inStream);
2556
2557                return parsePDBFile(buf);
2558
2559        }
2560
2561        /**
2562         * Parse a PDB file and return a datastructure implementing
2563         * PDBStructure interface.
2564         *
2565         * @param buf  a BufferedReader object
2566         * @return the Structure object
2567         * @throws IOException ...
2568         */
2569        public  Structure parsePDBFile(BufferedReader buf)
2570                        throws IOException
2571                        {
2572                // set the correct max values for parsing...
2573                loadMaxAtoms = params.getMaxAtoms();
2574                atomCAThreshold = params.getAtomCaThreshold();
2575
2576
2577                // (re)set structure
2578
2579                allModels = new ArrayList<>();
2580                structure     = new StructureImpl() ;
2581                currentModel  = null;
2582                currentChain  = null;
2583                currentGroup  = null;
2584                // we initialise to true since at the beginning of the file we are always starting a new molecule
2585                startOfMolecule = true;
2586                startOfModel = true;
2587
2588                seqResChains  = new ArrayList<Chain>();
2589                siteMap = new LinkedHashMap<String, Site>();
2590                pdbHeader     = new PDBHeader();
2591                connects      = new ArrayList<Map<String,Integer>>();
2592                previousContinuationField = "";
2593                continuationField = "";
2594                continuationString = "";
2595                current_compound = null;
2596                sourceLines.clear();
2597                compndLines.clear();
2598                isLastCompndLine = false;
2599                isLastSourceLine = false;
2600                prevMolId = -1;
2601                entities.clear();
2602                helixList.clear();
2603                strandList.clear();
2604                turnList.clear();
2605                lengthCheck = -1;
2606                atomCount = 0;
2607                atomOverflow = false;
2608                linkRecords = new ArrayList<LinkRecord>();
2609                siteToResidueMap.clear();
2610
2611                blankChainIdsPresent = false;
2612
2613                parseCAonly = params.isParseCAOnly();
2614
2615                String line = null;
2616
2617                while ((line = buf.readLine()) != null) {
2618
2619                        // ignore empty lines
2620                        if ( line.equals("") ||
2621                                        (line.equals(NEWLINE))){
2622                                continue;
2623                        }
2624
2625
2626                        // ignore short TER and END lines
2627                        if ( line.startsWith("END")) {
2628                                continue;
2629                        }
2630
2631                        if ( line.length() < 6 && !line.startsWith("TER")) {
2632                                logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
2633                                continue;
2634                        }
2635
2636                        String recordName = null;
2637                        if (line.length()<6)
2638                                recordName = line.trim();
2639                        else
2640                                recordName = line.substring (0, 6).trim ();
2641
2642                        try {
2643                                if (recordName.equals("ATOM"))
2644                                        pdb_ATOM_Handler(line);
2645                                else if (recordName.equals("SEQRES"))
2646                                        pdb_SEQRES_Handler(line);
2647                                else if (recordName.equals("HETATM"))
2648                                        pdb_ATOM_Handler(line);
2649                                else if (recordName.equals("MODEL"))
2650                                        pdb_MODEL_Handler(line);
2651                                else if (recordName.equals("TER"))
2652                                        pdb_TER_Handler();
2653                                else if (recordName.equals("HEADER"))
2654                                        pdb_HEADER_Handler(line);
2655                                else if (recordName.equals("AUTHOR"))
2656                                        pdb_AUTHOR_Handler(line);
2657                                else if (recordName.equals("TITLE"))
2658                                        pdb_TITLE_Handler(line);
2659                                else if (recordName.equals("SOURCE"))
2660                                        sourceLines.add(line); //pdb_SOURCE_Handler
2661                                else if (recordName.equals("COMPND"))
2662                                        compndLines.add(line); //pdb_COMPND_Handler
2663                                else if (recordName.equals("JRNL"))
2664                                        pdb_JRNL_Handler(line);
2665                                else if (recordName.equals("EXPDTA"))
2666                                        pdb_EXPDTA_Handler(line);
2667                                else if (recordName.equals("CRYST1"))
2668                                        pdb_CRYST1_Handler(line);
2669                                else if (recordName.startsWith("MTRIX"))
2670                                        pdb_MTRIXn_Handler(line);
2671                                else if (recordName.equals("REMARK"))
2672                                        pdb_REMARK_Handler(line);
2673                                else if (recordName.equals("CONECT"))
2674                                        pdb_CONECT_Handler(line);
2675                                else if (recordName.equals("REVDAT"))
2676                                        pdb_REVDAT_Handler(line);
2677                                else if (recordName.equals("DBREF"))
2678                                        pdb_DBREF_Handler(line);
2679                                else if (recordName.equals("SITE"))
2680                                        pdb_SITE_Handler(line);
2681                                else if (recordName.equals("SSBOND"))
2682                                        pdb_SSBOND_Handler(line);
2683                                else if (recordName.equals("LINK"))
2684                                        pdb_LINK_Handler(line);
2685                                else if ( params.isParseSecStruc()) {
2686                                        if ( recordName.equals("HELIX") ) pdb_HELIX_Handler (  line ) ;
2687                                        else if (recordName.equals("SHEET")) pdb_SHEET_Handler(line ) ;
2688                                        else if (recordName.equals("TURN")) pdb_TURN_Handler(   line ) ;
2689                                }
2690                        } catch (StringIndexOutOfBoundsException | NullPointerException ex) {
2691                                logger.info("Unable to parse [" + line + "]");
2692                        }
2693                }
2694
2695                makeCompounds(compndLines, sourceLines);
2696
2697                triggerEndFileChecks();
2698
2699                if (params.shouldCreateAtomBonds()) {
2700                        formBonds();
2701                }
2702
2703                if ( params.shouldCreateAtomCharges()) {
2704                        addCharges();
2705                }
2706
2707                if ( params.isParseSecStruc() && !params.isHeaderOnly())
2708                        setSecStruc();
2709
2710                // Now correct the alternate location group
2711                StructureTools.cleanUpAltLocs(structure);
2712
2713                return structure;
2714
2715                        }
2716
2717
2718        /**
2719         * Add the charges to the Structure
2720         */
2721        private void addCharges() {
2722                ChargeAdder.addCharges(structure);
2723        }
2724
2725        /**
2726         * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained.
2727         * @author Jules Jacobsen
2728         * @param  compoundList
2729         * @param  sourceList
2730         */
2731        private void makeCompounds(List<String> compoundList,
2732                        List<String> sourceList) {
2733                //              System.out.println("[makeCompounds] making compounds from compoundLines");
2734
2735                for (String line : compoundList) {
2736                        if (compoundList.indexOf(line) + 1 == compoundList.size()) {
2737                                //                              System.out.println("[makeCompounds] Final line in compoundLines.");
2738                                isLastCompndLine = true;
2739                        }
2740                        pdb_COMPND_Handler(line);
2741
2742                }
2743                //              System.out.println("[makeCompounds] adding sources to compounds from sourceLines");
2744                // since we're starting again from the first compound, reset it here
2745                if ( entities.size() == 0){
2746                        current_compound = new EntityInfo();
2747                } else {
2748                        current_compound = entities.get(0);
2749                }
2750                for (String line : sourceList) {
2751                        if (sourceList.indexOf(line) + 1 == sourceList.size()) {
2752                                //                              System.out.println("[makeCompounds] Final line in sourceLines.");
2753                                isLastSourceLine = true;
2754                        }
2755                        pdb_SOURCE_Handler(line);
2756                }
2757
2758        }
2759
2760        /**
2761         * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide
2762         * bonds), peptide bonds, and intra-residue bonds.
2763         * <p>
2764         * Note: the current implementation only looks at the first model of each
2765         * structure. This may need to be fixed in the future.
2766         */
2767        private void formBonds() {
2768
2769                BondMaker maker = new BondMaker(structure, params);
2770
2771                // LINK records should be preserved, they are the way that
2772                // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers.
2773                // The analogy in mmCIF is the _struct_conn record.
2774                for (LinkRecord linkRecord : linkRecords) {
2775                        maker.formLinkRecordBond(linkRecord);
2776                }
2777
2778                maker.formDisulfideBonds(ssbonds);
2779
2780                maker.makeBonds();
2781        }
2782
2783
2784
2785        private void triggerEndFileChecks(){
2786
2787                // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines)
2788                if (currentChain!=null && currentGroup!=null) {
2789                        currentChain.addGroup(currentGroup);
2790                }
2791                if (currentModel!=null && currentChain!=null) {
2792                        currentModel.add(currentChain);
2793                }
2794                if (currentModel!=null) {
2795                        allModels.add(currentModel);
2796                }
2797
2798                if (blankChainIdsPresent) {
2799                        // from biojava 5.0 there's limited support for old pdb files with blank chain ids
2800                        logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly.");
2801                }
2802
2803                // reordering chains following the mmcif model and assigning entities
2804                assignChainsAndEntities();
2805                structure.setEntityInfos(entities);
2806
2807
2808
2809                // header data
2810
2811                Date modDate = pdbHeader.getModDate();
2812                if ( modDate.equals(new Date(0)) ) {
2813                        // modification date = deposition date
2814                        Date depositionDate = pdbHeader.getDepDate();
2815
2816                        if (! depositionDate.equals(modDate)){
2817                                // depDate is 0000-00-00
2818                                pdbHeader.setDepDate(depositionDate);
2819                        }
2820
2821                }
2822
2823                structure.setPDBHeader(pdbHeader);
2824                structure.setCrystallographicInfo(crystallographicInfo);
2825
2826                //set the JournalArticle, if there is one
2827                if (!journalLines.isEmpty()) {
2828                        buildjournalArticle();
2829                        pdbHeader.setJournalArticle(journalArticle);
2830                }
2831
2832                structure.setDBRefs(dbrefs);
2833
2834                // Only align if requested (default) and not when headerOnly mode with no Atoms.
2835                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
2836                if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){
2837                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
2838                        SeqRes2AtomAligner aligner = new SeqRes2AtomAligner();
2839                        aligner.align(structure,seqResChains);
2840
2841                } else {
2842                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
2843                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
2844                }
2845
2846
2847
2848                //associate the temporary Groups in the siteMap to the ones
2849                if (!params.isHeaderOnly()) {
2850                        // Only can link SITES if Atom Groups were parsed.
2851                        linkSitesToGroups(); // will work now that setSites is called
2852                }
2853
2854                if ( bioAssemblyParser != null){
2855                        bioAssemblyParser.setMacromolecularSizes();
2856                        pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap());
2857                }
2858
2859                if (ncsOperators !=null && ncsOperators.size()>0) {
2860                        crystallographicInfo.setNcsOperators(
2861                                ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
2862                }
2863
2864
2865                // rfree end file check
2866                // Rfree annotation is not very consistent in PDB format, it varies depending on the software
2867                // Here we follow this strategy:
2868                // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
2869                // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
2870
2871                if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) {
2872                        pdbHeader.setRfree(rfreeNoCutoffLine);
2873                } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) {
2874                        pdbHeader.setRfree(rfreeStandardLine);
2875                } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) {
2876                        pdbHeader.setRfree(rfreeStandardLine);
2877                } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE
2878
2879
2880
2881        }
2882
2883        private void setSecStruc(){
2884
2885                setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2886                                SecStrucType.helix4);
2887                setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2888                                SecStrucType.extended);
2889                setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2890                                SecStrucType.turn);
2891
2892                //Now insert random coil to the Groups that did not have SS information
2893                GroupIterator gi = new GroupIterator(structure);
2894                while (gi.hasNext()){
2895                        Group g = gi.next();
2896                        if (g.hasAminoAtoms()){
2897                                if (g.getProperty(Group.SEC_STRUC) == null){
2898                                        SecStrucInfo ss = new SecStrucInfo(g,
2899                                                        SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2900                                                        SecStrucType.coil);
2901                                        g.setProperty(Group.SEC_STRUC, ss);
2902                                }
2903                        }
2904                }
2905
2906        }
2907
2908        private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){
2909
2910
2911                Iterator<Map<String,String>> iter = secList.iterator();
2912                nextElement:
2913                        while (iter.hasNext()){
2914                                Map<String,String> m = iter.next();
2915
2916                                // assign all residues in this range to this secondary structure type
2917                                // String initResName = (String)m.get("initResName");
2918                                String initChainId = m.get("initChainId");
2919                                String initSeqNum  = m.get("initSeqNum" );
2920                                String initICode   = m.get("initICode" );
2921                                // String endResName  = (String)m.get("endResName" );
2922                                String endChainId  = m.get("endChainId" );
2923                                String endSeqNum   = m.get("endSeqNum");
2924                                String endICode    = m.get("endICode");
2925
2926                                if (initICode.equals(" "))
2927                                        initICode = "";
2928                                if (endICode.equals(" "))
2929                                        endICode = "";
2930
2931                                GroupIterator gi = new GroupIterator(structure);
2932                                boolean inRange = false;
2933                                while (gi.hasNext()){
2934                                        Group g = gi.next();
2935                                        Chain c = g.getChain();
2936
2937                                        if (c.getName().equals(initChainId)){
2938
2939                                                String pdbCode = initSeqNum + initICode;
2940                                                if ( g.getResidueNumber().toString().equals(pdbCode)  ) {
2941                                                        inRange = true;
2942                                                }
2943                                        }
2944                                        if ( inRange){
2945                                                if (g.hasAminoAtoms()) {
2946                                                        SecStrucInfo ss = new SecStrucInfo(g, assignment, type);
2947                                                        g.setProperty(Group.SEC_STRUC, ss);
2948                                                }
2949
2950                                        }
2951                                        if ( c.getName().equals(endChainId)){
2952                                                String pdbCode = endSeqNum + endICode;
2953                                                if (pdbCode.equals(g.getResidueNumber().toString())){
2954                                                        inRange = false;
2955                                                        continue nextElement;
2956                                                }
2957                                        }
2958                                }
2959                        }
2960        }
2961
2962        /**
2963         * Gets all chains with given chainName from given models list
2964         * @param chainName
2965         * @param polyModels
2966         * @return
2967         */
2968        private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) {
2969                List<List<Chain>> models = new ArrayList<>();
2970
2971                for (List<Chain> chains:polyModels) {
2972                        List<Chain> matchingChains = new ArrayList<>();
2973                        models.add(matchingChains);
2974                        for (Chain c:chains) {
2975                                if (c.getName().equals(chainName)) {
2976                                        matchingChains.add(c);
2977                                }
2978                        }
2979                }
2980                return models;
2981        }
2982
2983        /**
2984         * Split the given chain (containing non-polymer groups and water groups only)
2985         * into individual chains per non-polymer group and individual chains per contiguous sets of water groups.
2986         * @param chain
2987         * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains
2988         */
2989        private static List<List<Chain>> splitNonPolyChain(Chain chain) {
2990                List<Chain> splitNonPolys = new ArrayList<>();
2991                List<Chain> waterChains = new ArrayList<>();
2992
2993                Chain split = null;
2994                boolean previousGroupIsWater = false;
2995
2996                for (Group g:chain.getAtomGroups()){
2997
2998                        if (!previousGroupIsWater) {
2999                                // add last one if there's one
3000                                if (split!=null) {
3001                                        splitNonPolys.add(split);
3002                                }
3003                                split = new ChainImpl();
3004                                split.setName(chain.getName());
3005                        } else if (!g.isWater()) {
3006                                // previous group is water and this group is not water: we change from a water chain to a non-poly
3007                                // we'll need to add now the water chain to the list of water chains
3008                                waterChains.add(split);
3009                                split = new ChainImpl();
3010                                split.setName(chain.getName());
3011                        }
3012
3013                        if (g.isWater()) {
3014                                previousGroupIsWater = true;
3015                        } else {
3016                                previousGroupIsWater = false;
3017
3018                        }
3019
3020                        // this should include alt locs (referenced from the main group)
3021                        split.addGroup(g);
3022
3023                }
3024
3025                // adding the last split chain: either to water or non-poly depending on what was the last seen group
3026                if (split!=null) {
3027                        if (previousGroupIsWater)
3028                                waterChains.add(split);
3029                        else
3030                                splitNonPolys.add(split);
3031                }
3032
3033
3034                List<List<Chain>> all = new ArrayList<>(2);
3035                all.add(splitNonPolys);
3036                all.add(waterChains);
3037
3038                return all;
3039        }
3040
3041        /**
3042         * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files
3043         * @param polys
3044         * @param nonPolys
3045         * @param waters
3046         */
3047        private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) {
3048
3049                for (int i=0; i<polys.size(); i++) {
3050                        String asymId = "A";
3051
3052                        for (Chain poly:polys.get(i)) {
3053                                poly.setId(asymId);
3054                                asymId = getNextAsymId(asymId);
3055                        }
3056                        for (Chain nonPoly:nonPolys.get(i)) {
3057                                nonPoly.setId(asymId);
3058                                asymId = getNextAsymId(asymId);
3059                        }
3060                        for (Chain water:waters.get(i)) {
3061                                water.setId(asymId);
3062                                asymId = getNextAsymId(asymId);
3063                        }
3064                }
3065        }
3066
3067        /**
3068         * Gets the next asym id given an asymId, according to the convention followed by
3069         * mmCIF files produced by the PDB
3070         * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,...
3071         * @param asymId
3072         * @return
3073         */
3074        private String getNextAsymId(String asymId) {
3075                if (asymId.length()==1) {
3076                        if (!asymId.equals("Z")) {
3077                                return Character.toString(getNextChar(asymId.charAt(0)));
3078                        } else {
3079                                return "AA";
3080                        }
3081                } else if (asymId.length()==2) {
3082                        if (asymId.equals("ZZ")) {
3083                                return "AAA";
3084                        }
3085                        char[] c = new char[2];
3086                        asymId.getChars(0, 2, c, 0);
3087                        c[0] = getNextChar(c[0]);
3088                        if (c[0]=='A') {
3089                                c[1] = getNextChar(c[1]);
3090                        }
3091                        return new String(c);
3092                } else if (asymId.length()==3) {
3093                        char[] c = new char[3];
3094                        asymId.getChars(0, 3, c, 0);
3095                        c[0] = getNextChar(c[0]);
3096                        if (c[0]=='A') {
3097                                c[1] = getNextChar(c[1]);
3098                                if (c[1]=='A') {
3099                                        c[2] = getNextChar(c[2]);
3100                                }
3101                        }
3102                        return new String(c);
3103                }
3104                return null;
3105        }
3106
3107        private char getNextChar(char c) {
3108                if (c!='Z') {
3109                        return ((char)(c+1));
3110                } else {
3111                        return 'A';
3112                }
3113        }
3114
3115        /**
3116         * Here we assign chains following the mmCIF data model:
3117         * one chain per polymer, one chain per non-polymer group and
3118         * several water chains.
3119         * <p>
3120         * Subsequently we assign entities for them: either from those read from
3121         * COMPOUND records or from those found heuristically through {@link EntityFinder}
3122         *
3123         */
3124        private void assignChainsAndEntities(){
3125
3126                List<List<Chain>> polyModels = new ArrayList<>();
3127                List<List<Chain>> nonPolyModels = new ArrayList<>();
3128                List<List<Chain>> waterModels = new ArrayList<>();
3129
3130                for (List<Chain> model:allModels) {
3131
3132                        List<Chain> polyChains = new ArrayList<>();
3133                        List<Chain> nonPolyChains = new ArrayList<>();
3134                        List<Chain> waterChains = new ArrayList<>();
3135
3136                        polyModels.add(polyChains);
3137                        nonPolyModels.add(nonPolyChains);
3138                        waterModels.add(waterChains);
3139
3140                        for (Chain c:model) {
3141
3142                                // we only have entities for polymeric chains, all others are ignored for assigning entities
3143                                if (c.isWaterOnly()) {
3144                                        waterChains.add(c);
3145
3146                                } else if (c.isPureNonPolymer()) {
3147                                        nonPolyChains.add(c);
3148
3149                                } else {
3150                                        polyChains.add(c);
3151                                }
3152                        }
3153                }
3154
3155                List<List<Chain>> splitNonPolyModels = new ArrayList<>();
3156                for (int i=0; i<nonPolyModels.size(); i++) {
3157                        List<Chain> nonPolyModel = nonPolyModels.get(i);
3158                        List<Chain> waterModel = waterModels.get(i);
3159
3160                        List<Chain> splitNonPolys = new ArrayList<>();
3161                        splitNonPolyModels.add(splitNonPolys);
3162
3163                        for (Chain nonPoly:nonPolyModel) {
3164                                List<List<Chain>> splits = splitNonPolyChain(nonPoly);
3165                                splitNonPolys.addAll(splits.get(0));
3166                                waterModel.addAll(splits.get(1));
3167                        }
3168                }
3169
3170
3171                // now we have all chains as in mmcif, let's assign ids following the mmcif rules
3172                assignAsymIds(polyModels, splitNonPolyModels, waterModels);
3173
3174
3175                if (!entities.isEmpty()) {
3176                        // if the file contained COMPOUND records then we can assign entities to the poly chains
3177                        for (EntityInfo comp : entities){
3178                        List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId());
3179                        if ( chainIds == null)
3180                                continue;
3181                        for ( String chainId : chainIds) {
3182
3183                                        List<List<Chain>> models = findChains(chainId, polyModels);
3184
3185                                        for (List<Chain> matchingChains:models) {
3186                                                for (Chain chain:matchingChains) {
3187                                                        comp.addChain(chain);
3188                                                        chain.setEntityInfo(comp);
3189                                                }
3190
3191                                                if (matchingChains.isEmpty()) {
3192                                        // usually if this happens something is wrong with the PDB header
3193                                        // e.g. 2brd - there is no Chain A, although it is specified in the header
3194                                        // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES
3195                                        // but the authors didn't observe in the density so it's completely missing
3196                                        // from the ATOM lines
3197                                                        logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId());
3198                                                }
3199                                        }
3200                                }
3201                        }
3202
3203                } else {
3204
3205                        logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically");
3206                        // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
3207                        entities = EntityFinder.findPolyEntities(polyModels);
3208
3209                }
3210
3211                // now we assign entities to the nonpoly and water chains
3212                EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities);
3213
3214
3215                // in some rare cases purely non-polymer or purely water chain are present in pdb files
3216                // see https://github.com/biojava/biojava/pull/394
3217                // these case should be covered by the above
3218
3219
3220                // now that we have entities in chains we add the chains to the structure
3221
3222                for (int i=0;i<allModels.size();i++) {
3223                        List<Chain> model = new ArrayList<>();
3224                        model.addAll(polyModels.get(i));
3225                        model.addAll(splitNonPolyModels.get(i));
3226                        model.addAll(waterModels.get(i));
3227                        structure.addModel(model);
3228                        }
3229
3230
3231        }
3232
3233        /**
3234         * Links the Sites in the siteMap to the Groups in the Structure via the
3235         * siteToResidueMap ResidueNumber.
3236         * @author Jules Jacobsen
3237         * @return
3238         */
3239        private void linkSitesToGroups() {
3240
3241                //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size());
3242
3243                //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back.
3244                //the return list
3245
3246                if ( siteMap == null || siteToResidueMap == null){
3247                        logger.info("Sites can not be linked to residues!");
3248
3249                        return;
3250                }
3251
3252                List<Site> sites = null;
3253                //check that there are chains with which to associate the groups
3254                if (structure.getChains().isEmpty()) {
3255                        sites = new ArrayList<Site>(siteMap.values());
3256                        logger.info("No chains to link Site Groups with - Sites will not be present in the Structure");
3257                        return;
3258                }
3259
3260                //check that the keys in the siteMap and SiteToResidueMap are equal
3261                if (! siteMap.keySet().equals(siteToResidueMap.keySet())) {
3262                        logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure");
3263                        logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet());
3264                        //return;
3265                }
3266
3267                //so we have chains - associate the siteResidues-related groups with the ones
3268                //already in in the chains
3269                for (String key : siteMap.keySet()) {
3270                        Site currentSite = siteMap.get(key);
3271                        List<ResidueNumber> linkedGroups = siteToResidueMap.get(key);
3272                        if ( linkedGroups == null)
3273                                continue;
3274                        for (ResidueNumber residueNumber : linkedGroups) {
3275
3276                                String pdbCode = residueNumber.toString();
3277                                String chain = residueNumber.getChainName();
3278                                //                    System.out.println("chain: '" + chain + "'");
3279                                //                    String resNum = resNum.getSeqNum().toString();
3280                                //                    System.out.println("resNum: '" + resNum + "'");
3281
3282                                Group linkedGroup = null;
3283                                try {
3284                                        //TODO: implement findGroup(ResidueNumber resNum)
3285                                        linkedGroup = structure.findGroup(chain, pdbCode);
3286                                } catch (StructureException ex) {
3287                                        logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")");
3288                                        continue;
3289                                }
3290
3291                                //                    System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID());
3292                                currentSite.getGroups().add(linkedGroup);
3293                        }
3294                }
3295
3296                //System.out.println("SITEMAP: " + siteMap);
3297
3298                sites = new ArrayList<Site>(siteMap.values());
3299                structure.setSites(sites);
3300                //System.out.println("STRUCTURE SITES: " + structure.getSites().size());
3301                //            for (Site site : structure.getSites()) {
3302                //                System.out.println(site);
3303                //            }
3304                //            System.out.println("Linked Site Groups with Chains");
3305
3306        }
3307
3308        private void buildjournalArticle() {
3309
3310                logger.debug("building new JournalArticle");
3311                //            for (String line : journalLines) {
3312                //                System.out.println(line);
3313                //            }
3314
3315                this.journalArticle = new JournalArticle();
3316                //        JRNL        AUTH   M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI,
3317                //        JRNL        AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT
3318                //        JRNL        TITL   A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY
3319                //        JRNL        TITL 2 STAPHYLOCOCCUS AUREUS.
3320                //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3321                //        JRNL        REFN                   ISSN 1529-2908
3322                //        JRNL        PMID   17351618
3323                //        JRNL        DOI    10.1038/NI1450
3324                StringBuffer auth = new StringBuffer();
3325                StringBuffer titl = new StringBuffer();
3326                StringBuffer edit = new StringBuffer();
3327                StringBuffer ref = new StringBuffer();
3328                StringBuffer publ = new StringBuffer();
3329                StringBuffer refn = new StringBuffer();
3330                StringBuffer pmid = new StringBuffer();
3331                StringBuffer doi = new StringBuffer();
3332
3333                for (String line : journalLines) {
3334                        if ( line.length() < 19 ) {
3335                                logger.info("can not process Journal line: " + line);
3336                                continue;
3337                        }
3338                        //            System.out.println("'" + line + "'");
3339                        String subField = line.substring(12, 16);
3340                        //            System.out.println("'" + subField + "'");
3341                        if (subField.equals("AUTH")) {
3342                                auth.append(line.substring(19, line.length()).trim());
3343
3344                                logger.debug("AUTH '" + auth.toString() + "'");
3345
3346                        }
3347                        if (subField.equals("TITL")) {
3348                                //add a space to the end of a line so that when wrapped the
3349                                //words on the join won't be concatenated
3350                                titl.append(line.substring(19, line.length()).trim()).append(" ");
3351
3352                                logger.debug("TITL '" + titl.toString() + "'");
3353
3354                        }
3355                        if (subField.equals("EDIT")) {
3356                                edit.append(line.substring(19, line.length()).trim());
3357
3358                                logger.debug("EDIT '" + edit.toString() + "'");
3359
3360                        }
3361                        //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3362                        if (subField.equals("REF ")) {
3363                                ref.append(line.substring(19, line.length()).trim()).append(" ");
3364
3365                                logger.debug("REF '" + ref.toString() + "'");
3366
3367                        }
3368                        if (subField.equals("PUBL")) {
3369                                publ.append(line.substring(19, line.length()).trim()).append(" ");
3370
3371                                logger.debug("PUBL '" + publ.toString() + "'");
3372
3373                        }
3374                        //        JRNL        REFN                   ISSN 1529-2908
3375                        if (subField.equals("REFN")) {
3376                                if ( line.length() < 35 ) {
3377                                        logger.info("can not process Journal REFN line: " + line);
3378                                        continue;
3379                                }
3380                                refn.append(line.substring(35, line.length()).trim());
3381
3382                                logger.debug("REFN '" + refn.toString() + "'");
3383
3384                        }
3385                        //        JRNL        PMID   17351618
3386                        if (subField.equals("PMID")) {
3387                                pmid.append(line.substring(19, line.length()).trim());
3388
3389                                logger.debug("PMID '" + pmid.toString() + "'");
3390
3391                        }
3392                        //        JRNL        DOI    10.1038/NI1450
3393                        if (subField.equals("DOI ")) {
3394                                doi.append(line.substring(19, line.length()).trim());
3395
3396                                logger.debug("DOI '" + doi.toString() + "'");
3397
3398                        }
3399                }
3400
3401                //now set the parts of the JournalArticle
3402                journalArticle.setAuthorList(authorBuilder(auth.toString()));
3403                journalArticle.setEditorList(authorBuilder(edit.toString()));
3404                journalArticle.setRef(ref.toString());
3405                JournalParser journalParser = new JournalParser(ref.toString());
3406                journalArticle.setJournalName(journalParser.getJournalName());
3407                if (!journalArticle.getJournalName().equals("TO BE PUBLISHED")) {
3408                        journalArticle.setIsPublished(true);
3409                }
3410                journalArticle.setVolume(journalParser.getVolume());
3411                journalArticle.setStartPage(journalParser.getStartPage());
3412                journalArticle.setPublicationDate(journalParser.getPublicationDate());
3413                journalArticle.setPublisher(publ.toString().trim());
3414                journalArticle.setTitle(titl.toString().trim());
3415                journalArticle.setRefn(refn.toString().trim());
3416                journalArticle.setPmid(pmid.toString().trim());
3417                journalArticle.setDoi(doi.toString().trim());
3418
3419
3420                logger.debug("Made JournalArticle:");
3421                logger.debug(journalArticle.toString());
3422
3423        }
3424
3425        //inner class to deal with all the journal info
3426        private class JournalParser {
3427
3428                private String journalName;
3429                private String volume;
3430                private String startPage;
3431                private int publicationDate;
3432
3433
3434                public JournalParser(String ref) {
3435
3436                        logger.debug("JournalParser init '" + ref + "'");
3437
3438
3439                        if (ref.equals("TO BE PUBLISHED ")) {
3440                                journalName = ref.trim();
3441
3442                                logger.debug(String.format("JournalParser found journalString '%s'", journalName));
3443
3444                                return;
3445                        }
3446
3447                        if (ref.length() < 48) {
3448                                logger.info("REF line too short - must be at least 48 characters to be valid for parsing.");
3449                                journalName = "";
3450                                volume = "";
3451                                startPage = "";
3452                                publicationDate = 0;
3453                                return;
3454                        }
3455                        //can be multi line:
3456                        //REF    PHILOS.TRANS.R.SOC.LONDON,    V. 293    53 1981
3457                        //REF  2 SER.B
3458
3459                        //or
3460
3461                        //REF    GLYCOGEN PHOSPHORYLASE B:                1 1991
3462                        //REF  2 DESCRIPTION OF THE PROTEIN
3463                        //REF  3 STRUCTURE
3464
3465                        //but usually single line
3466                        //REF    NUCLEIC ACIDS RES.                         2009
3467                        //REF    MOL.CELL                                   2009
3468                        //REF    NAT.STRUCT.MOL.BIOL.          V.  16   238 2009
3469                        //REF    ACTA CRYSTALLOGR.,SECT.F      V.  65   199 2009
3470                        //check if the date is present at the end of the line.
3471                        //                             09876543210987654321
3472                        //'J.BIOL.CHEM.                  V. 280 23000 2005 '
3473                        //'J.AM.CHEM.SOC.                V. 130 16011 2008 '
3474                        //'NAT.STRUCT.MOL.BIOL.          V.  16   238 2009'
3475                        String volumeInformation = ref.substring(30, 48);
3476
3477                        logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation));
3478
3479                        //volumeInformation: 'V. 293    53 1981 '
3480                        //                      String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim();
3481                        //                      String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim();
3482                        //                      String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim();
3483                        //                      String journalString = ref.substring(0 , ref.length() - 18).trim();
3484                        String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim();
3485                        String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim();
3486                        String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim();
3487                        //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk)
3488                        String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim();
3489                        journalString = journalString.trim();
3490                        //                        System.out.println("journalString: " + journalString);
3491
3492                        logger.debug(String.format("JournalParser found volumeString '%s'", volumeString));
3493                        logger.debug(String.format("JournalParser found startPageString '%s'", startPageString));
3494                        logger.debug(String.format("JournalParser found dateString '%s'", dateString));
3495                        logger.debug(String.format("JournalParser found journalString '%s'", journalString));
3496
3497
3498                        if (!dateString.equals("    ")) {
3499                                try {
3500                                        publicationDate = Integer.valueOf(dateString);
3501                                } catch (NumberFormatException nfe) {
3502                                        logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1");
3503                                }
3504                                //                              if (DEBUG) {
3505                                //                                      System.out.println("JournalParser set date " + publicationDate);
3506                                //                              }
3507                        }
3508
3509                        if (!startPageString.equals("    ")) {
3510                                startPage = startPageString;
3511                                //                              if (DEBUG) {
3512                                //                                      System.out.println("JournalParser set startPage " + startPage);
3513                                //                              }
3514                        }
3515
3516                        if (!volumeString.equals("    ")) {
3517                                volume = volumeString;
3518                                //                              if (DEBUG) {
3519                                //                                      System.out.println("JournalParser set volume " + volume);
3520                                //                              }
3521                        }
3522
3523                        if (!journalString.equals("    ")) {
3524                                journalName = journalString;
3525
3526                                logger.debug("JournalParser set journalName " + journalName);
3527
3528                        }
3529                }
3530
3531                private String getJournalName() {
3532                        return journalName;
3533                }
3534
3535                private int getPublicationDate() {
3536                        return publicationDate;
3537                }
3538
3539                private String getStartPage() {
3540                        return startPage;
3541                }
3542
3543                private String getVolume() {
3544                        return volume;
3545                }
3546        }
3547
3548        private List<Author> authorBuilder(String authorString) {
3549                ArrayList<Author> authorList = new ArrayList<Author>();
3550
3551                if (authorString.equals("")) {
3552                        return authorList;
3553                }
3554
3555                String[] authors = authorString.split(",");
3556                //        if (DEBUG) {
3557                //            for (int i = 0; i < authors.length; i++) {
3558                //                String string = authors[i];
3559                //                System.out.println("authorBuilder author: '" + string + "'");
3560                //            }
3561                //        }
3562                //        AUTH   SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS
3563                //        AUTH 2 DISEASE (SSGCID)
3564                //        or
3565                //        AUTH   E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET,
3566                //        AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA,
3567                //        AUTH 3 A.BOCHKAREV,D.COSSAR,
3568                //        AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC)
3569                //        or
3570                //        AUTH   T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER
3571                if (authors.length == 1) {
3572                        //only one element means it's a consortium only
3573                        Author author = new Author();
3574                        author.setSurname(authors[0]);
3575
3576                        logger.debug("Set consortium author name " + author.getSurname());
3577
3578                        authorList.add(author);
3579                } else {
3580                        for (int i = 0; i < authors.length; i++) {
3581                                String authorFullName = authors[i];
3582
3583                                logger.debug("Building author " + authorFullName);
3584
3585                                Author author = new Author();
3586                                String regex = "\\.";
3587                                String[] authorNames = authorFullName.split(regex);
3588                                //                if (DEBUG) {
3589                                //                    System.out.println("authorNames size " + authorNames.length);
3590                                //                    for (int j = 0; j < authorNames.length; j++) {
3591                                //                        String name = authorNames[j];
3592                                //                        System.out.println("split authName '" + name + "'");
3593                                //
3594                                //                    }
3595                                //                }
3596                                if (authorNames.length == 0) {
3597                                        author.setSurname(authorFullName);
3598
3599                                        logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname());
3600
3601                                }
3602                                //again there might be a consortium name so there may be no elements
3603                                else if (authorNames.length == 1) {
3604                                        author.setSurname(authorNames[0]);
3605
3606                                        logger.debug("Set consortium author name in multiple author block " + author.getSurname
3607                                                                ());
3608
3609                                } else {
3610                                        String initials = "";
3611                                        for (int j = 0; j < authorNames.length - 1; j++) {
3612                                                String initial = authorNames[j];
3613                                                //                        if (DEBUG) {
3614                                                //                            System.out.println("adding initial '" + initial + "'");
3615                                                //                        }
3616                                                //build the initials back up again
3617                                                initials += initial + ".";
3618                                        }
3619
3620                                        logger.debug("built initials '" + initials + "'");
3621
3622                                        author.setInitials(initials);
3623                                        //surname is always last
3624                                        int lastName = authorNames.length - 1;
3625                                        String surname = authorNames[lastName];
3626
3627                                        logger.debug("built author surname " + surname);
3628
3629                                        author.setSurname(surname);
3630
3631                                }
3632                                authorList.add(author);
3633                        }
3634                }
3635                return authorList;
3636        }
3637
3638        public void setFileParsingParameters(FileParsingParameters params)
3639        {
3640                this.params= params;
3641
3642                // set the correct max values for parsing...
3643                loadMaxAtoms = params.getMaxAtoms();
3644                atomCAThreshold = params.getAtomCaThreshold();
3645
3646
3647        }
3648
3649        public FileParsingParameters getFileParsingParameters(){
3650                return params;
3651        }
3652
3653
3654}