Source code

001/*
002 *
003 * This code may be freely distributed and modified under the
004 * terms of the GNU Lesser General Public Licence.  This should
005 * be distributed with the code.  If you do not have a copy,
006 * see:
007 *
008 *      http://www.gnu.org/copyleft/lesser.html
009 *
010 * Copyright for this code is held jointly by the individual
011 * authors.  These should be listed in @author doc comments.
012 *
013 * For more information on the BioJava project and its aims,
014 * or to join the biojava-l mailing list, visit the home page
015 * at:
016 *
017 *      http://www.biojava.org/
018 *
019 * Created on 16.03.2004
020 *
021 */
022package org.biojava.nbio.structure.io;
023
024import static java.lang.Math.min;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.InputStreamReader;
030import java.text.DateFormat;
031import java.text.ParseException;
032import java.text.SimpleDateFormat;
033import java.util.ArrayList;
034import java.util.Arrays;
035import java.util.Date;
036import java.util.HashMap;
037import java.util.Iterator;
038import java.util.LinkedHashMap;
039import java.util.List;
040import java.util.Locale;
041import java.util.Map;
042import java.util.StringTokenizer;
043import java.util.regex.Matcher;
044import java.util.regex.Pattern;
045
046import javax.vecmath.Matrix4d;
047
048import org.biojava.nbio.structure.AminoAcid;
049import org.biojava.nbio.structure.AminoAcidImpl;
050import org.biojava.nbio.structure.Atom;
051import org.biojava.nbio.structure.AtomImpl;
052import org.biojava.nbio.structure.Author;
053import org.biojava.nbio.structure.Chain;
054import org.biojava.nbio.structure.ChainImpl;
055import org.biojava.nbio.structure.DBRef;
056import org.biojava.nbio.structure.Element;
057import org.biojava.nbio.structure.EntityInfo;
058import org.biojava.nbio.structure.EntityType;
059import org.biojava.nbio.structure.Group;
060import org.biojava.nbio.structure.GroupIterator;
061import org.biojava.nbio.structure.HetatomImpl;
062import org.biojava.nbio.structure.JournalArticle;
063import org.biojava.nbio.structure.NucleotideImpl;
064import org.biojava.nbio.structure.PDBCrystallographicInfo;
065import org.biojava.nbio.structure.PDBHeader;
066import org.biojava.nbio.structure.PdbId;
067import org.biojava.nbio.structure.ResidueNumber;
068import org.biojava.nbio.structure.Site;
069import org.biojava.nbio.structure.Structure;
070import org.biojava.nbio.structure.StructureException;
071import org.biojava.nbio.structure.StructureImpl;
072import org.biojava.nbio.structure.StructureTools;
073import org.biojava.nbio.structure.chem.ChemCompAtom;
074import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
075import org.biojava.nbio.structure.io.util.PDBTemporaryStorageUtils.LinkRecord;
076import org.biojava.nbio.structure.secstruc.SecStrucInfo;
077import org.biojava.nbio.structure.secstruc.SecStrucType;
078import org.biojava.nbio.structure.xtal.CrystalCell;
079import org.biojava.nbio.structure.xtal.SpaceGroup;
080import org.biojava.nbio.structure.xtal.SymoplibParser;
081import org.slf4j.Logger;
082import org.slf4j.LoggerFactory;
083
084
085/**
086 * This class implements the actual PDB file parsing. Do not access it directly, but
087 * via the PDBFileReader class.
088 *
089 * <h2>Parsing</h2>
090 *
091 * During the PDBfile parsing several Flags can be set. See the {@link #setFileParsingParameters(FileParsingParameters)} methods.
092 *
093 *
094 * <p>
095 * To provide excessive memory usage for large PDB files, there is the ATOM_CA_THRESHOLD.
096 * If more Atoms than this threshold are being parsed in a PDB file, the parser will automatically
097 * switch to a C-alpha only representation.
098 *
099 * <p>
100 * The result of the parsing of the PDB file is a new {@link Structure} object.
101 *
102 * <p>
103 * For more documentation on how to work with the Structure API please
104 * see <a href="http://biojava.org/wiki/BioJava:CookBook#Protein_Structure" target="_top">
105 * http://biojava.org/wiki/BioJava:CookBook#Protein_Structure</a>
106 *
107 *
108 *
109 *
110 * <h2>Example</h2>
111 * <p>
112 * Q: How can I get a Structure object from a PDB file?
113 * <p>
114 * A:
115 * <pre>
116 * public {@link Structure} loadStructure(String pathToPDBFile){
117 *      // The PDBFileParser is wrapped by the PDBFileReader
118 *      {@link PDBFileReader} pdbreader = new {@link PDBFileReader}();
119 *
120 *      {@link Structure} structure = null;
121 *      try{
122 *              structure = pdbreader.getStructure(pathToPDBFile);
123 *              System.out.println(structure);
124 *      } catch (IOException e) {
125 *              e.printStackTrace();
126 *      }
127 *      return structure;
128 * }
129 * </pre>
130 *
131 *
132 * @author Andreas Prlic
133 * @author Jules Jacobsen
134 * @author Jose Duarte
135 * @since 1.4
136 */
137public class PDBFileParser  {
138
139
140
141        private static final Logger logger = LoggerFactory.getLogger(PDBFileParser.class);
142
143        // for printing
144        private static final String NEWLINE = System.getProperty("line.separator");
145
146
147        // required for parsing:
148        private String pdbId; //the actual id of the entry
149        private Structure     structure;
150        private List<List<Chain>> allModels; // a temp data structure to keep all models
151        private List<Chain>   currentModel; // contains the ATOM records for each model
152        private Chain         currentChain;
153        private Group         currentGroup;
154
155        private List<Chain>   seqResChains; // contains all the chains for the SEQRES records
156        //we're going to work on the assumption that the files are current -
157        //if the pdb_HEADER_Handler detects a legacy format, this will be changed to true.
158        //if true then lines will be truncated at 72 characters in certain cases
159        //(pdb_COMPOUND_handler for example)
160        private boolean isLegacyFormat = false;
161
162        private boolean blankChainIdsPresent = false;
163
164        // for re-creating the biological assembly
165        private PDBBioAssemblyParser bioAssemblyParser = null;
166
167        private PDBHeader pdbHeader;
168        private PDBCrystallographicInfo crystallographicInfo;
169        private JournalArticle journalArticle;
170        private List<Map<String, Integer>> connects ;
171        private List<Map<String,String>> helixList;
172        private List<Map<String,String>> strandList;
173        private List<Map<String,String>> turnList;
174
175        private int lengthCheck ;
176
177        private boolean isLastCompndLine = false;
178        private boolean isLastSourceLine = false;
179        private EntityInfo current_compound;
180        private List<EntityInfo> entities = new ArrayList<>();
181        private HashMap<Integer,List<String>> compoundMolIds2chainIds = new HashMap<>();
182        private List<String> compndLines = new ArrayList<>();
183        private List<String> sourceLines = new ArrayList<>();
184        private List<String> journalLines = new ArrayList<>();
185        private List<String> keywordsLines = new ArrayList<>();
186        private List<DBRef> dbrefs;
187        private Map<String, Site> siteMap = new LinkedHashMap<>();
188        private Map<String, List<ResidueNumber>> siteToResidueMap = new LinkedHashMap<>();
189
190        private List<SSBondImpl> ssbonds = new ArrayList<>();
191
192        // for storing LINK until we have all the atoms parsed
193        private List<LinkRecord> linkRecords;
194
195        private Matrix4d currentNcsOp;
196        private List<Matrix4d> ncsOperators;
197
198        // for parsing COMPOUND and SOURCE Header lines
199        private int prevMolId;
200        private String previousContinuationField;
201        private String continuationField;
202        private String continuationString;
203
204        private DateFormat dateFormat;
205
206        // for rfree parsing
207        private float rfreeStandardLine = -1;
208        private float rfreeNoCutoffLine = -1;
209
210        private static  final List<String> compndFieldValues = new ArrayList<>(
211                        Arrays.asList(
212                                        "MOL_ID:", "MOLECULE:", "CHAIN:", "SYNONYM:",
213                                        "EC:", "FRAGMENT:", "ENGINEERED:", "MUTATION:",
214                                        "BIOLOGICAL_UNIT:", "OTHER_DETAILS:"
215                                        ));
216
217
218        private static final List<String> ignoreCompndFieldValues = new ArrayList<>(
219                        Arrays.asList(
220                                        "HETEROGEN:","ENGINEEREED:","FRAGMENT,",
221                                        "MUTANT:","SYNTHETIC:"
222                                        ));
223        // ENGINEEREED in pdb219d
224
225        private static final List<String> sourceFieldValues = new ArrayList<>(
226                        Arrays.asList("ENGINEERED:", "MOL_ID:", "SYNTHETIC:", "FRAGMENT:",
227                                        "ORGANISM_SCIENTIFIC:", "ORGANISM_COMMON:",
228                                        "ORGANISM_TAXID:","STRAIN:",
229                                        "VARIANT:", "CELL_LINE:", "ATCC:", "ORGAN:", "TISSUE:",
230                                        "CELL:", "ORGANELLE:", "SECRETION:", "GENE:",
231                                        "CELLULAR_LOCATION:", "EXPRESSION_SYSTEM:",
232                                        "EXPRESSION_SYSTEM_TAXID:",
233                                        "EXPRESSION_SYSTEM_STRAIN:", "EXPRESSION_SYSTEM_VARIANT:",
234                                        "EXPRESSION_SYSTEM_CELL_LINE:",
235                                        "EXPRESSION_SYSTEM_ATCC_NUMBER:",
236                                        "EXPRESSION_SYSTEM_ORGAN:", "EXPRESSION_SYSTEM_TISSUE:",
237                                        "EXPRESSION_SYSTEM_CELL:", "EXPRESSION_SYSTEM_ORGANELLE:",
238                                        "EXPRESSION_SYSTEM_CELLULAR_LOCATION:",
239                                        "EXPRESSION_SYSTEM_VECTOR_TYPE:",
240                                        "EXPRESSION_SYSTEM_VECTOR:", "EXPRESSION_SYSTEM_PLASMID:",
241                                        "EXPRESSION_SYSTEM_GENE:", "OTHER_DETAILS:"));
242
243        private int atomCount;
244
245        // parsing options:
246
247        private int atomCAThreshold ;
248
249        private int loadMaxAtoms;
250
251        private boolean atomOverflow;
252
253        /** flag to tell parser to only read Calpha coordinates **/
254        private boolean parseCAonly;
255
256
257        private FileParsingParameters params;
258
259        private boolean startOfMolecule;
260        private boolean startOfModel;
261
262        public PDBFileParser() {
263                params = new FileParsingParameters();
264
265                allModels = new ArrayList<>();
266                structure     = null           ;
267                currentModel  = null;
268                currentChain  = null;
269                currentGroup  = null;
270                // we initialise to true since at the beginning of the file we are always starting a new molecule
271                startOfMolecule = true;
272                startOfModel = true;
273
274
275                pdbHeader         = new PDBHeader();
276                crystallographicInfo = new PDBCrystallographicInfo();
277                connects      = new ArrayList<>() ;
278
279
280                helixList     = new ArrayList<>();
281                strandList    = new ArrayList<>();
282                turnList      = new ArrayList<>();
283                current_compound = null;
284                dbrefs        = new ArrayList<>();
285                siteMap = null;
286                dateFormat = new SimpleDateFormat("dd-MMM-yy", Locale.US);
287                atomCount = 0;
288                atomOverflow = false;
289                parseCAonly = false;
290
291                // this SHOULD not be done
292                // DONOT:setFileParsingParameters(params);
293                // set the correct max values for parsing...
294                loadMaxAtoms = params.getMaxAtoms();
295                atomCAThreshold = params.getAtomCaThreshold();
296
297                linkRecords = new ArrayList<>();
298
299                blankChainIdsPresent = false;
300
301        }
302
303        /** initiate new resNum, either Hetatom, Nucleotide, or AminoAcid */
304        private Group getNewGroup(String recordName,Character aminoCode1, String aminoCode3) {
305
306                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(aminoCode3);
307                if ( g != null && !g.getChemComp().isEmpty())
308                        return g;
309
310
311                Group group;
312                if (aminoCode1 == null || StructureTools.UNKNOWN_GROUP_LABEL == aminoCode1 ){
313                        group = new HetatomImpl();
314
315                } else if(StructureTools.isNucleotide(aminoCode3))  {
316                        // it is a nucleotide
317                        NucleotideImpl nu = new NucleotideImpl();
318                        group = nu;
319
320                } else {
321                        AminoAcidImpl aa = new AminoAcidImpl() ;
322                        aa.setAminoType(aminoCode1);
323                        group = aa ;
324                }
325
326                //              System.out.println("new resNum type: "+ resNum.getType() );
327                return  group ;
328        }
329
330
331
332        // Handler methods to deal with PDB file records properly.
333        /**
334         Handler for
335         HEADER Record Format
336         <pre>
337         COLUMNS        DATA TYPE       FIELD           DEFINITION
338         ----------------------------------------------------------------------------------
339         1 -  6        Record name     "HEADER"
340         11 - 50        String(40)      classification  Classifies the molecule(s)
341         51 - 59        Date            depDate         Deposition date.  This is the date
342         the coordinates were received by
343         the PDB
344         63 - 66        IDcode          idCode          This identifier is unique within PDB
345        </pre>
346         */
347        private void pdb_HEADER_Handler(String line) {
348
349                String classification  = null;
350                String deposition_date = null;
351                String pdbCode         = null;
352
353                int len = line.trim().length();
354                if(len > 10) {
355                        classification  = line.substring (10, min(len,50)).trim() ;
356                        pdbHeader.setClassification(classification);
357                }
358                if(len > 50) {
359                        deposition_date = line.substring (50, min(len,59)).trim() ;
360                        try {
361                                Date dep = dateFormat.parse(deposition_date);
362                                pdbHeader.setDepDate(dep);
363
364                        } catch (ParseException e){
365                                logger.info("Could not parse deposition date string '"+deposition_date+"'. Will continue without deposition date");
366                        }
367                }
368                if(len > 62) {
369                        pdbCode         = line.substring (62, min(len,66)).trim() ;
370                        pdbId = pdbCode;
371
372                        logger.debug("Parsing entry {}", pdbId);
373
374                        PdbId pdbIdToSet;
375                        if(pdbCode.isBlank()) {
376                                pdbIdToSet = null;
377                        } else {
378                                try {
379                                        pdbIdToSet = new PdbId(pdbCode);
380                                } catch (IllegalArgumentException e) {
381                                        logger.warn("Malformed PDB ID {}. setting PdbId to null", pdbCode);
382                                        pdbIdToSet = null;
383                                }
384                        }
385                        structure.setPdbId(pdbIdToSet);
386                        pdbHeader.setPdbId(pdbIdToSet);
387                }
388
389                //*really* old files (you'll need to hunt to find these as they
390                //should have been remediated) have headers like below. Plus the
391                //pdbId at positions 72-76 is present in every line
392
393                //HEADER    PROTEINASE INHIBITOR (TRYPSIN)          05-OCT-84   5PTI      5PTI   3
394                //HEADER    TRANSFERASE (ACYLTRANSFERASE)           02-SEP-92   1LAC      1LAC   2
395                if (len > 66) {
396                        if (pdbId.equals(line.substring (72, 76))){
397                                isLegacyFormat = true;
398                                logger.warn(pdbId + " is a LEGACY entry - this will most likely not parse correctly.");
399                        }
400                }
401
402        }
403
404
405        /**
406         * Parses the following record:
407         * <pre>
408         *  COLUMNS      DATA  TYPE      FIELD         DEFINITION
409         * ------------------------------------------------------------------------------------
410         *  1 -  6      Record name     "AUTHOR"
411         *  9 - 10      Continuation    continuation  Allows concatenation of multiple records.
412         * 11 - 79      List            authorList    List of the author names, separated
413         *                                            by commas.
414         *
415         * </pre>
416         * @param line
417         */
418        private void pdb_AUTHOR_Handler(String line) {
419
420                String authors = line.substring(10).trim();
421
422                String auth = pdbHeader.getAuthors();
423                if (auth == null){
424                        pdbHeader.setAuthors(authors);
425                } else {
426                        auth +=  authors;
427                        pdbHeader.setAuthors(auth);
428                }
429
430        }
431
432
433
434        /**
435         * Parses the following record:
436         *
437         * <pre>
438         * COLUMNS       DATA TYPE        FIELD        DEFINITION
439         * --------------------------------------------------------------------
440         *  1 -  6       Record name      "HELIX "
441         *  8 - 10       Integer          serNum       Serial number of the helix.
442         *                                             This starts at 1 and increases
443         *                                             incrementally.
444         * 12 - 14       LString(3)       helixID      Helix identifier. In addition
445         *                                             to a serial number, each helix is
446         *                                             given an alphanumeric character
447         *                                             helix identifier.
448         * 16 - 18       Residue name     initResName  Name of the initial residue.
449         * 20            Character        initChainID  Chain identifier for the chain
450         *                                             containing this helix.
451         * 22 - 25       Integer          initSeqNum   Sequence number of the initial
452         *                                             residue.
453         * 26            AChar            initICode    Insertion code of the initial
454         *                                             residue.
455         * 28 - 30       Residue name     endResName   Name of the terminal residue of
456         *                                             the helix.
457         * 32            Character        endChainID   Chain identifier for the chain
458         *                                             containing this helix.
459         * 34 - 37       Integer          endSeqNum    Sequence number of the terminal
460         *                                             residue.
461         * 38            AChar            endICode     Insertion code of the terminal
462         *                                             residue.
463         * 39 - 40       Integer          helixClass   Helix class (see below).
464         * 41 - 70       String           comment      Comment about this helix.
465         * 72 - 76       Integer          length       Length of this helix.
466         * </pre>
467         */
468        private void pdb_HELIX_Handler(String line){
469
470                if (params.isHeaderOnly()) return;
471
472                if (line.length()<38) {
473                        logger.info("HELIX line has length under 38. Ignoring it.");
474                        return;
475                }
476
477                String initResName = line.substring(15,18).trim();
478                String initChainId = line.substring(19,20);
479                String initSeqNum  = line.substring(21,25).trim();
480                String initICode   = line.substring(25,26);
481                String endResName  = line.substring(27,30).trim();
482                String endChainId  = line.substring(31,32);
483                String endSeqNum   = line.substring(33,37).trim();
484                String endICode    = line.substring(37,38);
485
486                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
487                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
488
489                Map<String,String> m = new HashMap<>();
490
491                m.put("initResName",initResName);
492                m.put("initChainId", initChainId);
493                m.put("initSeqNum", initSeqNum);
494                m.put("initICode", initICode);
495                m.put("endResName", endResName);
496                m.put("endChainId", endChainId);
497                m.put("endSeqNum",endSeqNum);
498                m.put("endICode",endICode);
499
500                helixList.add(m);
501
502        }
503
504        /**
505         * Handler for
506         * <pre>
507         *       COLUMNS     DATA TYPE        FIELD           DEFINITION
508         * --------------------------------------------------------------
509         *  1 -  6     Record name      "SHEET "
510         *  8 - 10     Integer          strand       Strand number which starts at 1
511         *                                           for each strand within a sheet
512         *                                           and increases by one.
513         * 12 - 14     LString(3)       sheetID      Sheet identifier.
514         * 15 - 16     Integer          numStrands   Number of strands in sheet.
515         * 18 - 20     Residue name     initResName  Residue name of initial residue.
516         * 22          Character        initChainID  Chain identifier of initial
517         *                                           residue in strand.
518         * 23 - 26     Integer          initSeqNum   Sequence number of initial
519         *                                           residue in strand.
520         * 27          AChar            initICode    Insertion code of initial residue
521         *                                           in strand.
522         * 29 - 31     Residue name     endResName   Residue name of terminal residue.
523         * 33          Character        endChainID   Chain identifier of terminal
524         *                                           residue.
525         * 34 - 37     Integer          endSeqNum    Sequence number of terminal
526         *                                           residue.
527         * 38          AChar            endICode     Insertion code of terminal
528         *                                           residue.
529         * 39 - 40     Integer          sense        Sense of strand with respect to
530         *                                           previous strand in the sheet. 0
531         *                                           if first strand, 1 if parallel,
532         *                                           -1 if anti-parallel.
533         * 42 - 45     Atom             curAtom      Registration. Atom name in
534         *                                           current strand.
535         * 46 - 48     Residue name     curResName   Registration. Residue name in
536         *                                           current strand.
537         * 50          Character        curChainId   Registration. Chain identifier in
538         *                                           current strand.
539         * 51 - 54     Integer          curResSeq    Registration. Residue sequence
540         *                                           number in current strand.
541         * 55          AChar            curICode     Registration. Insertion code in
542         *                                           current strand.
543         * 57 - 60     Atom             prevAtom     Registration. Atom name in
544         *                                           previous strand.
545         * 61 - 63     Residue name     prevResName  Registration. Residue name in
546         *                                           previous strand.
547         * 65          Character        prevChainId  Registration. Chain identifier in
548         *                                           previous strand.
549         * 66 - 69     Integer          prevResSeq   Registration. Residue sequence
550         *                                           number in previous strand.
551         * 70          AChar            prevICode    Registration. Insertion code in
552         *                                               previous strand.
553         * </pre>
554         */
555        private void pdb_SHEET_Handler( String line){
556
557                if (params.isHeaderOnly()) return;
558
559                if (line.length()<38) {
560                        logger.info("SHEET line has length under 38. Ignoring it.");
561                        return;
562                }
563
564                String initResName = line.substring(17,20).trim();
565                String initChainId = line.substring(21,22);
566                String initSeqNum  = line.substring(22,26).trim();
567                String initICode   = line.substring(26,27);
568                String endResName  = line.substring(28,31).trim();
569                String endChainId  = line.substring(32,33);
570                String endSeqNum   = line.substring(33,37).trim();
571                String endICode    = line.substring(37,38);
572
573                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
574                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
575
576                Map<String,String> m = new HashMap<>();
577
578                m.put("initResName",initResName);
579                m.put("initChainId", initChainId);
580                m.put("initSeqNum", initSeqNum);
581                m.put("initICode", initICode);
582                m.put("endResName", endResName);
583                m.put("endChainId", endChainId);
584                m.put("endSeqNum",endSeqNum);
585                m.put("endICode",endICode);
586
587                strandList.add(m);
588        }
589
590
591        /**
592         * Handler for TURN lines
593         * <pre>
594         * COLUMNS      DATA TYPE        FIELD         DEFINITION
595         * --------------------------------------------------------------------
596         *  1 -  6      Record name      "TURN "
597         *  8 - 10      Integer          seq           Turn number; starts with 1 and
598         *                                             increments by one.
599         * 12 - 14      LString(3)       turnId        Turn identifier
600         * 16 - 18      Residue name     initResName   Residue name of initial residue in
601         *                                             turn.
602         * 20           Character        initChainId   Chain identifier for the chain
603         *                                             containing this turn.
604         * 21 - 24      Integer          initSeqNum    Sequence number of initial residue
605         *                                             in turn.
606         * 25           AChar            initICode     Insertion code of initial residue
607         *                                             in turn.
608         * 27 - 29      Residue name     endResName    Residue name of terminal residue
609         *                                             of turn.
610         * 31           Character        endChainId    Chain identifier for the chain
611         *                                             containing this turn.
612         * 32 - 35      Integer          endSeqNum     Sequence number of terminal
613         *                                             residue of turn.
614         * 36           AChar            endICode      Insertion code of terminal residue
615         *                                             of turn.
616         * 41 - 70      String           comment       Associated comment.
617         * </pre>
618         * @param line
619         */
620        private void pdb_TURN_Handler( String line){
621
622                if (params.isHeaderOnly()) return;
623
624                if (line.length()<36) {
625                        logger.info("TURN line has length under 36. Ignoring it.");
626                        return;
627                }
628
629                String initResName = line.substring(15,18).trim();
630                String initChainId = line.substring(19,20);
631                String initSeqNum  = line.substring(20,24).trim();
632                String initICode   = line.substring(24,25);
633                String endResName  = line.substring(26,29).trim();
634                String endChainId  = line.substring(30,31);
635                String endSeqNum   = line.substring(31,35).trim();
636                String endICode    = line.substring(35,36);
637
638                //System.out.println(initResName + " " + initChainId + " " + initSeqNum + " " + initICode + " " +
639                //        endResName + " " + endChainId + " " + endSeqNum + " " + endICode);
640
641                Map<String,String> m = new HashMap<>();
642
643                m.put("initResName",initResName);
644                m.put("initChainId", initChainId);
645                m.put("initSeqNum", initSeqNum);
646                m.put("initICode", initICode);
647                m.put("endResName", endResName);
648                m.put("endChainId", endChainId);
649                m.put("endSeqNum",endSeqNum);
650                m.put("endICode",endICode);
651
652                turnList.add(m);
653        }
654
655        /**
656         * Handler for
657         * REVDAT Record format:
658         * <pre>
659         *
660         * COLUMNS       DATA TYPE      FIELD         DEFINITION
661         * ----------------------------------------------------------------------------------
662         * 1 -  6       Record name    "REVDAT"
663         * 8 - 10       Integer        modNum        Modification number.
664         * 11 - 12       Continuation   continuation  Allows concatenation of multiple
665         * records.
666         * 14 - 22       Date           modDate       Date of modification (or release for
667         * new entries).  This is not repeated
668         * on continuation lines.
669         * 24 - 28       String(5)      modId         Identifies this particular
670         * modification.  It links to the
671         * archive used internally by PDB.
672         * This is not repeated on continuation
673         * lines.
674         * 32            Integer        modType       An integer identifying the type of
675         * modification.  In case of revisions
676         * with more than one possible modType,
677         * the highest value applicable will be
678         * assigned.
679         * 40 - 45       LString(6)     record        Name of the modified record.
680         * 47 - 52       LString(6)     record        Name of the modified record.
681         * 54 - 59       LString(6)     record        Name of the modified record.
682         * 61 - 66       LString(6)     record        Name of the modified record.
683         * </pre>
684         */
685        private void pdb_REVDAT_Handler(String line) {
686
687                // keep the first as latest modified date and the last as release date
688                Date modDate = pdbHeader.getModDate();
689
690                if ( modDate==null || modDate.equals(new Date(0)) ) {
691
692                        // modified date is still uninitialized
693                        String modificationDate = line.substring (13, 22).trim() ;
694
695                        try {
696                                Date dep = dateFormat.parse(modificationDate);
697                                pdbHeader.setModDate(dep);
698                                pdbHeader.setRelDate(dep);
699                        } catch (ParseException e){
700                                logger.info("Could not parse revision date string '"+modificationDate+"'. ");
701                        }
702
703                } else {
704
705                        // set as the release date
706                        String releaseDate = line.substring (13, 22).trim() ;
707
708                        try {
709                                Date dep = dateFormat.parse(releaseDate);
710                                pdbHeader.setRelDate(dep);
711                        } catch (ParseException e){
712                                logger.info("Could not parse revision date string '"+releaseDate+"'. ");
713                        }
714                }
715        }
716
717        /**
718         * Handler for
719         * SEQRES record format
720         * SEQRES records contain the amino acid or nucleic acid sequence of residues in each chain of the macromolecule that was studied.
721         * <p>
722         * Record Format:
723         * <p>
724         * <pre>
725         * COLUMNS        DATA TYPE       FIELD         DEFINITION
726         * ---------------------------------------------------------------------------------
727         * 1 -  6        Record name     "SEQRES"
728         * 9 - 10        Integer         serNum        Serial number of the SEQRES record
729         * for the current chain.  Starts at 1
730         * and increments by one each line.
731         * Reset to 1 for each chain.
732         * 12             Character       chainID       Chain identifier.  This may be any
733         * single legal character, including a
734         * blank which is used if there is
735         * only one chain.
736         * 14 - 17        Integer         numRes        Number of residues in the chain.
737         * This value is repeated on every
738         * record.
739         * 20 - 22        Residue name    resName       Residue name.
740         * 24 - 26        Residue name    resName       Residue name.
741         * 28 - 30        Residue name    resName       Residue name.
742         * 32 - 34        Residue name    resName       Residue name.
743         * 36 - 38        Residue name    resName       Residue name.
744         * 40 - 42        Residue name    resName       Residue name.
745         * 44 - 46        Residue name    resName       Residue name.
746         * 48 - 50        Residue name    resName       Residue name.
747         * 52 - 54        Residue name    resName       Residue name.
748         * 56 - 58        Residue name    resName       Residue name.
749         * 60 - 62        Residue name    resName       Residue name.
750         * 64 - 66        Residue name    resName       Residue name.
751         * 68 - 70        Residue name    resName       Residue name.
752         * </pre>
753         * @author Jules Jacobsen
754         */
755        private void pdb_SEQRES_Handler(String line) {
756
757                /*
758                 *          1         2         3         4         5         6         7
759                 * 1234567890123456789012345678901234567890123456789012345678901234567890
760                 * SEQRES   1 A  376  LYS PRO VAL THR VAL LYS LEU VAL ASP SER GLN ALA THR
761                 * SEQRES   1 A   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
762                 * SEQRES   2 A   21  TYR GLN LEU GLU ASN TYR CYS ASN
763                 * SEQRES   1 B   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
764                 * SEQRES   2 B   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
765                 * SEQRES   3 B   30  THR PRO LYS ALA
766                 * SEQRES   1 C   21  GLY ILE VAL GLU GLN CYS CYS THR SER ILE CYS SER LEU
767                 * SEQRES   2 C   21  TYR GLN LEU GLU ASN TYR CYS ASN
768                 * SEQRES   1 D   30  PHE VAL ASN GLN HIS LEU CYS GLY SER HIS LEU VAL GLU
769                 * SEQRES   2 D   30  ALA LEU TYR LEU VAL CYS GLY GLU ARG GLY PHE PHE TYR
770                 * SEQRES   3 D   30  THR PRO LYS ALA
771                 */
772
773                String recordName = line.substring(0, 6).trim();
774                String chainID    = line.substring(11, 12);
775                String newLength   = line.substring(13,17).trim();
776                String subSequence = line.substring(18);
777
778                if ( lengthCheck == -1 ){
779                        lengthCheck = Integer.parseInt(newLength);
780                }
781
782                StringTokenizer subSequenceResidues = new StringTokenizer(subSequence);
783
784                Character aminoCode1 = null;
785                if (! recordName.equals(AminoAcid.SEQRESRECORD)) {
786                        // should not have been called
787                        return;
788                }
789
790                currentChain = isKnownChain(chainID, seqResChains);
791                if ( currentChain == null) {
792
793                        currentChain = new ChainImpl();
794                        currentChain.setId(chainID);
795                        currentChain.setName(chainID);
796
797                }
798
799                while (subSequenceResidues.hasMoreTokens()) {
800
801                        String threeLetter = subSequenceResidues.nextToken();
802
803                        aminoCode1 = StructureTools.get1LetterCode(threeLetter);
804
805                        //if (aminoCode1 == null) {
806                        // could be a nucleotide...
807                        // but getNewGroup takes care of that and converts ATOM records with aminoCode1 == nnull to nucleotide...
808                        //}
809                        currentGroup = getNewGroup("ATOM", aminoCode1, threeLetter);
810
811                        currentGroup.setPDBName(threeLetter);
812
813                        if ( currentGroup instanceof AminoAcid){
814                                AminoAcid aa = (AminoAcid)currentGroup;
815                                aa.setRecordType(AminoAcid.SEQRESRECORD);
816                        }
817                        // add the current resNum to the new chain.
818                        currentChain.addGroup(currentGroup);
819
820                }
821                Chain test = isKnownChain(chainID, seqResChains);
822
823                if ( test == null)
824                        seqResChains.add(currentChain);
825
826                if (currentGroup != null)
827                        currentGroup.trimToSize();
828
829                currentGroup = null;
830                currentChain = null;
831
832                //               the current chain is finished!
833                //if ( current_chain.getLength() != lengthCheck ){
834                //      System.err.println("the length of chain " + current_chain.getName() + "(" +
835                //                      current_chain.getLength() + ") does not match the expected " + lengthCheck);
836                //}
837
838                lengthCheck = Integer.parseInt(newLength);
839
840        }
841
842
843
844        /**
845         * Handler for
846         * TITLE Record Format
847         * <pre>
848         COLUMNS        DATA TYPE       FIELD          DEFINITION
849         ----------------------------------------------------------------------------------
850         1 -  6        Record name     "TITLE "
851         9 - 10        Continuation    continuation   Allows concatenation of multiple
852         records.
853         11 - 70        String          title          Title of the experiment.
854         * </pre>
855         *
856         */
857        private void pdb_TITLE_Handler(String line) {
858                String title;
859                if ( line.length() > 79)
860                        title = line.substring(10,80).trim();
861                else
862                        title = line.substring(10,line.length()).trim();
863
864                String t = pdbHeader.getTitle();
865                if ( (t != null) && (! "".equals(t)) ){
866                        if (t.endsWith("-"))
867                                t += ""; // if last line ends with a hyphen then we don't add space
868                        else
869                                t += " ";
870                }
871                else t = "";
872
873                t += title;
874
875                pdbHeader.setTitle(t);
876        }
877
878        /**
879         * JRNL handler.
880         * The JRNL record contains the primary literature citation that describes the experiment which resulted
881         * in the deposited coordinate set. There is at most one JRNL reference per entry. If there is no primary
882         * reference, then there is no JRNL reference. Other references are given in REMARK 1.
883         *
884         * Record Format
885         * <pre>
886         * COLUMNS       DATA TYPE     FIELD         DEFINITION
887         * -----------------------------------------------------------------------
888         * 1 -  6       Record name   "JRNL  "
889         *
890         * 13 - 70       LString        text         See Details below.
891         * </pre>
892         */
893        private void pdb_JRNL_Handler(String line) {
894                //add the strings to the journalLines
895                //the actual JournalArticle is then built when the whole entry is being
896                //finalized with triggerEndFileChecks()
897                //JRNL        TITL   NMR SOLUTION STRUCTURE OF RECOMBINANT TICK           1TAP  10
898                if (line.substring(line.length() - 8, line.length() - 4).equals(pdbId)) {
899                        //trim off the trailing PDB id from legacy files.
900                        //are we really trying to still cater for these museum pieces?
901
902                        logger.debug("trimming legacy PDB id from end of JRNL section line");
903
904                        line = line.substring(0, line.length() - 8);
905                        journalLines.add(line);
906                } else {
907                        journalLines.add(line);
908                }
909        }
910
911        /**
912         * This should not be accessed directly, other than by </code>makeCompounds</code>. It still deals with the same
913         * lines in a similar manner but if not accessed from </code>makeCompounds</code> the last element will be
914         * missing. Don't say I didn't warn you.
915         *
916         * @param line
917         */
918        private void pdb_COMPND_Handler(String line) {
919
920                logger.debug("previousContinuationField  is {}", previousContinuationField);
921                logger.debug("current continuationField  is {}", continuationField);
922                logger.debug("current continuationString is {}", continuationString);
923                logger.debug("current compound           is {}", current_compound);
924
925
926                // In legacy PDB files the line ends with the PDB code and a serial number, chop those off!
927                //format version 3.0 onwards will have 80 characters in a line
928                //              if (line.length() > 72) {
929                if (isLegacyFormat) {
930                        //                    if (DEBUG) {
931                        //                        System.out.println("We have a legacy file - truncating line length to 71 characters:");
932                        //                        System.out.println(line);
933                        //                    }
934                        line = line.substring(0, 72);
935                }
936
937                line = line.substring(10, line.length());
938
939
940                String[] fieldList = line.trim().split("\\s+");
941                int fl = fieldList.length;
942                if (fl > 0) {
943                        String field0 = fieldList[0];
944                        if (compndFieldValues.contains(field0)) {
945                                continuationField = field0;
946                                if ("".equals(previousContinuationField)) {
947                                        previousContinuationField = continuationField;
948                                }
949                        } else if (field0.endsWith(";") && compndFieldValues.contains(field0.substring(0, field0.length()-1)) ) {
950                                // the ':' character indicates the end of a field name and should be invalid as part the first data token
951                                // e.g. obsolete file 1hhb has a malformed COMPND line that can only be caught with this kind of check
952                                // UPDATE: There is no harm of having a ':' in the first data token. e.g. 3fdj contains a ':'.
953                                //   The intended case occurs only if the token is a key followed by a colon and a semicolon without spaces, e.g. "COMPND   2 MOLECULE:;"
954                                logger.info("COMPND line does not follow the PDB 3.0 format. Note that COMPND parsing is not supported any longer in format 2.3 or earlier");
955                                return;
956                        }
957                } else {
958                        // the line will be added as data to the previous field
959                }
960
961
962                line = line.replace(continuationField, "").trim();
963
964                StringTokenizer compndTokens = new StringTokenizer(line);
965
966                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
967
968                while (compndTokens.hasMoreTokens()) {
969                        String token = compndTokens.nextToken();
970
971                        if ("".equals(previousContinuationField)) {
972                                previousContinuationField = continuationField;
973                        }
974
975                        if (previousContinuationField.equals(continuationField)
976                                        && compndFieldValues.contains(continuationField)) {
977
978                                logger.debug("Still in field {}", continuationField);
979                                logger.debug("token = {}", token);
980
981                                continuationString = continuationString.concat(token + " ");
982
983                                logger.debug("continuationString = {}", continuationString);
984
985                        }
986                        if (!continuationField.equals(previousContinuationField)) {
987
988                                if ("".equals(continuationString)) {
989                                        continuationString = token;
990
991                                } else {
992
993                                        compndValueSetter(previousContinuationField,
994                                                        continuationString);
995                                        previousContinuationField = continuationField;
996                                        continuationString = token + " ";
997                                }
998                        } else if (ignoreCompndFieldValues.contains(token)) {
999                                // this field shall be ignored
1000                                //continuationField = token;
1001                        }
1002                }
1003                if (isLastCompndLine) {
1004                        // final line in the section - finish off the compound
1005                        //                      System.out.println("[pdb_COMPND_Handler] Final COMPND line - Finishing off final MolID header.");
1006                        compndValueSetter(continuationField, continuationString);
1007                        continuationString = "";
1008                        if (current_compound!=null) entities.add(current_compound);
1009                }
1010        }
1011
1012        /**
1013         * Set the value in the current molId object
1014         * @param field
1015         * @param value
1016         */
1017        private void compndValueSetter(String field, String value) {
1018
1019                value = value.trim().replace(";", "");
1020                if ("MOL_ID:".equals(field)) {
1021
1022                        int i = -1;
1023                        try {
1024                                i = Integer.valueOf(value);
1025                        } catch (NumberFormatException e){
1026                                logger.warn("Value '{}' does not look like a number, while trying to parse COMPND MOL_ID line.",value);
1027                        }
1028                        if (i>0 && prevMolId!=i) {
1029
1030                                if (current_compound!=null) entities.add(current_compound);
1031
1032                                logger.debug("Initialising new Compound with mol_id {}", i);
1033
1034                                current_compound = new EntityInfo();
1035
1036                                current_compound.setMolId(i);
1037
1038                                // we will set polymer for all defined compounds in PDB file (non-polymer compounds are not defined in header) - JD 2016-03-25
1039                                current_compound.setType(EntityType.POLYMER);
1040
1041                                prevMolId = i;
1042                        }
1043
1044                }
1045
1046                // if for some reason (e.g. missing mol_id line) the current_compound is null we can't add anything to it, return
1047                if (current_compound==null) {
1048                        return;
1049                }
1050
1051                if ("MOLECULE:".equals(field)) {
1052                        current_compound.setDescription(value);
1053
1054                }
1055                if ("CHAIN:".equals(field)) {
1056                        //System.out.println(value);
1057                        StringTokenizer chainTokens = new StringTokenizer(value, ",");
1058                        List<String> chains = new ArrayList<>();
1059
1060                        while (chainTokens.hasMoreTokens()) {
1061                                String chainID = chainTokens.nextToken().trim();
1062                                // NULL is used in old PDB files to represent empty chain DI
1063                                if ("NULL".equals(chainID))
1064                                        chainID = " ";
1065                                chains.add(chainID);
1066                        }
1067                        compoundMolIds2chainIds.put(current_compound.getMolId(),chains);
1068
1069                }
1070                if ("SYNONYM:".equals(field)) {
1071
1072                        StringTokenizer synonyms = new StringTokenizer(value, ",");
1073                        List<String> names = new ArrayList<>();
1074
1075                        while (synonyms.hasMoreTokens()) {
1076                                names.add(synonyms.nextToken());
1077
1078                                current_compound.setSynonyms(names);
1079                        }
1080
1081                }
1082
1083                if ("EC:".equals(field)) {
1084
1085                        StringTokenizer ecNumTokens = new StringTokenizer(value, ",");
1086                        List<String> ecNums = new ArrayList<>();
1087
1088                        while (ecNumTokens.hasMoreTokens()) {
1089                                ecNums.add(ecNumTokens.nextToken());
1090
1091                                current_compound.setEcNums(ecNums);
1092                        }
1093
1094                }
1095                if ("FRAGMENT:".equals(field)) {
1096
1097                        current_compound.setFragment(value);
1098
1099                }
1100                if ("ENGINEERED:".equals(field)) {
1101
1102                        current_compound.setEngineered(value);
1103
1104                }
1105                if ("MUTATION:".equals(field)) {
1106
1107                        current_compound.setMutation(value);
1108
1109                }
1110                if ("BIOLOGICAL_UNIT:".equals(field)) {
1111
1112                        current_compound.setBiologicalUnit(value);
1113
1114                }
1115                if ("OTHER_DETAILS:".equals(field)) {
1116
1117                        current_compound.setDetails(value);
1118
1119                }
1120
1121        }
1122
1123
1124        /**
1125         * Handler for
1126         * SOURCE Record format
1127         *
1128         * The SOURCE record specifies the biological and/or chemical source of each biological molecule in the entry. Sources are described by both the common name and the scientific name, e.g., genus and species. Strain and/or cell-line for immortalized cells are given when they help to uniquely identify the biological entity studied.
1129         * Record Format
1130         * <pre>
1131         * COLUMNS   DATA TYPE         FIELD          DEFINITION
1132         * -------------------------------------------------------------------------------
1133         *  1 -  6   Record name       "SOURCE"
1134         *  9 - 10   Continuation      continuation   Allows concatenation of multiple records.
1135         * 11 - 70   Specification     srcName        Identifies the source of the macromolecule in
1136         *            list                            a token: value format.
1137         * </pre>
1138         * @param line the line to be parsed
1139         */
1140        private void pdb_SOURCE_Handler(String line) {
1141                // works in the same way as the pdb_COMPND_Handler.
1142                String continuationNr = line.substring(9, 10).trim();
1143
1144
1145
1146                logger.debug("current continuationNo     is {}", continuationNr);
1147                logger.debug("previousContinuationField  is {}", previousContinuationField);
1148                logger.debug("current continuationField  is {}", continuationField);
1149                logger.debug("current continuationString is {}", continuationString);
1150                logger.debug("current compound           is {}", current_compound);
1151
1152
1153                // following the docs, the last valid character should be 79, chop off the rest
1154                if (line.length() > 79) {
1155                        line = line.substring(0, 79);
1156                }
1157
1158                line = line.substring(10, line.length());
1159
1160                logger.debug("LINE: >{}<", line);
1161
1162                String[] fieldList = line.split("\\s+");
1163
1164                if (!"".equals(fieldList[0])
1165                                && sourceFieldValues.contains(fieldList[0])) {
1166                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[0] + "'");
1167                        continuationField = fieldList[0];
1168                        if ("".equals(previousContinuationField)) {
1169                                previousContinuationField = continuationField;
1170                        }
1171
1172                } else if ((fieldList.length > 1) && ( sourceFieldValues.contains(fieldList[1]))) {
1173                        //                      System.out.println("[PDBFileParser.pdb_COMPND_Handler] Setting continuationField to '" + fieldList[1] + "'");
1174                        continuationField = fieldList[1];
1175                        if ("".equals(previousContinuationField)) {
1176                                previousContinuationField = continuationField;
1177                        }
1178
1179                } else {
1180                        if ("".equals(continuationNr)) {
1181
1182                                logger.debug("looks like an old PDB file");
1183
1184                                continuationField = "MOLECULE:";
1185                                if ("".equals(previousContinuationField)) {
1186                                        previousContinuationField = continuationField;
1187                                }
1188                        }
1189
1190                }
1191
1192                line = line.replace(continuationField, "").trim();
1193
1194                StringTokenizer compndTokens = new StringTokenizer(line);
1195
1196                //              System.out.println("PDBFileParser.pdb_COMPND_Handler: Tokenizing '" + line + "'");
1197
1198                while (compndTokens.hasMoreTokens()) {
1199                        String token = compndTokens.nextToken();
1200
1201                        if ("".equals(previousContinuationField)) {
1202                                //                              System.out.println("previousContinuationField is empty. Setting to : " + continuationField);
1203                                previousContinuationField = continuationField;
1204                        }
1205
1206                        if (previousContinuationField.equals(continuationField)
1207                                        && sourceFieldValues.contains(continuationField)) {
1208
1209                                logger.debug("Still in field {}", continuationField);
1210
1211                                continuationString = continuationString.concat(token + " ");
1212
1213                                logger.debug("continuationString = {}", continuationString);
1214                        }
1215                        if (!continuationField.equals(previousContinuationField)) {
1216
1217                                if ("".equals(continuationString)) {
1218                                        continuationString = token;
1219
1220                                } else {
1221
1222                                        sourceValueSetter(previousContinuationField,
1223                                                        continuationString);
1224                                        previousContinuationField = continuationField;
1225                                        continuationString = token + " ";
1226                                }
1227                        } else if (ignoreCompndFieldValues.contains(token)) {
1228                                // this field shall be ignored
1229                                //continuationField = token;
1230                        }
1231                }
1232                if (isLastSourceLine) {
1233                        // final line in the section - finish off the compound
1234                        //                      System.out.println("[pdb_SOURCE_Handler] Final SOURCE line - Finishing off final MolID header.");
1235                        sourceValueSetter(continuationField, continuationString);
1236                        continuationString = "";
1237                        //compounds.add(current_compound);
1238                }
1239
1240        }
1241
1242
1243        /**
1244         * Set the value in the current molId object
1245         *
1246         * @param field
1247         * @param value
1248         */
1249        private void sourceValueSetter(String field, String value) {
1250
1251                value = value.trim().replace(";", "");
1252                //              System.out.println("[sourceValueSetter] " + field);
1253                if ("MOL_ID:".equals(field)) {
1254
1255                        try {
1256                                current_compound = entities.get(Integer.valueOf(value) - 1);
1257                        } catch (NumberFormatException e){
1258                                logger.info("could not process SOURCE MOL_ID record correctly:" + e.getMessage());
1259                                return;
1260                        }
1261
1262
1263                        //                      System.out.println("[sourceValueSetter] Fetching compound " + value + " " + current_compound.getMolId());
1264
1265                }
1266                if ("SYNTHETIC:".equals(field)) {
1267                        current_compound.setSynthetic(value);
1268                } else if ("FRAGMENT:".equals(field)) {
1269                        current_compound.setFragment(value);
1270                } else if ("ORGANISM_SCIENTIFIC:".equals(field)) {
1271                        current_compound.setOrganismScientific(value);
1272                } else if ("ORGANISM_TAXID:".equals(field)) {
1273                        current_compound.setOrganismTaxId(value);
1274                } else if ("ORGANISM_COMMON:".equals(field)) {
1275                        current_compound.setOrganismCommon(value);
1276                } else if ("STRAIN:".equals(field)) {
1277                        current_compound.setStrain(value);
1278                } else if ("VARIANT:".equals(field)) {
1279                        current_compound.setVariant(value);
1280                } else if ("CELL_LINE:".equals(field)) {
1281                        current_compound.setCellLine(value);
1282                } else if ("ATCC:".equals(field)) {
1283                        current_compound.setAtcc(value);
1284                } else if ("ORGAN:".equals(field)) {
1285                        current_compound.setOrgan(value);
1286                } else if ("TISSUE:".equals(field)) {
1287                        current_compound.setTissue(value);
1288                } else if ("CELL:".equals(field)) {
1289                        current_compound.setCell(value);
1290                } else if ("ORGANELLE:".equals(field)) {
1291                        current_compound.setOrganelle(value);
1292                } else if ("SECRETION:".equals(field)) {
1293                        current_compound.setSecretion(value);
1294                } else if ("GENE:".equals(field)) {
1295                        current_compound.setGene(value);
1296                } else if ("CELLULAR_LOCATION:".equals(field)) {
1297                        current_compound.setCellularLocation(value);
1298                } else if ("EXPRESSION_SYSTEM:".equals(field)) {
1299                        current_compound.setExpressionSystem(value);
1300                } else if ("EXPRESSION_SYSTEM_TAXID:".equals(field)) {
1301                        current_compound.setExpressionSystemTaxId(value);
1302                } else if ("EXPRESSION_SYSTEM_STRAIN:".equals(field)) {
1303                        current_compound.setExpressionSystemStrain(value);
1304                } else if ("EXPRESSION_SYSTEM_VARIANT:".equals(field)) {
1305                        current_compound.setExpressionSystemVariant(value);
1306                } else if ("EXPRESSION_SYSTEM_CELL_LINE:".equals(field)) {
1307                        current_compound.setExpressionSystemCellLine(value);
1308                } else if ("EXPRESSION_SYSTEM_ATCC_NUMBER:".equals(field)) {
1309                        current_compound.setExpressionSystemAtccNumber(value);
1310                } else if ("EXPRESSION_SYSTEM_ORGAN:".equals(field)) {
1311                        current_compound.setExpressionSystemOrgan(value);
1312                } else if ("EXPRESSION_SYSTEM_TISSUE:".equals(field)) {
1313                        current_compound.setExpressionSystemTissue(value);
1314                } else if ("EXPRESSION_SYSTEM_CELL:".equals(field)) {
1315                        current_compound.setExpressionSystemCell(value);
1316                } else if ("EXPRESSION_SYSTEM_ORGANELLE:".equals(field)) {
1317                        current_compound.setExpressionSystemOrganelle(value);
1318                } else if ("EXPRESSION_SYSTEM_CELLULAR_LOCATION:".equals(field)) {
1319                        current_compound.setExpressionSystemCellularLocation(value);
1320                } else if ("EXPRESSION_SYSTEM_VECTOR_TYPE:".equals(field)) {
1321                        current_compound.setExpressionSystemVectorType(value);
1322                } else if ("EXPRESSION_SYSTEM_VECTOR:".equals(field)) {
1323                        current_compound.setExpressionSystemVector(value);
1324                } else if ("EXPRESSION_SYSTEM_PLASMID:".equals(field)) {
1325                        current_compound.setExpressionSystemPlasmid(value);
1326                } else if ("EXPRESSION_SYSTEM_GENE:".equals(field)) {
1327                        current_compound.setExpressionSystemGene(value);
1328                } else if ("OTHER_DETAILS:".equals(field)) {
1329                        current_compound.setExpressionSystemOtherDetails(value);
1330                }
1331
1332        }
1333
1334        /**
1335         * Handler for REMARK lines
1336         */
1337        private void pdb_REMARK_Handler(String line) {
1338
1339                if ( line == null || line.length() < 11)
1340                        return;
1341
1342
1343                if (line.startsWith("REMARK 800")) {
1344                        pdb_REMARK_800_Handler(line);
1345
1346                } else if ( line.startsWith("REMARK 350")){
1347
1348                        if ( params.isParseBioAssembly()) {
1349
1350                                if (bioAssemblyParser == null){
1351                                        bioAssemblyParser = new PDBBioAssemblyParser();
1352                                }
1353
1354                                bioAssemblyParser.pdb_REMARK_350_Handler(line);
1355                        }
1356                } else if (line.startsWith("REMARK   2")) {
1357                        //REMARK   2 RESOLUTION.
1358                        Pattern pR = Pattern.compile("^REMARK   2 RESOLUTION.\\s+(\\d+\\.\\d+)\\s+ANGSTROMS\\..*");
1359                        handleResolutionLine(line, pR);
1360
1361                // REMARK 3 (for R free)
1362                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1363                // then last one encountered will be taken
1364                } else if (line.startsWith("REMARK   3   FREE R VALUE")) {
1365
1366                        // Rfree annotation is not very consistent in PDB format, it varies depending on the software
1367                        // Here we follow this strategy:
1368                        // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
1369                        // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
1370
1371                        Pattern pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+(?:\\(NO CUTOFF\\))?\\s+:\\s+(\\d?\\.\\d+).*");
1372                        Matcher mR = pR.matcher(line);
1373                        if (mR.matches()) {
1374                                try {
1375                                        rfreeNoCutoffLine = Float.parseFloat(mR.group(1));
1376                                } catch (NumberFormatException e) {
1377                                        logger.info("Rfree value "+mR.group(1)+" does not look like a number, will ignore it");
1378                                }
1379                        }
1380                        pR = Pattern.compile("^REMARK   3   FREE R VALUE\\s+:\\s+(\\d?\\.\\d+).*");
1381                        mR = pR.matcher(line);
1382                        if (mR.matches()) {
1383                                try {
1384                                        rfreeStandardLine = Float.parseFloat(mR.group(1));
1385                                } catch (NumberFormatException e) {
1386                                        logger.info("Rfree value '{}' does not look like a number, will ignore it", mR.group(1));
1387                                }
1388                        }
1389
1390                // REMARK 3 RESOLUTION (contains more info than REMARK 2, for instance multiple resolutions in hybrid experimental technique entries)
1391                // note: if more than 1 value present (occurring in hybrid experimental technique entries, e.g. 3ins, 4n9m)
1392                // then last one encountered will be taken
1393                } else if (line.startsWith("REMARK   3   RESOLUTION RANGE HIGH")){
1394                        Pattern pR = Pattern.compile("^REMARK   3   RESOLUTION RANGE HIGH \\(ANGSTROMS\\) :\\s+(\\d+\\.\\d+).*");
1395                        handleResolutionLine(line, pR);
1396                } else if (line.startsWith("REMARK   3   EFFECTIVE RESOLUTION")){
1397                        Pattern pR = Pattern.compile("^REMARK   3   EFFECTIVE RESOLUTION \\(ANGSTROMS\\)\\s+:\\s+(\\d+\\.\\d+).*");
1398                        handleResolutionLine(line, pR);
1399                }
1400        }
1401
1402        public void handleResolutionLine(String line, Pattern pR) {
1403                Matcher mR = pR.matcher(line);
1404                if (mR.matches()) {
1405                        final String resString = mR.group(1);
1406                        try {
1407                                float res = Float.parseFloat(resString);
1408                                final float resInHeader = pdbHeader.getResolution();
1409                                if (resInHeader!=PDBHeader.DEFAULT_RESOLUTION && resInHeader != res) {
1410                                        logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1411                                                        ,resString, String.format("%4.2f",resInHeader));
1412                                }
1413                                pdbHeader.setResolution(res);
1414                        } catch (NumberFormatException e) {
1415                                logger.info("Could not parse resolution '{}', ignoring it",resString);
1416                        }
1417                }
1418        }
1419
1420
1421
1422
1423
1424
1425        /**
1426         * Handler for
1427         * EXPDTA Record Format
1428        <pre>
1429         COLUMNS       DATA TYPE      FIELD         DEFINITION
1430         -------------------------------------------------------------------------------
1431         1 -  6       Record name    "EXPDTA"
1432         9 - 10       Continuation   continuation  Allows concatenation of multiple
1433         records.
1434         11 - 70       SList          technique     The experimental technique(s) with
1435         optional comment describing the
1436         sample or experiment.
1437
1438         allowed techniques are:
1439         ELECTRON DIFFRACTION
1440         FIBER DIFFRACTION
1441         FLUORESCENCE TRANSFER
1442         NEUTRON DIFFRACTION
1443         NMR
1444         THEORETICAL MODEL
1445         X-RAY DIFFRACTION
1446        </pre>
1447         */
1448        private void pdb_EXPDTA_Handler(String line) {
1449
1450                String technique  ;
1451                if (line.length() > 69)
1452                        technique = line.substring (10, 70).trim() ;
1453                else
1454                        technique = line.substring(10).trim();
1455
1456                for (String singleTechnique: technique.split(";\\s+")) {
1457                        pdbHeader.setExperimentalTechnique(singleTechnique);
1458                }
1459
1460
1461        }
1462
1463        /**
1464         * Handler for
1465         * CRYST1 Record Format
1466         * The CRYST1 record presents the unit cell parameters, space group, and Z value.
1467         * If the entry describes a structure determined by a technique other than X-ray crystallography,
1468         * CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1469         * <pre>
1470         * COLUMNS DATA TYPE    FIELD          DEFINITION
1471         * -------------------------------------------------------------
1472         *  1 - 6  Record name  "CRYST1"
1473         *  7 - 15 Real(9.3)    a              a (Angstroms).
1474         * 16 - 24 Real(9.3)    b              b (Angstroms).
1475         * 25 - 33 Real(9.3)    c              c (Angstroms).
1476         * 34 - 40 Real(7.2)    alpha          alpha (degrees).
1477         * 41 - 47 Real(7.2)    beta           beta (degrees).
1478         * 48 - 54 Real(7.2)    gamma          gamma (degrees).
1479         * 56 - 66 LString      sGroup         Space group.
1480         * 67 - 70 Integer      z              Z value.
1481         * </pre>
1482         */
1483        private void pdb_CRYST1_Handler(String line) {
1484                // for badly formatted files (e.g. phenix-produced ones), there's no z and the min length is 58 (e.g. for SG 'P 1')
1485                if (line.length() < 58) {
1486                        logger.warn("CRYST1 record has fewer than 58 columns: will ignore it");
1487                        return;
1488                }
1489
1490                float a;
1491                float b;
1492                float c;
1493                float alpha;
1494                float beta;
1495                float gamma;
1496                String spaceGroup = "";
1497
1498                try {
1499                        a = Float.parseFloat(line.substring(6,15).trim());
1500                        b = Float.parseFloat(line.substring(15,24).trim());
1501                        c = Float.parseFloat(line.substring(24,33).trim());
1502                        alpha = Float.parseFloat(line.substring(33,40).trim());
1503                        beta = Float.parseFloat(line.substring(40,47).trim());
1504                        gamma = Float.parseFloat(line.substring(47,54).trim());
1505                } catch (NumberFormatException e) {
1506                        logger.info("could not parse CRYST1 record ("+e.getMessage()+") from line and ignoring it " + line);
1507                        return ;
1508                }
1509                if (line.length()>=66) {
1510                        // for well formatted files
1511                        spaceGroup = line.substring(55,66).trim();
1512                } else {
1513                        // for not-so-well formatted files, e.g. phenix-produced ones: they lack a Z value
1514                        spaceGroup = line.substring(55,line.length()).trim();
1515                }
1516
1517                CrystalCell xtalCell = new CrystalCell();
1518                xtalCell.setA(a);
1519                xtalCell.setB(b);
1520                xtalCell.setC(c);
1521                xtalCell.setAlpha(alpha);
1522                xtalCell.setBeta(beta);
1523                xtalCell.setGamma(gamma);
1524
1525                if (!xtalCell.isCellReasonable()) {
1526                        // If the entry describes a structure determined by a technique other than X-ray crystallography,
1527                        // CRYST1 contains a = b = c = 1.0, alpha = beta = gamma = 90 degrees, space group = P 1, and Z =1.
1528                        // if so we don't add the crystal cell and it remains null
1529                        logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1530                                        CrystalCell.MIN_VALID_CELL_SIZE);
1531                } else {
1532                        crystallographicInfo.setCrystalCell(xtalCell);
1533                }
1534
1535                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1536                if (sg==null) {
1537                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1538                        crystallographicInfo.setNonStandardSg(true);
1539                } else {
1540                        crystallographicInfo.setSpaceGroup(sg);
1541                        crystallographicInfo.setNonStandardSg(false);
1542                }
1543        }
1544
1545        /**
1546         * Handler for MTRIXn records. They specify extra NCS operators (usually in virus entries)
1547         *
1548         * See http://www.wwpdb.org/documentation/format33/sect8.html#MTRIXn
1549         * <pre>
1550         * COLUMNS        DATA TYPE     FIELD         DEFINITION
1551         * -------------------------------------------------------------
1552         *
1553         *  1 -  6        Record name   "MTRIXn"      n=1, 2, or 3
1554         *  8 - 10        Integer       serial        Serial number.
1555         * 11 - 20        Real(10.6)    m[n][1]       Mn1
1556         * 21 - 30        Real(10.6)    m[n][2]       Mn2
1557         * 31 - 40        Real(10.6)    m[n][3]       Mn3
1558         * 46 - 55        Real(10.5)    v[n]          Vn
1559         * 60             Integer       iGiven        1
1560         *
1561         * </pre>
1562         * Note that we ignore operators with iGiven==1
1563         *
1564         * @param line
1565         */
1566        private void pdb_MTRIXn_Handler(String line) {
1567
1568                // don't process incomplete records
1569                if (line.length() < 55) {
1570                        logger.info("MTRIXn record has fewer than 55 columns: will ignore it");
1571                        return;
1572                }
1573
1574
1575                try {
1576
1577                        int rowIndex = Integer.parseInt(line.substring(5,6));
1578                        double col1Value = Double.parseDouble(line.substring(10,20));
1579                        double col2Value = Double.parseDouble(line.substring(20,30));
1580                        double col3Value = Double.parseDouble(line.substring(30,40));
1581                        double translValue = Double.parseDouble(line.substring(45,55));
1582                        int iGiven = 0;
1583                        if (line.length()>=60 && !line.substring(59,60).trim().isEmpty()) {
1584                                iGiven = Integer.parseInt(line.substring(59,60));
1585                        }
1586
1587                        if (iGiven == 1) return;
1588
1589                        if (ncsOperators==null) {
1590                                // we initialise on first pass
1591                                ncsOperators = new ArrayList<Matrix4d>();
1592                        }
1593
1594                        if (currentNcsOp==null) {
1595                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1596                        }
1597
1598                        currentNcsOp.setElement(rowIndex-1, 0, col1Value);
1599                        currentNcsOp.setElement(rowIndex-1, 1, col2Value);
1600                        currentNcsOp.setElement(rowIndex-1, 2, col3Value);
1601                        currentNcsOp.setElement(rowIndex-1, 3, translValue);
1602
1603
1604                        if (rowIndex==3) {
1605                                ncsOperators.add(currentNcsOp);
1606                                // we initialise for next matrix to come
1607                                currentNcsOp = new Matrix4d(1,0,0,0,  0,1,0,0,  0,0,1,0,  0,0,0,1); // initialised to identity
1608                        }
1609
1610                } catch (NumberFormatException e) {
1611                        logger.info("Could not parse a number in MTRIXn record ("+e.getMessage()+") from line: >" + line+"<");
1612                }
1613        }
1614
1615        /**
1616         * Handler for ATOM.
1617         * Record Format:
1618         *
1619         * <pre>
1620         * ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1621         *
1622         * COLUMNS        DATA TYPE       FIELD         DEFINITION
1623         * ---------------------------------------------------------------------------------
1624         * 1 -  6        Record name     "ATOM  "
1625         * 7 - 11        Integer         serial        Atom serial number.
1626         * 13 - 16        Atom            name          Atom name.
1627         * 17             Character       altLoc        Alternate location indicator.
1628         * 18 - 20        Residue name    resName       Residue name.
1629         * 22             Character       chainID       Chain identifier.
1630         * 23 - 26        Integer         resSeq        Residue sequence number.
1631         * 27             AChar           iCode         Code for insertion of residues.
1632         * 31 - 38        Real(8.3)       x             Orthogonal coordinates for X in Angstroms.
1633         * 39 - 46        Real(8.3)       y             Orthogonal coordinates for Y in Angstroms.
1634         * 47 - 54        Real(8.3)       z             Orthogonal coordinates for Z in Angstroms.
1635         * 55 - 60        Real(6.2)       occupancy     Occupancy.
1636         * 61 - 66        Real(6.2)       tempFactor    Temperature factor.
1637         * 73 - 76        LString(4)      segID         Segment identifier, left-justified.
1638         * 77 - 78        LString(2)      element       Element symbol, right-justified.
1639         * 79 - 80        LString(2)      charge        Charge on the atom.
1640         * </pre>
1641         */
1642        private void  pdb_ATOM_Handler(String line)     {
1643
1644                if ( params.isHeaderOnly())
1645                        return;
1646
1647                // let's first get the chain name which will serve to identify if we are starting a new molecule
1648                String chainName      = line.substring(21,22);
1649
1650                if (" ".equals(chainName)) {
1651                        blankChainIdsPresent = true;
1652                }
1653
1654                if (currentChain!=null && !currentChain.getName().equals(chainName)) {
1655                        // new chain name: another molecule coming
1656                        startOfMolecule = true;
1657                }
1658
1659                if (startOfMolecule) {
1660                        // we add last chain if there was one
1661                        if (currentChain!=null) {
1662                                currentModel.add(currentChain);
1663                                // let's not forget adding the last group to the finishing chain
1664                                if (currentGroup!=null) {
1665                                        currentChain.addGroup(currentGroup);
1666                                }
1667                        }
1668                        // we initialise the new molecule to come
1669                        currentChain = new ChainImpl();
1670                        // note that the chainId (asym id) is set properly later in assignAsymIds
1671                        currentChain.setId(chainName);
1672                        currentChain.setName(chainName);
1673
1674                }
1675
1676                if (startOfModel) {
1677                        // we add last model if there was one
1678                        if (currentModel!=null) {
1679                                allModels.add(currentModel);
1680                        }
1681                        // we initialise the model to come
1682                        currentModel = new ArrayList<>();
1683                }
1684
1685
1686                // let's get the residue number and see if we need to start a new group
1687
1688                String groupCode3     = line.substring(17,20).trim();
1689                String resNum  = line.substring(22,26).trim();
1690                Character iCode = line.substring(26,27).charAt(0);
1691                if ( iCode == ' ')
1692                        iCode = null;
1693                ResidueNumber residueNumber = new ResidueNumber(chainName, Integer.valueOf(resNum), iCode);
1694
1695                //recordName      groupCode3
1696                //|                |    resNum
1697                //|                |    |   iCode
1698                //|     |          | |  |   ||
1699                //ATOM      1  N   ASP A  15     110.964  24.941  59.191  1.00 83.44           N
1700                //ATOM   1964  N   ARG H 221A      5.963 -16.715  27.669  1.00 28.59           N
1701
1702                Character aminoCode1 = StructureTools.get1LetterCode(groupCode3);
1703
1704                String recordName     = line.substring (0, 6).trim ();
1705
1706                boolean isHetAtomInFile = false;
1707
1708                if ("HETATM".equals(recordName) ){
1709                        // HETATOM RECORDS are treated slightly differently
1710                        // some modified amino acids that we want to treat as amino acids
1711                        // can be found as HETATOM records
1712                        if ( aminoCode1 != null && aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
1713                                        aminoCode1 = null;
1714
1715                        isHetAtomInFile = true;
1716                }
1717
1718                if ( startOfMolecule) {
1719
1720                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1721
1722                        currentGroup.setPDBName(groupCode3);
1723                        currentGroup.setResidueNumber(residueNumber);
1724                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1725
1726                }
1727
1728                // resetting states
1729                startOfModel = false;
1730                startOfMolecule = false;
1731
1732
1733                Character altLoc   = line.substring (16, 17).charAt(0);
1734                Group altGroup = null;
1735
1736
1737                // check if residue number is the same ...
1738                if ( ! residueNumber.equals(currentGroup.getResidueNumber())) {
1739
1740                        currentChain.addGroup(currentGroup);
1741                        currentGroup.trimToSize();
1742
1743                        currentGroup = getNewGroup(recordName, aminoCode1, groupCode3);
1744
1745                        currentGroup.setPDBName(groupCode3);
1746                        currentGroup.setResidueNumber(residueNumber);
1747                        currentGroup.setHetAtomInFile(isHetAtomInFile);
1748
1749                } else {
1750                        // same residueNumber, but altLocs...
1751
1752                        // test altLoc
1753                        if ( ! altLoc.equals(' ')) {
1754                                logger.debug("found altLoc! " + currentGroup + " " + altGroup);
1755                                altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3);
1756                                if ( altGroup.getChain() == null) {
1757                                        // need to set current chain
1758                                        altGroup.setChain(currentChain);
1759                                }
1760
1761                        }
1762                }
1763
1764                atomCount++;
1765
1766                if ( atomCount == atomCAThreshold ) {
1767                        // throw away the SEQRES lines - too much to deal with...
1768                        logger.warn("more than " + atomCAThreshold + " atoms in this structure, ignoring the SEQRES lines");
1769                        seqResChains.clear();
1770
1771                        switchCAOnly();
1772
1773                }
1774
1775
1776
1777                if ( atomCount == loadMaxAtoms){
1778                        logger.warn("File has more atoms than max specified in parsing parameters ({}). Ignoring atoms after line: {}", loadMaxAtoms, line);
1779                        return;
1780                }
1781                if ( atomCount > loadMaxAtoms){
1782                        return;
1783                }
1784
1785
1786                //          1         2         3         4         5         6
1787                //012345678901234567890123456789012345678901234567890123456789
1788                //ATOM      1  N   MET     1      20.154  29.699   5.276   1.0
1789                //ATOM    112  CA  ASP   112      41.017  33.527  28.371  1.00  0.00
1790                //ATOM     53  CA  MET     7      23.772  33.989 -21.600  1.00  0.00           C
1791                //ATOM    112  CA  ASP   112      37.613  26.621  33.571     0     0
1792
1793
1794                String fullname = line.substring (12, 16);
1795
1796                // check for CA only if requested
1797                if ( parseCAonly ){
1798                        // yes , user wants to get CA only
1799                        // only parse CA atoms...
1800                        if (! " CA ".equals(fullname)){
1801                                //System.out.println("ignoring " + line);
1802                                atomCount--;
1803                                return;
1804                        }
1805                }
1806
1807                if ( params.getAcceptedAtomNames() != null) {
1808
1809                        boolean found = false;
1810                        for (String ok : params.getAcceptedAtomNames()){
1811                                //System.out.println(ok + "< >" + fullname +"<");
1812
1813                                if ( ok.equals(fullname.trim())) {
1814                                        found = true;
1815                                        break;
1816                                }
1817                        }
1818                        if ( ! found) {
1819                                atomCount--;
1820                                return;
1821                        }
1822                }
1823                // create new atom
1824
1825                int pdbnumber = Integer.parseInt (line.substring (6, 11).trim ());
1826                AtomImpl atom = new AtomImpl() ;
1827                atom.setPDBserial(pdbnumber) ;
1828
1829                atom.setAltLoc(altLoc);
1830                atom.setName(fullname.trim());
1831
1832                double x = Double.parseDouble (line.substring (30, 38).trim());
1833                double y = Double.parseDouble (line.substring (38, 46).trim());
1834                double z = Double.parseDouble (line.substring (46, 54).trim());
1835
1836                double[] coords = new double[3];
1837                coords[0] = x ;
1838                coords[1] = y ;
1839                coords[2] = z ;
1840                atom.setCoords(coords);
1841
1842                float occu  = 1.0f;
1843                if ( line.length() > 59 ) {
1844                        try {
1845                                // occu and tempf are sometimes not used :-/
1846                                occu = Float.parseFloat (line.substring (54, 60).trim());
1847                        }  catch (NumberFormatException e){}
1848                }
1849
1850                float tempf = 0.0f;
1851                if ( line.length() > 65) {
1852                        try {
1853                                tempf = Float.parseFloat (line.substring (60, 66).trim());
1854                        }  catch (NumberFormatException e){}
1855                }
1856
1857                atom.setOccupancy(  occu  );
1858                atom.setTempFactor( tempf );
1859
1860
1861
1862
1863                // Parse element from the element field. If this field is
1864                // missing (i.e. misformatted PDB file), then parse the
1865                // element from the chemical component.
1866                Element element = Element.R;
1867                boolean guessElement = true;
1868                if ( line.length() > 77 ) {
1869                        // parse element from element field
1870                        String elementSymbol = line.substring(76, 78).trim();
1871                        if (elementSymbol.isEmpty()) {
1872                                logger.info("Element column was empty for atom {} {}. Assigning atom element "
1873                                                + "from Chemical Component Dictionary information", fullname.trim(), pdbnumber);
1874                        } else {
1875
1876                        try {
1877                                        element = Element.valueOfIgnoreCase(elementSymbol);
1878                                        guessElement = false;
1879                                }  catch (IllegalArgumentException e){
1880                                        logger.info("Element {} of atom {} {} was not recognised. Assigning atom element "
1881                                                        + "from Chemical Component Dictionary information", elementSymbol,
1882                                                        fullname.trim(), pdbnumber);
1883                                }
1884                        }
1885                } else {
1886                        logger.info("Missformatted PDB file: element column of atom {} {} is not present. "
1887                                        + "Assigning atom element from Chemical Component Dictionary information",
1888                                        fullname.trim(), pdbnumber);
1889                }
1890                if (guessElement) {
1891                        String elementSymbol = null;
1892                        if (currentGroup.getChemComp() != null) {
1893                                for (ChemCompAtom a : currentGroup.getChemComp().getAtoms()) {
1894                                        if (a.getAtomId().equals(fullname.trim())) {
1895                                                elementSymbol = a.getTypeSymbol();
1896                                                break;
1897                                        }
1898                                }
1899                                if (elementSymbol == null) {
1900                                        logger.info("Atom name {} was not found in the Chemical Component Dictionary information of {}. "
1901                                                        + "Assigning generic element R to it", fullname.trim(), currentGroup.getPDBName());
1902                        } else {
1903                        try {
1904                                element = Element.valueOfIgnoreCase(elementSymbol);
1905                                        } catch (IllegalArgumentException e) {
1906                                                // this can still happen for cases like UNK
1907                                                logger.info("Element symbol {} found in chemical component dictionary for Atom {} {} could not be recognised as a known element. "
1908                                                                + "Assigning generic element R to it", elementSymbol, fullname.trim(), pdbnumber);
1909                                        }
1910                                }
1911                        } else {
1912                                logger.warn("Chemical Component Dictionary information was not found for Atom name {}. "
1913                                                + "Assigning generic element R to it", fullname.trim());
1914                        }
1915
1916                }
1917                atom.setElement(element);
1918
1919
1920                //see if chain_id is one of the previous chains ...
1921                if ( altGroup != null) {
1922                        altGroup.addAtom(atom);
1923                        altGroup = null;
1924                }
1925                else {
1926                        currentGroup.addAtom(atom);
1927                }
1928
1929
1930                // make sure that main group has all atoms
1931                // GitHub issue: #76
1932                if ( ! currentGroup.hasAtom(atom.getName())) {
1933                        currentGroup.addAtom(atom);
1934                }
1935
1936
1937
1938        }
1939
1940
1941        private Group getCorrectAltLocGroup( Character altLoc,
1942                        String recordName, Character aminoCode1, String groupCode3) {
1943
1944                // see if we know this altLoc already;
1945                List<Atom> atoms = currentGroup.getAtoms();
1946                if ( atoms.size() > 0) {
1947                        Atom a1 = atoms.get(0);
1948                        // we are just adding atoms to the current group
1949                        // probably there is a second group following later...
1950                        if (a1.getAltLoc().equals(altLoc)) {
1951
1952                                return currentGroup;
1953                        }
1954                }
1955
1956                List<Group> altLocs = currentGroup.getAltLocs();
1957                for ( Group altLocG : altLocs ){
1958                        atoms = altLocG.getAtoms();
1959                        if ( atoms.size() > 0) {
1960                                for ( Atom a1 : atoms) {
1961                                        if (a1.getAltLoc().equals( altLoc)) {
1962
1963                                                return altLocG;
1964                                        }
1965                                }
1966                        }
1967                }
1968
1969                // no matching altLoc group found.
1970                // build it up.
1971
1972                if ( groupCode3.equals(currentGroup.getPDBName())) {
1973                        if ( currentGroup.getAtoms().size() == 0) {
1974                                //System.out.println("current group is empty " + current_group + " " + altLoc);
1975                                return currentGroup;
1976                        }
1977                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
1978                        Group altLocG = (Group) currentGroup.clone();
1979                        // drop atoms from cloned group...
1980                        // https://redmine.open-bio.org/issues/3307
1981                        altLocG.setAtoms(new ArrayList<Atom>());
1982                        altLocG.getAltLocs().clear();
1983                        currentGroup.addAltLoc(altLocG);
1984                        return altLocG;
1985                }
1986
1987                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
1988                Group altLocG = getNewGroup(recordName,aminoCode1,groupCode3);
1989
1990
1991                altLocG.setPDBName(groupCode3);
1992
1993                altLocG.setResidueNumber(currentGroup.getResidueNumber());
1994                currentGroup.addAltLoc(altLocG);
1995                return altLocG;
1996        }
1997
1998        private void switchCAOnly(){
1999                parseCAonly = true;
2000
2001
2002                currentModel = CAConverter.getRepresentativeAtomsOnly(currentModel);
2003
2004                for ( int i =0; i< structure.nrModels() ; i++){
2005                        //  iterate over all known models ...
2006                        List<Chain> model = structure.getModel(i);
2007                        model = CAConverter.getRepresentativeAtomsOnly(model);
2008                        structure.setModel(i,model);
2009                }
2010
2011                currentChain = CAConverter.getRepresentativeAtomsOnly(currentChain);
2012
2013        }
2014
2015
2016        /** safes repeating a few lines ... */
2017        private Integer conect_helper (String line,int start,int end) {
2018                if (line.length() < end) return null;
2019
2020                String sbond = line.substring(start,end).trim();
2021                int bond  = -1 ;
2022                Integer b = null ;
2023
2024                if ( ! "".equals(sbond)) {
2025                        bond = Integer.parseInt(sbond);
2026                        b = bond;
2027                }
2028
2029                return b ;
2030        }
2031
2032        /**
2033         * Handler for CONECT Record Format
2034        <pre>
2035         COLUMNS         DATA TYPE        FIELD           DEFINITION
2036         ---------------------------------------------------------------------------------
2037         1 -  6         Record name      "CONECT"
2038         7 - 11         Integer          serial          Atom serial number
2039         12 - 16         Integer          serial          Serial number of bonded atom
2040         17 - 21         Integer          serial          Serial number of bonded atom
2041         22 - 26         Integer          serial          Serial number of bonded atom
2042         27 - 31         Integer          serial          Serial number of bonded atom
2043         32 - 36         Integer          serial          Serial number of hydrogen bonded
2044         atom
2045         37 - 41         Integer          serial          Serial number of hydrogen bonded
2046         atom
2047         42 - 46         Integer          serial          Serial number of salt bridged
2048         atom
2049         47 - 51         Integer          serial          Serial number of hydrogen bonded
2050         atom
2051         52 - 56         Integer          serial          Serial number of hydrogen bonded
2052         atom
2053         57 - 61         Integer          serial          Serial number of salt bridged
2054         atom
2055         </pre>
2056         */
2057        private void pdb_CONECT_Handler(String line) {
2058
2059                if ( atomOverflow) {
2060                        return ;
2061                }
2062                if (params.isHeaderOnly()) {
2063                        return;
2064                }
2065
2066                // this try .. catch is e.g. to catch 1gte which has wrongly formatted lines...
2067                try {
2068                        int atomserial = Integer.parseInt (line.substring(6 ,11).trim());
2069                        Integer bond1      = conect_helper(line,11,16);
2070                        Integer bond2      = conect_helper(line,16,21);
2071                        Integer bond3      = conect_helper(line,21,26);
2072                        Integer bond4      = conect_helper(line,26,31);
2073                        Integer hyd1       = conect_helper(line,31,36);
2074                        Integer hyd2       = conect_helper(line,36,41);
2075                        Integer salt1      = conect_helper(line,41,46);
2076                        Integer hyd3       = conect_helper(line,46,51);
2077                        Integer hyd4       = conect_helper(line,51,56);
2078                        Integer salt2      = conect_helper(line,56,61);
2079
2080                        //System.out.println(atomserial+ " "+ bond1 +" "+bond2+ " " +bond3+" "+bond4+" "+
2081                        //                 hyd1+" "+hyd2 +" "+salt1+" "+hyd3+" "+hyd4+" "+salt2);
2082                        HashMap<String, Integer> cons = new HashMap<>();
2083                        cons.put("atomserial",atomserial);
2084
2085                        if ( bond1 != null) cons.put("bond1",bond1);
2086                        if ( bond2 != null) cons.put("bond2",bond2);
2087                        if ( bond3 != null) cons.put("bond3",bond3);
2088                        if ( bond4 != null) cons.put("bond4",bond4);
2089                        if ( hyd1  != null) cons.put("hydrogen1",hyd1);
2090                        if ( hyd2  != null) cons.put("hydrogen2",hyd2);
2091                        if ( salt1 != null) cons.put("salt1",salt1);
2092                        if ( hyd3  != null) cons.put("hydrogen3",hyd3);
2093                        if ( hyd4  != null) cons.put("hydrogen4",hyd4);
2094                        if ( salt2 != null) cons.put("salt2",salt2);
2095
2096                        connects.add(cons);
2097                } catch (NumberFormatException e){
2098                        logger.info("could not parse CONECT line correctly ("+e.getMessage()+"), at line : " + line);
2099                        return;
2100                }
2101        }
2102
2103        /**
2104         * Handler for MODEL Record Format
2105         * <pre>
2106         * COLUMNS       DATA TYPE      FIELD         DEFINITION
2107         * ----------------------------------------------------------------------
2108         * 1 -  6       Record name    "MODEL "
2109         * 11 - 14       Integer        serial        Model serial number.
2110         * </pre>
2111         */
2112        private void pdb_MODEL_Handler(String line) {
2113
2114                if (params.isHeaderOnly()) return;
2115
2116                // new model: we start a new molecule
2117                startOfMolecule = true;
2118                startOfModel = true;
2119
2120        }
2121
2122        /**
2123         * Handler for TER record. The record is used in deposited PDB files and many others,
2124         * but it's often forgotten by some softwares. In any case it helps identifying the
2125         * start of ligand molecules so we use it for that.
2126         */
2127        private void pdb_TER_Handler() {
2128                startOfMolecule = true;
2129        }
2130
2131
2132        /**
2133         * DBREF handler
2134         * <pre>
2135         * COLUMNS       DATA TYPE          FIELD          DEFINITION
2136         * ----------------------------------------------------------------
2137         *  1 - 6        Record name        "DBREF "
2138         *  8 - 11       IDcode             idCode         ID code of this entry.
2139         * 13            Character          chainID        Chain identifier.
2140         * 15 - 18       Integer            seqBegin       Initial sequence number
2141         *                                                 of the PDB sequence segment.
2142         * 19            AChar              insertBegin    Initial insertion code
2143         *                                                 of the PDB sequence segment.
2144         * 21 - 24       Integer            seqEnd         Ending sequence number
2145         *                                                 of the PDB sequence segment.
2146         * 25            AChar              insertEnd      Ending insertion code
2147         *                                                 of the PDB sequence segment.
2148         * 27 - 32       LString            database       Sequence database name.
2149         * 34 - 41       LString            dbAccession    Sequence database accession code.
2150         * 43 - 54      LString            dbIdCode        Sequence database
2151         *                                                 identification code.
2152         * 56 - 60      Integer            dbseqBegin      Initial sequence number of the
2153         *                                                 database seqment.
2154         * 61           AChar              idbnsBeg        Insertion code of initial residue
2155         *                                                 of the segment, if PDB is the
2156         *                                                 reference.
2157         * 63 - 67      Integer            dbseqEnd        Ending sequence number of the
2158         *                                                 database segment.
2159         * 68           AChar              dbinsEnd        Insertion code of the ending
2160         *                                                 residue of the segment, if PDB is
2161         *                                                 the reference.
2162         * </pre>
2163         */
2164        private void pdb_DBREF_Handler(String line){
2165
2166                logger.debug("Parsing DBREF {}", line);
2167
2168                DBRef dbref = new DBRef();
2169                String idCode      = line.substring(7,11);
2170                String chainName     = line.substring(12,13);
2171                String seqBegin    = line.substring(14,18);
2172                String insertBegin = line.substring(18,19);
2173                String seqEnd      = line.substring(20,24);
2174                String insertEnd   = line.substring(24,25);
2175                String database    = line.substring(26,32);
2176                String dbAccession = line.substring(33,41);
2177                String dbIdCode    = line.substring(42,54);
2178                String dbseqBegin  = line.substring(55,60);
2179                String idbnsBeg    = line.substring(60,61);
2180                String dbseqEnd    = line.substring(62,67);
2181                // Support implicit space character at end
2182                String dbinsEnd;
2183                if(line.length() >= 68)
2184                        dbinsEnd       = line.substring(67,68);
2185                else
2186                        dbinsEnd       = " ";
2187
2188                dbref.setIdCode(idCode);
2189                dbref.setChainName(chainName);
2190                dbref.setSeqBegin(intFromString(seqBegin));
2191                dbref.setInsertBegin(insertBegin.charAt(0));
2192                dbref.setSeqEnd(intFromString(seqEnd));
2193                dbref.setInsertEnd(insertEnd.charAt(0));
2194                dbref.setDatabase(database.trim());
2195                dbref.setDbAccession(dbAccession.trim());
2196                dbref.setDbIdCode(dbIdCode.trim());
2197                dbref.setDbSeqBegin(intFromString(dbseqBegin));
2198                dbref.setIdbnsBegin(idbnsBeg.charAt(0));
2199                dbref.setDbSeqEnd(intFromString(dbseqEnd));
2200                dbref.setIdbnsEnd(dbinsEnd.charAt(0));
2201
2202                //System.out.println(dbref.toPDB());
2203                dbrefs.add(dbref);
2204        }
2205
2206
2207        /**
2208         * Process the disulfide bond info provided by an SSBOND record
2209         *
2210         * <pre>
2211        COLUMNS        DATA TYPE       FIELD         DEFINITION
2212        -------------------------------------------------------------------
2213         1 -  6        Record name     "SSBOND"
2214         8 - 10        Integer         serNum       Serial number.
2215        12 - 14        LString(3)      "CYS"        Residue name.
2216        16             Character       chainID1     Chain identifier.
2217        18 - 21        Integer         seqNum1      Residue sequence number.
2218        22             AChar           icode1       Insertion code.
2219        26 - 28        LString(3)      "CYS"        Residue name.
2220        30             Character       chainID2     Chain identifier.
2221        32 - 35        Integer         seqNum2      Residue sequence number.
2222        36             AChar           icode2       Insertion code.
2223        60 - 65        SymOP           sym1         Symmetry oper for 1st resid
2224        67 - 72        SymOP           sym2         Symmetry oper for 2nd resid
2225         * </pre>
2226         */
2227        private void pdb_SSBOND_Handler(String line){
2228
2229                if (params.isHeaderOnly()) return;
2230
2231                if (line.length()<36) {
2232                        logger.info("SSBOND line has length under 36. Ignoring it.");
2233                        return;
2234                }
2235
2236                String chain1      = line.substring(15,16);
2237                String seqNum1     = line.substring(17,21).trim();
2238                String icode1      = line.substring(21,22);
2239                String chain2      = line.substring(29,30);
2240                String seqNum2     = line.substring(31,35).trim();
2241                String icode2      = line.substring(35,36);
2242
2243                if (line.length()>=72) {
2244                        String symop1 = line.substring(59, 65).trim();
2245                        String symop2 = line.substring(66, 72).trim();
2246
2247                        // until we implement proper treatment of symmetry in biojava #220, we can't deal with sym-related parteners properly, skipping them
2248                        if (!"".equals(symop1) && !"".equals(symop2) && // in case the field is missing
2249                                        (!"1555".equals(symop1) || !"1555".equals(symop2)) ) {
2250                                logger.info("Skipping ss bond between groups {} and {} belonging to different symmetry partners, because it is not supported yet", seqNum1+icode1, seqNum2+icode2);
2251                                return;
2252                        }
2253                }
2254
2255                if (" ".equals(icode1))
2256                        icode1 = "";
2257                if (" ".equals(icode2))
2258                        icode2 = "";
2259
2260                SSBondImpl ssbond = new SSBondImpl();
2261
2262                ssbond.setChainID1(chain1);
2263                ssbond.setResnum1(seqNum1);
2264                ssbond.setChainID2(chain2);
2265                ssbond.setResnum2(seqNum2);
2266                ssbond.setInsCode1(icode1);
2267                ssbond.setInsCode2(icode2);
2268                ssbonds.add(ssbond);
2269        }
2270
2271
2272        /**
2273         * Takes care of LINK records. These take the format of:
2274         *
2275         * <pre>
2276         * COLUMNS        DATA TYPE       FIELD       DEFINITION
2277         * --------------------------------------------------------------------------------
2278         *  1 -  6        Record name     "LINK  "
2279         * 13 - 16        Atom            name1       Atom name.
2280         * 17             Character       altLoc1     Alternate location indicator.
2281         * 18 - 20        Residue name    resName1    Residue name.
2282         * 22             Character       chainID1    Chain identifier.
2283         * 23 - 26        Integer         resSeq1     Residue sequence number.
2284         * 27             AChar           iCode1      Insertion code.
2285         * 43 - 46        Atom            name2       Atom name.
2286         * 47             Character       altLoc2     Alternate location indicator.
2287         * 48 - 50        Residue name    resName2    Residue name.
2288         * 52             Character       chainID2    Chain identifier.
2289         * 53 - 56        Integer         resSeq2     Residue sequence number.
2290         * 57             AChar           iCode2      Insertion code.
2291         * 60 - 65        SymOP           sym1        Symmetry operator for 1st atom.
2292         * 67 - 72        SymOP           sym2        Symmetry operator for 2nd atom.
2293         * </pre>
2294         *
2295         * (From http://www.wwpdb.org/documentation/format32/sect6.html#LINK)
2296         *
2297         * @param line the LINK record line to parse.
2298         */
2299        private void pdb_LINK_Handler(String line) {
2300
2301                if (params.isHeaderOnly()) return;
2302
2303                // Check for the minimal set of fields.
2304                if (line.length()<56) {
2305                        logger.info("LINK line has length under 56. Ignoring it.");
2306                        return;
2307                }
2308
2309                int len = line.length();
2310
2311                String name1 = line.substring(12, 16).trim();
2312                String altLoc1 = line.substring(16, 17).trim();
2313                String resName1 = line.substring(17, 20).trim();
2314                String chainID1 = line.substring(21, 22).trim();
2315                String resSeq1 = line.substring(22, 26).trim();
2316                String iCode1 = line.substring(26, 27).trim();
2317
2318                String name2 = line.substring(42, 46).trim();
2319                String altLoc2 = line.substring(46, 47).trim();
2320                String resName2 = line.substring(47, 50).trim();
2321                String chainID2 = line.substring(51, 52).trim();
2322                String resSeq2 = line.substring(52, 56).trim();
2323                String iCode2 = null;  // Might get trimmed if blank.
2324                if (len > 56) iCode2 = line.substring(56, 57).trim();
2325
2326                String sym1 = null;
2327                if (len > 64) sym1 = line.substring(59, 65).trim();
2328                String sym2 = null;
2329                if (len > 71) sym2 = line.substring(66, 72).trim();
2330
2331                linkRecords.add(new LinkRecord(
2332                                name1, altLoc1, resName1, chainID1, resSeq1, iCode1,
2333                                name2, altLoc2, resName2, chainID2, resSeq2, iCode2,
2334                                sym1, sym2));
2335        }
2336
2337        /**
2338         * Handler for the SITE records. <br>
2339         *
2340         * <pre>
2341         *
2342         * COLUMNS      DATA TYPE               FIELD           DEFINITION
2343         * ---------------------------------------------------------------------------------
2344         * 1 - 6        Record name     "SITE "
2345         * 8 - 10       Integer                 seqNum          Sequence number.
2346         * 12 - 14      LString(3)              siteID          Site name.
2347         * 16 - 17      Integer                 numRes          Number of residues that compose the siteResidues.
2348         * 19 - 21      Residue name    resName1        Residue name for first residue that
2349         *                                                                              creates the siteResidues.
2350         * 23           Character               chainID1        Chain identifier for first residue of siteResidues.
2351         * 24 - 27      Integer                 seq1            Residue sequence number for first residue
2352         *                                                                              of the siteResidues.
2353         * 28           AChar                   iCode1          Insertion code for first residue of the siteResidues.
2354         *
2355         * example:
2356         *          1         2         3         4         5         6         7         8
2357         * 12345678901234567890123456789012345678901234567890123456789012345678901234567890
2358         * SITE     1 AC1  3 HIS A  94 HIS A   96  HIS A 119
2359         * SITE     1 AC2  5 ASN A  62 GLY A   63  HIS A  64  HOH A 328
2360         * SITE     2 AC2  5 HOH A 634
2361         * SITE     1 AC3  5 GLN A 136 GLN A  137  PRO A 138  GLU A 205
2362         * SITE     2 AC3  5 CYS A 206
2363         * SITE     1 AC4 11 HIS A  64 HIS A   94  HIS A  96  HIS A 119
2364         * SITE     2 AC4 11 LEU A 198 THR A  199  THR A 200  TRP A 209
2365         * SITE     3 AC4 11 HOH A 572 HOH A  582  HOH A 635
2366         * </pre>
2367         * @param line the SITE line record being currently read
2368         * @author Amr ALHOSSARY
2369         * @author Jules Jacobsen
2370         */
2371        private void pdb_SITE_Handler(String line){
2372
2373                if (params.isHeaderOnly()) return;
2374
2375                //  make a map of: SiteId to List<ResidueNumber>
2376
2377                logger.debug("Site Line:{}", line);
2378
2379
2380                String siteID = line.substring(11, 14);
2381                //fetch the siteResidues from the map
2382                List<ResidueNumber> siteResidues = siteToResidueMap.get(siteID);
2383
2384                //if the siteResidues doesn't yet exist, make a new one.
2385                if (siteResidues == null || ! siteToResidueMap.containsKey(siteID.trim())){
2386                        siteResidues = new ArrayList<>();
2387                        siteToResidueMap.put(siteID.trim(), siteResidues);
2388
2389                        logger.debug(String.format("New Site made: %s %s", siteID,  siteResidues));
2390                        logger.debug("Now made {} sites", siteMap.size());
2391
2392                }
2393
2394                logger.debug(String.format("SiteId: %s", siteID));
2395
2396
2397                //line = 'SITE     1 AC1  6 ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2398                //line.substring(18) = 'ARG H 221A LYS H 224  HOH H 403  HOH H 460'
2399                line = line.substring(18);
2400                String groupString = null;
2401                //groupString = 'ARG H 221A'
2402                //keep iterating through chunks of 10 characters - these are the groups in the siteResidues
2403                while (!"          ".equals((groupString = line.substring(0, 10)))) {
2404                        //groupstring: 'ARG H 221A'
2405
2406                        logger.debug("groupString: '{}'", groupString);
2407
2408                        //set the residue name
2409                        //residueName = 'ARG'
2410                        String residueName = groupString.substring(0, 3);
2411                        Character aminoCode1 = StructureTools.get1LetterCode(residueName);
2412                        if (aminoCode1 != null) {
2413                                if (aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
2414                                        aminoCode1 = null;
2415                                }
2416                        }
2417
2418                        //this is already in the right format, so no need to fiddle with it...
2419                        //pdbCode = 'H 221A'
2420                        //                    String pdbCode = groupString.substring(4, 10).trim();
2421                        String chainId = groupString.substring(4, 5);
2422                        Integer resNum = Integer.valueOf(groupString.substring(5, 9).trim());
2423                        Character insCode = groupString.substring(9, 10).charAt(0);
2424                        //set insCode to null as a measure to prevent storing thousands of empty Strings
2425                        //- the empty value is returned using Group.getInsCode()
2426                        //                    if (insCode.equals(" ")) {
2427                        //                        insCode = null;
2428                        //                    }
2429
2430                        logger.debug(String.format("Site: %s: 'resName:%s resNum:%s insCode:%s'", siteID, residueName, resNum, insCode));
2431
2432                        //make a new resNum with the data - this will be linked up with a site later
2433                        ResidueNumber residueNumber = new ResidueNumber();
2434
2435
2436                        logger.debug("pdbCode: '{}{}'", resNum, insCode);
2437
2438                        residueNumber.setChainName(chainId);
2439                        residueNumber.setSeqNum(resNum);
2440                        residueNumber.setInsCode(insCode);
2441                        //add the resNum to the groups
2442                        siteResidues.add(residueNumber);
2443
2444                        logger.debug("Adding residueNumber " + residueNumber + " to site " + siteID);
2445
2446                        line = line.substring(11);
2447                }
2448
2449                logger.debug("Current SiteMap (contains {} sites):", siteToResidueMap.keySet().size());
2450                for (String key : siteToResidueMap.keySet()) {
2451                        logger.debug(key + " : " + siteToResidueMap.get(key));
2452                }
2453
2454        }
2455
2456        //Site variable related to parsing the REMARK 800 records.
2457        Site site;
2458
2459        private String[] keywords;
2460        private void pdb_REMARK_800_Handler(String line){
2461
2462                if (params.isHeaderOnly()) return;
2463
2464                // 'REMARK 800 SITE_IDENTIFIER: CAT                                                 '
2465                line = line.substring(11);
2466                String[] fields = line.split(": ");
2467
2468                if (fields.length == 2) {
2469                        if ("SITE_IDENTIFIER".equals(fields[0])) {
2470                                //                    remark800Counter++;
2471                                String siteID = fields[1].trim();
2472
2473                                logger.debug("siteID: '{}'", siteID);
2474
2475                                //fetch the siteResidues from the map
2476                                site = siteMap.get(siteID);
2477
2478                                //if the siteResidues doesn't yet exist, make a new one.
2479                                if (site == null || !siteID.equals(site.getSiteID())) {
2480                                        site = new Site(siteID, new ArrayList<Group>());
2481                                        siteMap.put(site.getSiteID(), site);
2482
2483                                        logger.debug("New Site made: {}", site);
2484                                        logger.debug("Now made {} sites", siteMap.size());
2485
2486                                }
2487                        }
2488                        if ("EVIDENCE_CODE".equals(fields[0])) {
2489                                //                    remark800Counter++;
2490                                String evCode = fields[1].trim();
2491
2492                                logger.debug("evCode: '{}'", evCode);
2493
2494                                //fetch the siteResidues from the map
2495                                site.setEvCode(evCode);
2496                        }
2497                        if ("SITE_DESCRIPTION".equals(fields[0])) {
2498                                //                    remark800Counter++;
2499                                String desc = fields[1].trim();
2500
2501                                logger.debug("desc: '{}'", desc);
2502
2503                                //fetch the siteResidues from the map
2504                                site.setDescription(desc);
2505
2506                                logger.debug("Finished making REMARK 800 for site {}", site.getSiteID());
2507                                logger.debug(site.remark800toPDB());
2508
2509                        }
2510                }
2511        }
2512
2513        private int intFromString(String intString){
2514                int val = Integer.MIN_VALUE;
2515                try {
2516                        val = Integer.parseInt(intString.trim());
2517                } catch (NumberFormatException ex){
2518                        logger.info("Could not parse a number: " + ex.getMessage());
2519                }
2520                return val;
2521        }
2522
2523
2524
2525        /**
2526         * Finds in the given list of chains the first one that has as name the given chainID.
2527         * If no such Chain can be found it returns null.
2528         */
2529        private static Chain isKnownChain(String chainID, List<Chain> chains){
2530
2531                for (int i = 0; i< chains.size();i++){
2532                        Chain testchain =  chains.get(i);
2533                        if (chainID.equals(testchain.getName())) {
2534                                return testchain;
2535                        }
2536                }
2537
2538                return null;
2539        }
2540
2541
2542
2543        private BufferedReader getBufferedReader(InputStream inStream)
2544                        throws IOException {
2545
2546                BufferedReader buf ;
2547                if (inStream == null) {
2548                        throw new IOException ("input stream is null!");
2549                }
2550
2551                buf = new BufferedReader (new InputStreamReader (inStream));
2552                return buf ;
2553
2554        }
2555
2556
2557
2558        /**
2559         * Parse a PDB file and return a datastructure implementing
2560         * PDBStructure interface.
2561         *
2562         * @param inStream  an InputStream object
2563         * @return a Structure object
2564         * @throws IOException
2565         */
2566        public Structure parsePDBFile(InputStream inStream)
2567                        throws IOException
2568        {
2569
2570                BufferedReader buf = getBufferedReader(inStream);
2571
2572                return parsePDBFile(buf);
2573
2574        }
2575
2576        /**
2577         * Parse a PDB file and return a datastructure implementing
2578         * PDBStructure interface.
2579         *
2580         * @param buf  a BufferedReader object
2581         * @return the Structure object
2582         * @throws IOException ...
2583         */
2584        public  Structure parsePDBFile(BufferedReader buf)
2585                        throws IOException
2586                        {
2587                // set the correct max values for parsing...
2588                loadMaxAtoms = params.getMaxAtoms();
2589                atomCAThreshold = params.getAtomCaThreshold();
2590
2591
2592                // (re)set structure
2593
2594                allModels = new ArrayList<>();
2595                structure     = new StructureImpl() ;
2596                currentModel  = null;
2597                currentChain  = null;
2598                currentGroup  = null;
2599                // we initialise to true since at the beginning of the file we are always starting a new molecule
2600                startOfMolecule = true;
2601                startOfModel = true;
2602
2603                seqResChains  = new ArrayList<>();
2604                siteMap = new LinkedHashMap<>();
2605                pdbHeader     = new PDBHeader();
2606                connects      = new ArrayList<>();
2607                previousContinuationField = "";
2608                continuationField = "";
2609                continuationString = "";
2610                current_compound = null;
2611                sourceLines.clear();
2612                compndLines.clear();
2613                keywordsLines.clear();
2614                isLastCompndLine = false;
2615                isLastSourceLine = false;
2616                prevMolId = -1;
2617                entities.clear();
2618                helixList.clear();
2619                strandList.clear();
2620                turnList.clear();
2621                lengthCheck = -1;
2622                atomCount = 0;
2623                atomOverflow = false;
2624                linkRecords = new ArrayList<>();
2625                siteToResidueMap.clear();
2626
2627                blankChainIdsPresent = false;
2628
2629                parseCAonly = params.isParseCAOnly();
2630
2631                String line = null;
2632
2633                while ((line = buf.readLine()) != null) {
2634
2635                        // ignore empty lines
2636                        if ( "".equals(line) ||
2637                                        (line.equals(NEWLINE))){
2638                                continue;
2639                        }
2640
2641
2642                        // ignore short TER and END lines
2643                        if ( line.startsWith("END")) {
2644                                continue;
2645                        }
2646
2647                        if ( line.length() < 6 && !line.startsWith("TER")) {
2648                                logger.info("Found line length below 6. Ignoring it, line: >" + line +"<" );
2649                                continue;
2650                        }
2651
2652                        String recordName = null;
2653                        if (line.length()<6)
2654                                recordName = line.trim();
2655                        else
2656                                recordName = line.substring (0, 6).trim ();
2657
2658                        try {
2659                                if ("ATOM".equals(recordName))
2660                                        pdb_ATOM_Handler(line);
2661                                else if ("SEQRES".equals(recordName))
2662                                        pdb_SEQRES_Handler(line);
2663                                else if ("HETATM".equals(recordName))
2664                                        pdb_ATOM_Handler(line);
2665                                else if ("MODEL".equals(recordName))
2666                                        pdb_MODEL_Handler(line);
2667                                else if ("TER".equals(recordName))
2668                                        pdb_TER_Handler();
2669                                else if ("HEADER".equals(recordName))
2670                                        pdb_HEADER_Handler(line);
2671                                else if ("AUTHOR".equals(recordName))
2672                                        pdb_AUTHOR_Handler(line);
2673                                else if ("TITLE".equals(recordName))
2674                                        pdb_TITLE_Handler(line);
2675                                else if ("SOURCE".equals(recordName))
2676                                        sourceLines.add(line); //pdb_SOURCE_Handler
2677                                else if ("COMPND".equals(recordName))
2678                                        compndLines.add(line); //pdb_COMPND_Handler
2679                                else if ("KEYWDS".equals(recordName))
2680                                        keywordsLines.add(line);
2681                                else if ("JRNL".equals(recordName))
2682                                        pdb_JRNL_Handler(line);
2683                                else if ("EXPDTA".equals(recordName))
2684                                        pdb_EXPDTA_Handler(line);
2685                                else if ("CRYST1".equals(recordName))
2686                                        pdb_CRYST1_Handler(line);
2687                                else if (recordName.startsWith("MTRIX"))
2688                                        pdb_MTRIXn_Handler(line);
2689                                else if ("REMARK".equals(recordName))
2690                                        pdb_REMARK_Handler(line);
2691                                else if ("CONECT".equals(recordName))
2692                                        pdb_CONECT_Handler(line);
2693                                else if ("REVDAT".equals(recordName))
2694                                        pdb_REVDAT_Handler(line);
2695                                else if ("DBREF".equals(recordName))
2696                                        pdb_DBREF_Handler(line);
2697                                else if ("SITE".equals(recordName))
2698                                        pdb_SITE_Handler(line);
2699                                else if ("SSBOND".equals(recordName))
2700                                        pdb_SSBOND_Handler(line);
2701                                else if ("LINK".equals(recordName))
2702                                        pdb_LINK_Handler(line);
2703                                else if ( params.isParseSecStruc()) {
2704                                        if ( "HELIX".equals(recordName) ) pdb_HELIX_Handler (  line ) ;
2705                                        else if ("SHEET".equals(recordName)) pdb_SHEET_Handler(line ) ;
2706                                        else if ("TURN".equals(recordName)) pdb_TURN_Handler(   line ) ;
2707                                }
2708                        } catch (StringIndexOutOfBoundsException | NullPointerException ex) {
2709                                logger.info("Unable to parse [" + line + "]");
2710                        }
2711                }
2712
2713                makeCompounds(compndLines, sourceLines);
2714
2715                handlePDBKeywords(keywordsLines);
2716
2717                triggerEndFileChecks();
2718
2719                if (params.shouldCreateAtomBonds()) {
2720                        formBonds();
2721                }
2722
2723                if ( params.shouldCreateAtomCharges()) {
2724                        addCharges();
2725                }
2726
2727                if ( params.isParseSecStruc() && !params.isHeaderOnly())
2728                        setSecStruc();
2729
2730                // Now correct the alternate location group
2731                StructureTools.cleanUpAltLocs(structure);
2732
2733                return structure;
2734
2735                        }
2736
2737
2738        /**
2739         * Add the charges to the Structure
2740         */
2741        private void addCharges() {
2742                ChargeAdder.addCharges(structure);
2743        }
2744
2745        /**
2746         * This is the new method for building the COMPND and SOURCE records. Now each method is self-contained.
2747         * @author Jules Jacobsen
2748         * @param  compoundList
2749         * @param  sourceList
2750         */
2751        private void makeCompounds(List<String> compoundList,
2752                        List<String> sourceList) {
2753                //              System.out.println("[makeCompounds] making compounds from compoundLines");
2754
2755                for (String line : compoundList) {
2756                        if (compoundList.indexOf(line) + 1 == compoundList.size()) {
2757                                //                              System.out.println("[makeCompounds] Final line in compoundLines.");
2758                                isLastCompndLine = true;
2759                        }
2760                        pdb_COMPND_Handler(line);
2761
2762                }
2763                //              System.out.println("[makeCompounds] adding sources to compounds from sourceLines");
2764                // since we're starting again from the first compound, reset it here
2765                if ( entities.size() == 0){
2766                        current_compound = new EntityInfo();
2767                } else {
2768                        current_compound = entities.get(0);
2769                }
2770                for (String line : sourceList) {
2771                        if (sourceList.indexOf(line) + 1 == sourceList.size()) {
2772                                //                              System.out.println("[makeCompounds] Final line in sourceLines.");
2773                                isLastSourceLine = true;
2774                        }
2775                        pdb_SOURCE_Handler(line);
2776                }
2777
2778        }
2779
2780        /**Parse KEYWODS record of the PDB file.<br>
2781         * A keyword may be split over two lines. whether a keyword ends by the end
2782         * of a line or it is aplit over two lines, a <code>space</code> is added
2783         * between the 2 lines's contents, unless the first line ends in
2784         * a '-' character.
2785         * <pre>
2786         * Record Format
2787         * COLUMNS       DATA  TYPE     FIELD         DEFINITION
2788         *      ---------------------------------------------------------------------------------
2789         *       1 -  6       Record name    "KEYWDS"
2790         *       9 - 10       Continuation   continuation  Allows concatenation of records if necessary.
2791         *      11 - 79       List           keywds        Comma-separated list of keywords relevant
2792         *                                                 to the entry.
2793         * Example
2794         *               1         2         3         4         5         6         7         8
2795         *      12345678901234567890123456789012345678901234567890123456789012345678901234567890
2796         *      KEYWDS    LYASE,  TRICARBOXYLIC ACID CYCLE, MITOCHONDRION, OXIDATIVE
2797         *      KEYWDS   2 METABOLISM
2798         * </pre>
2799         * @param lines The KEWODS record lines.
2800         * @author Amr ALHOSSARY
2801         */
2802        private void handlePDBKeywords(List<String> lines) {
2803                StringBuilder fullList = new StringBuilder();
2804                for (String line : lines) {
2805                        String kwList = line.substring(10).trim();
2806                        if(kwList.length() > 0) {
2807                                if(fullList.length() > 0 && fullList.indexOf("-", fullList.length()-1) < 0) {
2808                                        fullList.append(' ');
2809                                }
2810                                fullList.append(kwList);
2811                        }
2812                }
2813                String fulllengthList = fullList.toString();
2814                keywords = fulllengthList.split("( )*,( )*");
2815                ArrayList<String> lst = new ArrayList<>(keywords.length);
2816                for (String keyword : keywords) {
2817                        if(keyword.length() == 0) {
2818                                logger.debug("Keyword empty in structure {}", structure.getIdentifier().toString());
2819                                continue;
2820                        }
2821                        lst.add(keyword);
2822                }
2823                pdbHeader.setKeywords(lst);
2824        }
2825
2826        /**
2827         * Handles creation of all bonds. Looks at LINK records, SSBOND (Disulfide
2828         * bonds), peptide bonds, and intra-residue bonds.
2829         * <p>
2830         * Note: the current implementation only looks at the first model of each
2831         * structure. This may need to be fixed in the future.
2832         */
2833        private void formBonds() {
2834
2835                BondMaker maker = new BondMaker(structure, params);
2836
2837                // LINK records should be preserved, they are the way that
2838                // inter-residue bonds are created for ligands such as trisaccharides, unusual polymers.
2839                // The analogy in mmCIF is the _struct_conn record.
2840                for (LinkRecord linkRecord : linkRecords) {
2841                        maker.formLinkRecordBond(linkRecord);
2842                }
2843
2844                maker.formDisulfideBonds(ssbonds);
2845
2846                maker.makeBonds();
2847        }
2848
2849
2850
2851        private void triggerEndFileChecks(){
2852
2853                // we need to add the last chain and model, checking for nulls (e.g. the file could be completely empty of ATOM lines)
2854                if (currentChain!=null && currentGroup!=null) {
2855                        currentChain.addGroup(currentGroup);
2856                }
2857                if (currentModel!=null && currentChain!=null) {
2858                        currentModel.add(currentChain);
2859                }
2860                if (currentModel!=null) {
2861                        allModels.add(currentModel);
2862                }
2863
2864                if (blankChainIdsPresent) {
2865                        // from biojava 5.0 there's limited support for old pdb files with blank chain ids
2866                        logger.warn("Found some blank chain ids in PDB file. Please note that support for them has been discontinued and things might not work properly.");
2867                }
2868
2869                // reordering chains following the mmcif model and assigning entities
2870                assignChainsAndEntities();
2871                structure.setEntityInfos(entities);
2872
2873
2874
2875                // header data
2876
2877                Date modDate = pdbHeader.getModDate();
2878                if ( modDate.equals(new Date(0)) ) {
2879                        // modification date = deposition date
2880                        Date depositionDate = pdbHeader.getDepDate();
2881
2882                        if (! depositionDate.equals(modDate)){
2883                                // depDate is 0000-00-00
2884                                pdbHeader.setModDate(depositionDate);
2885                        }
2886                }
2887
2888                structure.setPDBHeader(pdbHeader);
2889                structure.setCrystallographicInfo(crystallographicInfo);
2890
2891                //set the JournalArticle, if there is one
2892                if (!journalLines.isEmpty()) {
2893                        buildjournalArticle();
2894                        pdbHeader.setJournalArticle(journalArticle);
2895                }
2896
2897                structure.setDBRefs(dbrefs);
2898
2899                // Only align if requested (default) and not when headerOnly mode with no Atoms.
2900                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
2901                if ( params.isAlignSeqRes() && !params.isHeaderOnly() && !seqResChains.isEmpty()){
2902                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
2903                        SeqRes2AtomAligner aligner = new SeqRes2AtomAligner();
2904                        aligner.align(structure,seqResChains);
2905
2906                } else {
2907                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
2908                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
2909                }
2910
2911
2912
2913                //associate the temporary Groups in the siteMap to the ones
2914                if (!params.isHeaderOnly()) {
2915                        // Only can link SITES if Atom Groups were parsed.
2916                        linkSitesToGroups(); // will work now that setSites is called
2917                }
2918
2919                if ( bioAssemblyParser != null){
2920                        bioAssemblyParser.setMacromolecularSizes();
2921                        pdbHeader.setBioAssemblies(bioAssemblyParser.getTransformationMap());
2922                }
2923
2924                if (ncsOperators !=null && ncsOperators.size()>0) {
2925                        crystallographicInfo.setNcsOperators(
2926                                ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
2927                }
2928
2929
2930                // rfree end file check
2931                // Rfree annotation is not very consistent in PDB format, it varies depending on the software
2932                // Here we follow this strategy:
2933                // a) take the '(NO CUTOFF)' value if the only one available (shelx software, e.g. 1x7q)
2934                // b) don't take it if also a line without '(NO CUTOFF)' is present (CNX software, e.g. 3lak)
2935
2936                if (rfreeNoCutoffLine>0 && rfreeStandardLine<0) {
2937                        pdbHeader.setRfree(rfreeNoCutoffLine);
2938                } else if (rfreeNoCutoffLine>0 && rfreeStandardLine>0) {
2939                        pdbHeader.setRfree(rfreeStandardLine);
2940                } else if (rfreeNoCutoffLine<0 && rfreeStandardLine>0) {
2941                        pdbHeader.setRfree(rfreeStandardLine);
2942                } // otherwise it remains default value: PDBHeader.DEFAULT_RFREE
2943
2944
2945
2946        }
2947
2948        private void setSecStruc(){
2949
2950                setSecElement(helixList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2951                                SecStrucType.helix4);
2952                setSecElement(strandList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2953                                SecStrucType.extended);
2954                setSecElement(turnList, SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2955                                SecStrucType.turn);
2956
2957                //Now insert random coil to the Groups that did not have SS information
2958                GroupIterator gi = new GroupIterator(structure);
2959                while (gi.hasNext()){
2960                        Group g = gi.next();
2961                        if (g.hasAminoAtoms()){
2962                                if (g.getProperty(Group.SEC_STRUC) == null){
2963                                        SecStrucInfo ss = new SecStrucInfo(g,
2964                                                        SecStrucInfo.PDB_AUTHOR_ASSIGNMENT,
2965                                                        SecStrucType.coil);
2966                                        g.setProperty(Group.SEC_STRUC, ss);
2967                                }
2968                        }
2969                }
2970
2971        }
2972
2973        private void setSecElement(List<Map<String,String>> secList, String assignment, SecStrucType type){
2974
2975
2976                Iterator<Map<String,String>> iter = secList.iterator();
2977                nextElement:
2978                        while (iter.hasNext()){
2979                                Map<String,String> m = iter.next();
2980
2981                                // assign all residues in this range to this secondary structure type
2982                                // String initResName = (String)m.get("initResName");
2983                                String initChainId = m.get("initChainId");
2984                                String initSeqNum  = m.get("initSeqNum" );
2985                                String initICode   = m.get("initICode" );
2986                                // String endResName  = (String)m.get("endResName" );
2987                                String endChainId  = m.get("endChainId" );
2988                                String endSeqNum   = m.get("endSeqNum");
2989                                String endICode    = m.get("endICode");
2990
2991                                if (" ".equals(initICode))
2992                                        initICode = "";
2993                                if (" ".equals(endICode))
2994                                        endICode = "";
2995
2996                                GroupIterator gi = new GroupIterator(structure);
2997                                boolean inRange = false;
2998                                while (gi.hasNext()){
2999                                        Group g = gi.next();
3000                                        Chain c = g.getChain();
3001
3002                                        if (c.getName().equals(initChainId)){
3003
3004                                                String pdbCode = initSeqNum + initICode;
3005                                                if ( g.getResidueNumber().toString().equals(pdbCode)  ) {
3006                                                        inRange = true;
3007                                                }
3008                                        }
3009                                        if ( inRange){
3010                                                if (g.hasAminoAtoms()) {
3011                                                        SecStrucInfo ss = new SecStrucInfo(g, assignment, type);
3012                                                        g.setProperty(Group.SEC_STRUC, ss);
3013                                                }
3014
3015                                        }
3016                                        if ( c.getName().equals(endChainId)){
3017                                                String pdbCode = endSeqNum + endICode;
3018                                                if (pdbCode.equals(g.getResidueNumber().toString())){
3019                                                        inRange = false;
3020                                                        continue nextElement;
3021                                                }
3022                                        }
3023                                }
3024                        }
3025        }
3026
3027        /**
3028         * Gets all chains with given chainName from given models list
3029         * @param chainName
3030         * @param polyModels
3031         * @return
3032         */
3033        private static List<List<Chain>> findChains(String chainName, List<List<Chain>> polyModels) {
3034                List<List<Chain>> models = new ArrayList<>();
3035
3036                for (List<Chain> chains:polyModels) {
3037                        List<Chain> matchingChains = new ArrayList<>();
3038                        models.add(matchingChains);
3039                        for (Chain c:chains) {
3040                                if (c.getName().equals(chainName)) {
3041                                        matchingChains.add(c);
3042                                }
3043                        }
3044                }
3045                return models;
3046        }
3047
3048        /**
3049         * Split the given chain (containing non-polymer groups and water groups only)
3050         * into individual chains per non-polymer group and individual chains per contiguous sets of water groups.
3051         * @param chain
3052         * @return a list of lists of size 2: first list is the split non-poly chains, second list is the split water chains
3053         */
3054        private static List<List<Chain>> splitNonPolyChain(Chain chain) {
3055                List<Chain> splitNonPolys = new ArrayList<>();
3056                List<Chain> waterChains = new ArrayList<>();
3057
3058                Chain split = null;
3059                boolean previousGroupIsWater = false;
3060
3061                for (Group g:chain.getAtomGroups()){
3062
3063                        if (!previousGroupIsWater) {
3064                                // add last one if there's one
3065                                if (split!=null) {
3066                                        splitNonPolys.add(split);
3067                                }
3068                                split = new ChainImpl();
3069                                split.setName(chain.getName());
3070                        } else if (!g.isWater()) {
3071                                // previous group is water and this group is not water: we change from a water chain to a non-poly
3072                                // we'll need to add now the water chain to the list of water chains
3073                                waterChains.add(split);
3074                                split = new ChainImpl();
3075                                split.setName(chain.getName());
3076                        }
3077
3078                        if (g.isWater()) {
3079                                previousGroupIsWater = true;
3080                        } else {
3081                                previousGroupIsWater = false;
3082
3083                        }
3084
3085                        // this should include alt locs (referenced from the main group)
3086                        split.addGroup(g);
3087
3088                }
3089
3090                // adding the last split chain: either to water or non-poly depending on what was the last seen group
3091                if (split!=null) {
3092                        if (previousGroupIsWater)
3093                                waterChains.add(split);
3094                        else
3095                                splitNonPolys.add(split);
3096                }
3097
3098
3099                List<List<Chain>> all = new ArrayList<>(2);
3100                all.add(splitNonPolys);
3101                all.add(waterChains);
3102
3103                return all;
3104        }
3105
3106        /**
3107         * Assign asym ids following the rules used by the PDB to assign asym ids in mmCIF files
3108         * @param polys
3109         * @param nonPolys
3110         * @param waters
3111         */
3112        private void assignAsymIds(List<List<Chain>> polys, List<List<Chain>> nonPolys, List<List<Chain>> waters) {
3113
3114                for (int i=0; i<polys.size(); i++) {
3115                        String asymId = "A";
3116
3117                        for (Chain poly:polys.get(i)) {
3118                                poly.setId(asymId);
3119                                asymId = getNextAsymId(asymId);
3120                        }
3121                        for (Chain nonPoly:nonPolys.get(i)) {
3122                                nonPoly.setId(asymId);
3123                                asymId = getNextAsymId(asymId);
3124                        }
3125                        for (Chain water:waters.get(i)) {
3126                                water.setId(asymId);
3127                                asymId = getNextAsymId(asymId);
3128                        }
3129                }
3130        }
3131
3132        /**
3133         * Gets the next asym id given an asymId, according to the convention followed by
3134         * mmCIF files produced by the PDB
3135         * i.e.: A,B,...,Z,AA,BA,CA,...,ZA,AB,BB,CB,...,ZB,.......,ZZ,AAA,BAA,CAA,...
3136         * @param asymId
3137         * @return
3138         */
3139        private String getNextAsymId(String asymId) {
3140                if (asymId.length()==1) {
3141                        if (!"Z".equals(asymId)) {
3142                                return Character.toString(getNextChar(asymId.charAt(0)));
3143                        } else {
3144                                return "AA";
3145                        }
3146                } else if (asymId.length()==2) {
3147                        if ("ZZ".equals(asymId)) {
3148                                return "AAA";
3149                        }
3150                        char[] c = new char[2];
3151                        asymId.getChars(0, 2, c, 0);
3152                        c[0] = getNextChar(c[0]);
3153                        if (c[0]=='A') {
3154                                c[1] = getNextChar(c[1]);
3155                        }
3156                        return String.valueOf(c);
3157                } else if (asymId.length()==3) {
3158                        char[] c = new char[3];
3159                        asymId.getChars(0, 3, c, 0);
3160                        c[0] = getNextChar(c[0]);
3161                        if (c[0]=='A') {
3162                                c[1] = getNextChar(c[1]);
3163                                if (c[1]=='A') {
3164                                        c[2] = getNextChar(c[2]);
3165                                }
3166                        }
3167                        return String.valueOf(c);
3168                }
3169                return null;
3170        }
3171
3172        private char getNextChar(char c) {
3173                if (c!='Z') {
3174                        return ((char)(c+1));
3175                } else {
3176                        return 'A';
3177                }
3178        }
3179
3180        /**
3181         * Here we assign chains following the mmCIF data model:
3182         * one chain per polymer, one chain per non-polymer group and
3183         * several water chains.
3184         * <p>
3185         * Subsequently we assign entities for them: either from those read from
3186         * COMPOUND records or from those found heuristically through {@link EntityFinder}
3187         *
3188         */
3189        private void assignChainsAndEntities(){
3190
3191                List<List<Chain>> polyModels = new ArrayList<>();
3192                List<List<Chain>> nonPolyModels = new ArrayList<>();
3193                List<List<Chain>> waterModels = new ArrayList<>();
3194
3195                for (List<Chain> model:allModels) {
3196
3197                        List<Chain> polyChains = new ArrayList<>();
3198                        List<Chain> nonPolyChains = new ArrayList<>();
3199                        List<Chain> waterChains = new ArrayList<>();
3200
3201                        polyModels.add(polyChains);
3202                        nonPolyModels.add(nonPolyChains);
3203                        waterModels.add(waterChains);
3204
3205                        for (Chain c:model) {
3206
3207                                // we only have entities for polymeric chains, all others are ignored for assigning entities
3208                                if (c.isWaterOnly()) {
3209                                        waterChains.add(c);
3210
3211                                } else if (c.isPureNonPolymer()) {
3212                                        nonPolyChains.add(c);
3213
3214                                } else {
3215                                        polyChains.add(c);
3216                                }
3217                        }
3218                }
3219
3220                List<List<Chain>> splitNonPolyModels = new ArrayList<>();
3221                for (int i=0; i<nonPolyModels.size(); i++) {
3222                        List<Chain> nonPolyModel = nonPolyModels.get(i);
3223                        List<Chain> waterModel = waterModels.get(i);
3224
3225                        List<Chain> splitNonPolys = new ArrayList<>();
3226                        splitNonPolyModels.add(splitNonPolys);
3227
3228                        for (Chain nonPoly:nonPolyModel) {
3229                                List<List<Chain>> splits = splitNonPolyChain(nonPoly);
3230                                splitNonPolys.addAll(splits.get(0));
3231                                waterModel.addAll(splits.get(1));
3232                        }
3233                }
3234
3235
3236                // now we have all chains as in mmcif, let's assign ids following the mmcif rules
3237                assignAsymIds(polyModels, splitNonPolyModels, waterModels);
3238
3239
3240                if (!entities.isEmpty()) {
3241                        // if the file contained COMPOUND records then we can assign entities to the poly chains
3242                        for (EntityInfo comp : entities){
3243                        List<String> chainIds = compoundMolIds2chainIds.get(comp.getMolId());
3244                        if ( chainIds == null)
3245                                continue;
3246                        for ( String chainId : chainIds) {
3247
3248                                        List<List<Chain>> models = findChains(chainId, polyModels);
3249
3250                                        for (List<Chain> matchingChains:models) {
3251                                                for (Chain chain:matchingChains) {
3252                                                        comp.addChain(chain);
3253                                                        chain.setEntityInfo(comp);
3254                                                }
3255
3256                                                if (matchingChains.isEmpty()) {
3257                                        // usually if this happens something is wrong with the PDB header
3258                                        // e.g. 2brd - there is no Chain A, although it is specified in the header
3259                                        // Some bona-fide cases exist, e.g. 2ja5, chain N is described in SEQRES
3260                                        // but the authors didn't observe in the density so it's completely missing
3261                                        // from the ATOM lines
3262                                                        logger.warn("Could not find polymeric chain {} to link to entity {}. The chain will be missing in the entity.", chainId, comp.getMolId());
3263                                                }
3264                                        }
3265                                }
3266                        }
3267
3268                } else {
3269
3270                        logger.info("Entity information (COMPOUND record) not found in file. Will assign entities heuristically");
3271                        // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
3272                        entities = EntityFinder.findPolyEntities(polyModels);
3273
3274                }
3275
3276                // now we assign entities to the nonpoly and water chains
3277                EntityFinder.createPurelyNonPolyEntities(splitNonPolyModels, waterModels, entities);
3278
3279
3280                // in some rare cases purely non-polymer or purely water chain are present in pdb files
3281                // see https://github.com/biojava/biojava/pull/394
3282                // these case should be covered by the above
3283
3284
3285                // now that we have entities in chains we add the chains to the structure
3286
3287                for (int i=0;i<allModels.size();i++) {
3288                        List<Chain> model = new ArrayList<>();
3289                        model.addAll(polyModels.get(i));
3290                        model.addAll(splitNonPolyModels.get(i));
3291                        model.addAll(waterModels.get(i));
3292                        structure.addModel(model);
3293                        }
3294
3295
3296        }
3297
3298        /**
3299         * Links the Sites in the siteMap to the Groups in the Structure via the
3300         * siteToResidueMap ResidueNumber.
3301         * @author Jules Jacobsen
3302         * @return
3303         */
3304        private void linkSitesToGroups() {
3305
3306                //System.out.println("LINK SITES TO GROUPS:" + siteToResidueMap.keySet().size());
3307
3308                //link the map of siteIds : <ResidueNumber> with the sites by using ResidueNumber to get the correct group back.
3309                //the return list
3310
3311                if ( siteMap == null || siteToResidueMap == null){
3312                        logger.info("Sites can not be linked to residues!");
3313
3314                        return;
3315                }
3316
3317                List<Site> sites = null;
3318                //check that there are chains with which to associate the groups
3319                if (structure.getChains().isEmpty()) {
3320                        sites = new ArrayList<>(siteMap.values());
3321                        logger.info("No chains to link Site Groups with - Sites will not be present in the Structure");
3322                        return;
3323                }
3324
3325                //check that the keys in the siteMap and SiteToResidueMap are equal
3326                if (! siteMap.keySet().equals(siteToResidueMap.keySet())) {
3327                        logger.info("Not all sites have been properly described in the PDB " + pdbId + " header - some Sites will not be present in the Structure");
3328                        logger.debug(siteMap.keySet() + " | " + siteToResidueMap.keySet());
3329                        //return;
3330                }
3331
3332                //so we have chains - associate the siteResidues-related groups with the ones
3333                //already in in the chains
3334                for (String key : siteMap.keySet()) {
3335                        Site currentSite = siteMap.get(key);
3336                        List<ResidueNumber> linkedGroups = siteToResidueMap.get(key);
3337                        if ( linkedGroups == null)
3338                                continue;
3339                        for (ResidueNumber residueNumber : linkedGroups) {
3340
3341                                String pdbCode = residueNumber.toString();
3342                                String chain = residueNumber.getChainName();
3343                                //                    System.out.println("chain: '" + chain + "'");
3344                                //                    String resNum = resNum.getSeqNum().toString();
3345                                //                    System.out.println("resNum: '" + resNum + "'");
3346
3347                                Group linkedGroup = null;
3348                                try {
3349                                        //TODO: implement findGroup(ResidueNumber resNum)
3350                                        linkedGroup = structure.findGroup(chain, pdbCode);
3351                                } catch (StructureException ex) {
3352                                        logger.info("Can't find group " + pdbCode + " in chain " + chain + " in order to link up SITE records (PDB ID " + pdbId +")");
3353                                        continue;
3354                                }
3355
3356                                //                    System.out.println("Adding group: " + linkedGroup.getSeqNum() + " to site " + site.getSiteID());
3357                                currentSite.getGroups().add(linkedGroup);
3358                        }
3359                }
3360
3361                //System.out.println("SITEMAP: " + siteMap);
3362
3363                sites = new ArrayList<>(siteMap.values());
3364                structure.setSites(sites);
3365                //System.out.println("STRUCTURE SITES: " + structure.getSites().size());
3366                //            for (Site site : structure.getSites()) {
3367                //                System.out.println(site);
3368                //            }
3369                //            System.out.println("Linked Site Groups with Chains");
3370
3371        }
3372
3373        private void buildjournalArticle() {
3374
3375                logger.debug("building new JournalArticle");
3376                //            for (String line : journalLines) {
3377                //                System.out.println(line);
3378                //            }
3379
3380                this.journalArticle = new JournalArticle();
3381                //        JRNL        AUTH   M.HAMMEL,G.SFYROERA,D.RICKLIN,P.MAGOTTI,
3382                //        JRNL        AUTH 2 J.D.LAMBRIS,B.V.GEISBRECHT
3383                //        JRNL        TITL   A STRUCTURAL BASIS FOR COMPLEMENT INHIBITION BY
3384                //        JRNL        TITL 2 STAPHYLOCOCCUS AUREUS.
3385                //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3386                //        JRNL        REFN                   ISSN 1529-2908
3387                //        JRNL        PMID   17351618
3388                //        JRNL        DOI    10.1038/NI1450
3389                StringBuffer auth = new StringBuffer();
3390                StringBuffer titl = new StringBuffer();
3391                StringBuffer edit = new StringBuffer();
3392                StringBuffer ref = new StringBuffer();
3393                StringBuffer publ = new StringBuffer();
3394                StringBuffer refn = new StringBuffer();
3395                StringBuffer pmid = new StringBuffer();
3396                StringBuffer doi = new StringBuffer();
3397
3398                for (String line : journalLines) {
3399                        if ( line.length() < 19 ) {
3400                                logger.info("can not process Journal line: " + line);
3401                                continue;
3402                        }
3403                        //            System.out.println("'" + line + "'");
3404                        String subField = line.substring(12, 16);
3405                        //            System.out.println("'" + subField + "'");
3406                        if ("AUTH".equals(subField)) {
3407                                auth.append(line.substring(19, line.length()).trim());
3408
3409                                logger.debug("AUTH '{}'", auth.toString());
3410
3411                        }
3412                        if ("TITL".equals(subField)) {
3413                                //add a space to the end of a line so that when wrapped the
3414                                //words on the join won't be concatenated
3415                                titl.append(line.substring(19, line.length()).trim()).append(" ");
3416
3417                                logger.debug("TITL '{}'", titl.toString());
3418
3419                        }
3420                        if ("EDIT".equals(subField)) {
3421                                edit.append(line.substring(19, line.length()).trim());
3422
3423                                logger.debug("EDIT '{}'", edit.toString());
3424
3425                        }
3426                        //        JRNL        REF    NAT.IMMUNOL.                  V.   8   430 2007
3427                        if ("REF ".equals(subField)) {
3428                                ref.append(line.substring(19, line.length()).trim()).append(" ");
3429
3430                                logger.debug("REF '{}'", ref.toString());
3431
3432                        }
3433                        if ("PUBL".equals(subField)) {
3434                                publ.append(line.substring(19, line.length()).trim()).append(" ");
3435
3436                                logger.debug("PUBL '{}'", publ.toString());
3437
3438                        }
3439                        //        JRNL        REFN                   ISSN 1529-2908
3440                        if ("REFN".equals(subField)) {
3441                                if ( line.length() < 35 ) {
3442                                        logger.info("can not process Journal REFN line: " + line);
3443                                        continue;
3444                                }
3445                                refn.append(line.substring(35, line.length()).trim());
3446
3447                                logger.debug("REFN '{}'", refn.toString());
3448
3449                        }
3450                        //        JRNL        PMID   17351618
3451                        if ("PMID".equals(subField)) {
3452                                pmid.append(line.substring(19, line.length()).trim());
3453
3454                                logger.debug("PMID '{}'", pmid.toString());
3455
3456                        }
3457                        //        JRNL        DOI    10.1038/NI1450
3458                        if ("DOI ".equals(subField)) {
3459                                doi.append(line.substring(19, line.length()).trim());
3460
3461                                logger.debug("DOI '{}'", doi.toString());
3462
3463                        }
3464                }
3465
3466                //now set the parts of the JournalArticle
3467                journalArticle.setAuthorList(authorBuilder(auth.toString()));
3468                journalArticle.setEditorList(authorBuilder(edit.toString()));
3469                journalArticle.setRef(ref.toString());
3470                JournalParser journalParser = new JournalParser(ref.toString());
3471                journalArticle.setJournalName(journalParser.getJournalName());
3472                if (!"TO BE PUBLISHED".equals(journalArticle.getJournalName())) {
3473                        journalArticle.setIsPublished(true);
3474                }
3475                journalArticle.setVolume(journalParser.getVolume());
3476                journalArticle.setStartPage(journalParser.getStartPage());
3477                journalArticle.setPublicationDate(journalParser.getPublicationDate());
3478                journalArticle.setPublisher(publ.toString().trim());
3479                journalArticle.setTitle(titl.toString().trim());
3480                journalArticle.setRefn(refn.toString().trim());
3481                journalArticle.setPmid(pmid.toString().trim());
3482                journalArticle.setDoi(doi.toString().trim());
3483
3484
3485                logger.debug("Made JournalArticle:");
3486                logger.debug(journalArticle.toString());
3487
3488        }
3489
3490        //inner class to deal with all the journal info
3491        private class JournalParser {
3492
3493                private String journalName;
3494                private String volume;
3495                private String startPage;
3496                private int publicationDate;
3497
3498
3499                public JournalParser(String ref) {
3500
3501                        logger.debug("JournalParser init '{}'", ref);
3502
3503
3504                        if ("TO BE PUBLISHED ".equals(ref)) {
3505                                journalName = ref.trim();
3506
3507                                logger.debug(String.format("JournalParser found journalString '%s'", journalName));
3508
3509                                return;
3510                        }
3511
3512                        if (ref.length() < 48) {
3513                                logger.info("REF line too short - must be at least 48 characters to be valid for parsing.");
3514                                journalName = "";
3515                                volume = "";
3516                                startPage = "";
3517                                publicationDate = 0;
3518                                return;
3519                        }
3520                        //can be multi line:
3521                        //REF    PHILOS.TRANS.R.SOC.LONDON,    V. 293    53 1981
3522                        //REF  2 SER.B
3523
3524                        //or
3525
3526                        //REF    GLYCOGEN PHOSPHORYLASE B:                1 1991
3527                        //REF  2 DESCRIPTION OF THE PROTEIN
3528                        //REF  3 STRUCTURE
3529
3530                        //but usually single line
3531                        //REF    NUCLEIC ACIDS RES.                         2009
3532                        //REF    MOL.CELL                                   2009
3533                        //REF    NAT.STRUCT.MOL.BIOL.          V.  16   238 2009
3534                        //REF    ACTA CRYSTALLOGR.,SECT.F      V.  65   199 2009
3535                        //check if the date is present at the end of the line.
3536                        //                             09876543210987654321
3537                        //'J.BIOL.CHEM.                  V. 280 23000 2005 '
3538                        //'J.AM.CHEM.SOC.                V. 130 16011 2008 '
3539                        //'NAT.STRUCT.MOL.BIOL.          V.  16   238 2009'
3540                        String volumeInformation = ref.substring(30, 48);
3541
3542                        logger.debug(String.format("Parsing volumeInformation: '%s'", volumeInformation));
3543
3544                        //volumeInformation: 'V. 293    53 1981 '
3545                        //                      String dateString = ref.substring(ref.length() - 5 , ref.length() - 1).trim();
3546                        //                      String startPageString = ref.substring(ref.length() - 11 , ref.length() - 6).trim();
3547                        //                      String volumeString = ref.substring(ref.length() - 16 , ref.length() - 12).trim();
3548                        //                      String journalString = ref.substring(0 , ref.length() - 18).trim();
3549                        String dateString = volumeInformation.substring(volumeInformation.length() - 5 , volumeInformation.length() - 1).trim();
3550                        String startPageString = volumeInformation.substring(volumeInformation.length() - 11 , volumeInformation.length() - 6).trim();
3551                        String volumeString = volumeInformation.substring(volumeInformation.length() - 16 , volumeInformation.length() - 12).trim();
3552                        //for the journal string we need to remove the volume information which might be in the middle of the string (e.g. 1gpb, 3pfk)
3553                        String journalString = ref.substring(0 , 29).trim() + " " + ref.substring(30, ref.length() - 1).replace(volumeInformation.trim(), "").trim();
3554                        journalString = journalString.trim();
3555                        //                        System.out.println("journalString: " + journalString);
3556
3557                        logger.debug(String.format("JournalParser found volumeString '%s'", volumeString));
3558                        logger.debug(String.format("JournalParser found startPageString '%s'", startPageString));
3559                        logger.debug(String.format("JournalParser found dateString '%s'", dateString));
3560                        logger.debug(String.format("JournalParser found journalString '%s'", journalString));
3561
3562
3563                        if (!"    ".equals(dateString)) {
3564                                try {
3565                                        publicationDate = Integer.valueOf(dateString);
3566                                } catch (NumberFormatException nfe) {
3567                                        logger.info(dateString + " is not a valid integer for a date in JRNL sub-section REF line 1");
3568                                }
3569                                //                              if (DEBUG) {
3570                                //                                      System.out.println("JournalParser set date " + publicationDate);
3571                                //                              }
3572                        }
3573
3574                        if (!"    ".equals(startPageString)) {
3575                                startPage = startPageString;
3576                                //                              if (DEBUG) {
3577                                //                                      System.out.println("JournalParser set startPage " + startPage);
3578                                //                              }
3579                        }
3580
3581                        if (!"    ".equals(volumeString)) {
3582                                volume = volumeString;
3583                                //                              if (DEBUG) {
3584                                //                                      System.out.println("JournalParser set volume " + volume);
3585                                //                              }
3586                        }
3587
3588                        if (!"    ".equals(journalString)) {
3589                                journalName = journalString;
3590
3591                                logger.debug("JournalParser set journalName {}", journalName);
3592
3593                        }
3594                }
3595
3596                private String getJournalName() {
3597                        return journalName;
3598                }
3599
3600                private int getPublicationDate() {
3601                        return publicationDate;
3602                }
3603
3604                private String getStartPage() {
3605                        return startPage;
3606                }
3607
3608                private String getVolume() {
3609                        return volume;
3610                }
3611        }
3612
3613        private List<Author> authorBuilder(String authorString) {
3614                ArrayList<Author> authorList = new ArrayList<>();
3615
3616                if ("".equals(authorString)) {
3617                        return authorList;
3618                }
3619
3620                String[] authors = authorString.split(",");
3621                //        if (DEBUG) {
3622                //            for (int i = 0; i < authors.length; i++) {
3623                //                String string = authors[i];
3624                //                System.out.println("authorBuilder author: '" + string + "'");
3625                //            }
3626                //        }
3627                //        AUTH   SEATTLE STRUCTURAL GENOMICS CENTER FOR INFECTIOUS
3628                //        AUTH 2 DISEASE (SSGCID)
3629                //        or
3630                //        AUTH   E.DOBROVETSKY,A.DONG,A.SEITOVA,B.DUNCAN,L.CROMBET,
3631                //        AUTH 2 M.SUNDSTROM,C.H.ARROWSMITH,A.M.EDWARDS,C.BOUNTRA,
3632                //        AUTH 3 A.BOCHKAREV,D.COSSAR,
3633                //        AUTH 4 STRUCTURAL GENOMICS CONSORTIUM (SGC)
3634                //        or
3635                //        AUTH   T.-C.MOU,S.R.SPRANG,N.MASADA,D.M.F.COOPER
3636                if (authors.length == 1) {
3637                        //only one element means it's a consortium only
3638                        Author author = new Author();
3639                        author.setSurname(authors[0]);
3640
3641                        logger.debug("Set consortium author name {}", author.getSurname());
3642
3643                        authorList.add(author);
3644                } else {
3645                        for (int i = 0; i < authors.length; i++) {
3646                                String authorFullName = authors[i];
3647
3648                                logger.debug("Building author {}", authorFullName);
3649
3650                                Author author = new Author();
3651                                String regex = "\\.";
3652                                String[] authorNames = authorFullName.split(regex);
3653                                //                if (DEBUG) {
3654                                //                    System.out.println("authorNames size " + authorNames.length);
3655                                //                    for (int j = 0; j < authorNames.length; j++) {
3656                                //                        String name = authorNames[j];
3657                                //                        System.out.println("split authName '" + name + "'");
3658                                //
3659                                //                    }
3660                                //                }
3661                                if (authorNames.length == 0) {
3662                                        author.setSurname(authorFullName);
3663
3664                                        logger.debug("Unable to split using '" + regex + "' Setting whole name " + author.getSurname());
3665
3666                                }
3667                                //again there might be a consortium name so there may be no elements
3668                                else if (authorNames.length == 1) {
3669                                        author.setSurname(authorNames[0]);
3670
3671                                        logger.debug("Set consortium author name in multiple author block {}", author.getSurname
3672                                                                ());
3673
3674                                } else {
3675                                        String initials = "";
3676                                        for (int j = 0; j < authorNames.length - 1; j++) {
3677                                                String initial = authorNames[j];
3678                                                //                        if (DEBUG) {
3679                                                //                            System.out.println("adding initial '" + initial + "'");
3680                                                //                        }
3681                                                //build the initials back up again
3682                                                initials += initial + ".";
3683                                        }
3684
3685                                        logger.debug("built initials '{}'", initials);
3686
3687                                        author.setInitials(initials);
3688                                        //surname is always last
3689                                        int lastName = authorNames.length - 1;
3690                                        String surname = authorNames[lastName];
3691
3692                                        logger.debug("built author surname {}", surname);
3693
3694                                        author.setSurname(surname);
3695
3696                                }
3697                                authorList.add(author);
3698                        }
3699                }
3700                return authorList;
3701        }
3702
3703        public void setFileParsingParameters(FileParsingParameters params)
3704        {
3705                this.params= params;
3706
3707                // set the correct max values for parsing...
3708                loadMaxAtoms = params.getMaxAtoms();
3709                atomCAThreshold = params.getAtomCaThreshold();
3710
3711
3712        }
3713
3714        public FileParsingParameters getFileParsingParameters(){
3715                return params;
3716        }
3717
3718
3719}