001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * created at Apr 26, 2008
021 */
022package org.biojava.nbio.structure.io.mmcif;
023
024import java.text.ParseException;
025import java.text.SimpleDateFormat;
026import java.util.ArrayList;
027import java.util.Date;
028import java.util.HashMap;
029import java.util.LinkedHashMap;
030import java.util.List;
031import java.util.Locale;
032import java.util.Map;
033
034import javax.vecmath.Matrix4d;
035
036import org.biojava.nbio.structure.AminoAcid;
037import org.biojava.nbio.structure.AminoAcidImpl;
038import org.biojava.nbio.structure.Atom;
039import org.biojava.nbio.structure.AtomImpl;
040import org.biojava.nbio.structure.Chain;
041import org.biojava.nbio.structure.ChainImpl;
042import org.biojava.nbio.structure.EntityInfo;
043import org.biojava.nbio.structure.EntityType;
044import org.biojava.nbio.structure.DBRef;
045import org.biojava.nbio.structure.Element;
046import org.biojava.nbio.structure.Group;
047import org.biojava.nbio.structure.GroupType;
048import org.biojava.nbio.structure.HetatomImpl;
049import org.biojava.nbio.structure.NucleotideImpl;
050import org.biojava.nbio.structure.PDBCrystallographicInfo;
051import org.biojava.nbio.structure.PDBHeader;
052import org.biojava.nbio.structure.ResidueNumber;
053import org.biojava.nbio.structure.SeqMisMatch;
054import org.biojava.nbio.structure.SeqMisMatchImpl;
055import org.biojava.nbio.structure.Site;
056import org.biojava.nbio.structure.Structure;
057import org.biojava.nbio.structure.StructureException;
058import org.biojava.nbio.structure.StructureImpl;
059import org.biojava.nbio.structure.StructureTools;
060import org.biojava.nbio.structure.io.BondMaker;
061import org.biojava.nbio.structure.io.ChargeAdder;
062import org.biojava.nbio.structure.io.EntityFinder;
063import org.biojava.nbio.structure.io.FileParsingParameters;
064import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
065import org.biojava.nbio.structure.io.mmcif.model.AtomSite;
066import org.biojava.nbio.structure.io.mmcif.model.AtomSites;
067import org.biojava.nbio.structure.io.mmcif.model.AuditAuthor;
068import org.biojava.nbio.structure.io.mmcif.model.Cell;
069import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
070import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom;
071import org.biojava.nbio.structure.io.mmcif.model.ChemCompBond;
072import org.biojava.nbio.structure.io.mmcif.model.ChemCompDescriptor;
073import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBremark;
074import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBrev;
075import org.biojava.nbio.structure.io.mmcif.model.DatabasePdbrevRecord;
076import org.biojava.nbio.structure.io.mmcif.model.Entity;
077import org.biojava.nbio.structure.io.mmcif.model.EntityPoly;
078import org.biojava.nbio.structure.io.mmcif.model.EntityPolySeq;
079import org.biojava.nbio.structure.io.mmcif.model.EntitySrcGen;
080import org.biojava.nbio.structure.io.mmcif.model.EntitySrcNat;
081import org.biojava.nbio.structure.io.mmcif.model.EntitySrcSyn;
082import org.biojava.nbio.structure.io.mmcif.model.Exptl;
083import org.biojava.nbio.structure.io.mmcif.model.PdbxAuditRevisionHistory;
084import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompDescriptor;
085import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompIdentifier;
086import org.biojava.nbio.structure.io.mmcif.model.PdbxDatabaseStatus;
087import org.biojava.nbio.structure.io.mmcif.model.PdbxEntityNonPoly;
088import org.biojava.nbio.structure.io.mmcif.model.PdbxNonPolyScheme;
089import org.biojava.nbio.structure.io.mmcif.model.PdbxPolySeqScheme;
090import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssembly;
091import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssemblyGen;
092import org.biojava.nbio.structure.io.mmcif.model.PdbxStructOperList;
093import org.biojava.nbio.structure.io.mmcif.model.Refine;
094import org.biojava.nbio.structure.io.mmcif.model.Struct;
095import org.biojava.nbio.structure.io.mmcif.model.StructAsym;
096import org.biojava.nbio.structure.io.mmcif.model.StructConn;
097import org.biojava.nbio.structure.io.mmcif.model.StructKeywords;
098import org.biojava.nbio.structure.io.mmcif.model.StructNcsOper;
099import org.biojava.nbio.structure.io.mmcif.model.StructRef;
100import org.biojava.nbio.structure.io.mmcif.model.StructRefSeq;
101import org.biojava.nbio.structure.io.mmcif.model.StructRefSeqDif;
102import org.biojava.nbio.structure.io.mmcif.model.StructSite;
103import org.biojava.nbio.structure.io.mmcif.model.StructSiteGen;
104import org.biojava.nbio.structure.io.mmcif.model.Symmetry;
105import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
106import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
107import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
108import org.biojava.nbio.structure.xtal.CrystalCell;
109import org.biojava.nbio.structure.xtal.SpaceGroup;
110import org.biojava.nbio.structure.xtal.SymoplibParser;
111import org.slf4j.Logger;
112import org.slf4j.LoggerFactory;
113
114/**
115 * A MMcifConsumer implementation that builds an in-memory representation of the
116 * content of a mmcif file as a BioJava Structure object.
117 *
118 * @author Andreas Prlic
119 * @since 1.7
120 */
121
122public class SimpleMMcifConsumer implements MMcifConsumer {
123
124        private static final Logger logger = LoggerFactory.getLogger(SimpleMMcifConsumer.class);
125
126        private Structure structure;
127        private Chain currentChain;
128        private Group currentGroup;
129
130        /**
131         * A temporary data structure to hold all parsed chains
132         */
133        private ArrayList<List<Chain>> allModels;
134        /**
135         * The current set of chains per model
136         */
137        private List<Chain>      currentModel;
138        private List<Entity>     entities;
139        /**
140         * Needed in header only mode to get mapping between asym ids and author ids
141         */
142        private List<EntityPoly> entityPolys;
143        private List<StructRef>  strucRefs;
144        private List<Chain>      seqResChains;
145        private List<Chain>      entityChains; // needed to link entities, chains and compounds...
146        private List<StructAsym> structAsyms;  // needed to link entities, chains and compounds...
147        private List<PdbxStructOperList> structOpers ; //
148        private List<PdbxStructAssembly> strucAssemblies;
149        private List<PdbxStructAssemblyGen> strucAssemblyGens;
150        private List<EntitySrcGen> entitySrcGens;
151        private List<EntitySrcNat> entitySrcNats;
152        private List<EntitySrcSyn> entitySrcSyns;
153        private List<StructConn> structConn;
154        private List<StructNcsOper> structNcsOper;
155        private List<StructRefSeqDif> sequenceDifs;
156        private List<StructSiteGen> structSiteGens;
157
158        private Matrix4d parsedScaleMatrix;
159
160
161
162        /**
163         * A map of asym ids (internal chain ids) to entity ids extracted from
164         * the _struct_asym category
165         */
166        private Map<String,String> asymId2entityId;
167
168        /**
169         * A map of asym ids (internal chain ids) to author ids extracted from
170         * the _entity_poly category. Used in header only parsing.
171         */
172        private Map<String,String> asymId2authorId;
173
174        private String currentNmrModelNumber ;
175
176        private FileParsingParameters params;
177
178        public  SimpleMMcifConsumer(){
179                params = new FileParsingParameters();
180                documentStart();
181
182        }
183
184        @Override
185        public void newEntity(Entity entity) {
186                logger.debug("New entity: {}",entity.toString());
187                entities.add(entity);
188        }
189
190        @Override
191        public void newEntityPoly(EntityPoly entityPoly) {
192                entityPolys.add(entityPoly);
193        }
194
195        @Override
196        public void newPdbxStructOperList(PdbxStructOperList structOper){
197
198                structOpers.add(structOper);
199        }
200
201        @Override
202        public void newStructAsym(StructAsym sasym){
203
204                structAsyms.add(sasym);
205        }
206
207        private Entity getEntity(int entity_id){
208                try {
209                        for (Entity e: entities){
210                                int eId = Integer.parseInt(e.getId());
211                                if  (eId== entity_id){
212                                        return e;
213                                }
214                        }
215                } catch (NumberFormatException e) {
216                        logger.warn("Entity id does not look like a number:", e.getMessage());
217                }
218                return null;
219        }
220
221        @Override
222        public void newStructKeywords(StructKeywords kw){
223                PDBHeader header = structure.getPDBHeader();
224                if ( header == null)
225                        header = new PDBHeader();
226                header.setDescription(kw.getPdbx_keywords());
227                header.setClassification(kw.getPdbx_keywords());
228        }
229
230        @Override
231        public void setStruct(Struct struct) {
232
233                PDBHeader header = structure.getPDBHeader();
234                if ( header == null)
235                        header = new PDBHeader();
236
237                header.setTitle(struct.getTitle());
238                header.setIdCode(struct.getEntry_id());
239                //header.setDescription(struct.getPdbx_descriptor());
240                //header.setClassification(struct.getPdbx_descriptor());
241                //header.setDescription(struct.getPdbx_descriptor());
242
243
244
245                structure.setPDBHeader(header);
246                structure.setPDBCode(struct.getEntry_id());
247        }
248
249        /** initiate new group, either Hetatom, Nucleotide, or AminoAcid */
250        private Group getNewGroup(String recordName,Character aminoCode1, long seq_id,String groupCode3) {
251
252                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(groupCode3);
253                if ( g != null && !g.getChemComp().isEmpty()) {
254                        if ( g instanceof AminoAcidImpl) {
255                                AminoAcidImpl aa = (AminoAcidImpl) g;
256                                aa.setId(seq_id);
257                        } else if ( g instanceof NucleotideImpl) {
258                                NucleotideImpl nuc =  (NucleotideImpl) g;
259                                nuc.setId(seq_id);
260                        } else if ( g instanceof HetatomImpl) {
261                                HetatomImpl het = (HetatomImpl)g;
262                                het.setId(seq_id);
263                        }
264                        return g;
265                }
266
267
268
269                Group group;
270                if ( recordName.equals("ATOM") ) {
271                        if (StructureTools.isNucleotide(groupCode3))  {
272                                // it is a nucleotide
273                                NucleotideImpl nu = new NucleotideImpl();
274                                group = nu;
275                                nu.setId(seq_id);
276
277                        } else if (aminoCode1==null || aminoCode1 == StructureTools.UNKNOWN_GROUP_LABEL){
278                                HetatomImpl h = new HetatomImpl();
279                                h.setId(seq_id);
280                                group = h;
281
282                        } else {
283                                AminoAcidImpl aa = new AminoAcidImpl() ;
284                                aa.setAminoType(aminoCode1);
285                                aa.setId(seq_id);
286                                group = aa ;
287                        }
288                }
289                else {
290                        if (StructureTools.isNucleotide(groupCode3))  {
291                                // it is a nucleotide
292                                NucleotideImpl nu = new NucleotideImpl();
293                                group = nu;
294                                nu.setId(seq_id);
295                        }
296                        else if (aminoCode1 != null ) {
297                                AminoAcidImpl aa = new AminoAcidImpl() ;
298                                aa.setAminoType(aminoCode1);
299                                aa.setId(seq_id);
300                                group = aa ;
301                        } else {
302                                HetatomImpl h = new HetatomImpl();
303                                h.setId(seq_id);
304                                group = h;
305                        }
306                }
307                return  group ;
308        }
309
310        /**
311         * Test if the given asymId is already present in the list of chains given. If yes, returns the chain
312         * otherwise returns null.
313         */
314        private static Chain isKnownChain(String asymId, List<Chain> chains){
315
316                for (int i = 0; i< chains.size();i++){
317                        Chain testchain =  chains.get(i);
318                        //System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<");
319                        if (asymId.equals(testchain.getId())) {
320                                //System.out.println("chain "+ chainID+" already known ...");
321                                return testchain;
322                        }
323                }
324
325                return null;
326        }
327
328        @Override
329        public void newAtomSite(AtomSite atom) {
330
331                if (params.isHeaderOnly()) return;
332
333                // Warning: getLabel_asym_id is not the "chain id" in the PDB file
334                // it is the internally used chain id.
335                // later on we will fix this...
336
337                // later one needs to map the asym id to the pdb_strand_id
338
339                //TODO: add support for FileParsingParams.getMaxAtoms()
340
341                boolean startOfNewChain = false;
342
343                String asymId = atom.getLabel_asym_id();
344                String authId = atom.getAuth_asym_id();
345
346                String recordName    = atom.getGroup_PDB();
347                String residueNumberS = atom.getAuth_seq_id();
348                Integer residueNrInt = Integer.parseInt(residueNumberS);
349
350                // the 3-letter name of the group:
351                String groupCode3    = atom.getLabel_comp_id();
352
353                boolean isHetAtomInFile = false;
354
355                Character aminoCode1 = null;
356                if ( recordName.equals("ATOM") )
357                        aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
358                else {
359                        aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
360
361                        // for nucleotides this will be null..
362                        if (aminoCode1 != null &&  aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
363                                aminoCode1 = null;
364
365                        isHetAtomInFile = true;
366                }
367                String insCodeS = atom.getPdbx_PDB_ins_code();
368                Character insCode = null;
369                if (!  insCodeS.equals("?")) {
370                        insCode = insCodeS.charAt(0);
371                }
372                // we store the internal seq id in the Atom._id field
373                // this is not a PDB file field but we need this to internally assign the insertion codes later
374                // from the pdbx_poly_seq entries..
375
376                long seq_id = -1;
377                try {
378                        seq_id = Long.parseLong(atom.getLabel_seq_id());
379                } catch (NumberFormatException e){
380                        // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.', thus it is ok to
381                        // silently ignore this
382                        //logger.debug("Could not parse number for _atom_site.label_seq_id: "+e.getMessage());
383                }
384
385                String nmrModelNumber = atom.getPdbx_PDB_model_num();
386
387                if ( currentNmrModelNumber == null) {
388                        currentNmrModelNumber = nmrModelNumber;
389                }
390
391                if (! currentNmrModelNumber.equals(nmrModelNumber)){
392                        currentNmrModelNumber = nmrModelNumber;
393
394                        // add previous data
395                        if ( currentChain != null ) {
396                                currentChain.addGroup(currentGroup);
397                                currentGroup.trimToSize();
398                        }
399
400                        // we came to the beginning of a new NMR model
401                        allModels.add(currentModel);
402                        currentModel = new ArrayList<Chain>();
403                        currentChain = null;
404                        currentGroup = null;
405                }
406
407
408                if (currentChain == null) {
409
410                        currentChain = new ChainImpl();
411                        currentChain.setName(authId);
412                        currentChain.setId(asymId);
413                        currentModel.add(currentChain);
414                        startOfNewChain = true;
415                }
416
417                //System.out.println("BEFORE: " + chain_id + " " + current_chain.getName());
418                if ( ! asymId.equals(currentChain.getId()) ) {
419                        //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId);
420                        startOfNewChain = true;
421
422                        // end up old chain...
423                        currentChain.addGroup(currentGroup);
424
425                        // see if old chain is known ...
426                        Chain testchain = isKnownChain(asymId,currentModel);
427
428                        if ( testchain == null) {
429                                //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId);
430
431                                currentChain = new ChainImpl();
432                                currentChain.setName(authId);
433                                currentChain.setId(asymId);
434
435                        }   else {
436                                currentChain = testchain;
437                        }
438
439                        if ( ! currentModel.contains(currentChain))
440                                currentModel.add(currentChain);
441
442                }
443
444
445                ResidueNumber residueNumber = new ResidueNumber(authId,residueNrInt, insCode);
446
447                if (currentGroup == null) {
448
449
450                        currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
451
452                        currentGroup.setResidueNumber(residueNumber);
453                        currentGroup.setPDBName(groupCode3);
454                        currentGroup.setHetAtomInFile(isHetAtomInFile);
455                }
456
457                // SET UP THE ALT LOC GROUP
458                Group altGroup = null;
459                String altLocS = atom.getLabel_alt_id();
460                Character altLoc = ' ';
461                if ( altLocS.length()>0) {
462                        altLoc = altLocS.charAt(0);
463                        if ( altLoc.equals('.') )
464                                altLoc = ' ';
465
466                }
467                // If it's the start of the new chain
468                if ( startOfNewChain){
469                        currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
470                        currentGroup.setResidueNumber(residueNumber);
471                        currentGroup.setPDBName(groupCode3);
472                        currentGroup.setHetAtomInFile(isHetAtomInFile);
473                }
474                // ANTHONY BRADLEY ADDED THIS -> WE ONLY WAN'T TO CHECK FOR ALT LOCS WHEN IT's NOT THE FIRST GROUP IN CHAIN
475                else{
476                // check if residue number is the same ...
477                // insertion code is part of residue number
478                        if ( ! residueNumber.equals(currentGroup.getResidueNumber())) {
479                        //System.out.println("end of residue: "+current_group.getPDBCode()+" "+residueNrInt);
480                                currentChain.addGroup(currentGroup);
481                                currentGroup.trimToSize();
482                                currentGroup = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
483                                currentGroup.setPDBName(groupCode3);
484                                currentGroup.setResidueNumber(residueNumber);
485                                currentGroup.setHetAtomInFile(isHetAtomInFile);
486
487
488                } else {
489                        // same residueNumber, but altLocs...
490                        // test altLoc
491
492                        if ( ! altLoc.equals(' ') && ( ! altLoc.equals('.'))) {
493                                        logger.debug("found altLoc! " + altLoc + " " + currentGroup + " " + altGroup);
494                                altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3, seq_id);
495                                if (altGroup.getChain()==null) {
496                                                altGroup.setChain(currentChain);
497                                        }
498                                }
499                        }
500                }
501                //atomCount++;
502                //System.out.println("fixing atom name for  >" + atom.getLabel_atom_id() + "< >" + fullname + "<");
503
504
505                if ( params.isParseCAOnly() ){
506                        // yes , user wants to get CA only
507                        // only parse CA atoms...
508                        if (! (atom.getLabel_atom_id().equals(StructureTools.CA_ATOM_NAME) && atom.getType_symbol().equals("C"))) {
509                                //System.out.println("ignoring " + line);
510                                //atomCount--;
511                                return;
512                        }
513                }
514
515                //see if chain_id is one of the previous chains ...
516
517                Atom a = convertAtom(atom);
518
519                //see if chain_id is one of the previous chains ...
520                if ( altGroup != null) {
521                        altGroup.addAtom(a);
522                        altGroup = null;
523                }
524                else {
525                        currentGroup.addAtom(a);
526                }
527
528
529                String atomName = a.getName();
530                // make sure that main group has all atoms
531                // GitHub issue: #76
532                if ( ! currentGroup.hasAtom(atomName)) {
533                        // Unless it's microheterogenity https://github.com/rcsb/codec-devel/issues/81
534                        if (currentGroup.getPDBName().equals(a.getGroup().getPDBName())) {
535                                if(!StructureTools.hasNonDeuteratedEquiv(a,currentGroup)){
536                                        currentGroup.addAtom(a);
537                                }
538                        }
539
540                }
541        }
542
543        /**
544         * Convert a mmCIF AtomSite object to a BioJava Atom object
545         *
546         * @param atom the mmmcif AtomSite record
547         * @return an Atom
548         */
549        private Atom convertAtom(AtomSite atom){
550
551
552                Atom a = new AtomImpl();
553
554                a.setPDBserial(Integer.parseInt(atom.getId()));
555                a.setName(atom.getLabel_atom_id());
556
557                double x = Double.parseDouble (atom.getCartn_x());
558                double y = Double.parseDouble (atom.getCartn_y());
559                double z = Double.parseDouble (atom.getCartn_z());
560                a.setX(x);
561                a.setY(y);
562                a.setZ(z);
563
564                float occupancy = Float.parseFloat (atom.getOccupancy());
565                a.setOccupancy(occupancy);
566
567                float temp = Float.parseFloat (atom.getB_iso_or_equiv());
568                a.setTempFactor(temp);
569
570                String alt = atom.getLabel_alt_id();
571                if (( alt != null ) && ( alt.length() > 0) && (! alt.equals("."))){
572                        a.setAltLoc(new Character(alt.charAt(0)));
573                } else {
574                        a.setAltLoc(new Character(' '));
575                }
576
577                Element element = Element.R;
578                try {
579                        element = Element.valueOfIgnoreCase(atom.getType_symbol());
580                }  catch (IllegalArgumentException e) {
581                        logger.info("Element {} was not recognised as a BioJava-known element, the element will be represented as the generic element {}", atom.getType_symbol(), Element.R.name());
582                }
583                a.setElement(element);
584
585                return a;
586
587        }
588
589
590        private Group getCorrectAltLocGroup( Character altLoc,
591                        String recordName,
592                        Character aminoCode1,
593                        String groupCode3,
594                        long seq_id) {
595
596                // see if we know this altLoc already;
597                List<Atom> atoms = currentGroup.getAtoms();
598                if ( atoms.size() > 0) {
599                        Atom a1 = atoms.get(0);
600                        // we are just adding atoms to the current group
601                        // probably there is a second group following later...
602                        if (a1.getAltLoc().equals(altLoc)) {
603
604                                return currentGroup;
605                        }
606                }
607
608                List<Group> altLocs = currentGroup.getAltLocs();
609                for ( Group altLocG : altLocs ){
610                        atoms = altLocG.getAtoms();
611                        if ( atoms.size() > 0) {
612                                for ( Atom a1 : atoms) {
613                                        if (a1.getAltLoc().equals( altLoc)) {
614
615                                                return altLocG;
616                                        }
617                                }
618                        }
619                }
620
621                // no matching altLoc group found.
622                // build it up.
623
624                if ( groupCode3.equals(currentGroup.getPDBName())) {
625                        if ( currentGroup.getAtoms().size() == 0) {
626                                //System.out.println("current group is empty " + current_group + " " + altLoc);
627                                return currentGroup;
628                        }
629                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
630                        Group altLocG = (Group) currentGroup.clone();
631                        // drop atoms from cloned group...
632                        // https://redmine.open-bio.org/issues/3307
633                        altLocG.setAtoms(new ArrayList<Atom>());
634                        altLocG.getAltLocs().clear();
635                        currentGroup.addAltLoc(altLocG);
636                        return altLocG;
637                }
638
639                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
640                //String recordName,Character aminoCode1, long seq_id,String groupCode3) {
641                Group altLocG = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
642
643                altLocG.setPDBName(groupCode3);
644                altLocG.setResidueNumber(currentGroup.getResidueNumber());
645                currentGroup.addAltLoc(altLocG);
646                return altLocG;
647        }
648
649        /**
650         * Start the parsing
651         */
652        @Override
653        public void documentStart() {
654                structure = new StructureImpl();
655
656                currentChain        = null;
657                currentGroup            = null;
658                currentNmrModelNumber   = null;
659                //atomCount                     = 0;
660
661                allModels     = new ArrayList<List<Chain>>();
662                currentModel  = new ArrayList<Chain>();
663                entities      = new ArrayList<Entity>();
664                entityPolys   = new ArrayList<>();
665                strucRefs     = new ArrayList<StructRef>();
666                seqResChains  = new ArrayList<Chain>();
667                entityChains  = new ArrayList<Chain>();
668                structAsyms   = new ArrayList<StructAsym>();
669
670                asymId2entityId = new HashMap<String,String>();
671                asymId2authorId = new HashMap<>();
672                structOpers   = new ArrayList<PdbxStructOperList>();
673                strucAssemblies = new ArrayList<PdbxStructAssembly>();
674                strucAssemblyGens = new ArrayList<PdbxStructAssemblyGen>();
675                entitySrcGens = new ArrayList<EntitySrcGen>();
676                entitySrcNats = new ArrayList<EntitySrcNat>();
677                entitySrcSyns = new ArrayList<EntitySrcSyn>();
678                structConn = new ArrayList<StructConn>();
679                structNcsOper = new ArrayList<StructNcsOper>();
680                sequenceDifs = new ArrayList<StructRefSeqDif>();
681                structSiteGens = new ArrayList<StructSiteGen>();
682        }
683
684
685        @Override
686        public void documentEnd() {
687
688                // Expected that there is one current_chain that needs to be added to the model
689                // When in headerOnly mode, no Atoms are read, and there will not be an active
690                // current_chain.
691                if ( currentChain != null ) {
692
693                        currentChain.addGroup(currentGroup);
694                        if (isKnownChain(currentChain.getId(),currentModel) == null) {
695                                currentModel.add(currentChain);
696                        }
697                } else if (!params.isHeaderOnly()){
698                        logger.warn("current chain is null at end of document.");
699                }
700
701                allModels.add(currentModel);
702
703                // this populates the asymId2authorId and asymId2entityId maps, needed in header only mode to get the mapping
704                // between the 2 chain identifiers.
705                initMaps();
706
707                for (StructAsym asym : structAsyms) {
708
709                        logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() );
710
711                        Chain s = getEntityChain(asym.getEntity_id());
712                        Chain seqres = (Chain)s.clone();
713                        // to solve issue #160 (e.g. 3u7t)
714                        seqres = removeSeqResHeterogeneity(seqres);
715                        seqres.setId(asym.getId());
716                        if (asymId2authorId.get(asym.getId()) !=null ){
717                                seqres.setName(asymId2authorId.get(asym.getId()));
718                        } else {
719                                seqres.setName(asym.getId());
720                        }
721
722                        EntityType type = null;
723                        try {
724                                Entity ent = getEntity(Integer.parseInt(asym.getEntity_id()));
725                                type = EntityType.entityTypeFromString(ent.getType());
726                        } catch (NumberFormatException e) {
727                                logger.debug("Could not parse integer from entity id field {}", asym.getEntity_id());
728                        }
729
730                        // we'll only add seqres chains that are polymeric or unknown
731                        if (type==null || type==EntityType.POLYMER ) {
732                        seqResChains.add(seqres);
733                        }
734
735                        logger.debug(" seqres: " + asym.getId() + " " + seqres + "<") ;
736                        // adding the entities to structure
737                        addEntities(asym);
738
739                }
740
741                if (structAsyms.isEmpty()) {
742                        logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
743                }
744
745                // entities
746                // In addEntities above we created the entities if they were present in the file
747                // Now we need to make sure that they are linked to chains and also that if they are not present in the file we need to add them now
748                linkEntities();
749
750                // now that we know the entities, we can add all chains to structure so that they are stored
751                // properly as polymer/nonpolymer/water chains inside structure
752                for (List<Chain> model:allModels) {
753                        structure.addModel(model);
754                }
755
756                // Only align if requested (default) and not when headerOnly mode with no Atoms.
757                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
758                if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){
759                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
760                        alignSeqRes();
761                } else {
762                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
763                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
764                }
765
766
767                // Now make sure all altlocgroups have all the atoms in all the groups
768                StructureTools.cleanUpAltLocs(structure);
769
770                // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
771                if (!params.isHeaderOnly()) {
772                        if ( params.shouldCreateAtomBonds()) {
773                                addBonds();
774                        }
775
776                        if ( params.shouldCreateAtomCharges()) {
777                                addCharges();
778                        }
779                }
780
781                if (!params.isHeaderOnly()) {
782
783                        // Do structure.setSites(sites) after any chain renaming to be like PDB.
784                        addSites();
785                }
786
787
788
789                // set the oligomeric state info in the header...
790                if (params.isParseBioAssembly()) {
791
792                        // the more detailed mapping of chains to rotation operations happens in StructureIO...
793
794                        Map<Integer,BioAssemblyInfo> bioAssemblies = new LinkedHashMap<Integer, BioAssemblyInfo>();
795
796                        for ( PdbxStructAssembly psa : strucAssemblies){
797
798                                List<PdbxStructAssemblyGen> psags = new ArrayList<PdbxStructAssemblyGen>(1);
799
800                                for ( PdbxStructAssemblyGen psag: strucAssemblyGens ) {
801                                        if ( psag.getAssembly_id().equals(psa.getId())) {
802                                                psags.add(psag);
803                                        }
804                                }
805
806                                BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
807
808                                // these are the transformations that need to be applied to our model
809                                List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(psa, psags, structOpers);
810
811                                int bioAssemblyId = -1;
812                                try {
813                                        bioAssemblyId = Integer.parseInt(psa.getId());
814                                } catch (NumberFormatException e) {
815                                        logger.info("Could not parse a numerical bio assembly id from '{}'",psa.getId());
816                                }
817
818                                // if bioassembly id is not numerical we throw it away
819                                // this happens usually for viral capsid entries, like 1ei7
820                                // see issue #230 in github
821                                if (bioAssemblyId!=-1) {
822                                        int mmSize = 0;
823                                        // note that the transforms contain asym ids of both polymers and non-polymers
824                                        // For the mmsize, we are only interested in the polymers
825                                        for (BiologicalAssemblyTransformation transf:transformations) {
826                                                Chain c = structure.getChain(transf.getChainId());
827                                                if (c==null) {
828                                                        logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
829                                                        continue;
830                                                }
831                                                if (c.getEntityType() == EntityType.POLYMER &&
832                                                        // for entries like 4kro, sugars are annotated as polymers but we
833                                                        // don't want them in the macromolecularSize count
834                                                        !c.getEntityInfo().getDescription().contains("SUGAR") ) {
835
836                                                                mmSize++;
837                                                        }
838                                        }
839
840                                        BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
841                                        bioAssembly.setId(bioAssemblyId);
842                                        bioAssembly.setMacromolecularSize(mmSize);
843                                        bioAssembly.setTransforms(transformations);
844                                        bioAssemblies.put(bioAssemblyId,bioAssembly);
845                                }
846
847                        }
848                        structure.getPDBHeader().setBioAssemblies(bioAssemblies);
849                }
850
851                setStructNcsOps();
852
853                setCrystallographicInfoMetadata();
854
855
856                Map<String,List<SeqMisMatch>> misMatchMap = new HashMap<String, List<SeqMisMatch>>();
857                for (StructRefSeqDif sdif : sequenceDifs) {
858                        SeqMisMatch misMatch = new SeqMisMatchImpl();
859                        misMatch.setDetails(sdif.getDetails());
860
861                        String insCode = sdif.getPdbx_pdb_ins_code();
862                        if ( insCode != null && insCode.equals("?"))
863                                insCode = null;
864                        misMatch.setInsCode(insCode);
865                        misMatch.setOrigGroup(sdif.getDb_mon_id());
866                        misMatch.setPdbGroup(sdif.getMon_id());
867                        misMatch.setPdbResNum(sdif.getPdbx_auth_seq_num());
868                        misMatch.setUniProtId(sdif.getPdbx_seq_db_accession_code());
869                        misMatch.setSeqNum(sdif.getSeq_num());
870
871
872                        List<SeqMisMatch> mms = misMatchMap.get(sdif.getPdbx_pdb_strand_id());
873                        if ( mms == null) {
874                                mms = new ArrayList<SeqMisMatch>();
875                                misMatchMap.put(sdif.getPdbx_pdb_strand_id(),mms);
876                        }
877                        mms.add(misMatch);
878
879                }
880
881                for (String chainId : misMatchMap.keySet()){
882
883                        Chain chain = structure.getPolyChainByPDB(chainId);
884
885                        if ( chain == null) {
886                                logger.warn("Could not set mismatches for chain with author id" + chainId);
887                                continue;
888                        }
889
890                        chain.setSeqMisMatches(misMatchMap.get(chainId));
891
892
893                }
894
895        }
896
897        /**
898         * Here we link entities to chains.
899         * Also if entities are not present in file, this initialises the entities with some heuristics, see {@link org.biojava.nbio.structure.io.EntityFinder}
900         */
901        private void linkEntities() {
902
903                for (int i =0; i< allModels.size() ; i++){
904                        for (Chain chain : allModels.get(i)) {
905                                //logger.info("linking entities for " + chain.getId() + " "  + chain.getName());
906                                String entityId = asymId2entityId.get(chain.getId());
907
908                                if (entityId==null) {
909                                        // this can happen for instance if the cif file didn't have _struct_asym category at all
910                                        // and thus we have no asymId2entityId mapping at all
911                                        logger.info("No entity id could be found for chain {}", chain.getId());
912                                        continue;
913                                }
914                                int eId = Integer.parseInt(entityId);
915
916                                // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
917                                // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
918                                // asyms (chains).  Either create a unique StructureImpl or modify existing for a better representation of the
919                                // mmCIF internal data structures but is compatible with Structure interface.
920                                // Some examples of PDB entries with this kind of problem:
921                                //   - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName
922                                //   - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule
923                                //   - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone
924
925                                EntityInfo entityInfo = structure.getEntityById(eId);
926                                if (entityInfo==null) {
927                                        // Supports the case where the only chain members were from non-polymeric entity that is missing.
928                                        // Solved by creating a new Compound(entity) to which this chain will belong.
929                                        logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
930                                                        eId, chain.getId());
931                                        entityInfo = new EntityInfo();
932                                        entityInfo.setMolId(eId);
933                                        entityInfo.addChain(chain);
934                                        if (chain.isWaterOnly()) {
935                                                entityInfo.setType(EntityType.WATER);
936                                        } else {
937                                                entityInfo.setType(EntityType.NONPOLYMER);
938                                        }
939                                        chain.setEntityInfo(entityInfo);
940                                        structure.addEntityInfo(entityInfo);
941                                } else {
942                                        logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}",
943                                                        chain.getId(), chain.getName(), eId);
944                                        entityInfo.addChain(chain);
945                                        chain.setEntityInfo(entityInfo);
946                                }
947
948                        }
949
950                }
951
952                // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
953                List<EntityInfo> entityInfos = structure.getEntityInfos();
954                if (entityInfos==null || entityInfos.isEmpty()) {
955
956                        List<List<Chain>> polyModels = new ArrayList<>();
957                        List<List<Chain>> nonPolyModels = new ArrayList<>();
958                        List<List<Chain>> waterModels = new ArrayList<>();
959
960                        for (List<Chain> model:allModels) {
961
962                                List<Chain> polyChains = new ArrayList<>();
963                                List<Chain> nonPolyChains = new ArrayList<>();
964                                List<Chain> waterChains = new ArrayList<>();
965
966                                polyModels.add(polyChains);
967                                nonPolyModels.add(nonPolyChains);
968                                waterModels.add(waterChains);
969
970                                for (Chain c:model) {
971
972                                        // we only have entities for polymeric chains, all others are ignored for assigning entities
973                                        if (c.isWaterOnly()) {
974                                                waterChains.add(c);
975
976                                        } else if (c.isPureNonPolymer()) {
977                                                nonPolyChains.add(c);
978
979                                        } else {
980                                                polyChains.add(c);
981                                        }
982                                }
983                        }
984
985                        entityInfos = EntityFinder.findPolyEntities(polyModels);
986                        EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos);
987
988
989                        structure.setEntityInfos(entityInfos);
990                }
991
992                // final sanity check: it can happen that from the annotated entities some are not linked to any chains
993                // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
994                // we simply log it, this can sign some other problems if the entities are used down the line
995                for (EntityInfo e:entityInfos) {
996                        if (e.getChains().isEmpty()) {
997                                logger.info("Entity {} '{}' has no chains associated to it",
998                                                e.getMolId()<0?"with no entity id":e.getMolId(), e.getDescription());
999                        }
1000                }
1001
1002        }
1003
1004        private void addCharges() {
1005                ChargeAdder.addCharges(structure);
1006        }
1007
1008        /**
1009         * The method will return a new reference to a Chain with any consecutive groups
1010         * having same residue numbers removed.
1011         * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
1012         * @param c
1013         * @return
1014         */
1015        private static Chain removeSeqResHeterogeneity(Chain c) {
1016
1017                Chain trimmedChain = new ChainImpl();
1018
1019                ResidueNumber lastResNum = null;
1020
1021                for (Group g:c.getAtomGroups()) {
1022
1023                        // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
1024                        ResidueNumber currentResNum = new ResidueNumber(
1025                                        g.getResidueNumber().getChainName(),
1026                                        g.getResidueNumber().getSeqNum(),
1027                                        g.getResidueNumber().getInsCode());
1028
1029                        if (lastResNum == null || !lastResNum.equals(currentResNum) ) {
1030                                trimmedChain.addGroup(g);
1031                        } else {
1032                                logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': "+g);
1033                        }
1034
1035                        lastResNum = currentResNum;
1036
1037                }
1038                return trimmedChain;
1039        }
1040
1041        private void addBonds() {
1042                BondMaker maker = new BondMaker(structure, params);
1043                maker.makeBonds();
1044                maker.formBondsFromStructConn(structConn);
1045        }
1046
1047        private void alignSeqRes() {
1048
1049                logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
1050
1051                // fix SEQRES residue numbering for all models
1052
1053                for (int model=0;model<structure.nrModels();model++) {
1054
1055                        List<Chain> atomList   = structure.getModel(model);
1056
1057                        for (Chain seqResChain: seqResChains){
1058
1059                                // this extracts the matching atom chain from atomList
1060                                Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true);
1061
1062                                if (atomChain == null) {
1063                                        // most likely there's no observed residues at all for the seqres chain: can't map
1064                                        // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
1065                                        logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's no observed residues in the chain.",
1066                                                        seqResChain.getId());
1067                                        continue;
1068                                }
1069
1070                                //map the atoms to the seqres...
1071
1072                                // we need to first clone the seqres so that they stay independent for different models
1073                                List<Group> seqResGroups = new ArrayList<Group>();
1074                                for (int i=0;i<seqResChain.getAtomGroups().size();i++) {
1075                                        seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
1076                                }
1077
1078                                for ( int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
1079                                        Group seqresG = seqResGroups.get(seqResPos);
1080                                        boolean found = false;
1081                                        for ( Group atomG: atomChain.getAtomGroups()) {
1082
1083                                                int internalNr = getInternalNr (atomG);
1084
1085                                                if (seqresG.getResidueNumber().getSeqNum() == internalNr ) {
1086                                                        seqResGroups.set(seqResPos, atomG);
1087                                                        found = true;
1088                                                        break;
1089                                                }
1090
1091
1092                                        }
1093                                        if ( ! found)
1094                                                // so far the residue number has tracked internal numbering.
1095                                                // however there are no atom records, as such this can't be a PDB residue number...
1096                                                seqresG.setResidueNumber(null);
1097                                }
1098                                atomChain.setSeqResGroups(seqResGroups);
1099
1100                        }
1101                }
1102        }
1103
1104        private int getInternalNr(Group atomG) {
1105                if ( atomG.getType().equals(GroupType.AMINOACID)) {
1106                        AminoAcidImpl aa = (AminoAcidImpl) atomG;
1107                        return new Long(aa.getId()).intValue();
1108                } else if ( atomG.getType().equals(GroupType.NUCLEOTIDE)) {
1109                        NucleotideImpl nu = (NucleotideImpl) atomG;
1110                        return new Long(nu.getId()).intValue();
1111                } else {
1112                        HetatomImpl he = (HetatomImpl) atomG;
1113                        return new Long(he.getId()).intValue();
1114                }
1115        }
1116
1117        private void addEntities(StructAsym asym) {
1118                int eId = 0;
1119                try {
1120                        eId = Integer.parseInt(asym.getEntity_id());
1121                } catch (NumberFormatException e) {
1122                        logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity",asym.getEntity_id());
1123                }
1124                Entity e = getEntity(eId);
1125
1126                // for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing
1127                // we need to fill the Compounds in some other way:
1128
1129                EntityInfo entityInfo = structure.getEntityById(eId);
1130
1131                if (entityInfo==null) {
1132                        //logger.info("Creating new EntityInfo " + eId + " " + e.getId() + " " + e.getPdbx_description());
1133                        entityInfo = new EntityInfo();
1134                        entityInfo.setMolId(eId);
1135                        // we only add the compound if a polymeric one (to match what the PDB parser does)
1136                        if (e!=null) {
1137                                entityInfo.setDescription(e.getPdbx_description());
1138
1139                                EntityType eType = EntityType.entityTypeFromString(e.getType());
1140                                if (eType!=null) {
1141                                        entityInfo.setType(eType);
1142                                } else {
1143                                        logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", e.getType(), eId);
1144                                }
1145                                addAncilliaryEntityData(asym, eId, e, entityInfo);
1146                                structure.addEntityInfo(entityInfo);
1147                                logger.debug("Adding Entity with entity id {} from _entity, with name: {}",eId, entityInfo.getDescription());
1148                        }
1149                }
1150        }
1151
1152
1153        /**
1154         * Add any extra information to the entity information.
1155         * @param asym
1156         * @param entityId
1157         * @param entity
1158         * @param entityInfo
1159         */
1160        private void addAncilliaryEntityData(StructAsym asym, int entityId, Entity entity, EntityInfo entityInfo) {
1161                // Loop through each of the entity types and add the corresponding data
1162                // We're assuming if data is duplicated between sources it is consistent
1163                // This is a potentially huge assumption...
1164
1165
1166                for (EntitySrcGen esg : entitySrcGens) {
1167
1168                        if (! esg.getEntity_id().equals(asym.getEntity_id()))
1169                                continue;
1170
1171                        addInformationFromESG(esg, entityId, entityInfo);
1172
1173                }
1174
1175                for (EntitySrcNat esn : entitySrcNats) {
1176                        if (! esn.getEntity_id().equals(asym.getEntity_id()))
1177                                continue;
1178                        addInformationFromESN(esn, entityId, entityInfo);
1179
1180                }
1181
1182                for (EntitySrcSyn ess : entitySrcSyns) {
1183                        if (! ess.getEntity_id().equals(asym.getEntity_id()))
1184                                continue;
1185                        addInfoFromESS(ess, entityId, entityInfo);
1186
1187                }
1188        }
1189
1190        /**
1191         * Add the information from an ESG to a compound.
1192         * @param entitySrcInfo
1193         * @param entityId
1194         * @param c
1195         */
1196        private void addInformationFromESG(EntitySrcGen entitySrcInfo, int entityId, EntityInfo c) {
1197                c.setAtcc(entitySrcInfo.getPdbx_gene_src_atcc());
1198                c.setCell(entitySrcInfo.getPdbx_gene_src_cell());
1199                c.setOrganismCommon(entitySrcInfo.getGene_src_common_name());
1200                c.setOrganismScientific(entitySrcInfo.getPdbx_gene_src_scientific_name());
1201                c.setOrganismTaxId(entitySrcInfo.getPdbx_gene_src_ncbi_taxonomy_id());
1202                c.setExpressionSystemTaxId(entitySrcInfo.getPdbx_host_org_ncbi_taxonomy_id());
1203                c.setExpressionSystem(entitySrcInfo.getPdbx_host_org_scientific_name());
1204        }
1205
1206        /**
1207         * Add the information to entity info from ESN.
1208         * @param esn
1209         * @param eId
1210         * @param c
1211         */
1212        private void addInformationFromESN(EntitySrcNat esn, int eId, EntityInfo c) {
1213
1214                c.setAtcc(esn.getPdbx_atcc());
1215                c.setCell(esn.getPdbx_cell());
1216                c.setOrganismCommon(esn.getCommon_name());
1217                c.setOrganismScientific(esn.getPdbx_organism_scientific());
1218                c.setOrganismTaxId(esn.getPdbx_ncbi_taxonomy_id());
1219
1220        }
1221        /**
1222         * Add the information from ESS to Entity info.
1223         * @param ess
1224         * @param eId
1225         * @param c
1226         */
1227        private void addInfoFromESS(EntitySrcSyn ess, int eId, EntityInfo c) {
1228                c.setOrganismCommon(ess.getOrganism_common_name());
1229                c.setOrganismScientific(ess.getOrganism_scientific());
1230                c.setOrganismTaxId(ess.getNcbi_taxonomy_id());
1231
1232        }
1233
1234        private void initMaps() {
1235
1236
1237                if (structAsyms == null || structAsyms.isEmpty()) {
1238                        logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available");
1239                        return;
1240                }
1241
1242                Map<String, List<String>> entityId2asymId = new HashMap<>();
1243
1244                for (StructAsym asym : structAsyms) {
1245
1246                        logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() );
1247
1248                        asymId2entityId.put(asym.getId(), asym.getEntity_id());
1249
1250                        if (entityId2asymId.containsKey(asym.getEntity_id())) {
1251                                List<String> asymIds = entityId2asymId.get(asym.getEntity_id());
1252                                asymIds.add(asym.getId());
1253                        } else {
1254                                List<String> asymIds = new ArrayList<>();
1255                                asymIds.add(asym.getId());
1256                                entityId2asymId.put(asym.getEntity_id(), asymIds);
1257                        }
1258                }
1259
1260                if (entityPolys==null || entityPolys.isEmpty()) {
1261                        logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available for header only parsing");
1262                        return;
1263                }
1264
1265                for (EntityPoly ep:entityPolys) {
1266                        if (ep.getPdbx_strand_id()==null) {
1267                                logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to author ids for this entity.", ep.getEntity_id());
1268                                continue;
1269                        }
1270                        String[] chainNames = ep.getPdbx_strand_id().split(",");
1271                        List<String> asymIds = entityId2asymId.get(ep.getEntity_id());
1272                        if (chainNames.length!=asymIds.size()) {
1273                                logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) for entity {} have different lengths! Can't provide a mapping from asym ids to author chain ids", ep.getEntity_id());
1274                                continue;
1275                        }
1276                        for (int i=0; i<chainNames.length; i++) {
1277                                asymId2authorId.put(asymIds.get(i), chainNames[i]);
1278                        }
1279                }
1280        }
1281
1282        private void setStructNcsOps() {
1283
1284                ArrayList<Matrix4d> ncsOperators = new ArrayList<Matrix4d>();
1285
1286                for (StructNcsOper sNcsOper:structNcsOper) {
1287
1288                        if (!sNcsOper.getCode().equals("generate")) continue;
1289
1290                        try {
1291                                Matrix4d op = new Matrix4d();
1292                                op.setElement(3, 0, 0.0);
1293                                op.setElement(3, 1, 0.0);
1294                                op.setElement(3, 2, 0.0);
1295                                op.setElement(3, 3, 1.0);
1296
1297
1298                                op.setElement(0, 0, Double.parseDouble(sNcsOper.getMatrix11()));
1299                                op.setElement(0, 1, Double.parseDouble(sNcsOper.getMatrix12()));
1300                                op.setElement(0, 2, Double.parseDouble(sNcsOper.getMatrix13()));
1301
1302                                op.setElement(1, 0, Double.parseDouble(sNcsOper.getMatrix21()));
1303                                op.setElement(1, 1, Double.parseDouble(sNcsOper.getMatrix22()));
1304                                op.setElement(1, 2, Double.parseDouble(sNcsOper.getMatrix23()));
1305
1306                                op.setElement(2, 0, Double.parseDouble(sNcsOper.getMatrix31()));
1307                                op.setElement(2, 1, Double.parseDouble(sNcsOper.getMatrix32()));
1308                                op.setElement(2, 2, Double.parseDouble(sNcsOper.getMatrix33()));
1309
1310                                op.setElement(0, 3, Double.parseDouble(sNcsOper.getVector1()));
1311                                op.setElement(1, 3, Double.parseDouble(sNcsOper.getVector2()));
1312                                op.setElement(2, 3, Double.parseDouble(sNcsOper.getVector3()));
1313
1314                                ncsOperators.add(op);
1315
1316                        } catch (NumberFormatException e) {
1317                                logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", structNcsOper.indexOf(sNcsOper)+1);
1318                        }
1319
1320                }
1321
1322                // we only set it if not empty, otherwise remains null
1323                if (ncsOperators.size()>0) {
1324                        structure.getCrystallographicInfo().setNcsOperators(
1325                                        ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
1326                }
1327        }
1328
1329        private void setCrystallographicInfoMetadata() {
1330                if (parsedScaleMatrix!=null) {
1331
1332                        PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
1333
1334                        boolean nonStd = false;
1335                        if (crystalInfo.getCrystalCell()!=null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
1336                                nonStd = true;
1337                        }
1338
1339                        crystalInfo.setNonStandardCoordFrameConvention(nonStd);
1340                }
1341        }
1342
1343
1344        /** This method will return the parsed protein structure, once the parsing has been finished
1345         *
1346         * @return a BioJava protein structure object
1347         */
1348        public Structure getStructure() {
1349
1350                return structure;
1351        }
1352
1353        @Override
1354        public void newDatabasePDBrevRecord(DatabasePdbrevRecord record) {
1355
1356                PDBHeader header = structure.getPDBHeader();
1357
1358                if ( header == null) {
1359                        header = new PDBHeader();
1360                        structure.setPDBHeader(header);
1361                }
1362
1363                List<DatabasePdbrevRecord> revRecords = header.getRevisionRecords();
1364                if ( revRecords == null) {
1365                        revRecords = new ArrayList<DatabasePdbrevRecord>();
1366                        header.setRevisionRecords(revRecords);
1367                }
1368                revRecords.add(record);
1369
1370
1371        }
1372
1373
1374        @Override
1375        public void newDatabasePDBrev(DatabasePDBrev dbrev) {
1376
1377                logger.debug("got a database revision:" + dbrev);
1378
1379                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
1380                PDBHeader header = structure.getPDBHeader();
1381
1382                if ( header == null) {
1383                        header = new PDBHeader();
1384                }
1385
1386                if (dbrev.getNum().equals("1")){
1387
1388                        try {
1389                                Date dep = dateFormat.parse(dbrev.getDate_original());
1390                                header.setDepDate(dep);
1391
1392                        } catch (ParseException e){
1393                                logger.warn("Could not parse date string '{}', deposition date will be unavailable", dbrev.getDate_original());
1394                        }
1395
1396                        try {
1397                                Date rel = dateFormat.parse(dbrev.getDate());
1398                                header.setRelDate(rel);
1399
1400                        } catch (ParseException e){
1401                                logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
1402                        }
1403
1404
1405                } else {
1406                        try {
1407
1408                                Date mod = dateFormat.parse(dbrev.getDate());
1409                                header.setModDate(mod);
1410
1411                        } catch (ParseException e){
1412                                logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
1413                        }
1414                }
1415
1416                structure.setPDBHeader(header);
1417        }
1418
1419        @Override
1420        public void newPdbxAuditRevisionHistory(PdbxAuditRevisionHistory history) {
1421
1422                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
1423                PDBHeader header = structure.getPDBHeader();
1424
1425                if ( header == null) {
1426                        header = new PDBHeader();
1427                }
1428
1429        // first entry in revision history is the release date
1430                if (history.getOrdinal().equals("1")){
1431                        try {
1432                                Date releaseDate = dateFormat.parse(history.getRevision_date());
1433                                header.setRelDate(releaseDate);
1434
1435                        } catch (ParseException e){
1436                                logger.warn("Could not parse date string '{}', release date will be unavailable", history.getRevision_date());
1437                        }
1438                } else {
1439                        // all other dates are revision dates;
1440                        // since this method may be called multiple times,
1441                        // the last revision date will "stick"
1442                        try {
1443                                Date revisionDate = dateFormat.parse(history.getRevision_date());
1444                                header.setModDate(revisionDate);
1445                        } catch (ParseException e){
1446                                logger.warn("Could not parse date string '{}', revision date will be unavailable", history.getRevision_date());
1447                        }
1448                }
1449
1450                structure.setPDBHeader(header);
1451        }
1452
1453        @Override
1454        public void newPdbxDatabaseStatus(PdbxDatabaseStatus status) {
1455
1456                // the deposition date field is only available in mmCIF 5.0
1457
1458                if (status.getRecvd_initial_deposition_date() == null) {
1459                        // skip this method for older mmCIF versions
1460                        return;
1461                }
1462
1463                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
1464                PDBHeader header = structure.getPDBHeader();
1465
1466                if (header == null) {
1467                        header = new PDBHeader();
1468                }
1469
1470                try {
1471                        Date depositionDate = dateFormat.parse(status.getRecvd_initial_deposition_date());
1472                        header.setDepDate(depositionDate);
1473                } catch (ParseException e){
1474                        logger.warn("Could not parse date string '{}', deposition date will be unavailable", status.getRecvd_initial_deposition_date());
1475                }
1476
1477                structure.setPDBHeader(header);
1478        }
1479
1480        @Override
1481        public void newDatabasePDBremark(DatabasePDBremark remark) {
1482                //System.out.println(remark);
1483                String id = remark.getId();
1484                if (id.equals("2")){
1485
1486                        //this remark field contains the resolution information:
1487                        String line = remark.getText();
1488
1489                        int i = line.indexOf("ANGSTROM");
1490                        if ( i > 5) {
1491                                // line contains ANGSTROM info...
1492                                String resolution = line.substring(i-5,i).trim();
1493                                // convert string to float
1494                                float res = 99 ;
1495                                try {
1496                                        res = Float.parseFloat(resolution);
1497
1498                                } catch (NumberFormatException e) {
1499                                        logger.info("could not parse resolution from line and ignoring it " + line);
1500                                        return ;
1501
1502
1503                                }
1504                                // support for old style header
1505
1506                                PDBHeader pdbHeader = structure.getPDBHeader();
1507                                pdbHeader.setResolution(res);
1508
1509                        }
1510
1511                }
1512        }
1513
1514        @Override
1515        public void newRefine(Refine r){
1516
1517                PDBHeader pdbHeader = structure.getPDBHeader();
1518                // RESOLUTION
1519                // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
1520                // there are 2 resolution values, one for each method
1521                // we take the last one found so that behaviour is like in PDB file parsing
1522                if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) {
1523                        logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1524                                        ,r.getLs_d_res_high(), String.format("%4.2f",pdbHeader.getResolution()));
1525                }
1526                try {
1527                        pdbHeader.setResolution(Float.parseFloat(r.getLs_d_res_high()));
1528                } catch (NumberFormatException e){
1529                        logger.info("Could not parse resolution from " + r.getLs_d_res_high() + " " + e.getMessage());
1530                }
1531
1532
1533                // RFREE
1534                if (pdbHeader.getRfree()!=PDBHeader.DEFAULT_RFREE) {
1535                        logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {} ",
1536                                        r.getLs_R_factor_R_free(), String.format("%4.2f",pdbHeader.getRfree()));
1537                }
1538                if (r.getLs_R_factor_R_free()==null) {
1539                        // some entries like 2ifo haven't got this field at all
1540                        logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
1541                } else {
1542                        try {
1543                                pdbHeader.setRfree(Float.parseFloat(r.getLs_R_factor_R_free()));
1544                        } catch (NumberFormatException e){
1545                                // no rfree present ('?') is very usual, that's why we set it to debug
1546                                logger.debug("Could not parse Rfree from string '{}'", r.getLs_R_factor_R_free());
1547                        }
1548                }
1549
1550                // RWORK
1551                if(pdbHeader.getRwork()!=PDBHeader.DEFAULT_RFREE) {
1552                        logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ",
1553                                        r.getLs_R_factor_R_work(), String.format("%4.2f",pdbHeader.getRwork()));
1554                }
1555                if(r.getLs_R_factor_R_work()==null){
1556                        logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value");
1557                }
1558                else{
1559                        try{
1560                                pdbHeader.setRwork(Float.parseFloat(r.getLs_R_factor_R_work()));
1561                        }
1562                        catch (NumberFormatException e){
1563                                logger.debug("Could not parse R-work from string '{}'", r.getLs_R_factor_R_work());
1564                        }
1565
1566                }
1567
1568        }
1569
1570
1571        @Override
1572        public void newAuditAuthor(AuditAuthor aa){
1573
1574                String name =  aa.getName();
1575
1576                StringBuffer famName = new StringBuffer();
1577                StringBuffer initials = new StringBuffer();
1578                boolean afterComma = false;
1579                for ( char c: name.toCharArray()) {
1580                        if ( c == ' ')
1581                                continue;
1582                        if ( c == ','){
1583                                afterComma = true;
1584                                continue;
1585                        }
1586
1587                        if ( afterComma)
1588                                initials.append(c);
1589                        else
1590                                famName.append(c);
1591                }
1592
1593                StringBuffer newaa = new StringBuffer();
1594                newaa.append(initials);
1595                newaa.append(famName);
1596
1597                PDBHeader header = structure.getPDBHeader();
1598                String auth = header.getAuthors();
1599                if (auth == null) {
1600                        header.setAuthors(newaa.toString());
1601                }else {
1602                        auth += "," + newaa.toString();
1603                        header.setAuthors(auth);
1604
1605                }
1606        }
1607
1608        @Override
1609        public void newExptl(Exptl exptl) {
1610
1611                PDBHeader pdbHeader = structure.getPDBHeader();
1612                String method = exptl.getMethod();
1613                pdbHeader.setExperimentalTechnique(method);
1614
1615        }
1616
1617        @Override
1618        public void newCell(Cell cell) {
1619
1620                try {
1621                        float a = Float.parseFloat(cell.getLength_a());
1622                        float b = Float.parseFloat(cell.getLength_b());
1623                        float c = Float.parseFloat(cell.getLength_c());
1624                        float alpha = Float.parseFloat(cell.getAngle_alpha());
1625                        float beta = Float.parseFloat(cell.getAngle_beta());
1626                        float gamma = Float.parseFloat(cell.getAngle_gamma());
1627
1628                        CrystalCell xtalCell = new CrystalCell();
1629                        xtalCell.setA(a);
1630                        xtalCell.setB(b);
1631                        xtalCell.setC(c);
1632                        xtalCell.setAlpha(alpha);
1633                        xtalCell.setBeta(beta);
1634                        xtalCell.setGamma(gamma);
1635
1636                        if (!xtalCell.isCellReasonable()) {
1637                                // If the entry describes a structure determined by a technique other than X-ray crystallography,
1638                            // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
1639                                // if so we don't add and CrystalCell will be null
1640                                logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1641                                                CrystalCell.MIN_VALID_CELL_SIZE);
1642                                return;
1643                        }
1644
1645                        structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(xtalCell);
1646
1647                } catch (NumberFormatException e){
1648                        structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(null);
1649                        logger.info("could not parse some cell parameters ("+e.getMessage()+"), ignoring _cell ");
1650                }
1651        }
1652
1653        @Override
1654        public void newSymmetry(Symmetry symmetry) {
1655                String spaceGroup = symmetry.getSpace_group_name_H_M();
1656                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1657                if (sg==null) {
1658                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1659                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(true);
1660                } else {
1661                        structure.getPDBHeader().getCrystallographicInfo().setSpaceGroup(sg);
1662                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(false);
1663                }
1664        }
1665
1666        @Override
1667        public void newStructNcsOper(StructNcsOper sNcsOper) {
1668                structNcsOper.add(sNcsOper);
1669        }
1670
1671        public void newAtomSites(AtomSites atomSites) {
1672
1673                try {
1674                        Matrix4d m = new Matrix4d(
1675                                Double.parseDouble(atomSites.getFract_transf_matrix11()), Double.parseDouble(atomSites.getFract_transf_matrix12()), Double.parseDouble(atomSites.getFract_transf_matrix13()), Double.parseDouble(atomSites.getFract_transf_vector1()),
1676                                Double.parseDouble(atomSites.getFract_transf_matrix21()), Double.parseDouble(atomSites.getFract_transf_matrix22()), Double.parseDouble(atomSites.getFract_transf_matrix23()), Double.parseDouble(atomSites.getFract_transf_vector2()),
1677                                Double.parseDouble(atomSites.getFract_transf_matrix31()), Double.parseDouble(atomSites.getFract_transf_matrix32()), Double.parseDouble(atomSites.getFract_transf_matrix33()), Double.parseDouble(atomSites.getFract_transf_vector3()),
1678                                0,0,0,1);
1679
1680                        parsedScaleMatrix = m;
1681
1682                } catch (NumberFormatException e) {
1683                        logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", e.getMessage());
1684                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
1685
1686                        // in this case parsedScaleMatrix stays null and can't be used in documentEnd()
1687                }
1688        }
1689
1690        @Override
1691        public void newStructRef(StructRef sref) {
1692                logger.debug(sref.toString());
1693                strucRefs.add(sref);
1694        }
1695
1696        private StructRef getStructRef(String ref_id){
1697                for (StructRef structRef : strucRefs) {
1698
1699                        if (structRef.getId().equals(ref_id)){
1700                                return structRef;
1701                        }
1702
1703                }
1704                return null;
1705
1706        }
1707
1708        /**
1709         * create a DBRef record from the StrucRefSeq record:
1710         * <pre>
1711         * PDB record                    DBREF
1712         * Field Name                    mmCIF Data Item
1713         * Section                       n.a.
1714         * PDB_ID_Code                   _struct_ref_seq.pdbx_PDB_id_code
1715         * Strand_ID                     _struct_ref_seq.pdbx_strand_id
1716         * Begin_Residue_Number          _struct_ref_seq.pdbx_auth_seq_align_beg
1717         * Begin_Ins_Code                _struct_ref_seq.pdbx_seq_align_beg_ins_code
1718         * End_Residue_Number            _struct_ref_seq.pdbx_auth_seq_align_end
1719         * End_Ins_Code                  _struct_ref_seq.pdbx_seq_align_end_ins_code
1720         * Database                      _struct_ref.db_name
1721         * Database_Accession_No         _struct_ref_seq.pdbx_db_accession
1722         * Database_ID_Code              _struct_ref.db_code
1723         * Database_Begin_Residue_Number _struct_ref_seq.db_align_beg
1724         * Databaes_Begin_Ins_Code       _struct_ref_seq.pdbx_db_align_beg_ins_code
1725         * Database_End_Residue_Number   _struct_ref_seq.db_align_end
1726         * Databaes_End_Ins_Code         _struct_ref_seq.pdbx_db_align_end_ins_code
1727         * </pre>
1728         *
1729         *
1730         */
1731        @Override
1732        public void newStructRefSeq(StructRefSeq sref) {
1733                DBRef r = new DBRef();
1734
1735                r.setIdCode(sref.getPdbx_PDB_id_code());
1736                r.setDbAccession(sref.getPdbx_db_accession());
1737                r.setDbIdCode(sref.getPdbx_db_accession());
1738
1739                r.setChainName(sref.getPdbx_strand_id());
1740                StructRef structRef = getStructRef(sref.getRef_id());
1741                if (structRef == null){
1742                        logger.info("could not find StructRef " + sref.getRef_id() + " for StructRefSeq " + sref);
1743                } else {
1744                        r.setDatabase(structRef.getDb_name());
1745                        r.setDbIdCode(structRef.getDb_code());
1746                }
1747
1748                int seqbegin;
1749                int seqend;
1750                try{
1751                        seqbegin = Integer.parseInt(sref.getPdbx_auth_seq_align_beg());
1752                        seqend   = Integer.parseInt(sref.getPdbx_auth_seq_align_end());
1753                }
1754                catch(NumberFormatException e){
1755                        // this happens in a few entries, annotation error? e.g. 6eoj
1756                        logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref alignment info for accession {}. Error: {}", r.getDbAccession(), e.getMessage());
1757                        return;
1758                }
1759
1760                Character begin_ins_code = ' ';
1761                if (sref.getPdbx_seq_align_beg_ins_code() != null ) {
1762                    begin_ins_code = new Character(sref.getPdbx_seq_align_beg_ins_code().charAt(0));
1763                }
1764
1765                Character end_ins_code = ' ';
1766                if (sref.getPdbx_seq_align_end_ins_code() != null) {
1767                    end_ins_code = new Character(sref.getPdbx_seq_align_end_ins_code().charAt(0));
1768                }
1769
1770                if (begin_ins_code == '?')
1771                        begin_ins_code = ' ';
1772
1773                if (end_ins_code == '?')
1774                        end_ins_code = ' ';
1775
1776                r.setSeqBegin(seqbegin);
1777                r.setInsertBegin(begin_ins_code);
1778
1779                r.setSeqEnd(seqend);
1780                r.setInsertEnd(end_ins_code);
1781
1782                int dbseqbegin = Integer.parseInt(sref.getDb_align_beg());
1783                int dbseqend   = Integer.parseInt(sref.getDb_align_end());
1784
1785                Character db_begin_in_code = ' ';
1786                if (sref.getPdbx_db_align_beg_ins_code() != null) {
1787                    db_begin_in_code = new Character(sref.getPdbx_db_align_beg_ins_code().charAt(0));
1788                }
1789
1790                Character db_end_in_code = ' ';
1791                if (sref.getPdbx_db_align_end_ins_code() != null) {
1792                    db_end_in_code = new Character(sref.getPdbx_db_align_end_ins_code().charAt(0));
1793                }
1794
1795                if (db_begin_in_code == '?')
1796                        db_begin_in_code = ' ';
1797
1798                if (db_end_in_code == '?')
1799                        db_end_in_code = ' ';
1800
1801
1802                r.setDbSeqBegin(dbseqbegin);
1803                r.setIdbnsBegin(db_begin_in_code);
1804
1805                r.setDbSeqEnd(dbseqend);
1806                r.setIdbnsEnd(db_end_in_code);
1807
1808                List<DBRef> dbrefs = structure.getDBRefs();
1809                if ( dbrefs == null)
1810                        dbrefs = new ArrayList<DBRef>();
1811                dbrefs.add(r);
1812
1813                logger.debug(r.toPDB());
1814
1815                structure.setDBRefs(dbrefs);
1816
1817        }
1818
1819        @Override
1820        public void newStructRefSeqDif(StructRefSeqDif sref) {
1821                sequenceDifs.add(sref);
1822        }
1823
1824        private Chain getEntityChain(String entity_id){
1825
1826                for (Chain chain : entityChains) {
1827                        if ( chain.getId().equals(entity_id)){
1828
1829                                return chain;
1830                        }
1831                }
1832                // does not exist yet, so create...
1833
1834                Chain   chain = new ChainImpl();
1835                chain.setId(entity_id);
1836                entityChains.add(chain);
1837
1838                return chain;
1839
1840        }
1841
1842        //private Chain getSeqResChain(String chainID){
1843        //      return getChainFromList(seqResChains, chainID);
1844        //}
1845
1846
1847        /**
1848         * Data items in the ENTITY_SRC_GEN category record details of
1849         * the source from which the entity was obtained in cases
1850         * where the source was genetically manipulated.  The
1851         * following are treated separately:  items pertaining to the tissue
1852         * from which the gene was obtained, items pertaining to the host
1853         * organism for gene expression and items pertaining to the actual
1854         * producing organism (plasmid).
1855         */
1856        @Override
1857        public void newEntitySrcGen(EntitySrcGen entitySrcGen){
1858
1859                // add to internal list. Map to Compound object later on...
1860                entitySrcGens.add(entitySrcGen);
1861        }
1862
1863        @Override
1864        public void newEntitySrcNat(EntitySrcNat entitySrcNat){
1865
1866                // add to internal list. Map to Compound object later on...
1867                entitySrcNats.add(entitySrcNat);
1868        }
1869
1870        @Override
1871        public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){
1872
1873                // add to internal list. Map to Compound object later on...
1874                entitySrcSyns.add(entitySrcSyn);
1875        }
1876
1877        /**
1878         * The EntityPolySeq object provide the amino acid sequence objects for the Entities.
1879         * Later on the entities are mapped to the BioJava {@link Chain} and {@link EntityInfo} objects.
1880         * @param epolseq the EntityPolySeq record for one amino acid
1881         */
1882        @Override
1883        public void newEntityPolySeq(EntityPolySeq epolseq) {
1884
1885                logger.debug("NEW entity poly seq " + epolseq);
1886
1887                int eId = -1;
1888                try {
1889                        eId = Integer.parseInt(epolseq.getEntity_id());
1890                } catch (NumberFormatException e) {
1891                        logger.warn("Could not parse entity id from EntityPolySeq: "+e.getMessage());
1892                }
1893                Entity e = getEntity(eId);
1894
1895                if (e == null){
1896                        logger.info("Could not find entity "+ epolseq.getEntity_id()+". Can not match sequence to it.");
1897                        return;
1898                }
1899
1900                Chain entityChain = getEntityChain(epolseq.getEntity_id());
1901
1902                // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
1903                // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
1904
1905                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(epolseq.getMon_id());
1906                //int seqId = Integer.parseInt(epolseq.getNum());
1907                if ( g != null && !g.getChemComp().isEmpty()) {
1908                        if ( g instanceof AminoAcidImpl) {
1909                                AminoAcidImpl aa = (AminoAcidImpl) g;
1910                                aa.setRecordType(AminoAcid.SEQRESRECORD);
1911                                //aa.setId(seqId);
1912                        }
1913                } else {
1914
1915                        if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){
1916                                AminoAcidImpl a = new AminoAcidImpl();
1917                                a.setRecordType(AminoAcid.SEQRESRECORD);
1918                                Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id());
1919                                a.setAminoType(code1);
1920                                g = a;
1921
1922                        } else if ( StructureTools.isNucleotide(epolseq.getMon_id())) {
1923                                // the group is actually a nucleotide group...
1924                                NucleotideImpl n = new NucleotideImpl();
1925                                g = n;
1926
1927                        } else {
1928                                logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id());
1929                                HetatomImpl h = new HetatomImpl();
1930                                g = h;
1931
1932                        }
1933
1934
1935                }
1936                // at this stage we don't know about author residue numbers (insertion codes)
1937                // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly sequential and follow the seqres sequence 1 to n)
1938                // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
1939                g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
1940
1941                g.setPDBName(epolseq.getMon_id());
1942
1943                entityChain.addGroup(g);
1944
1945        }
1946
1947        @Override
1948        public void newPdbxPolySeqScheme(PdbxPolySeqScheme ppss) {
1949
1950                //if ( headerOnly)
1951                //      return;
1952
1953                // replace the group asym ids with the real PDB ids!
1954                // replaceGroupSeqPos(ppss);  // This might be incorrect in some pdb, to use auth_seq_id of the pdbx_poly_seq_scheme.
1955
1956
1957        }
1958
1959
1960        @Override
1961        public void newPdbxNonPolyScheme(PdbxNonPolyScheme ppss) {
1962
1963                //if (headerOnly)
1964                //      return;
1965
1966                // merge the EntityPolySeq info and the AtomSite chains into one...
1967                //already known ignore:
1968
1969        }
1970
1971        @Override
1972        public void newPdbxEntityNonPoly(PdbxEntityNonPoly pen){
1973                // TODO: do something with them...
1974                // not implemented yet...
1975                logger.debug(pen.getEntity_id() + " " + pen.getName() + " " + pen.getComp_id());
1976
1977        }
1978
1979        @Override
1980        public void newChemComp(ChemComp c) {
1981                // TODO: do something with them...
1982
1983        }
1984
1985        @Override
1986        public void newGenericData(String category, List<String> loopFields,
1987                        List<String> lineData) {
1988
1989                //logger.debug("unhandled category so far: " + category);
1990        }
1991
1992        @Override
1993        public FileParsingParameters getFileParsingParameters()
1994        {
1995                return params;
1996        }
1997
1998        @Override
1999        public void setFileParsingParameters(FileParsingParameters params)
2000        {
2001                this.params = params;
2002
2003        }
2004
2005        @Override
2006        public void newChemCompDescriptor(ChemCompDescriptor ccd) {
2007
2008                // TODO nothing happening here yet.
2009
2010        }
2011
2012
2013
2014        public List<PdbxStructOperList> getStructOpers() {
2015                return structOpers;
2016        }
2017
2018        @Override
2019        public void newPdbxStrucAssembly(PdbxStructAssembly strucAssembly) {
2020                strucAssemblies.add(strucAssembly);
2021
2022        }
2023
2024        public List<PdbxStructAssembly> getStructAssemblies(){
2025                return strucAssemblies;
2026        }
2027
2028        @Override
2029        public void newPdbxStrucAssemblyGen(PdbxStructAssemblyGen strucAssembly) {
2030                strucAssemblyGens.add(strucAssembly);
2031
2032        }
2033
2034        public List<PdbxStructAssemblyGen> getStructAssemblyGens(){
2035                return strucAssemblyGens;
2036        }
2037
2038        @Override
2039        public void newChemCompAtom(ChemCompAtom atom) {
2040
2041        }
2042
2043        @Override
2044        public void newPdbxChemCompIndentifier(PdbxChemCompIdentifier id) {
2045
2046        }
2047
2048        @Override
2049        public void newChemCompBond(ChemCompBond bond) {
2050
2051        }
2052
2053        @Override
2054        public void newPdbxChemCompDescriptor(PdbxChemCompDescriptor desc) {
2055
2056        }
2057
2058        @Override
2059        public void newStructConn(StructConn structConn) {
2060                this.structConn.add(structConn);
2061        }
2062
2063        @Override
2064        public void newStructSiteGen(StructSiteGen siteGen) { this.structSiteGens.add(siteGen); }
2065
2066        @Override
2067        public void newStructSite(StructSite structSite) {
2068
2069                if (params.isHeaderOnly()) {
2070                        return;
2071                }
2072
2073                // Simply implement the method.
2074                List<Site> sites = structure.getSites();
2075                if (sites == null) sites = new ArrayList<Site>();
2076
2077                Site site = null;
2078                for (Site asite : sites) {
2079                        if (asite.getSiteID().equals(structSite.getId())) {
2080                                site = asite;           // Prevent duplicate siteIds
2081                        }
2082                }
2083                boolean addSite = false;
2084                if (site == null) { site = new Site(); addSite = true; }
2085                site.setSiteID(structSite.getId());
2086                site.setDescription(structSite.getDetails());
2087                // site.setPdbxEvidenceCode(structSite.getPdbxEvidenceCode()); // TODO - add addition fields in Sites
2088                if (addSite) sites.add(site);
2089
2090                structure.setSites(sites);
2091        }
2092
2093        /**
2094         * Build sites in a BioJava Structure using the original author chain id & residue numbers.
2095         * Sites are built from struct_site_gen records that have been parsed.
2096         */
2097        private void addSites() {
2098                List<Site> sites = structure.getSites();
2099                if (sites == null) sites = new ArrayList<Site>();
2100
2101                for (StructSiteGen siteGen : structSiteGens) {
2102                                // For each StructSiteGen, find the residues involved, if they exist then
2103                                String site_id = siteGen.getSite_id(); // multiple could be in same site.
2104                                if (site_id == null) site_id = "";
2105                                String comp_id = siteGen.getLabel_comp_id();  // PDBName
2106
2107                                // Assumption: the author chain ID and residue number for the site is consistent with the original
2108                                // author chain id and residue numbers.
2109
2110                        String asymId = siteGen.getLabel_asym_id(); // chain name
2111                        String authId = siteGen.getAuth_asym_id(); // chain Id
2112                                String auth_seq_id = siteGen.getAuth_seq_id(); // Res num
2113
2114                                String insCode = siteGen.getPdbx_auth_ins_code();
2115                                if ( insCode != null && insCode.equals("?"))
2116                                        insCode = null;
2117
2118                                // Look for asymID = chainID and seqID = seq_ID.  Check that comp_id matches the resname.
2119                                Group g = null;
2120                                try {
2121                                Chain chain = structure.getChain(asymId);
2122
2123                                        if (null != chain) {
2124                                                try {
2125                                                        Character insChar = null;
2126                                                        if (null != insCode && insCode.length() > 0) insChar = insCode.charAt(0);
2127                                                g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar));
2128                                                } catch (NumberFormatException e) {
2129                                                logger.warn("Could not lookup residue : " + authId + auth_seq_id);
2130                                                }
2131                                        }
2132                                } catch (StructureException e) {
2133                                        logger.warn("Problem finding residue in site entry " + siteGen.getSite_id() + " - " + e.getMessage(), e.getMessage());
2134                                }
2135
2136                                if (g != null) {
2137                                        // 2. find the site_id, if not existing, create anew.
2138                                        Site site = null;
2139                                        for (Site asite: sites) {
2140                                                if (site_id.equals(asite.getSiteID())) site = asite;
2141                                        }
2142
2143                                        boolean addSite = false;
2144
2145                                        // 3. add this residue to the site.
2146                                        if (site == null) {
2147                                                addSite = true;
2148                                                site = new Site();
2149                                                site.setSiteID(site_id);
2150                                        }
2151
2152                                        List<Group> groups = site.getGroups();
2153                                        if (groups == null) groups = new ArrayList<Group>();
2154
2155                                        // Check the self-consistency of the residue reference from auth_seq_id and chain_id
2156                                        if (!comp_id.equals(g.getPDBName())) {
2157                                        logger.warn("comp_id doesn't match the residue at " + authId + " " + auth_seq_id + " - skipping");
2158                                        } else {
2159                                                groups.add(g);
2160                                                site.setGroups(groups);
2161                                        }
2162                                        if (addSite) sites.add(site);
2163                                }
2164                }
2165                structure.setSites(sites);
2166        }
2167}