001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * created at Apr 26, 2008
021 */
022package org.biojava.nbio.structure.io.mmcif;
023
024import java.text.ParseException;
025import java.text.SimpleDateFormat;
026import java.util.ArrayList;
027import java.util.Date;
028import java.util.HashMap;
029import java.util.List;
030import java.util.Locale;
031import java.util.Map;
032
033import javax.vecmath.Matrix4d;
034
035import org.biojava.nbio.structure.AminoAcid;
036import org.biojava.nbio.structure.AminoAcidImpl;
037import org.biojava.nbio.structure.Atom;
038import org.biojava.nbio.structure.AtomImpl;
039import org.biojava.nbio.structure.Chain;
040import org.biojava.nbio.structure.ChainImpl;
041import org.biojava.nbio.structure.EntityInfo;
042import org.biojava.nbio.structure.EntityType;
043import org.biojava.nbio.structure.DBRef;
044import org.biojava.nbio.structure.Element;
045import org.biojava.nbio.structure.Group;
046import org.biojava.nbio.structure.GroupType;
047import org.biojava.nbio.structure.HetatomImpl;
048import org.biojava.nbio.structure.NucleotideImpl;
049import org.biojava.nbio.structure.PDBCrystallographicInfo;
050import org.biojava.nbio.structure.PDBHeader;
051import org.biojava.nbio.structure.ResidueNumber;
052import org.biojava.nbio.structure.SeqMisMatch;
053import org.biojava.nbio.structure.SeqMisMatchImpl;
054import org.biojava.nbio.structure.Site;
055import org.biojava.nbio.structure.Structure;
056import org.biojava.nbio.structure.StructureException;
057import org.biojava.nbio.structure.StructureImpl;
058import org.biojava.nbio.structure.StructureTools;
059import org.biojava.nbio.structure.io.BondMaker;
060import org.biojava.nbio.structure.io.ChargeAdder;
061import org.biojava.nbio.structure.io.EntityFinder;
062import org.biojava.nbio.structure.io.FileParsingParameters;
063import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
064import org.biojava.nbio.structure.io.mmcif.model.AtomSite;
065import org.biojava.nbio.structure.io.mmcif.model.AtomSites;
066import org.biojava.nbio.structure.io.mmcif.model.AuditAuthor;
067import org.biojava.nbio.structure.io.mmcif.model.Cell;
068import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
069import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom;
070import org.biojava.nbio.structure.io.mmcif.model.ChemCompBond;
071import org.biojava.nbio.structure.io.mmcif.model.ChemCompDescriptor;
072import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBremark;
073import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBrev;
074import org.biojava.nbio.structure.io.mmcif.model.DatabasePdbrevRecord;
075import org.biojava.nbio.structure.io.mmcif.model.Entity;
076import org.biojava.nbio.structure.io.mmcif.model.EntityPoly;
077import org.biojava.nbio.structure.io.mmcif.model.EntityPolySeq;
078import org.biojava.nbio.structure.io.mmcif.model.EntitySrcGen;
079import org.biojava.nbio.structure.io.mmcif.model.EntitySrcNat;
080import org.biojava.nbio.structure.io.mmcif.model.EntitySrcSyn;
081import org.biojava.nbio.structure.io.mmcif.model.Exptl;
082import org.biojava.nbio.structure.io.mmcif.model.PdbxAuditRevisionHistory;
083import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompDescriptor;
084import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompIdentifier;
085import org.biojava.nbio.structure.io.mmcif.model.PdbxDatabaseStatus;
086import org.biojava.nbio.structure.io.mmcif.model.PdbxEntityNonPoly;
087import org.biojava.nbio.structure.io.mmcif.model.PdbxNonPolyScheme;
088import org.biojava.nbio.structure.io.mmcif.model.PdbxPolySeqScheme;
089import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssembly;
090import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssemblyGen;
091import org.biojava.nbio.structure.io.mmcif.model.PdbxStructOperList;
092import org.biojava.nbio.structure.io.mmcif.model.Refine;
093import org.biojava.nbio.structure.io.mmcif.model.Struct;
094import org.biojava.nbio.structure.io.mmcif.model.StructAsym;
095import org.biojava.nbio.structure.io.mmcif.model.StructConn;
096import org.biojava.nbio.structure.io.mmcif.model.StructKeywords;
097import org.biojava.nbio.structure.io.mmcif.model.StructNcsOper;
098import org.biojava.nbio.structure.io.mmcif.model.StructRef;
099import org.biojava.nbio.structure.io.mmcif.model.StructRefSeq;
100import org.biojava.nbio.structure.io.mmcif.model.StructRefSeqDif;
101import org.biojava.nbio.structure.io.mmcif.model.StructSite;
102import org.biojava.nbio.structure.io.mmcif.model.StructSiteGen;
103import org.biojava.nbio.structure.io.mmcif.model.Symmetry;
104import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
105import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
106import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
107import org.biojava.nbio.structure.xtal.CrystalCell;
108import org.biojava.nbio.structure.xtal.SpaceGroup;
109import org.biojava.nbio.structure.xtal.SymoplibParser;
110import org.slf4j.Logger;
111import org.slf4j.LoggerFactory;
112
113/**
114 * A MMcifConsumer implementation that builds an in-memory representation of the
115 * content of a mmcif file as a BioJava Structure object.
116 *
117 * @author Andreas Prlic
118 * @since 1.7
119 */
120
121public class SimpleMMcifConsumer implements MMcifConsumer {
122
123        private static final Logger logger = LoggerFactory.getLogger(SimpleMMcifConsumer.class);
124
125        private Structure structure;
126        private Chain currentChain;
127        private Group currentGroup;
128
129        /**
130         * A temporary data structure to hold all parsed chains
131         */
132        private ArrayList<List<Chain>> allModels; 
133        /**
134         * The current set of chains per model
135         */
136        private List<Chain>      currentModel;
137        private List<Entity>     entities;
138        /**
139         * Needed in header only mode to get mapping between asym ids and author ids
140         */
141        private List<EntityPoly> entityPolys;
142        private List<StructRef>  strucRefs;
143        private List<Chain>      seqResChains;
144        private List<Chain>      entityChains; // needed to link entities, chains and compounds...
145        private List<StructAsym> structAsyms;  // needed to link entities, chains and compounds...
146        private List<PdbxStructOperList> structOpers ; //
147        private List<PdbxStructAssembly> strucAssemblies;
148        private List<PdbxStructAssemblyGen> strucAssemblyGens;
149        private List<EntitySrcGen> entitySrcGens;
150        private List<EntitySrcNat> entitySrcNats;
151        private List<EntitySrcSyn> entitySrcSyns;
152        private List<StructConn> structConn;
153        private List<StructNcsOper> structNcsOper;
154        private List<StructRefSeqDif> sequenceDifs;
155        private List<StructSiteGen> structSiteGens;
156        
157        private Matrix4d parsedScaleMatrix;
158
159
160
161        /**
162         * A map of asym ids (internal chain ids) to entity ids extracted from
163         * the _struct_asym category
164         */
165        private Map<String,String> asymId2entityId;
166
167        /**
168         * A map of asym ids (internal chain ids) to author ids extracted from 
169         * the _entity_poly category. Used in header only parsing.
170         */
171        private Map<String,String> asymId2authorId;
172
173        private String currentNmrModelNumber ;
174
175        private FileParsingParameters params;
176
177        public  SimpleMMcifConsumer(){
178                params = new FileParsingParameters();
179                documentStart();
180
181        }
182
183        @Override
184        public void newEntity(Entity entity) {
185                logger.debug("New entity: {}",entity.toString());
186                entities.add(entity);
187        }
188
189        @Override
190        public void newEntityPoly(EntityPoly entityPoly) {
191                entityPolys.add(entityPoly);
192        }
193
194        @Override
195        public void newPdbxStructOperList(PdbxStructOperList structOper){
196
197                structOpers.add(structOper);
198        }
199
200        @Override
201        public void newStructAsym(StructAsym sasym){
202
203                structAsyms.add(sasym);
204        }
205
206        private Entity getEntity(int entity_id){
207                try {
208                        for (Entity e: entities){
209                                int eId = Integer.parseInt(e.getId());
210                                if  (eId== entity_id){
211                                        return e;
212                                }
213                        }
214                } catch (NumberFormatException e) {
215                        logger.warn("Entity id does not look like a number:", e.getMessage());
216                }
217                return null;
218        }
219
220        @Override
221        public void newStructKeywords(StructKeywords kw){
222                PDBHeader header = structure.getPDBHeader();
223                if ( header == null)
224                        header = new PDBHeader();
225                header.setDescription(kw.getPdbx_keywords());
226                header.setClassification(kw.getPdbx_keywords());
227        }
228
229        @Override
230        public void setStruct(Struct struct) {
231
232                PDBHeader header = structure.getPDBHeader();
233                if ( header == null)
234                        header = new PDBHeader();
235
236                header.setTitle(struct.getTitle());
237                header.setIdCode(struct.getEntry_id());
238                //header.setDescription(struct.getPdbx_descriptor());
239                //header.setClassification(struct.getPdbx_descriptor());
240                //header.setDescription(struct.getPdbx_descriptor());
241
242
243
244                structure.setPDBHeader(header);
245                structure.setPDBCode(struct.getEntry_id());
246        }
247
248        /** initiate new group, either Hetatom, Nucleotide, or AminoAcid */
249        private Group getNewGroup(String recordName,Character aminoCode1, long seq_id,String groupCode3) {
250
251                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(groupCode3);
252                if ( g != null && !g.getChemComp().isEmpty()) {
253                        if ( g instanceof AminoAcidImpl) {
254                                AminoAcidImpl aa = (AminoAcidImpl) g;
255                                aa.setId(seq_id);
256                        } else if ( g instanceof NucleotideImpl) {
257                                NucleotideImpl nuc =  (NucleotideImpl) g;
258                                nuc.setId(seq_id);
259                        } else if ( g instanceof HetatomImpl) {
260                                HetatomImpl het = (HetatomImpl)g;
261                                het.setId(seq_id);
262                        }
263                        return g;
264                }
265
266
267
268                Group group;
269                if ( recordName.equals("ATOM") ) {
270                        if (StructureTools.isNucleotide(groupCode3))  {
271                                // it is a nucleotide
272                                NucleotideImpl nu = new NucleotideImpl();
273                                group = nu;
274                                nu.setId(seq_id);
275
276                        } else if (aminoCode1==null || aminoCode1 == StructureTools.UNKNOWN_GROUP_LABEL){
277                                HetatomImpl h = new HetatomImpl();
278                                h.setId(seq_id);
279                                group = h;
280
281                        } else {
282                                AminoAcidImpl aa = new AminoAcidImpl() ;
283                                aa.setAminoType(aminoCode1);
284                                aa.setId(seq_id);
285                                group = aa ;
286                        }
287                }
288                else {
289                        if (StructureTools.isNucleotide(groupCode3))  {
290                                // it is a nucleotide
291                                NucleotideImpl nu = new NucleotideImpl();
292                                group = nu;
293                                nu.setId(seq_id);
294                        }
295                        else if (aminoCode1 != null ) {
296                                AminoAcidImpl aa = new AminoAcidImpl() ;
297                                aa.setAminoType(aminoCode1);
298                                aa.setId(seq_id);
299                                group = aa ;
300                        } else {
301                                HetatomImpl h = new HetatomImpl();
302                                h.setId(seq_id);
303                                group = h;
304                        }
305                }
306                return  group ;
307        }
308
309        /**
310         * Test if the given asymId is already present in the list of chains given. If yes, returns the chain
311         * otherwise returns null.
312         */
313        private static Chain isKnownChain(String asymId, List<Chain> chains){
314
315                for (int i = 0; i< chains.size();i++){
316                        Chain testchain =  chains.get(i);
317                        //System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<");
318                        if (asymId.equals(testchain.getId())) {
319                                //System.out.println("chain "+ chainID+" already known ...");
320                                return testchain;
321                        }
322                }
323
324                return null;
325        }
326
327        @Override
328        public void newAtomSite(AtomSite atom) {
329
330                if (params.isHeaderOnly()) return;
331
332                // Warning: getLabel_asym_id is not the "chain id" in the PDB file
333                // it is the internally used chain id.
334                // later on we will fix this...
335
336                // later one needs to map the asym id to the pdb_strand_id
337
338                //TODO: add support for FileParsingParams.getMaxAtoms()
339
340                boolean startOfNewChain = false;
341
342                String asymId = atom.getLabel_asym_id();
343                String authId = atom.getAuth_asym_id();
344
345                String recordName    = atom.getGroup_PDB();
346                String residueNumberS = atom.getAuth_seq_id();
347                Integer residueNrInt = Integer.parseInt(residueNumberS);
348
349                // the 3-letter name of the group:
350                String groupCode3    = atom.getLabel_comp_id();
351
352                boolean isHetAtomInFile = false;
353
354                Character aminoCode1 = null;
355                if ( recordName.equals("ATOM") )
356                        aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
357                else {
358                        aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
359
360                        // for nucleotides this will be null..
361                        if (aminoCode1 != null &&  aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
362                                aminoCode1 = null;
363
364                        isHetAtomInFile = true;
365                }
366                String insCodeS = atom.getPdbx_PDB_ins_code();
367                Character insCode = null;
368                if (!  insCodeS.equals("?")) {
369                        insCode = insCodeS.charAt(0);
370                }
371                // we store the internal seq id in the Atom._id field
372                // this is not a PDB file field but we need this to internally assign the insertion codes later
373                // from the pdbx_poly_seq entries..
374
375                long seq_id = -1;
376                try {
377                        seq_id = Long.parseLong(atom.getLabel_seq_id());
378                } catch (NumberFormatException e){
379                        // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.', thus it is ok to
380                        // silently ignore this
381                        //logger.debug("Could not parse number for _atom_site.label_seq_id: "+e.getMessage());
382                }
383
384                String nmrModelNumber = atom.getPdbx_PDB_model_num();
385
386                if ( currentNmrModelNumber == null) {
387                        currentNmrModelNumber = nmrModelNumber;
388                }
389
390                if (! currentNmrModelNumber.equals(nmrModelNumber)){
391                        currentNmrModelNumber = nmrModelNumber;
392
393                        // add previous data
394                        if ( currentChain != null ) {
395                                currentChain.addGroup(currentGroup);
396                                currentGroup.trimToSize();
397                        }
398
399                        // we came to the beginning of a new NMR model
400                        allModels.add(currentModel);
401                        currentModel = new ArrayList<Chain>();
402                        currentChain = null;
403                        currentGroup = null;
404                }
405
406
407                if (currentChain == null) {
408
409                        currentChain = new ChainImpl();
410                        currentChain.setName(authId);
411                        currentChain.setId(asymId);
412                        currentModel.add(currentChain);
413                        startOfNewChain = true;
414                }
415
416                //System.out.println("BEFORE: " + chain_id + " " + current_chain.getName());
417                if ( ! asymId.equals(currentChain.getId()) ) {
418                        //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId);
419                        startOfNewChain = true;
420
421                        // end up old chain...
422                        currentChain.addGroup(currentGroup);
423
424                        // see if old chain is known ...
425                        Chain testchain = isKnownChain(asymId,currentModel);
426
427                        if ( testchain == null) {
428                                //logger.info("unknown chain. creating new chain. authId:" + authId + " asymId: " + asymId);
429
430                                currentChain = new ChainImpl();
431                                currentChain.setName(authId);
432                                currentChain.setId(asymId);
433
434                        }   else {
435                                currentChain = testchain;
436                        }
437
438                        if ( ! currentModel.contains(currentChain))
439                                currentModel.add(currentChain);
440
441                }
442
443
444                ResidueNumber residueNumber = new ResidueNumber(authId,residueNrInt, insCode);
445
446                if (currentGroup == null) {
447
448
449                        currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
450
451                        currentGroup.setResidueNumber(residueNumber);
452                        currentGroup.setPDBName(groupCode3);
453                        currentGroup.setHetAtomInFile(isHetAtomInFile);
454                }
455
456                // SET UP THE ALT LOC GROUP
457                Group altGroup = null;
458                String altLocS = atom.getLabel_alt_id();
459                Character altLoc = ' ';
460                if ( altLocS.length()>0) {
461                        altLoc = altLocS.charAt(0);
462                        if ( altLoc.equals('.') )
463                                altLoc = ' ';
464
465                }
466                // If it's the start of the new chain 
467                if ( startOfNewChain){
468                        currentGroup = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
469                        currentGroup.setResidueNumber(residueNumber);
470                        currentGroup.setPDBName(groupCode3);
471                        currentGroup.setHetAtomInFile(isHetAtomInFile);
472                }
473                // ANTHONY BRADLEY ADDED THIS -> WE ONLY WAN'T TO CHECK FOR ALT LOCS WHEN IT's NOT THE FIRST GROUP IN CHAIN
474                else{
475                        // check if residue number is the same ...
476                        // insertion code is part of residue number
477                        if ( ! residueNumber.equals(currentGroup.getResidueNumber())) {
478                                //System.out.println("end of residue: "+current_group.getPDBCode()+" "+residueNrInt);
479                                currentChain.addGroup(currentGroup);
480                                currentGroup.trimToSize();
481                                currentGroup = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
482                                currentGroup.setPDBName(groupCode3);
483                                currentGroup.setResidueNumber(residueNumber);
484                                currentGroup.setHetAtomInFile(isHetAtomInFile);
485
486
487                        } else {
488                                // same residueNumber, but altLocs...
489                                // test altLoc
490
491                                if ( ! altLoc.equals(' ') && ( ! altLoc.equals('.'))) {
492                                        logger.debug("found altLoc! " + altLoc + " " + currentGroup + " " + altGroup);
493                                        altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3, seq_id);
494                                        if (altGroup.getChain()==null) {
495                                                altGroup.setChain(currentChain);
496                                        }
497                                }
498                        }
499                }
500                //atomCount++;
501                //System.out.println("fixing atom name for  >" + atom.getLabel_atom_id() + "< >" + fullname + "<");
502
503
504                if ( params.isParseCAOnly() ){
505                        // yes , user wants to get CA only
506                        // only parse CA atoms...
507                        if (! (atom.getLabel_atom_id().equals(StructureTools.CA_ATOM_NAME) && atom.getType_symbol().equals("C"))) {
508                                //System.out.println("ignoring " + line);
509                                //atomCount--;
510                                return;
511                        }
512                }
513
514                //see if chain_id is one of the previous chains ...
515
516                Atom a = convertAtom(atom);
517
518                //see if chain_id is one of the previous chains ...
519                if ( altGroup != null) {
520                        altGroup.addAtom(a);
521                        altGroup = null;
522                }
523                else {
524                        currentGroup.addAtom(a);
525                }
526
527
528                String atomName = a.getName();
529                // make sure that main group has all atoms 
530                // GitHub issue: #76
531                if ( ! currentGroup.hasAtom(atomName)) {
532                        // Unless it's microheterogenity https://github.com/rcsb/codec-devel/issues/81
533                        if (currentGroup.getPDBName().equals(a.getGroup().getPDBName())) {
534                                if(!StructureTools.hasNonDeuteratedEquiv(a,currentGroup)){
535                                        currentGroup.addAtom(a);
536                                }
537                        }
538
539                }
540        }
541
542        /** 
543         * Convert a mmCIF AtomSite object to a BioJava Atom object
544         *
545         * @param atom the mmmcif AtomSite record
546         * @return an Atom
547         */
548        private Atom convertAtom(AtomSite atom){
549
550
551                Atom a = new AtomImpl();
552
553                a.setPDBserial(Integer.parseInt(atom.getId()));
554                a.setName(atom.getLabel_atom_id());
555
556                double x = Double.parseDouble (atom.getCartn_x());
557                double y = Double.parseDouble (atom.getCartn_y());
558                double z = Double.parseDouble (atom.getCartn_z());
559                a.setX(x);
560                a.setY(y);
561                a.setZ(z);
562
563                float occupancy = Float.parseFloat (atom.getOccupancy());
564                a.setOccupancy(occupancy);
565
566                float temp = Float.parseFloat (atom.getB_iso_or_equiv());
567                a.setTempFactor(temp);
568
569                String alt = atom.getLabel_alt_id();
570                if (( alt != null ) && ( alt.length() > 0) && (! alt.equals("."))){
571                        a.setAltLoc(new Character(alt.charAt(0)));
572                } else {
573                        a.setAltLoc(new Character(' '));
574                }
575
576                Element element = Element.R;
577                try {
578                        element = Element.valueOfIgnoreCase(atom.getType_symbol());
579                }  catch (IllegalArgumentException e) {
580                        logger.info("Element {} was not recognised as a BioJava-known element, the element will be represented as the generic element {}", atom.getType_symbol(), Element.R.name());
581                }
582                a.setElement(element);
583
584                return a;
585
586        }
587
588
589        private Group getCorrectAltLocGroup( Character altLoc,
590                        String recordName,
591                        Character aminoCode1,
592                        String groupCode3,
593                        long seq_id) {
594
595                // see if we know this altLoc already;
596                List<Atom> atoms = currentGroup.getAtoms();
597                if ( atoms.size() > 0) {
598                        Atom a1 = atoms.get(0);
599                        // we are just adding atoms to the current group
600                        // probably there is a second group following later...
601                        if (a1.getAltLoc().equals(altLoc)) {
602
603                                return currentGroup;
604                        }
605                }
606
607                List<Group> altLocs = currentGroup.getAltLocs();
608                for ( Group altLocG : altLocs ){
609                        atoms = altLocG.getAtoms();
610                        if ( atoms.size() > 0) {
611                                for ( Atom a1 : atoms) {
612                                        if (a1.getAltLoc().equals( altLoc)) {
613
614                                                return altLocG;
615                                        }
616                                }
617                        }
618                }
619
620                // no matching altLoc group found.
621                // build it up.
622
623                if ( groupCode3.equals(currentGroup.getPDBName())) {
624                        if ( currentGroup.getAtoms().size() == 0) {
625                                //System.out.println("current group is empty " + current_group + " " + altLoc);
626                                return currentGroup;
627                        }
628                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
629                        Group altLocG = (Group) currentGroup.clone();
630                        // drop atoms from cloned group...
631                        // https://redmine.open-bio.org/issues/3307
632                        altLocG.setAtoms(new ArrayList<Atom>());
633                        altLocG.getAltLocs().clear();
634                        currentGroup.addAltLoc(altLocG);
635                        return altLocG;
636                }
637
638                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
639                //String recordName,Character aminoCode1, long seq_id,String groupCode3) {
640                Group altLocG = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
641
642                altLocG.setPDBName(groupCode3);
643                altLocG.setResidueNumber(currentGroup.getResidueNumber());
644                currentGroup.addAltLoc(altLocG);
645                return altLocG;
646        }
647
648        /** 
649         * Start the parsing
650         */
651        @Override
652        public void documentStart() {
653                structure = new StructureImpl();
654
655                currentChain        = null;
656                currentGroup            = null;
657                currentNmrModelNumber   = null;
658                //atomCount                     = 0;
659
660                allModels     = new ArrayList<List<Chain>>();
661                currentModel  = new ArrayList<Chain>();
662                entities      = new ArrayList<Entity>();
663                entityPolys   = new ArrayList<>();
664                strucRefs     = new ArrayList<StructRef>();
665                seqResChains  = new ArrayList<Chain>();
666                entityChains  = new ArrayList<Chain>();
667                structAsyms   = new ArrayList<StructAsym>();
668
669                asymId2entityId = new HashMap<String,String>();
670                asymId2authorId = new HashMap<>();
671                structOpers   = new ArrayList<PdbxStructOperList>();
672                strucAssemblies = new ArrayList<PdbxStructAssembly>();
673                strucAssemblyGens = new ArrayList<PdbxStructAssemblyGen>();
674                entitySrcGens = new ArrayList<EntitySrcGen>();
675                entitySrcNats = new ArrayList<EntitySrcNat>();
676                entitySrcSyns = new ArrayList<EntitySrcSyn>();
677                structConn = new ArrayList<StructConn>();
678                structNcsOper = new ArrayList<StructNcsOper>();
679                sequenceDifs = new ArrayList<StructRefSeqDif>();
680                structSiteGens = new ArrayList<StructSiteGen>();
681        }
682
683
684        @Override
685        public void documentEnd() {
686
687                // Expected that there is one current_chain that needs to be added to the model
688                // When in headerOnly mode, no Atoms are read, and there will not be an active
689                // current_chain.
690                if ( currentChain != null ) {
691
692                        currentChain.addGroup(currentGroup);
693                        if (isKnownChain(currentChain.getId(),currentModel) == null) {
694                                currentModel.add(currentChain);
695                        }
696                } else if (!params.isHeaderOnly()){
697                        logger.warn("current chain is null at end of document.");
698                }
699
700                allModels.add(currentModel);
701
702                // this populates the asymId2authorId and asymId2entityId maps, needed in header only mode to get the mapping 
703                // between the 2 chain identifiers.
704                initMaps();
705
706                for (StructAsym asym : structAsyms) {
707
708                        logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() );
709
710                        Chain s = getEntityChain(asym.getEntity_id());
711                        Chain seqres = (Chain)s.clone();
712                        // to solve issue #160 (e.g. 3u7t)
713                        seqres = removeSeqResHeterogeneity(seqres);
714                        seqres.setId(asym.getId());
715                        if (asymId2authorId.get(asym.getId()) !=null ){ 
716                                seqres.setName(asymId2authorId.get(asym.getId()));
717                        } else {
718                                seqres.setName(asym.getId());
719                        }
720
721                        EntityType type = null;
722                        try {
723                                Entity ent = getEntity(Integer.parseInt(asym.getEntity_id()));
724                                type = EntityType.entityTypeFromString(ent.getType());
725                        } catch (NumberFormatException e) {
726                                logger.debug("Could not parse integer from entity id field {}", asym.getEntity_id());
727                        }
728
729                        // we'll only add seqres chains that are polymeric or unknown
730                        if (type==null || type==EntityType.POLYMER ) {
731                                seqResChains.add(seqres);       
732                        }
733
734                        logger.debug(" seqres: " + asym.getId() + " " + seqres + "<") ;
735                        // adding the entities to structure
736                        addEntities(asym);
737
738                }
739
740                if (structAsyms.isEmpty()) {
741                        logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
742                }
743
744                // entities
745                // In addEntities above we created the entities if they were present in the file
746                // Now we need to make sure that they are linked to chains and also that if they are not present in the file we need to add them now
747                linkEntities();
748
749                // now that we know the entities, we can add all chains to structure so that they are stored
750                // properly as polymer/nonpolymer/water chains inside structure
751                for (List<Chain> model:allModels) {
752                        structure.addModel(model);
753                }
754
755                // Only align if requested (default) and not when headerOnly mode with no Atoms.
756                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
757                if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){
758                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
759                        alignSeqRes();
760                } else {
761                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
762                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
763                }
764
765
766                // Now make sure all altlocgroups have all the atoms in all the groups
767                StructureTools.cleanUpAltLocs(structure);
768
769
770                // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
771                if (!params.isHeaderOnly()) {
772                        if ( params.shouldCreateAtomBonds()) {
773                                addBonds();
774                        }
775
776                        if ( params.shouldCreateAtomCharges()) {
777                                addCharges();
778                        }
779                }
780
781                if (!params.isHeaderOnly()) {
782
783                        // Do structure.setSites(sites) after any chain renaming to be like PDB.
784                        addSites();
785                }
786
787
788
789                // set the oligomeric state info in the header...
790                if (params.isParseBioAssembly()) {
791
792                        // the more detailed mapping of chains to rotation operations happens in StructureIO...
793
794                        Map<Integer,BioAssemblyInfo> bioAssemblies = new HashMap<Integer, BioAssemblyInfo>();
795
796                        for ( PdbxStructAssembly psa : strucAssemblies){
797
798                                List<PdbxStructAssemblyGen> psags = new ArrayList<PdbxStructAssemblyGen>(1);
799
800                                for ( PdbxStructAssemblyGen psag: strucAssemblyGens ) {
801                                        if ( psag.getAssembly_id().equals(psa.getId())) {
802                                                psags.add(psag);
803                                        }
804                                }
805
806                                BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
807
808                                // these are the transformations that need to be applied to our model
809                                List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(psa, psags, structOpers);
810
811                                int bioAssemblyId = -1;
812                                try {
813                                        bioAssemblyId = Integer.parseInt(psa.getId());
814                                } catch (NumberFormatException e) {
815                                        logger.info("Could not parse a numerical bio assembly id from '{}'",psa.getId());
816                                }
817
818                                // if bioassembly id is not numerical we throw it away
819                                // this happens usually for viral capsid entries, like 1ei7
820                                // see issue #230 in github
821                                if (bioAssemblyId!=-1) {
822                                        int mmSize = 0;
823                                        // note that the transforms contain asym ids of both polymers and non-polymers
824                                        // For the mmsize, we are only interested in the polymers
825                                        for (BiologicalAssemblyTransformation transf:transformations) {
826                                                Chain c = structure.getChain(transf.getChainId());
827                                                if (c==null) {
828                                                        logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
829                                                        continue;
830                                                }
831                                                if (c.getEntityType() == EntityType.POLYMER &&
832                                                        // for entries like 4kro, sugars are annotated as polymers but we
833                                                        // don't want them in the macromolecularSize count
834                                                        !c.getEntityInfo().getDescription().contains("SUGAR") ) {
835                                                                
836                                                                mmSize++;
837                                                        }
838                                        }
839                                        
840                                        BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
841                                        bioAssembly.setId(bioAssemblyId);
842                                        bioAssembly.setMacromolecularSize(mmSize);
843                                        bioAssembly.setTransforms(transformations);
844                                        bioAssemblies.put(bioAssemblyId,bioAssembly);
845                                }
846
847                        }
848                        structure.getPDBHeader().setBioAssemblies(bioAssemblies);
849                }
850
851                setStructNcsOps();
852                
853                setCrystallographicInfoMetadata();
854
855
856                Map<String,List<SeqMisMatch>> misMatchMap = new HashMap<String, List<SeqMisMatch>>();
857                for (StructRefSeqDif sdif : sequenceDifs) {
858                        SeqMisMatch misMatch = new SeqMisMatchImpl();
859                        misMatch.setDetails(sdif.getDetails());
860
861                        String insCode = sdif.getPdbx_pdb_ins_code();
862                        if ( insCode != null && insCode.equals("?"))
863                                insCode = null;
864                        misMatch.setInsCode(insCode);
865                        misMatch.setOrigGroup(sdif.getDb_mon_id());
866                        misMatch.setPdbGroup(sdif.getMon_id());
867                        misMatch.setPdbResNum(sdif.getPdbx_auth_seq_num());
868                        misMatch.setUniProtId(sdif.getPdbx_seq_db_accession_code());
869                        misMatch.setSeqNum(sdif.getSeq_num());
870
871
872                        List<SeqMisMatch> mms = misMatchMap.get(sdif.getPdbx_pdb_strand_id());
873                        if ( mms == null) {
874                                mms = new ArrayList<SeqMisMatch>();
875                                misMatchMap.put(sdif.getPdbx_pdb_strand_id(),mms);
876                        }
877                        mms.add(misMatch);
878
879                }
880
881                for (String chainId : misMatchMap.keySet()){
882
883                        Chain chain = structure.getPolyChainByPDB(chainId);
884
885                        if ( chain == null) {
886                                logger.warn("Could not set mismatches for chain with author id" + chainId);
887                                continue;
888                        }
889
890                        chain.setSeqMisMatches(misMatchMap.get(chainId));
891
892
893                }
894
895        }
896
897        /**
898         * Here we link entities to chains.
899         * Also if entities are not present in file, this initialises the entities with some heuristics, see {@link org.biojava.nbio.structure.io.EntityFinder}
900         */
901        private void linkEntities() {
902
903                for (int i =0; i< allModels.size() ; i++){
904                        for (Chain chain : allModels.get(i)) {
905                                //logger.info("linking entities for " + chain.getId() + " "  + chain.getName());
906                                String entityId = asymId2entityId.get(chain.getId());
907
908                                if (entityId==null) {
909                                        // this can happen for instance if the cif file didn't have _struct_asym category at all
910                                        // and thus we have no asymId2entityId mapping at all
911                                        logger.info("No entity id could be found for chain {}", chain.getId());
912                                        continue;
913                                }
914                                int eId = Integer.parseInt(entityId);
915
916                                // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
917                                // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
918                                // asyms (chains).  Either create a unique StructureImpl or modify existing for a better representation of the
919                                // mmCIF internal data structures but is compatible with Structure interface.
920                                // Some examples of PDB entries with this kind of problem:
921                                //   - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName
922                                //   - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule
923                                //   - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone
924
925                                EntityInfo entityInfo = structure.getEntityById(eId);
926                                if (entityInfo==null) {
927                                        // Supports the case where the only chain members were from non-polymeric entity that is missing.
928                                        // Solved by creating a new Compound(entity) to which this chain will belong.
929                                        logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
930                                                        eId, chain.getId());
931                                        entityInfo = new EntityInfo();
932                                        entityInfo.setMolId(eId);
933                                        entityInfo.addChain(chain);
934                                        if (chain.isWaterOnly()) {
935                                                entityInfo.setType(EntityType.WATER);
936                                        } else {
937                                                entityInfo.setType(EntityType.NONPOLYMER);
938                                        }
939                                        chain.setEntityInfo(entityInfo);
940                                        structure.addEntityInfo(entityInfo);
941                                } else {
942                                        logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}",
943                                                        chain.getId(), chain.getName(), eId);
944                                        entityInfo.addChain(chain);
945                                        chain.setEntityInfo(entityInfo);
946                                }
947
948                        }
949
950                }
951
952                // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
953                List<EntityInfo> entityInfos = structure.getEntityInfos();
954                if (entityInfos==null || entityInfos.isEmpty()) {
955
956                        List<List<Chain>> polyModels = new ArrayList<>();
957                        List<List<Chain>> nonPolyModels = new ArrayList<>();
958                        List<List<Chain>> waterModels = new ArrayList<>();
959
960                        for (List<Chain> model:allModels) {
961
962                                List<Chain> polyChains = new ArrayList<>();
963                                List<Chain> nonPolyChains = new ArrayList<>();
964                                List<Chain> waterChains = new ArrayList<>();
965
966                                polyModels.add(polyChains);
967                                nonPolyModels.add(nonPolyChains);
968                                waterModels.add(waterChains);
969
970                                for (Chain c:model) {
971
972                                        // we only have entities for polymeric chains, all others are ignored for assigning entities
973                                        if (c.isWaterOnly()) {
974                                                waterChains.add(c);
975
976                                        } else if (c.isPureNonPolymer()) {
977                                                nonPolyChains.add(c);
978
979                                        } else {
980                                                polyChains.add(c);
981                                        }
982                                }
983                        }
984
985                        entityInfos = EntityFinder.findPolyEntities(polyModels);
986                        EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos);
987
988
989                        structure.setEntityInfos(entityInfos);
990                }
991
992                // final sanity check: it can happen that from the annotated entities some are not linked to any chains
993                // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
994                // we simply log it, this can sign some other problems if the entities are used down the line
995                for (EntityInfo e:entityInfos) {
996                        if (e.getChains().isEmpty()) {
997                                logger.info("Entity {} '{}' has no chains associated to it",
998                                                e.getMolId()<0?"with no entity id":e.getMolId(), e.getDescription());
999                        }
1000                }
1001
1002        }
1003
1004        private void addCharges() {
1005                ChargeAdder.addCharges(structure);
1006        }
1007
1008        /**
1009         * The method will return a new reference to a Chain with any consecutive groups
1010         * having same residue numbers removed.
1011         * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
1012         * @param c
1013         * @return
1014         */
1015        private static Chain removeSeqResHeterogeneity(Chain c) {
1016
1017                Chain trimmedChain = new ChainImpl();
1018
1019                ResidueNumber lastResNum = null;
1020
1021                for (Group g:c.getAtomGroups()) {
1022
1023                        // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
1024                        ResidueNumber currentResNum = new ResidueNumber(
1025                                        g.getResidueNumber().getChainName(),
1026                                        g.getResidueNumber().getSeqNum(),
1027                                        g.getResidueNumber().getInsCode());
1028
1029                        if (lastResNum == null || !lastResNum.equals(currentResNum) ) {
1030                                trimmedChain.addGroup(g);
1031                        } else {
1032                                logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': "+g);
1033                        }
1034
1035                        lastResNum = currentResNum;
1036
1037                }
1038                return trimmedChain;
1039        }
1040
1041        private void addBonds() {
1042                BondMaker maker = new BondMaker(structure, params);
1043                maker.makeBonds();
1044                maker.formBondsFromStructConn(structConn);
1045        }
1046
1047        private void alignSeqRes() {
1048
1049                logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
1050
1051                // fix SEQRES residue numbering for all models
1052
1053                for (int model=0;model<structure.nrModels();model++) {
1054
1055                        List<Chain> atomList   = structure.getModel(model);
1056
1057                        for (Chain seqResChain: seqResChains){
1058
1059                                // this extracts the matching atom chain from atomList
1060                                Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true);
1061
1062                                if (atomChain == null) {
1063                                        // most likely there's no observed residues at all for the seqres chain: can't map
1064                                        // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
1065                                        logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's no observed residues in the chain.",
1066                                                        seqResChain.getId());
1067                                        continue;
1068                                }
1069
1070                                //map the atoms to the seqres...
1071
1072                                // we need to first clone the seqres so that they stay independent for different models
1073                                List<Group> seqResGroups = new ArrayList<Group>();
1074                                for (int i=0;i<seqResChain.getAtomGroups().size();i++) {
1075                                        seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
1076                                }
1077
1078                                for ( int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
1079                                        Group seqresG = seqResGroups.get(seqResPos);
1080                                        boolean found = false;
1081                                        for ( Group atomG: atomChain.getAtomGroups()) {
1082
1083                                                int internalNr = getInternalNr (atomG);
1084
1085                                                if (seqresG.getResidueNumber().getSeqNum() == internalNr ) {
1086                                                        seqResGroups.set(seqResPos, atomG);
1087                                                        found = true;
1088                                                        break;
1089                                                }
1090
1091
1092                                        }
1093                                        if ( ! found)
1094                                                // so far the residue number has tracked internal numbering.
1095                                                // however there are no atom records, as such this can't be a PDB residue number...
1096                                                seqresG.setResidueNumber(null);
1097                                }
1098                                atomChain.setSeqResGroups(seqResGroups);
1099
1100                        }
1101                }
1102        }
1103
1104        private int getInternalNr(Group atomG) {
1105                if ( atomG.getType().equals(GroupType.AMINOACID)) {
1106                        AminoAcidImpl aa = (AminoAcidImpl) atomG;
1107                        return new Long(aa.getId()).intValue();
1108                } else if ( atomG.getType().equals(GroupType.NUCLEOTIDE)) {
1109                        NucleotideImpl nu = (NucleotideImpl) atomG;
1110                        return new Long(nu.getId()).intValue();
1111                } else {
1112                        HetatomImpl he = (HetatomImpl) atomG;
1113                        return new Long(he.getId()).intValue();
1114                }
1115        }
1116
1117        private void addEntities(StructAsym asym) {
1118                int eId = 0;
1119                try {
1120                        eId = Integer.parseInt(asym.getEntity_id());
1121                } catch (NumberFormatException e) {
1122                        logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity",asym.getEntity_id());
1123                }
1124                Entity e = getEntity(eId);
1125
1126                // for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing
1127                // we need to fill the Compounds in some other way:
1128
1129                EntityInfo entityInfo = structure.getEntityById(eId);
1130
1131                if (entityInfo==null) {
1132                        //logger.info("Creating new EntityInfo " + eId + " " + e.getId() + " " + e.getPdbx_description());
1133                        entityInfo = new EntityInfo();
1134                        entityInfo.setMolId(eId);
1135                        // we only add the compound if a polymeric one (to match what the PDB parser does)
1136                        if (e!=null) {
1137                                entityInfo.setDescription(e.getPdbx_description());
1138
1139                                EntityType eType = EntityType.entityTypeFromString(e.getType());
1140                                if (eType!=null) {
1141                                        entityInfo.setType(eType);
1142                                } else {
1143                                        logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", e.getType(), eId);
1144                                }
1145                                addAncilliaryEntityData(asym, eId, e, entityInfo);
1146                                structure.addEntityInfo(entityInfo);
1147                                logger.debug("Adding Entity with entity id {} from _entity, with name: {}",eId, entityInfo.getDescription());
1148                        }
1149                }
1150        }
1151
1152
1153        /**
1154         * Add any extra information to the entity information.
1155         * @param asym 
1156         * @param entityId 
1157         * @param entity 
1158         * @param entityInfo 
1159         */
1160        private void addAncilliaryEntityData(StructAsym asym, int entityId, Entity entity, EntityInfo entityInfo) {
1161                // Loop through each of the entity types and add the corresponding data
1162                // We're assuming if data is duplicated between sources it is consistent
1163                // This is a potentially huge assumption...
1164
1165
1166                for (EntitySrcGen esg : entitySrcGens) {
1167
1168                        if (! esg.getEntity_id().equals(asym.getEntity_id()))
1169                                continue;
1170
1171                        addInformationFromESG(esg, entityId, entityInfo);
1172
1173                }
1174
1175                for (EntitySrcNat esn : entitySrcNats) {
1176                        if (! esn.getEntity_id().equals(asym.getEntity_id()))
1177                                continue;
1178                        addInformationFromESN(esn, entityId, entityInfo);
1179
1180                }
1181
1182                for (EntitySrcSyn ess : entitySrcSyns) {
1183                        if (! ess.getEntity_id().equals(asym.getEntity_id()))
1184                                continue;
1185                        addInfoFromESS(ess, entityId, entityInfo);
1186
1187                }               
1188        }
1189
1190        /**
1191         * Add the information from an ESG to a compound.
1192         * @param entitySrcInfo
1193         * @param entityId
1194         * @param c
1195         */
1196        private void addInformationFromESG(EntitySrcGen entitySrcInfo, int entityId, EntityInfo c) {
1197                c.setAtcc(entitySrcInfo.getPdbx_gene_src_atcc());
1198                c.setCell(entitySrcInfo.getPdbx_gene_src_cell());
1199                c.setOrganismCommon(entitySrcInfo.getGene_src_common_name());
1200                c.setOrganismScientific(entitySrcInfo.getPdbx_gene_src_scientific_name());
1201                c.setOrganismTaxId(entitySrcInfo.getPdbx_gene_src_ncbi_taxonomy_id());
1202                c.setExpressionSystemTaxId(entitySrcInfo.getPdbx_host_org_ncbi_taxonomy_id());
1203                c.setExpressionSystem(entitySrcInfo.getPdbx_host_org_scientific_name());
1204        }
1205
1206        /**
1207         * Add the information to entity info from ESN.
1208         * @param esn
1209         * @param eId
1210         * @param c
1211         */
1212        private void addInformationFromESN(EntitySrcNat esn, int eId, EntityInfo c) {
1213
1214                c.setAtcc(esn.getPdbx_atcc());
1215                c.setCell(esn.getPdbx_cell());
1216                c.setOrganismCommon(esn.getCommon_name());
1217                c.setOrganismScientific(esn.getPdbx_organism_scientific());
1218                c.setOrganismTaxId(esn.getPdbx_ncbi_taxonomy_id());
1219
1220        }
1221        /**
1222         * Add the information from ESS to Entity info.
1223         * @param ess
1224         * @param eId
1225         * @param c
1226         */
1227        private void addInfoFromESS(EntitySrcSyn ess, int eId, EntityInfo c) {
1228                c.setOrganismCommon(ess.getOrganism_common_name());
1229                c.setOrganismScientific(ess.getOrganism_scientific());
1230                c.setOrganismTaxId(ess.getNcbi_taxonomy_id());
1231
1232        }
1233
1234        private void initMaps() {
1235
1236
1237                if (structAsyms == null || structAsyms.isEmpty()) {
1238                        logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available");
1239                        return;
1240                }
1241
1242                Map<String, List<String>> entityId2asymId = new HashMap<>();
1243
1244                for (StructAsym asym : structAsyms) {
1245
1246                        logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() );
1247
1248                        asymId2entityId.put(asym.getId(), asym.getEntity_id());
1249
1250                        if (entityId2asymId.containsKey(asym.getEntity_id())) {
1251                                List<String> asymIds = entityId2asymId.get(asym.getEntity_id());
1252                                asymIds.add(asym.getId());
1253                        } else {
1254                                List<String> asymIds = new ArrayList<>();
1255                                asymIds.add(asym.getId());
1256                                entityId2asymId.put(asym.getEntity_id(), asymIds);
1257                        }
1258                }
1259
1260                if (entityPolys==null || entityPolys.isEmpty()) {
1261                        logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available for header only parsing");
1262                        return;
1263                }
1264
1265                for (EntityPoly ep:entityPolys) {
1266                        if (ep.getPdbx_strand_id()==null) {
1267                                logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to author ids for this entity.", ep.getEntity_id());
1268                                continue;
1269                        }
1270                        String[] chainNames = ep.getPdbx_strand_id().split(",");
1271                        List<String> asymIds = entityId2asymId.get(ep.getEntity_id());
1272                        if (chainNames.length!=asymIds.size()) {
1273                                logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) for entity {} have different lengths! Can't provide a mapping from asym ids to author chain ids", ep.getEntity_id());
1274                                continue;
1275                        }
1276                        for (int i=0; i<chainNames.length; i++) {
1277                                asymId2authorId.put(asymIds.get(i), chainNames[i]);
1278                        }
1279                }
1280        }
1281        
1282        private void setStructNcsOps() {
1283                
1284                ArrayList<Matrix4d> ncsOperators = new ArrayList<Matrix4d>();
1285                
1286                for (StructNcsOper sNcsOper:structNcsOper) {
1287                        
1288                        if (!sNcsOper.getCode().equals("generate")) continue;
1289                        
1290                        try {
1291                                Matrix4d op = new Matrix4d();
1292                                op.setElement(3, 0, 0.0);
1293                                op.setElement(3, 1, 0.0);
1294                                op.setElement(3, 2, 0.0);
1295                                op.setElement(3, 3, 1.0);
1296
1297
1298                                op.setElement(0, 0, Double.parseDouble(sNcsOper.getMatrix11()));
1299                                op.setElement(0, 1, Double.parseDouble(sNcsOper.getMatrix12()));
1300                                op.setElement(0, 2, Double.parseDouble(sNcsOper.getMatrix13()));
1301
1302                                op.setElement(1, 0, Double.parseDouble(sNcsOper.getMatrix21()));
1303                                op.setElement(1, 1, Double.parseDouble(sNcsOper.getMatrix22()));
1304                                op.setElement(1, 2, Double.parseDouble(sNcsOper.getMatrix23()));
1305
1306                                op.setElement(2, 0, Double.parseDouble(sNcsOper.getMatrix31()));
1307                                op.setElement(2, 1, Double.parseDouble(sNcsOper.getMatrix32()));
1308                                op.setElement(2, 2, Double.parseDouble(sNcsOper.getMatrix33()));
1309
1310                                op.setElement(0, 3, Double.parseDouble(sNcsOper.getVector1()));
1311                                op.setElement(1, 3, Double.parseDouble(sNcsOper.getVector2()));
1312                                op.setElement(2, 3, Double.parseDouble(sNcsOper.getVector3()));
1313
1314                                ncsOperators.add(op);
1315                                
1316                        } catch (NumberFormatException e) {
1317                                logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", structNcsOper.indexOf(sNcsOper)+1); 
1318                        }
1319
1320                }
1321                
1322                // we only set it if not empty, otherwise remains null
1323                if (ncsOperators.size()>0) {
1324                        structure.getCrystallographicInfo().setNcsOperators(
1325                                        ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
1326                }
1327        }
1328        
1329        private void setCrystallographicInfoMetadata() {
1330                if (parsedScaleMatrix!=null) {
1331                        
1332                        PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
1333                        
1334                        boolean nonStd = false;
1335                        if (crystalInfo.getCrystalCell()!=null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
1336                                nonStd = true;
1337                        }
1338                        
1339                        crystalInfo.setNonStandardCoordFrameConvention(nonStd); 
1340                }
1341        }
1342
1343
1344        /** This method will return the parsed protein structure, once the parsing has been finished
1345         *
1346         * @return a BioJava protein structure object
1347         */
1348        public Structure getStructure() {
1349
1350                return structure;
1351        }
1352
1353        @Override
1354        public void newDatabasePDBrevRecord(DatabasePdbrevRecord record) {
1355
1356                PDBHeader header = structure.getPDBHeader();
1357
1358                if ( header == null) {
1359                        header = new PDBHeader();
1360                        structure.setPDBHeader(header);
1361                }
1362
1363                List<DatabasePdbrevRecord> revRecords = header.getRevisionRecords();
1364                if ( revRecords == null) {
1365                        revRecords = new ArrayList<DatabasePdbrevRecord>();
1366                        header.setRevisionRecords(revRecords);
1367                }
1368                revRecords.add(record);
1369
1370
1371        }
1372
1373
1374        @Override
1375        public void newDatabasePDBrev(DatabasePDBrev dbrev) {
1376                
1377                logger.debug("got a database revision:" + dbrev);
1378                
1379                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
1380                PDBHeader header = structure.getPDBHeader();
1381
1382                if ( header == null) {
1383                        header = new PDBHeader();
1384                }
1385
1386                if (dbrev.getNum().equals("1")){
1387
1388                        try {
1389                                Date dep = dateFormat.parse(dbrev.getDate_original());
1390                                header.setDepDate(dep);
1391
1392                        } catch (ParseException e){
1393                                logger.warn("Could not parse date string '{}', deposition date will be unavailable", dbrev.getDate_original());
1394                        }
1395
1396                        try {
1397                                Date rel = dateFormat.parse(dbrev.getDate());
1398                                header.setRelDate(rel);
1399
1400                        } catch (ParseException e){
1401                                logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
1402                        }
1403
1404
1405                } else {
1406                        try {
1407
1408                                Date mod = dateFormat.parse(dbrev.getDate());
1409                                header.setModDate(mod);
1410
1411                        } catch (ParseException e){
1412                                logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
1413                        }
1414                }
1415
1416                structure.setPDBHeader(header);
1417        }
1418        
1419        @Override
1420        public void newPdbxAuditRevisionHistory(PdbxAuditRevisionHistory history) {
1421                
1422                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
1423                PDBHeader header = structure.getPDBHeader();
1424
1425                if ( header == null) {
1426                        header = new PDBHeader();
1427                }
1428
1429        // first entry in revision history is the release date
1430                if (history.getOrdinal().equals("1")){
1431                        try {
1432                                Date releaseDate = dateFormat.parse(history.getRevision_date());
1433                                header.setRelDate(releaseDate);
1434                                
1435                        } catch (ParseException e){
1436                                logger.warn("Could not parse date string '{}', release date will be unavailable", history.getRevision_date());
1437                        }
1438                } else {
1439                        // all other dates are revision dates;
1440                        // since this method may be called multiple times,
1441                        // the last revision date will "stick"
1442                        try {
1443                                Date revisionDate = dateFormat.parse(history.getRevision_date());
1444                                header.setModDate(revisionDate);
1445                        } catch (ParseException e){
1446                                logger.warn("Could not parse date string '{}', revision date will be unavailable", history.getRevision_date());
1447                        }
1448                }
1449
1450                structure.setPDBHeader(header);
1451        }
1452        
1453        @Override
1454        public void newPdbxDatabaseStatus(PdbxDatabaseStatus status) {
1455
1456                // the deposition date field is only available in mmCIF 5.0
1457
1458                if (status.getRecvd_initial_deposition_date() == null) {
1459                        // skip this method for older mmCIF versions
1460                        return;
1461                }
1462                
1463                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
1464                PDBHeader header = structure.getPDBHeader();
1465
1466                if (header == null) {
1467                        header = new PDBHeader();
1468                }
1469
1470                try {
1471                        Date depositionDate = dateFormat.parse(status.getRecvd_initial_deposition_date());
1472                        header.setDepDate(depositionDate);
1473                } catch (ParseException e){
1474                        logger.warn("Could not parse date string '{}', deposition date will be unavailable", status.getRecvd_initial_deposition_date());
1475                }
1476
1477                structure.setPDBHeader(header);
1478        }
1479
1480        @Override
1481        public void newDatabasePDBremark(DatabasePDBremark remark) {
1482                //System.out.println(remark);
1483                String id = remark.getId();
1484                if (id.equals("2")){
1485
1486                        //this remark field contains the resolution information:
1487                        String line = remark.getText();
1488
1489                        int i = line.indexOf("ANGSTROM");
1490                        if ( i > 5) {
1491                                // line contains ANGSTROM info...
1492                                String resolution = line.substring(i-5,i).trim();
1493                                // convert string to float
1494                                float res = 99 ;
1495                                try {
1496                                        res = Float.parseFloat(resolution);
1497
1498                                } catch (NumberFormatException e) {
1499                                        logger.info("could not parse resolution from line and ignoring it " + line);
1500                                        return ;
1501
1502
1503                                }
1504                                // support for old style header
1505
1506                                PDBHeader pdbHeader = structure.getPDBHeader();
1507                                pdbHeader.setResolution(res);
1508
1509                        }
1510
1511                }
1512        }
1513
1514        @Override
1515        public void newRefine(Refine r){
1516
1517                PDBHeader pdbHeader = structure.getPDBHeader();
1518                // RESOLUTION
1519                // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
1520                // there are 2 resolution values, one for each method
1521                // we take the last one found so that behaviour is like in PDB file parsing
1522                if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) {
1523                        logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1524                                        ,r.getLs_d_res_high(), String.format("%4.2f",pdbHeader.getResolution()));
1525                }
1526                try {
1527                        pdbHeader.setResolution(Float.parseFloat(r.getLs_d_res_high()));
1528                } catch (NumberFormatException e){
1529                        logger.info("Could not parse resolution from " + r.getLs_d_res_high() + " " + e.getMessage());
1530                }
1531
1532
1533                // RFREE
1534                if (pdbHeader.getRfree()!=PDBHeader.DEFAULT_RFREE) {
1535                        logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {} ",
1536                                        r.getLs_R_factor_R_free(), String.format("%4.2f",pdbHeader.getRfree()));
1537                }
1538                if (r.getLs_R_factor_R_free()==null) {
1539                        // some entries like 2ifo haven't got this field at all
1540                        logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
1541                } else {
1542                        try {
1543                                pdbHeader.setRfree(Float.parseFloat(r.getLs_R_factor_R_free()));
1544                        } catch (NumberFormatException e){
1545                                // no rfree present ('?') is very usual, that's why we set it to debug
1546                                logger.debug("Could not parse Rfree from string '{}'", r.getLs_R_factor_R_free());
1547                        }
1548                }
1549
1550                // RWORK
1551                if(pdbHeader.getRwork()!=PDBHeader.DEFAULT_RFREE) {
1552                        logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ",
1553                                        r.getLs_R_factor_R_work(), String.format("%4.2f",pdbHeader.getRwork()));
1554                }
1555                if(r.getLs_R_factor_R_work()==null){
1556                        logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value");
1557                }
1558                else{
1559                        try{
1560                                pdbHeader.setRwork(Float.parseFloat(r.getLs_R_factor_R_work()));
1561                        }
1562                        catch (NumberFormatException e){
1563                                logger.debug("Could not parse R-work from string '{}'", r.getLs_R_factor_R_work());
1564                        }
1565
1566                }
1567
1568        }
1569
1570
1571        @Override
1572        public void newAuditAuthor(AuditAuthor aa){
1573
1574                String name =  aa.getName();
1575
1576                StringBuffer famName = new StringBuffer();
1577                StringBuffer initials = new StringBuffer();
1578                boolean afterComma = false;
1579                for ( char c: name.toCharArray()) {
1580                        if ( c == ' ')
1581                                continue;
1582                        if ( c == ','){
1583                                afterComma = true;
1584                                continue;
1585                        }
1586
1587                        if ( afterComma)
1588                                initials.append(c);
1589                        else
1590                                famName.append(c);
1591                }
1592
1593                StringBuffer newaa = new StringBuffer();
1594                newaa.append(initials);
1595                newaa.append(famName);
1596
1597                PDBHeader header = structure.getPDBHeader();
1598                String auth = header.getAuthors();
1599                if (auth == null) {
1600                        header.setAuthors(newaa.toString());
1601                }else {
1602                        auth += "," + newaa.toString();
1603                        header.setAuthors(auth);
1604
1605                }
1606        }
1607
1608        @Override
1609        public void newExptl(Exptl exptl) {
1610
1611                PDBHeader pdbHeader = structure.getPDBHeader();
1612                String method = exptl.getMethod();
1613                pdbHeader.setExperimentalTechnique(method);
1614
1615        }
1616
1617        @Override
1618        public void newCell(Cell cell) {
1619
1620                try {
1621                        float a = Float.parseFloat(cell.getLength_a());
1622                        float b = Float.parseFloat(cell.getLength_b());
1623                        float c = Float.parseFloat(cell.getLength_c());
1624                        float alpha = Float.parseFloat(cell.getAngle_alpha());
1625                        float beta = Float.parseFloat(cell.getAngle_beta());
1626                        float gamma = Float.parseFloat(cell.getAngle_gamma());
1627
1628                        CrystalCell xtalCell = new CrystalCell();
1629                        xtalCell.setA(a);
1630                        xtalCell.setB(b);
1631                        xtalCell.setC(c);
1632                        xtalCell.setAlpha(alpha);
1633                        xtalCell.setBeta(beta);
1634                        xtalCell.setGamma(gamma);
1635
1636                        if (!xtalCell.isCellReasonable()) {
1637                                // If the entry describes a structure determined by a technique other than X-ray crystallography,
1638                                // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
1639                                // if so we don't add and CrystalCell will be null
1640                                logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1641                                                CrystalCell.MIN_VALID_CELL_SIZE);
1642                                return;
1643                        }
1644
1645                        structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(xtalCell);
1646
1647                } catch (NumberFormatException e){
1648                        structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(null);
1649                        logger.info("could not parse some cell parameters ("+e.getMessage()+"), ignoring _cell ");
1650                }
1651        }
1652
1653        @Override
1654        public void newSymmetry(Symmetry symmetry) {
1655                String spaceGroup = symmetry.getSpace_group_name_H_M();
1656                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1657                if (sg==null) {
1658                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1659                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(true);
1660                } else {
1661                        structure.getPDBHeader().getCrystallographicInfo().setSpaceGroup(sg);
1662                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(false);
1663                }
1664        }
1665
1666        @Override
1667        public void newStructNcsOper(StructNcsOper sNcsOper) {
1668                structNcsOper.add(sNcsOper);
1669        }
1670        
1671        public void newAtomSites(AtomSites atomSites) {
1672                
1673                try {
1674                        Matrix4d m = new Matrix4d(
1675                                Double.parseDouble(atomSites.getFract_transf_matrix11()), Double.parseDouble(atomSites.getFract_transf_matrix12()), Double.parseDouble(atomSites.getFract_transf_matrix13()), Double.parseDouble(atomSites.getFract_transf_vector1()),
1676                                Double.parseDouble(atomSites.getFract_transf_matrix21()), Double.parseDouble(atomSites.getFract_transf_matrix22()), Double.parseDouble(atomSites.getFract_transf_matrix23()), Double.parseDouble(atomSites.getFract_transf_vector2()),
1677                                Double.parseDouble(atomSites.getFract_transf_matrix31()), Double.parseDouble(atomSites.getFract_transf_matrix32()), Double.parseDouble(atomSites.getFract_transf_matrix33()), Double.parseDouble(atomSites.getFract_transf_vector3()),
1678                                0,0,0,1);
1679
1680                        parsedScaleMatrix = m;
1681                
1682                } catch (NumberFormatException e) {
1683                        logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", e.getMessage());
1684                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
1685                        
1686                        // in this case parsedScaleMatrix stays null and can't be used in documentEnd()
1687                }
1688        }
1689
1690        @Override
1691        public void newStructRef(StructRef sref) {
1692                logger.debug(sref.toString());
1693                strucRefs.add(sref);
1694        }
1695
1696        private StructRef getStructRef(String ref_id){
1697                for (StructRef structRef : strucRefs) {
1698
1699                        if (structRef.getId().equals(ref_id)){
1700                                return structRef;
1701                        }
1702
1703                }
1704                return null;
1705
1706        }
1707
1708        /**
1709         * create a DBRef record from the StrucRefSeq record:
1710         * <pre>
1711         * PDB record                    DBREF
1712         * Field Name                    mmCIF Data Item
1713         * Section                       n.a.
1714         * PDB_ID_Code                   _struct_ref_seq.pdbx_PDB_id_code
1715         * Strand_ID                     _struct_ref_seq.pdbx_strand_id
1716         * Begin_Residue_Number          _struct_ref_seq.pdbx_auth_seq_align_beg
1717         * Begin_Ins_Code                _struct_ref_seq.pdbx_seq_align_beg_ins_code
1718         * End_Residue_Number            _struct_ref_seq.pdbx_auth_seq_align_end
1719         * End_Ins_Code                  _struct_ref_seq.pdbx_seq_align_end_ins_code
1720         * Database                      _struct_ref.db_name
1721         * Database_Accession_No         _struct_ref_seq.pdbx_db_accession
1722         * Database_ID_Code              _struct_ref.db_code
1723         * Database_Begin_Residue_Number _struct_ref_seq.db_align_beg
1724         * Databaes_Begin_Ins_Code       _struct_ref_seq.pdbx_db_align_beg_ins_code
1725         * Database_End_Residue_Number   _struct_ref_seq.db_align_end
1726         * Databaes_End_Ins_Code         _struct_ref_seq.pdbx_db_align_end_ins_code
1727         * </pre>
1728         *
1729         *
1730         */
1731        @Override
1732        public void newStructRefSeq(StructRefSeq sref) {
1733                DBRef r = new DBRef();
1734
1735                r.setIdCode(sref.getPdbx_PDB_id_code());
1736                r.setDbAccession(sref.getPdbx_db_accession());
1737                r.setDbIdCode(sref.getPdbx_db_accession());
1738
1739                r.setChainName(sref.getPdbx_strand_id());
1740                StructRef structRef = getStructRef(sref.getRef_id());
1741                if (structRef == null){
1742                        logger.info("could not find StructRef " + sref.getRef_id() + " for StructRefSeq " + sref);
1743                } else {
1744                        r.setDatabase(structRef.getDb_name());
1745                        r.setDbIdCode(structRef.getDb_code());
1746                }
1747
1748                int seqbegin;
1749                int seqend;
1750                try{
1751                        seqbegin = Integer.parseInt(sref.getPdbx_auth_seq_align_beg());
1752                        seqend   = Integer.parseInt(sref.getPdbx_auth_seq_align_end());
1753                }
1754                catch(NumberFormatException e){
1755                        // this happens in a few entries, annotation error? e.g. 6eoj
1756                        logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref alignment info for accession {}. Error: {}", r.getDbAccession(), e.getMessage());
1757                        return;
1758                }
1759                
1760                Character begin_ins_code = ' ';
1761                if (sref.getPdbx_seq_align_beg_ins_code() != null ) {
1762                    begin_ins_code = new Character(sref.getPdbx_seq_align_beg_ins_code().charAt(0));
1763                }
1764                
1765                Character end_ins_code = ' ';
1766                if (sref.getPdbx_seq_align_end_ins_code() != null) {
1767                    end_ins_code = new Character(sref.getPdbx_seq_align_end_ins_code().charAt(0));
1768                }
1769
1770                if (begin_ins_code == '?')
1771                        begin_ins_code = ' ';
1772
1773                if (end_ins_code == '?')
1774                        end_ins_code = ' ';
1775
1776                r.setSeqBegin(seqbegin);
1777                r.setInsertBegin(begin_ins_code);
1778
1779                r.setSeqEnd(seqend);
1780                r.setInsertEnd(end_ins_code);
1781
1782                int dbseqbegin = Integer.parseInt(sref.getDb_align_beg());
1783                int dbseqend   = Integer.parseInt(sref.getDb_align_end());
1784                
1785                Character db_begin_in_code = ' ';
1786                if (sref.getPdbx_db_align_beg_ins_code() != null) {
1787                    db_begin_in_code = new Character(sref.getPdbx_db_align_beg_ins_code().charAt(0));
1788                }
1789                
1790                Character db_end_in_code = ' ';
1791                if (sref.getPdbx_db_align_end_ins_code() != null) {
1792                    db_end_in_code = new Character(sref.getPdbx_db_align_end_ins_code().charAt(0));
1793                }
1794
1795                if (db_begin_in_code == '?')
1796                        db_begin_in_code = ' ';
1797
1798                if (db_end_in_code == '?')
1799                        db_end_in_code = ' ';
1800
1801
1802                r.setDbSeqBegin(dbseqbegin);
1803                r.setIdbnsBegin(db_begin_in_code);
1804
1805                r.setDbSeqEnd(dbseqend);
1806                r.setIdbnsEnd(db_end_in_code);
1807
1808                List<DBRef> dbrefs = structure.getDBRefs();
1809                if ( dbrefs == null)
1810                        dbrefs = new ArrayList<DBRef>();
1811                dbrefs.add(r);
1812
1813                logger.debug(r.toPDB());
1814
1815                structure.setDBRefs(dbrefs);
1816
1817        }
1818
1819        @Override
1820        public void newStructRefSeqDif(StructRefSeqDif sref) {
1821                sequenceDifs.add(sref);
1822        }
1823
1824        private Chain getEntityChain(String entity_id){
1825
1826                for (Chain chain : entityChains) {
1827                        if ( chain.getId().equals(entity_id)){
1828
1829                                return chain;
1830                        }
1831                }
1832                // does not exist yet, so create...
1833
1834                Chain   chain = new ChainImpl();
1835                chain.setId(entity_id);
1836                entityChains.add(chain);
1837
1838                return chain;
1839
1840        }
1841
1842        //private Chain getSeqResChain(String chainID){
1843        //      return getChainFromList(seqResChains, chainID);
1844        //}
1845
1846
1847        /**
1848         * Data items in the ENTITY_SRC_GEN category record details of
1849         * the source from which the entity was obtained in cases
1850         * where the source was genetically manipulated.  The
1851         * following are treated separately:  items pertaining to the tissue
1852         * from which the gene was obtained, items pertaining to the host
1853         * organism for gene expression and items pertaining to the actual
1854         * producing organism (plasmid).
1855         */
1856        @Override
1857        public void newEntitySrcGen(EntitySrcGen entitySrcGen){
1858
1859                // add to internal list. Map to Compound object later on...
1860                entitySrcGens.add(entitySrcGen);
1861        }
1862
1863        @Override
1864        public void newEntitySrcNat(EntitySrcNat entitySrcNat){
1865
1866                // add to internal list. Map to Compound object later on...
1867                entitySrcNats.add(entitySrcNat);
1868        }
1869
1870        @Override
1871        public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){
1872
1873                // add to internal list. Map to Compound object later on...
1874                entitySrcSyns.add(entitySrcSyn);
1875        }
1876
1877        /**
1878         * The EntityPolySeq object provide the amino acid sequence objects for the Entities.
1879         * Later on the entities are mapped to the BioJava {@link Chain} and {@link EntityInfo} objects.
1880         * @param epolseq the EntityPolySeq record for one amino acid
1881         */
1882        @Override
1883        public void newEntityPolySeq(EntityPolySeq epolseq) {
1884
1885                logger.debug("NEW entity poly seq " + epolseq);
1886
1887                int eId = -1;
1888                try {
1889                        eId = Integer.parseInt(epolseq.getEntity_id());
1890                } catch (NumberFormatException e) {
1891                        logger.warn("Could not parse entity id from EntityPolySeq: "+e.getMessage());
1892                }
1893                Entity e = getEntity(eId);
1894
1895                if (e == null){
1896                        logger.info("Could not find entity "+ epolseq.getEntity_id()+". Can not match sequence to it.");
1897                        return;
1898                }
1899
1900                Chain entityChain = getEntityChain(epolseq.getEntity_id());
1901
1902                // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
1903                // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
1904
1905                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(epolseq.getMon_id());
1906                //int seqId = Integer.parseInt(epolseq.getNum());
1907                if ( g != null && !g.getChemComp().isEmpty()) {
1908                        if ( g instanceof AminoAcidImpl) {
1909                                AminoAcidImpl aa = (AminoAcidImpl) g;
1910                                aa.setRecordType(AminoAcid.SEQRESRECORD);
1911                                //aa.setId(seqId);
1912                        }
1913                } else {
1914
1915                        if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){
1916                                AminoAcidImpl a = new AminoAcidImpl();
1917                                a.setRecordType(AminoAcid.SEQRESRECORD);
1918                                Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id());
1919                                a.setAminoType(code1);
1920                                g = a;
1921
1922                        } else if ( StructureTools.isNucleotide(epolseq.getMon_id())) {
1923                                // the group is actually a nucleotide group...
1924                                NucleotideImpl n = new NucleotideImpl();
1925                                g = n;
1926
1927                        } else {
1928                                logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id());
1929                                HetatomImpl h = new HetatomImpl();
1930                                g = h;
1931
1932                        }
1933
1934
1935                }
1936                // at this stage we don't know about author residue numbers (insertion codes)
1937                // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly sequential and follow the seqres sequence 1 to n)
1938                // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
1939                g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
1940
1941                g.setPDBName(epolseq.getMon_id());
1942
1943                entityChain.addGroup(g);
1944
1945        }
1946
1947        @Override
1948        public void newPdbxPolySeqScheme(PdbxPolySeqScheme ppss) {
1949
1950                //if ( headerOnly)
1951                //      return;
1952
1953                // replace the group asym ids with the real PDB ids!
1954                // replaceGroupSeqPos(ppss);  // This might be incorrect in some pdb, to use auth_seq_id of the pdbx_poly_seq_scheme.
1955
1956
1957        }
1958
1959
1960        @Override
1961        public void newPdbxNonPolyScheme(PdbxNonPolyScheme ppss) {
1962
1963                //if (headerOnly)
1964                //      return;
1965
1966                // merge the EntityPolySeq info and the AtomSite chains into one...
1967                //already known ignore:
1968
1969        }
1970
1971        @Override
1972        public void newPdbxEntityNonPoly(PdbxEntityNonPoly pen){
1973                // TODO: do something with them...
1974                // not implemented yet...
1975                logger.debug(pen.getEntity_id() + " " + pen.getName() + " " + pen.getComp_id());
1976
1977        }
1978
1979        @Override
1980        public void newChemComp(ChemComp c) {
1981                // TODO: do something with them...
1982
1983        }
1984
1985        @Override
1986        public void newGenericData(String category, List<String> loopFields,
1987                        List<String> lineData) {
1988
1989                //logger.debug("unhandled category so far: " + category);
1990        }
1991
1992        @Override
1993        public FileParsingParameters getFileParsingParameters()
1994        {
1995                return params;
1996        }
1997
1998        @Override
1999        public void setFileParsingParameters(FileParsingParameters params)
2000        {
2001                this.params = params;
2002
2003        }
2004
2005        @Override
2006        public void newChemCompDescriptor(ChemCompDescriptor ccd) {
2007
2008                // TODO nothing happening here yet.
2009
2010        }
2011
2012
2013
2014        public List<PdbxStructOperList> getStructOpers() {
2015                return structOpers;
2016        }
2017
2018        @Override
2019        public void newPdbxStrucAssembly(PdbxStructAssembly strucAssembly) {
2020                strucAssemblies.add(strucAssembly);
2021
2022        }
2023
2024        public List<PdbxStructAssembly> getStructAssemblies(){
2025                return strucAssemblies;
2026        }
2027
2028        @Override
2029        public void newPdbxStrucAssemblyGen(PdbxStructAssemblyGen strucAssembly) {
2030                strucAssemblyGens.add(strucAssembly);
2031
2032        }
2033
2034        public List<PdbxStructAssemblyGen> getStructAssemblyGens(){
2035                return strucAssemblyGens;
2036        }
2037
2038        @Override
2039        public void newChemCompAtom(ChemCompAtom atom) {
2040
2041        }
2042
2043        @Override
2044        public void newPdbxChemCompIndentifier(PdbxChemCompIdentifier id) {
2045
2046        }
2047
2048        @Override
2049        public void newChemCompBond(ChemCompBond bond) {
2050
2051        }
2052
2053        @Override
2054        public void newPdbxChemCompDescriptor(PdbxChemCompDescriptor desc) {
2055
2056        }
2057
2058        @Override
2059        public void newStructConn(StructConn structConn) {
2060                this.structConn.add(structConn);
2061        }
2062
2063        @Override
2064        public void newStructSiteGen(StructSiteGen siteGen) { this.structSiteGens.add(siteGen); }
2065
2066        @Override
2067        public void newStructSite(StructSite structSite) {
2068
2069                if (params.isHeaderOnly()) {
2070                        return;
2071                }
2072
2073                // Simply implement the method.
2074                List<Site> sites = structure.getSites();
2075                if (sites == null) sites = new ArrayList<Site>();
2076
2077                Site site = null;
2078                for (Site asite : sites) {
2079                        if (asite.getSiteID().equals(structSite.getId())) {
2080                                site = asite;           // Prevent duplicate siteIds
2081                        }
2082                }
2083                boolean addSite = false;
2084                if (site == null) { site = new Site(); addSite = true; }
2085                site.setSiteID(structSite.getId());
2086                site.setDescription(structSite.getDetails());
2087                // site.setPdbxEvidenceCode(structSite.getPdbxEvidenceCode()); // TODO - add addition fields in Sites
2088                if (addSite) sites.add(site);
2089
2090                structure.setSites(sites);
2091        }
2092
2093        /**
2094         * Build sites in a BioJava Structure using the original author chain id & residue numbers.
2095         * Sites are built from struct_site_gen records that have been parsed.
2096         */
2097        private void addSites() {
2098                List<Site> sites = structure.getSites();
2099                if (sites == null) sites = new ArrayList<Site>();
2100
2101                for (StructSiteGen siteGen : structSiteGens) {
2102                        // For each StructSiteGen, find the residues involved, if they exist then
2103                        String site_id = siteGen.getSite_id(); // multiple could be in same site.
2104                        if (site_id == null) site_id = "";
2105                        String comp_id = siteGen.getLabel_comp_id();  // PDBName
2106
2107                        // Assumption: the author chain ID and residue number for the site is consistent with the original
2108                        // author chain id and residue numbers.
2109
2110                        String asymId = siteGen.getLabel_asym_id(); // chain name
2111                        String authId = siteGen.getAuth_asym_id(); // chain Id
2112                        String auth_seq_id = siteGen.getAuth_seq_id(); // Res num
2113
2114                        String insCode = siteGen.getPdbx_auth_ins_code();
2115                        if ( insCode != null && insCode.equals("?"))
2116                                insCode = null;
2117
2118                        // Look for asymID = chainID and seqID = seq_ID.  Check that comp_id matches the resname.
2119                        Group g = null;
2120                        try {
2121                                Chain chain = structure.getChain(asymId);
2122
2123                                if (null != chain) {
2124                                        try {
2125                                                Character insChar = null;
2126                                                if (null != insCode && insCode.length() > 0) insChar = insCode.charAt(0);
2127                                                g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar));
2128                                        } catch (NumberFormatException e) {
2129                                                logger.warn("Could not lookup residue : " + authId + auth_seq_id);
2130                                        }
2131                                }
2132                        } catch (StructureException e) {
2133                                logger.warn("Problem finding residue in site entry " + siteGen.getSite_id() + " - " + e.getMessage(), e.getMessage());
2134                        }
2135
2136                        if (g != null) {
2137                                // 2. find the site_id, if not existing, create anew.
2138                                Site site = null;
2139                                for (Site asite: sites) {
2140                                        if (site_id.equals(asite.getSiteID())) site = asite;
2141                                }
2142
2143                                boolean addSite = false;
2144
2145                                // 3. add this residue to the site.
2146                                if (site == null) {
2147                                        addSite = true;
2148                                        site = new Site();
2149                                        site.setSiteID(site_id);
2150                                }
2151
2152                                List<Group> groups = site.getGroups();
2153                                if (groups == null) groups = new ArrayList<Group>();
2154
2155                                // Check the self-consistency of the residue reference from auth_seq_id and chain_id
2156                                if (!comp_id.equals(g.getPDBName())) {
2157                                        logger.warn("comp_id doesn't match the residue at " + authId + " " + auth_seq_id + " - skipping");
2158                                } else {
2159                                        groups.add(g);
2160                                        site.setGroups(groups);
2161                                }
2162                                if (addSite) sites.add(site);
2163                        }
2164                }
2165                structure.setSites(sites);
2166        }
2167}