001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * created at Apr 26, 2008
021 */
022package org.biojava.nbio.structure.io.mmcif;
023
024import java.text.ParseException;
025import java.text.SimpleDateFormat;
026import java.util.ArrayList;
027import java.util.Date;
028import java.util.HashMap;
029import java.util.List;
030import java.util.Locale;
031import java.util.Map;
032
033import javax.vecmath.Matrix4d;
034
035import org.biojava.nbio.structure.AminoAcid;
036import org.biojava.nbio.structure.AminoAcidImpl;
037import org.biojava.nbio.structure.Atom;
038import org.biojava.nbio.structure.AtomImpl;
039import org.biojava.nbio.structure.Chain;
040import org.biojava.nbio.structure.ChainImpl;
041import org.biojava.nbio.structure.Compound;
042import org.biojava.nbio.structure.DBRef;
043import org.biojava.nbio.structure.Element;
044import org.biojava.nbio.structure.Group;
045import org.biojava.nbio.structure.GroupType;
046import org.biojava.nbio.structure.HetatomImpl;
047import org.biojava.nbio.structure.NucleotideImpl;
048import org.biojava.nbio.structure.PDBCrystallographicInfo;
049import org.biojava.nbio.structure.PDBHeader;
050import org.biojava.nbio.structure.ResidueNumber;
051import org.biojava.nbio.structure.SeqMisMatch;
052import org.biojava.nbio.structure.SeqMisMatchImpl;
053import org.biojava.nbio.structure.Site;
054import org.biojava.nbio.structure.Structure;
055import org.biojava.nbio.structure.StructureException;
056import org.biojava.nbio.structure.StructureImpl;
057import org.biojava.nbio.structure.StructureTools;
058import org.biojava.nbio.structure.io.BondMaker;
059import org.biojava.nbio.structure.io.ChargeAdder;
060import org.biojava.nbio.structure.io.FileParsingParameters;
061import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
062import org.biojava.nbio.structure.io.mmcif.model.AtomSite;
063import org.biojava.nbio.structure.io.mmcif.model.AtomSites;
064import org.biojava.nbio.structure.io.mmcif.model.AuditAuthor;
065import org.biojava.nbio.structure.io.mmcif.model.Cell;
066import org.biojava.nbio.structure.io.mmcif.model.ChemComp;
067import org.biojava.nbio.structure.io.mmcif.model.ChemCompAtom;
068import org.biojava.nbio.structure.io.mmcif.model.ChemCompBond;
069import org.biojava.nbio.structure.io.mmcif.model.ChemCompDescriptor;
070import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBremark;
071import org.biojava.nbio.structure.io.mmcif.model.DatabasePDBrev;
072import org.biojava.nbio.structure.io.mmcif.model.DatabasePdbrevRecord;
073import org.biojava.nbio.structure.io.mmcif.model.Entity;
074import org.biojava.nbio.structure.io.mmcif.model.EntityPolySeq;
075import org.biojava.nbio.structure.io.mmcif.model.EntitySrcGen;
076import org.biojava.nbio.structure.io.mmcif.model.EntitySrcNat;
077import org.biojava.nbio.structure.io.mmcif.model.EntitySrcSyn;
078import org.biojava.nbio.structure.io.mmcif.model.Exptl;
079import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompDescriptor;
080import org.biojava.nbio.structure.io.mmcif.model.PdbxChemCompIdentifier;
081import org.biojava.nbio.structure.io.mmcif.model.PdbxEntityNonPoly;
082import org.biojava.nbio.structure.io.mmcif.model.PdbxNonPolyScheme;
083import org.biojava.nbio.structure.io.mmcif.model.PdbxPolySeqScheme;
084import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssembly;
085import org.biojava.nbio.structure.io.mmcif.model.PdbxStructAssemblyGen;
086import org.biojava.nbio.structure.io.mmcif.model.PdbxStructOperList;
087import org.biojava.nbio.structure.io.mmcif.model.Refine;
088import org.biojava.nbio.structure.io.mmcif.model.Struct;
089import org.biojava.nbio.structure.io.mmcif.model.StructAsym;
090import org.biojava.nbio.structure.io.mmcif.model.StructConn;
091import org.biojava.nbio.structure.io.mmcif.model.StructKeywords;
092import org.biojava.nbio.structure.io.mmcif.model.StructNcsOper;
093import org.biojava.nbio.structure.io.mmcif.model.StructRef;
094import org.biojava.nbio.structure.io.mmcif.model.StructRefSeq;
095import org.biojava.nbio.structure.io.mmcif.model.StructRefSeqDif;
096import org.biojava.nbio.structure.io.mmcif.model.StructSite;
097import org.biojava.nbio.structure.io.mmcif.model.StructSiteGen;
098import org.biojava.nbio.structure.io.mmcif.model.Symmetry;
099import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
100import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
101import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
102import org.biojava.nbio.structure.xtal.CrystalCell;
103import org.biojava.nbio.structure.xtal.SpaceGroup;
104import org.biojava.nbio.structure.xtal.SymoplibParser;
105import org.slf4j.Logger;
106import org.slf4j.LoggerFactory;
107
108/**
109 * A MMcifConsumer implementation that builds an in-memory representation of the
110 * content of a mmcif file as a BioJava Structure object.
111 *
112 * @author Andreas Prlic
113 * @since 1.7
114 */
115
116public class SimpleMMcifConsumer implements MMcifConsumer {
117
118        private static final Logger logger = LoggerFactory.getLogger(SimpleMMcifConsumer.class);
119
120        private Structure structure;
121        private Chain current_chain;
122        private Group current_group;
123
124
125        private List<Chain>      current_model;
126        private List<Entity>     entities;
127        private List<StructRef>  strucRefs;
128        private List<Chain>      seqResChains;
129        private List<Chain>      entityChains; // needed to link entities, chains and compounds...
130        private List<StructAsym> structAsyms;  // needed to link entities, chains and compounds...
131        private List<PdbxStructOperList> structOpers ; //
132        private List<PdbxStructAssembly> strucAssemblies;
133        private List<PdbxStructAssemblyGen> strucAssemblyGens;
134        private List<EntitySrcGen> entitySrcGens;
135        private List<EntitySrcNat> entitySrcNats;
136        private List<EntitySrcSyn> entitySrcSyns;
137        private List<StructConn> structConn;
138        private List<StructNcsOper> structNcsOper;
139        private List<StructRefSeqDif> sequenceDifs;
140        private List<StructSiteGen> structSiteGens;
141        
142        private Matrix4d parsedScaleMatrix;
143
144        /**
145         * A map of asym ids (internal chain ids) to strand ids (author chain ids)
146         * extracted from pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme categories
147         */
148        private Map<String,String> asymStrandId;
149
150        /**
151         * A map of asym ids (internal chain ids) to strand ids (author chain ids)
152         * extracted from the information in _atom_sites category. Will be used
153         * if no mapping is found in pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme
154         */
155        private Map<String,String> asymId2StrandIdFromAtomSites;
156
157        /**
158         * A map of asym ids (internal chain ids) to entity ids extracted from
159         * the _struct_asym category
160         */
161        private Map<String,String> asymId2entityId;
162
163        private String current_nmr_model ;
164
165        private FileParsingParameters params;
166
167        public  SimpleMMcifConsumer(){
168                params = new FileParsingParameters();
169                documentStart();
170
171        }
172
173        @Override
174        public void newEntity(Entity entity) {
175                logger.debug("New entity: {}",entity.toString());
176                entities.add(entity);
177        }
178
179        @Override
180        public void newPdbxStructOperList(PdbxStructOperList structOper){
181
182                structOpers.add(structOper);
183        }
184
185        @Override
186        public void newStructAsym(StructAsym sasym){
187
188                structAsyms.add(sasym);
189        }
190
191        private Entity getEntity(int entity_id){
192                try {
193                        for (Entity e: entities){
194                                int eId = Integer.parseInt(e.getId());
195                                if  (eId== entity_id){
196                                        return e;
197                                }
198                        }
199                } catch (NumberFormatException e) {
200                        logger.warn("Entity id does not look like a number:", e.getMessage());
201                }
202                return null;
203        }
204
205        @Override
206        public void newStructKeywords(StructKeywords kw){
207                PDBHeader header = structure.getPDBHeader();
208                if ( header == null)
209                        header = new PDBHeader();
210                header.setDescription(kw.getPdbx_keywords());
211                header.setClassification(kw.getPdbx_keywords());
212        }
213
214        @Override
215        public void setStruct(Struct struct) {
216
217                PDBHeader header = structure.getPDBHeader();
218                if ( header == null)
219                        header = new PDBHeader();
220
221                header.setTitle(struct.getTitle());
222                header.setIdCode(struct.getEntry_id());
223                //header.setDescription(struct.getPdbx_descriptor());
224                //header.setClassification(struct.getPdbx_descriptor());
225                //header.setDescription(struct.getPdbx_descriptor());
226
227
228
229                structure.setPDBHeader(header);
230                structure.setPDBCode(struct.getEntry_id());
231        }
232
233        /** initiate new group, either Hetatom, Nucleotide, or AminoAcid */
234        private Group getNewGroup(String recordName,Character aminoCode1, long seq_id,String groupCode3) {
235
236                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(groupCode3);
237                if ( g != null && !g.getChemComp().isEmpty()) {
238                        if ( g instanceof AminoAcidImpl) {
239                                AminoAcidImpl aa = (AminoAcidImpl) g;
240                                aa.setId(seq_id);
241                        } else if ( g instanceof NucleotideImpl) {
242                                NucleotideImpl nuc =  (NucleotideImpl) g;
243                                nuc.setId(seq_id);
244                        } else if ( g instanceof HetatomImpl) {
245                                HetatomImpl het = (HetatomImpl)g;
246                                het.setId(seq_id);
247                        }
248                        return g;
249                }
250
251
252
253                Group group;
254                if ( recordName.equals("ATOM") ) {
255                        if (StructureTools.isNucleotide(groupCode3))  {
256                                // it is a nucleotide
257                                NucleotideImpl nu = new NucleotideImpl();
258                                group = nu;
259                                nu.setId(seq_id);
260
261                        } else if (aminoCode1==null || aminoCode1 == StructureTools.UNKNOWN_GROUP_LABEL){
262                                HetatomImpl h = new HetatomImpl();
263                                h.setId(seq_id);
264                                group = h;
265
266                        } else {
267                                AminoAcidImpl aa = new AminoAcidImpl() ;
268                                aa.setAminoType(aminoCode1);
269                                aa.setId(seq_id);
270                                group = aa ;
271                        }
272                }
273                else {
274                        if (StructureTools.isNucleotide(groupCode3))  {
275                                // it is a nucleotide
276                                NucleotideImpl nu = new NucleotideImpl();
277                                group = nu;
278                                nu.setId(seq_id);
279                        }
280                        else if (aminoCode1 != null ) {
281                                AminoAcidImpl aa = new AminoAcidImpl() ;
282                                aa.setAminoType(aminoCode1);
283                                aa.setId(seq_id);
284                                group = aa ;
285                        } else {
286                                HetatomImpl h = new HetatomImpl();
287                                h.setId(seq_id);
288                                group = h;
289                        }
290                }
291                return  group ;
292        }
293
294        /**
295         * Test if the given chainID is already present in the list of chains given. If yes, returns the chain
296         * otherwise returns null.
297         */
298        private static Chain isKnownChain(String chainID, List<Chain> chains){
299
300                for (int i = 0; i< chains.size();i++){
301                        Chain testchain =  chains.get(i);
302                        //System.out.println("comparing chainID >"+chainID+"< against testchain " + i+" >" +testchain.getName()+"<");
303                        if (chainID.equals(testchain.getChainID())) {
304                                //System.out.println("chain "+ chainID+" already known ...");
305                                return testchain;
306                        }
307                }
308
309                return null;
310        }
311
312        @Override
313        public void newAtomSite(AtomSite atom) {
314
315                if (params.isHeaderOnly()) return;
316
317                // Warning: getLabel_asym_id is not the "chain id" in the PDB file
318                // it is the internally used chain id.
319                // later on we will fix this...
320
321                // later one needs to map the asym id to the pdb_strand_id
322
323                //TODO: add support for FileParsingParams.getMaxAtoms()
324
325                boolean startOfNewChain = false;
326
327                String chain_id = atom.getLabel_asym_id();
328
329                String recordName    = atom.getGroup_PDB();
330                String residueNumberS = atom.getAuth_seq_id();
331                Integer residueNrInt = Integer.parseInt(residueNumberS);
332
333                // the 3-letter name of the group:
334                String groupCode3    = atom.getLabel_comp_id();
335
336                Character aminoCode1 = null;
337                if ( recordName.equals("ATOM") )
338                        aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
339                else {
340                        aminoCode1 = StructureTools.get1LetterCodeAmino(groupCode3);
341
342                        // for nucleotides this will be null..
343                        if (aminoCode1 != null &&  aminoCode1.equals(StructureTools.UNKNOWN_GROUP_LABEL))
344                                aminoCode1 = null;
345                }
346                String insCodeS = atom.getPdbx_PDB_ins_code();
347                Character insCode = null;
348                if (!  insCodeS.equals("?")) {
349                        insCode = insCodeS.charAt(0);
350                }
351                // we store the internal seq id in the Atom._id field
352                // this is not a PDB file field but we need this to internally assign the insertion codes later
353                // from the pdbx_poly_seq entries..
354
355                long seq_id = -1;
356                try {
357                        seq_id = Long.parseLong(atom.getLabel_seq_id());
358                } catch (NumberFormatException e){
359                        // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.', thus it is ok to
360                        // silently ignore this
361                        //logger.debug("Could not parse number for _atom_site.label_seq_id: "+e.getMessage());
362                }
363
364                String nmrModel = atom.getPdbx_PDB_model_num();
365
366                if ( current_nmr_model == null) {
367                        current_nmr_model = nmrModel;
368                }
369
370                if (! current_nmr_model.equals(nmrModel)){
371                        current_nmr_model = nmrModel;
372
373                        // add previous data
374                        if ( current_chain != null ) {
375                                current_chain.addGroup(current_group);
376                                current_group.trimToSize();
377                        }
378
379                        // we came to the beginning of a new NMR model
380                        structure.addModel(current_model);
381                        current_model = new ArrayList<Chain>();
382                        current_chain = null;
383                        current_group = null;
384                }
385
386
387                if (current_chain == null) {
388                        current_chain = new ChainImpl();
389                        current_chain.setChainID(chain_id);
390                        current_model.add(current_chain);
391                        startOfNewChain = true;
392                }
393
394                //System.out.println("BEFORE: " + chain_id + " " + current_chain.getName());
395                if ( ! chain_id.equals(current_chain.getChainID()) ) {
396
397                        startOfNewChain = true;
398
399                        // end up old chain...
400                        current_chain.addGroup(current_group);
401
402                        // see if old chain is known ...
403                        Chain testchain ;
404                        testchain = isKnownChain(current_chain.getChainID(),current_model);
405
406                        //System.out.println("trying to re-using known chain " + current_chain.getName() + " " + chain_id);
407                        if ( testchain != null && testchain.getChainID().equals(chain_id)){
408                                //System.out.println("re-using known chain " + current_chain.getName() + " " + chain_id);
409
410                        } else {
411
412                                testchain = isKnownChain(chain_id,current_model);
413                        }
414
415                        if ( testchain == null) {
416                                //System.out.println("unknown chain. creating new chain.");
417
418                                current_chain = new ChainImpl();
419                                current_chain.setChainID(chain_id);
420
421                        }   else {
422                                current_chain = testchain;
423                        }
424
425                        if ( ! current_model.contains(current_chain))
426                                current_model.add(current_chain);
427
428                }
429
430
431                ResidueNumber residueNumber = new ResidueNumber(chain_id,residueNrInt, insCode);
432
433                if (current_group == null) {
434
435                        current_group = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
436
437                        current_group.setResidueNumber(residueNumber);
438                        current_group.setPDBName(groupCode3);
439                }
440
441                if ( startOfNewChain){
442                        current_group = getNewGroup(recordName,aminoCode1,seq_id, groupCode3);
443
444                        current_group.setResidueNumber(residueNumber);
445                        current_group.setPDBName(groupCode3);
446                }
447
448                Group altGroup = null;
449                String altLocS = atom.getLabel_alt_id();
450                Character altLoc = ' ';
451                if ( altLocS.length()>0) {
452                        altLoc = altLocS.charAt(0);
453                        if ( altLoc.equals('.') )
454                                altLoc = ' ';
455
456                }
457
458                // check if residue number is the same ...
459                // insertion code is part of residue number
460                if ( ! residueNumber.equals(current_group.getResidueNumber())) {
461                        //System.out.println("end of residue: "+current_group.getPDBCode()+" "+residueNrInt);
462                        current_chain.addGroup(current_group);
463                        current_group.trimToSize();
464                        current_group = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
465                        current_group.setPDBName(groupCode3);
466                        current_group.setResidueNumber(residueNumber);
467
468
469                        //                        System.out.println("Made new group:  " + groupCode3 + " " + resNum + " " + iCode);
470
471                } else {
472                        // same residueNumber, but altLocs...
473
474                        // test altLoc
475                        if ( ! altLoc.equals(' ') && ( ! altLoc.equals('.'))) {
476                                logger.debug("found altLoc! " + altLoc + " " + current_group + " " + altGroup);
477                                altGroup = getCorrectAltLocGroup( altLoc,recordName,aminoCode1,groupCode3, seq_id);
478                                if (altGroup.getChain()==null) {
479                                        altGroup.setChain(current_chain);
480                                }
481                        }
482                }
483
484                //atomCount++;
485                //System.out.println("fixing atom name for  >" + atom.getLabel_atom_id() + "< >" + fullname + "<");
486
487
488                if ( params.isParseCAOnly() ){
489                        // yes , user wants to get CA only
490                        // only parse CA atoms...
491                        if (! (atom.getLabel_atom_id().equals(StructureTools.CA_ATOM_NAME) && atom.getType_symbol().equals("C"))) {
492                                //System.out.println("ignoring " + line);
493                                //atomCount--;
494                                return;
495                        }
496                }
497
498                // filling the map in case there's no pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme in the file
499                asymId2StrandIdFromAtomSites.put(atom.getLabel_asym_id(), atom.getAuth_asym_id());
500
501                //see if chain_id is one of the previous chains ...
502
503                Atom a = convertAtom(atom);
504
505                //see if chain_id is one of the previous chains ...
506                if ( altGroup != null) {
507                        altGroup.addAtom(a);
508                        altGroup = null;
509                }
510                else {
511                        current_group.addAtom(a);
512                }
513
514
515                // make sure that main group has all atoms
516                // GitHub issue: #76
517                if ( ! current_group.hasAtom(a.getName())) {
518                        current_group.addAtom(a);
519                }
520
521
522                //System.out.println(">" + atom.getLabel_atom_id()+"< " + a.getGroup().getPDBName() + " " + a.getGroup().getChemComp()  );
523
524                //System.out.println(current_group);
525
526        }
527
528        /** convert a MMCif AtomSite object to a BioJava Atom object
529         *
530         * @param atom the mmmcif AtomSite record
531         * @return an Atom
532         */
533        private Atom convertAtom(AtomSite atom){
534
535
536                Atom a = new AtomImpl();
537
538                a.setPDBserial(Integer.parseInt(atom.getId()));
539                a.setName(atom.getLabel_atom_id());
540
541                double x = Double.parseDouble (atom.getCartn_x());
542                double y = Double.parseDouble (atom.getCartn_y());
543                double z = Double.parseDouble (atom.getCartn_z());
544                a.setX(x);
545                a.setY(y);
546                a.setZ(z);
547
548                float occupancy = Float.parseFloat (atom.getOccupancy());
549                a.setOccupancy(occupancy);
550
551                float temp = Float.parseFloat (atom.getB_iso_or_equiv());
552                a.setTempFactor(temp);
553
554                String alt = atom.getLabel_alt_id();
555                if (( alt != null ) && ( alt.length() > 0) && (! alt.equals("."))){
556                        a.setAltLoc(new Character(alt.charAt(0)));
557                } else {
558                        a.setAltLoc(new Character(' '));
559                }
560
561                Element element = Element.R;
562                try {
563                        element = Element.valueOfIgnoreCase(atom.getType_symbol());
564                }  catch (IllegalArgumentException e) {
565                        logger.info("Element {} was not recognised as a BioJava-known element, the element will be represented as the generic element {}", atom.getType_symbol(), Element.R.name());
566                }
567                a.setElement(element);
568
569                return a;
570
571        }
572
573
574        private Group getCorrectAltLocGroup( Character altLoc,
575                        String recordName, Character aminoCode1, String groupCode3, long seq_id) {
576
577                // see if we know this altLoc already;
578                List<Atom> atoms = current_group.getAtoms();
579                if ( atoms.size() > 0) {
580                        Atom a1 = atoms.get(0);
581                        // we are just adding atoms to the current group
582                        // probably there is a second group following later...
583                        if (a1.getAltLoc().equals(altLoc)) {
584
585                                return current_group;
586                        }
587                }
588
589                List<Group> altLocs = current_group.getAltLocs();
590                for ( Group altLocG : altLocs ){
591                        atoms = altLocG.getAtoms();
592                        if ( atoms.size() > 0) {
593                                for ( Atom a1 : atoms) {
594                                        if (a1.getAltLoc().equals( altLoc)) {
595
596                                                return altLocG;
597                                        }
598                                }
599                        }
600                }
601
602                // no matching altLoc group found.
603                // build it up.
604
605                if ( groupCode3.equals(current_group.getPDBName())) {
606                        if ( current_group.getAtoms().size() == 0) {
607                                //System.out.println("current group is empty " + current_group + " " + altLoc);
608                                return current_group;
609                        }
610                        //System.out.println("cloning current group " + current_group + " " + current_group.getAtoms().get(0).getAltLoc() + " altLoc " + altLoc);
611                        Group altLocG = (Group) current_group.clone();
612                        // drop atoms from cloned group...
613                        // https://redmine.open-bio.org/issues/3307
614                        altLocG.setAtoms(new ArrayList<Atom>());
615                        altLocG.getAltLocs().clear();
616                        current_group.addAltLoc(altLocG);
617                        return altLocG;
618                }
619
620                //      System.out.println("new  group " + recordName + " " + aminoCode1 + " " +groupCode3);
621                //String recordName,Character aminoCode1, long seq_id,String groupCode3) {
622                Group altLocG = getNewGroup(recordName,aminoCode1,seq_id,groupCode3);
623
624                altLocG.setPDBName(groupCode3);
625                altLocG.setResidueNumber(current_group.getResidueNumber());
626                current_group.addAltLoc(altLocG);
627                return altLocG;
628        }
629
630        /** Start the parsing
631         *
632         */
633        @Override
634        public void documentStart() {
635                structure = new StructureImpl();
636
637                current_chain           = null;
638                current_group           = null;
639                current_nmr_model       = null;
640                //atomCount                     = 0;
641
642                current_model = new ArrayList<Chain>();
643                entities      = new ArrayList<Entity>();
644                strucRefs     = new ArrayList<StructRef>();
645                seqResChains  = new ArrayList<Chain>();
646                entityChains  = new ArrayList<Chain>();
647                structAsyms   = new ArrayList<StructAsym>();
648                asymStrandId  = new HashMap<String, String>();
649                asymId2StrandIdFromAtomSites = new HashMap<String, String>();
650                asymId2entityId = new HashMap<String,String>();
651                structOpers   = new ArrayList<PdbxStructOperList>();
652                strucAssemblies = new ArrayList<PdbxStructAssembly>();
653                strucAssemblyGens = new ArrayList<PdbxStructAssemblyGen>();
654                entitySrcGens = new ArrayList<EntitySrcGen>();
655                entitySrcNats = new ArrayList<EntitySrcNat>();
656                entitySrcSyns = new ArrayList<EntitySrcSyn>();
657                structConn = new ArrayList<StructConn>();
658                structNcsOper = new ArrayList<StructNcsOper>();
659                sequenceDifs = new ArrayList<StructRefSeqDif>();
660                structSiteGens = new ArrayList<StructSiteGen>();
661        }
662
663
664        @Override
665        public void documentEnd() {
666
667                // Expected that there is one current_chain that needs to be added to the model
668                // When in headerOnly mode, no Atoms are read, and there will not be an active
669                // current_chain.
670                if ( current_chain != null ) {
671
672                        current_chain.addGroup(current_group);
673                        if (isKnownChain(current_chain.getChainID(),current_model) == null) {
674                                current_model.add(current_chain);
675                        }
676                } else if (!params.isHeaderOnly()){
677                        logger.warn("current chain is null at end of document.");
678                }
679
680                structure.addModel(current_model);
681
682                // Goal is to reproduce the PDB files exactly:
683                // What has to be done is to use the auth_mon_id for the assignment. For this
684
685                // map entities to Chains and Compound objects...
686
687
688                for (StructAsym asym : structAsyms) {
689                        logger.debug("Entity {} matches asym_id: {}", asym.getEntity_id(), asym.getId() );
690
691                        asymId2entityId.put(asym.getId(), asym.getEntity_id());
692
693                        Chain s = getEntityChain(asym.getEntity_id());
694                        Chain seqres = (Chain)s.clone();
695                        // to solve issue #160 (e.g. 3u7t)
696                        seqres = removeSeqResHeterogeneity(seqres);
697                        seqres.setChainID(asym.getId());
698
699                        seqResChains.add(seqres);
700                        logger.debug(" seqres: " + asym.getId() + " " + seqres + "<") ;
701
702                        // adding the compounds (entities)
703                        addCompounds(asym);
704
705                }
706
707                if (structAsyms.isEmpty()) {
708                        logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
709                }
710
711                // Only align if requested (default) and not when headerOnly mode with no Atoms.
712                // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
713                if ( params.isAlignSeqRes() && !params.isHeaderOnly() ){
714                        logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
715                        alignSeqRes();
716                } else {
717                        logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
718                        SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
719                }
720
721                if (asymStrandId.isEmpty()) {
722                        logger.warn("No pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme categories present. Will use chain id mapping from _atom_sites category");
723
724                        asymStrandId = asymId2StrandIdFromAtomSites;
725                }
726                // If we only parse the header - we have no option but to use the other mapping (which can be broken)
727                if (asymId2StrandIdFromAtomSites.isEmpty()){
728
729                        logger.warn("No  _atom_sites category auth to asymid mappings. Will use chain id mapping from pdbx_poly_seq_scheme/pdbx_non_poly_seq_scheme categories");
730                        asymId2StrandIdFromAtomSites = asymStrandId;
731                }
732
733                // mismatching Author assigned chain IDS and PDB internal chain ids:
734                // fix the chain IDS in the current model:
735
736                if(params.isUseInternalChainId()==false){
737                        for (int i =0; i< structure.nrModels() ; i++){
738                                List<Chain> model = structure.getModel(i);
739
740                                List<Chain> pdbChains = new ArrayList<Chain>();
741                                for (Chain chain : model) {
742                                        for (String asym : asymId2StrandIdFromAtomSites.keySet()) {
743                                                if ( chain.getChainID().equals(asym)){
744                                                        String newChainId = asymId2StrandIdFromAtomSites.get(asym);
745
746                                                        logger.debug("Renaming chain with asym_id {} ({} atom groups) to author_asym_id/strand_id  {}",
747                                                                        asym, chain.getAtomGroups().size(), newChainId);
748
749                                                        chain.setChainID(newChainId);
750                                                        chain.setInternalChainID(asym);
751                                                        // set chain of all groups
752                                                        for(Group g : chain.getAtomGroups()) {
753                                                                ResidueNumber resNum = g.getResidueNumber();
754                                                                if(resNum != null)
755                                                                        resNum.setChainId(newChainId);
756                                                        }
757                                                        for(Group g : chain.getSeqResGroups()) {
758                                                                ResidueNumber resNum = g.getResidueNumber();
759                                                                if(resNum != null)
760                                                                        resNum.setChainId(newChainId);
761                                                        }
762                                                        Chain known =  isKnownChain(chain.getChainID(), pdbChains);
763                                                        if ( known == null ){
764                                                                pdbChains.add(chain);
765                                                        } else {
766                                                                // and now we join the 2 chains together again, because in cif files the data can be split up...
767                                                                for ( Group g : chain.getAtomGroups()){
768                                                                        known.addGroup(g);
769                                                                }
770                                                        }
771
772                                                        break;
773                                                }
774                                        }
775                                }
776
777                                structure.setModel(i,pdbChains);
778                        }
779                }
780                else{
781                        // Just set the internal id as the auth id -> if we're using the asymid
782                        for (int i =0; i< structure.nrModels() ; i++){
783                                List<Chain> model = structure.getModel(i);
784                                for (Chain chain : model) {
785                                        for (String asym : asymId2StrandIdFromAtomSites.keySet()) {
786                                                if (chain.getChainID().equals(asym)){
787                                                        String authChainId = asymId2StrandIdFromAtomSites.get(asym);
788                                                        chain.setInternalChainID(authChainId);
789                                                        break;
790                                                }
791                                        }
792                                }
793                        }
794                }
795
796                // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
797                if (!params.isHeaderOnly()) {
798                        if ( params.shouldCreateAtomBonds()) {
799                                addBonds();
800                        }
801
802                        if ( params.shouldCreateAtomCharges()) {
803                                addCharges();
804                        }
805                }
806
807                // compounds (entities)
808                // In addCompounds above we created the compounds if they were present in the file
809                // Now we need to make sure that they are linked to chains and also that if they are not present in the file we need to add them now
810                linkCompounds();
811
812                if (!params.isHeaderOnly()) {
813
814                        // Do structure.setSites(sites) after any chain renaming to be like PDB.
815                        addSites();
816                }
817
818
819
820                // set the oligomeric state info in the header...
821                if (params.isParseBioAssembly()) {
822
823                        // the more detailed mapping of chains to rotation operations happens in StructureIO...
824
825                        Map<Integer,BioAssemblyInfo> bioAssemblies = new HashMap<Integer, BioAssemblyInfo>();
826
827                        for ( PdbxStructAssembly psa : strucAssemblies){
828
829                                List<PdbxStructAssemblyGen> psags = new ArrayList<PdbxStructAssemblyGen>(1);
830
831                                for ( PdbxStructAssemblyGen psag: strucAssemblyGens ) {
832                                        if ( psag.getAssembly_id().equals(psa.getId())) {
833                                                psags.add(psag);
834                                        }
835                                }
836
837                                BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
838
839                                // these are the transformations that need to be applied to our model
840                                List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(psa, psags, structOpers);
841
842                                int mmSize = 0;
843                                int bioAssemblyId = -1;
844                                try {
845                                        bioAssemblyId = Integer.parseInt(psa.getId());
846                                } catch (NumberFormatException e) {
847                                        logger.info("Could not parse a numerical bio assembly id from '{}'",psa.getId());
848                                }
849                                try {
850                                        mmSize = Integer.parseInt(psa.getOligomeric_count());
851                                } catch (NumberFormatException e) {
852                                        if (bioAssemblyId!=-1)
853                                                // if we have a numerical id, then it's unusual to have no oligomeric size: we warn about it
854                                                logger.warn("Could not parse oligomeric count from '{}' for biological assembly id {}",
855                                                        psa.getOligomeric_count(),psa.getId());
856                                        else
857                                                // no numerical id (PAU,XAU in virus entries), it's normal to have no oligomeric size
858                                                logger.info("Could not parse oligomeric count from '{}' for biological assembly id {}",
859                                                                psa.getOligomeric_count(),psa.getId());
860                                }
861
862                                // if bioassembly id is not numerical we throw it away
863                                // this happens usually for viral capsid entries, like 1ei7
864                                // see issue #230 in github
865                                if (bioAssemblyId!=-1) {
866                                        BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
867                                        bioAssembly.setId(bioAssemblyId);
868                                        bioAssembly.setMacromolecularSize(mmSize);
869                                        bioAssembly.setTransforms(transformations);
870                                        bioAssemblies.put(bioAssemblyId,bioAssembly);
871                                }
872
873                        }
874                        structure.getPDBHeader().setBioAssemblies(bioAssemblies);
875                }
876
877                setStructNcsOps();
878                
879                setCrystallographicInfoMetadata();
880
881
882                Map<String,List<SeqMisMatch>> misMatchMap = new HashMap<String, List<SeqMisMatch>>();
883                for (StructRefSeqDif sdif : sequenceDifs) {
884                        SeqMisMatch misMatch = new SeqMisMatchImpl();
885                        misMatch.setDetails(sdif.getDetails());
886
887                        String insCode = sdif.getPdbx_pdb_ins_code();
888                        if ( insCode != null && insCode.equals("?"))
889                                insCode = null;
890                        misMatch.setInsCode(insCode);
891                        misMatch.setOrigGroup(sdif.getDb_mon_id());
892                        misMatch.setPdbGroup(sdif.getMon_id());
893                        misMatch.setPdbResNum(sdif.getPdbx_auth_seq_num());
894                        misMatch.setUniProtId(sdif.getPdbx_seq_db_accession_code());
895                        misMatch.setSeqNum(sdif.getSeq_num());
896
897
898                        List<SeqMisMatch> mms = misMatchMap.get(sdif.getPdbx_pdb_strand_id());
899                        if ( mms == null) {
900                                mms = new ArrayList<SeqMisMatch>();
901                                misMatchMap.put(sdif.getPdbx_pdb_strand_id(),mms);
902                        }
903                        mms.add(misMatch);
904
905                }
906
907                for (String chainId : misMatchMap.keySet()){
908                        try {
909                                Chain c = structure.getChainByPDB(chainId);
910                                c.setSeqMisMatches(misMatchMap.get(chainId));
911                        } catch (Exception e){
912                                logger.warn("could not set mismatches for chain " + chainId);
913
914                        }
915                }
916
917        }
918
919        /**
920         * Here we link compounds (entities) to chains.
921         * Also if compounds are not present in file, this initialises the compounds with some heuristics, see {@link CompoundFinder}
922         */
923        private void linkCompounds() {
924
925
926                for (int i =0; i< structure.nrModels() ; i++){
927                        for (Chain chain : structure.getModel(i)) {
928                                String entityId;
929                                if( params.isUseInternalChainId()){
930                                        entityId = asymId2entityId.get(chain.getChainID());
931                                }
932                                else{
933                                        entityId = asymId2entityId.get(chain.getInternalChainID());
934                                }
935                                if (entityId==null) {
936                                        // this can happen for instance if the cif file didn't have _struct_asym category at all
937                                        // and thus we have no asymId2entityId mapping at all
938                                        logger.warn("No entity id could be found for chain {}", chain.getInternalChainID());
939                                        continue;
940                                }
941                                int eId = Integer.parseInt(entityId);
942
943                                // Compounds are not added for non-polymeric entities, if a chain is non-polymeric its compound won't be found.
944                                // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
945                                // asyms (chains).  Either create a unique StructureImpl or modify existing for a better representation of the
946                                // mmCIF internal data structures but is compatible with Structure interface.
947                                // Some examples of PDB entries with this kind of problem:
948                                //   - 2uub: asym_id X, chainId Z, entity_id 24: fully non-polymeric but still with its own chainId
949                                //   - 3o6j: asym_id K, chainId Z, entity_id 6 : a single water molecule
950                                //   - 1dz9: asym_id K, chainId K, entity_id 6 : a potassium ion alone
951
952                                Compound compound = structure.getCompoundById(eId);
953                                if (compound==null) {
954                                        // Supports the case where the only chain members were from non-polymeric entity that is missing.
955                                        // Solved by creating a new Compound(entity) to which this chain will belong.
956                                        logger.warn("Could not find a compound for entity_id {}, for chain id {}, creating a new compound.",
957                                                        eId, chain.getChainID());
958                                        compound = new Compound();
959                                        compound.setMolId(eId);
960                                        compound.addChain(chain);
961                                        chain.setCompound(compound);
962                                        structure.addCompound(compound);
963                                } else {
964                                        logger.debug("Adding chain with chain id {} (asym id {}) to compound with entity_id {}",
965                                                        chain.getChainID(), chain.getInternalChainID(), eId);
966                                        compound.addChain(chain);
967                                        chain.setCompound(compound);
968                                }
969
970                        }
971
972                }
973
974                // to make sure we have Compounds linked to chains, we call getCompounds() which will lazily initialise the
975                // compounds using heuristics (see CompoundFinder) in the case that they were not explicitly present in the file
976                List<Compound> compounds = structure.getCompounds();
977
978                // final sanity check: it can happen that from the annotated compounds some are not linked to any chains
979                // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
980                // we simply log it, this can sign some other problems if the compounds are used down the line
981                for (Compound compound:compounds) {
982                        if (compound.getChains().isEmpty()) {
983                                logger.info("Compound {} '{}' has no chains associated to it",
984                                                compound.getId()==null?"with no entity id":compound.getId(), compound.getMolName());
985                        }
986                }
987
988        }
989
990        private void addCharges() {
991                ChargeAdder.addCharges(structure);
992        }
993
994        /**
995         * The method will return a new reference to a Chain with any consecutive groups
996         * having same residue numbers removed.
997         * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
998         * @param c
999         * @return
1000         */
1001        private Chain removeSeqResHeterogeneity(Chain c) {
1002
1003                Chain trimmedChain = new ChainImpl();
1004
1005                ResidueNumber lastResNum = null;
1006
1007                for (Group g:c.getAtomGroups()) {
1008
1009                        // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
1010                        ResidueNumber currentResNum = new ResidueNumber(
1011                                        g.getResidueNumber().getChainId(),
1012                                        g.getResidueNumber().getSeqNum(),
1013                                        g.getResidueNumber().getInsCode());
1014
1015                        if (lastResNum == null || !lastResNum.equals(currentResNum) ) {
1016                                trimmedChain.addGroup(g);
1017                        } else {
1018                                logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': "+g);
1019                        }
1020
1021                        lastResNum = currentResNum;
1022
1023                }
1024                return trimmedChain;
1025        }
1026
1027        private void addBonds() {
1028                BondMaker maker = new BondMaker(structure, params);
1029                maker.makeBonds();
1030                maker.formBondsFromStructConn(structConn);
1031        }
1032
1033        private void alignSeqRes() {
1034
1035                logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
1036
1037                // fix SEQRES residue numbering for all models
1038
1039                for (int model=0;model<structure.nrModels();model++) {
1040
1041                        List<Chain> atomList   = structure.getModel(model);
1042
1043                        for (Chain seqResChain: seqResChains){
1044
1045                                // this extracts the matching atom chain from atomList
1046                                Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList);
1047
1048                                if (atomChain == null) {
1049                                        // most likely there's no observed residues at all for the seqres chain: can't map
1050                                        // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
1051                                        logger.warn("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's no observed residues in the chain.",
1052                                                        seqResChain.getChainID());
1053                                        continue;
1054                                }
1055
1056                                //map the atoms to the seqres...
1057
1058                                // we need to first clone the seqres so that they stay independent for different models
1059                                List<Group> seqResGroups = new ArrayList<Group>();
1060                                for (int i=0;i<seqResChain.getAtomGroups().size();i++) {
1061                                        seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
1062                                }
1063
1064                                for ( int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
1065                                        Group seqresG = seqResGroups.get(seqResPos);
1066                                        boolean found = false;
1067                                        for ( Group atomG: atomChain.getAtomGroups()) {
1068
1069                                                int internalNr = getInternalNr (atomG);
1070
1071                                                if (seqresG.getResidueNumber().getSeqNum() == internalNr ) {
1072                                                        seqResGroups.set(seqResPos, atomG);
1073                                                        found = true;
1074                                                        break;
1075                                                }
1076
1077
1078                                        }
1079                                        if ( ! found)
1080                                                // so far the residue number has tracked internal numbering.
1081                                                // however there are no atom records, as such this can't be a PDB residue number...
1082                                                seqresG.setResidueNumber(null);
1083                                }
1084                                atomChain.setSeqResGroups(seqResGroups);
1085
1086                        }
1087                }
1088        }
1089
1090        private int getInternalNr(Group atomG) {
1091                if ( atomG.getType().equals(GroupType.AMINOACID)) {
1092                        AminoAcidImpl aa = (AminoAcidImpl) atomG;
1093                        return new Long(aa.getId()).intValue();
1094                } else if ( atomG.getType().equals(GroupType.NUCLEOTIDE)) {
1095                        NucleotideImpl nu = (NucleotideImpl) atomG;
1096                        return new Long(nu.getId()).intValue();
1097                } else {
1098                        HetatomImpl he = (HetatomImpl) atomG;
1099                        return new Long(he.getId()).intValue();
1100                }
1101        }
1102
1103        private void addCompounds(StructAsym asym) {
1104                int eId = 0;
1105                try {
1106                        eId = Integer.parseInt(asym.getEntity_id());
1107                } catch (NumberFormatException e) {
1108                        logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Compound",asym.getEntity_id());
1109                }
1110                Entity e = getEntity(eId);
1111
1112                for (EntitySrcGen esg : entitySrcGens) {
1113
1114                        if (! esg.getEntity_id().equals(asym.getEntity_id()))
1115                                continue;
1116
1117                        // found the matching EntitySrcGen
1118                        // get the corresponding Entity
1119                        Compound c = structure.getCompoundById(eId);
1120                        if ( c == null){
1121                                if (e!=null) {
1122                                        if (e.getType().equals("polymer")) {
1123                                                c = createNewCompoundFromESG(esg, eId);
1124                                                c.setMolName(e.getPdbx_description());
1125                                                structure.addCompound(c);
1126                                                logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
1127                                        } else if (e.getType().equals("non-solvent")) {
1128                                                // TODO handle non-polymer compounds.
1129                                        } else if (e.getType().equals("water")) {
1130                                                // TODO handle solvent entity.
1131                                        } else {
1132                                                logger.warn("Could not add entity id " + esg.getEntity_id() + " that has unknown _entity.type");
1133                                        }
1134                                }
1135                        }
1136
1137                }
1138
1139                for (EntitySrcNat esn : entitySrcNats) {
1140                        if (! esn.getEntity_id().equals(asym.getEntity_id()))
1141                                continue;
1142
1143                        // found the matching EntitySrcGen
1144                        // get the corresponding Entity
1145                        Compound c = structure.getCompoundById(eId);
1146                        if ( c == null){
1147                                if (e!=null && e.getType().equals("polymer")) {
1148                                        c = createNewCompoundFromESN(esn, eId);
1149                                        c.setMolName(e.getPdbx_description());
1150                                        structure.addCompound(c);
1151                                        logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
1152                                }
1153                        }
1154
1155                }
1156
1157                for (EntitySrcSyn ess : entitySrcSyns) {
1158                        if (! ess.getEntity_id().equals(asym.getEntity_id()))
1159                                continue;
1160
1161                        // found the matching EntitySrcGen
1162                        // get the corresponding Entity
1163                        Compound c = structure.getCompoundById(eId);
1164                        if ( c == null){
1165                                if (e!=null && e.getType().equals("polymer")) {
1166                                        c = createNewCompoundFromESS(ess, eId);
1167                                        c.setMolName(e.getPdbx_description());
1168                                        structure.addCompound(c);
1169                                        logger.debug("Adding Compound with entity id {} from _entity_src_syn, with name: {}",eId,c.getMolName());
1170                                }
1171                        }
1172                }
1173
1174                // for some mmCIF files like 1yrm all 3 of _entity_src_gen, _entity_src_nat and _pdbx_entity_src_syn are missing
1175                // we need to fill the Compounds in some other way:
1176
1177                Compound c = structure.getCompoundById(eId);
1178
1179                if (c==null) {
1180                        c = new Compound();
1181                        c.setMolId(eId);
1182
1183                        // we only add the compound if a polymeric one (to match what the PDB parser does)
1184                        if (e!=null && e.getType().equals("polymer")) {
1185                                c.setMolName(e.getPdbx_description());
1186                                structure.addCompound(c);
1187                                logger.debug("Adding Compound with entity id {} from _entity, with name: {}",eId, c.getMolName());
1188                        }
1189                }
1190        }
1191
1192        private Compound createNewCompoundFromESG(EntitySrcGen esg, int eId) {
1193
1194                Compound c = new Compound();
1195                c.setMolId(eId);
1196                c.setAtcc(esg.getPdbx_gene_src_atcc());
1197                c.setCell(esg.getPdbx_gene_src_cell());
1198                c.setOrganismCommon(esg.getGene_src_common_name());
1199                c.setOrganismScientific(esg.getPdbx_gene_src_scientific_name());
1200                c.setOrganismTaxId(esg.getPdbx_gene_src_ncbi_taxonomy_id());
1201                c.setExpressionSystemTaxId(esg.getPdbx_host_org_ncbi_taxonomy_id());
1202                c.setExpressionSystem(esg.getPdbx_host_org_scientific_name());
1203                return c;
1204
1205        }
1206
1207        private Compound createNewCompoundFromESN(EntitySrcNat esn, int eId) {
1208
1209                Compound c = new Compound();
1210
1211                c.setMolId(eId);
1212                c.setAtcc(esn.getPdbx_atcc());
1213                c.setCell(esn.getPdbx_cell());
1214                c.setOrganismCommon(esn.getCommon_name());
1215                c.setOrganismScientific(esn.getPdbx_organism_scientific());
1216                c.setOrganismTaxId(esn.getPdbx_ncbi_taxonomy_id());
1217
1218                return c;
1219
1220        }
1221
1222        private Compound createNewCompoundFromESS(EntitySrcSyn ess, int eId) {
1223
1224                Compound c = new Compound();
1225
1226                c.setMolId(eId);
1227                c.setOrganismCommon(ess.getOrganism_common_name());
1228                c.setOrganismScientific(ess.getOrganism_scientific());
1229                c.setOrganismTaxId(ess.getNcbi_taxonomy_id());
1230
1231
1232                return c;
1233
1234        }
1235        
1236        private void setStructNcsOps() {
1237                
1238                ArrayList<Matrix4d> ncsOperators = new ArrayList<Matrix4d>();
1239                
1240                for (StructNcsOper sNcsOper:structNcsOper) {
1241                        
1242                        if (!sNcsOper.getCode().equals("generate")) continue;
1243                        
1244                        try {
1245                                Matrix4d op = new Matrix4d();
1246                                op.setElement(3, 0, 0.0);
1247                                op.setElement(3, 1, 0.0);
1248                                op.setElement(3, 2, 0.0);
1249                                op.setElement(3, 3, 1.0);
1250
1251
1252                                op.setElement(0, 0, Double.parseDouble(sNcsOper.getMatrix11()));
1253                                op.setElement(0, 1, Double.parseDouble(sNcsOper.getMatrix12()));
1254                                op.setElement(0, 2, Double.parseDouble(sNcsOper.getMatrix13()));
1255
1256                                op.setElement(1, 0, Double.parseDouble(sNcsOper.getMatrix21()));
1257                                op.setElement(1, 1, Double.parseDouble(sNcsOper.getMatrix22()));
1258                                op.setElement(1, 2, Double.parseDouble(sNcsOper.getMatrix23()));
1259
1260                                op.setElement(2, 0, Double.parseDouble(sNcsOper.getMatrix31()));
1261                                op.setElement(2, 1, Double.parseDouble(sNcsOper.getMatrix32()));
1262                                op.setElement(2, 2, Double.parseDouble(sNcsOper.getMatrix33()));
1263
1264                                op.setElement(0, 3, Double.parseDouble(sNcsOper.getVector1()));
1265                                op.setElement(1, 3, Double.parseDouble(sNcsOper.getVector2()));
1266                                op.setElement(2, 3, Double.parseDouble(sNcsOper.getVector3()));
1267
1268                                ncsOperators.add(op);
1269                                
1270                        } catch (NumberFormatException e) {
1271                                logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", structNcsOper.indexOf(sNcsOper)+1); 
1272                        }
1273
1274                }
1275                
1276                // we only set it if not empty, otherwise remains null
1277                if (ncsOperators.size()>0) {
1278                        structure.getCrystallographicInfo().setNcsOperators(
1279                                        ncsOperators.toArray(new Matrix4d[ncsOperators.size()]));
1280                }
1281        }
1282        
1283        private void setCrystallographicInfoMetadata() {
1284                if (parsedScaleMatrix!=null) {
1285                        
1286                        PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
1287                        
1288                        boolean nonStd = false;
1289                        if (crystalInfo.getCrystalCell()!=null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
1290                                nonStd = true;
1291                        }
1292                        
1293                        crystalInfo.setNonStandardCoordFrameConvention(nonStd); 
1294                }
1295        }
1296
1297        /** This method will return the parsed protein structure, once the parsing has been finished
1298         *
1299         * @return a BioJava protein structure object
1300         */
1301        public Structure getStructure() {
1302
1303                return structure;
1304        }
1305
1306        @Override
1307        public void newDatabasePDBrevRecord(DatabasePdbrevRecord record) {
1308
1309                PDBHeader header = structure.getPDBHeader();
1310
1311                if ( header == null) {
1312                        header = new PDBHeader();
1313                        structure.setPDBHeader(header);
1314                }
1315
1316                List<DatabasePdbrevRecord> revRecords = header.getRevisionRecords();
1317                if ( revRecords == null) {
1318                        revRecords = new ArrayList<DatabasePdbrevRecord>();
1319                        header.setRevisionRecords(revRecords);
1320                }
1321                revRecords.add(record);
1322
1323
1324        }
1325
1326
1327        @Override
1328        public void newDatabasePDBrev(DatabasePDBrev dbrev) {
1329                //System.out.println("got a database revision:" + dbrev);
1330                SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd",Locale.US);
1331                PDBHeader header = structure.getPDBHeader();
1332
1333                if ( header == null) {
1334                        header = new PDBHeader();
1335                }
1336
1337
1338                if (dbrev.getNum().equals("1")){
1339
1340                        try {
1341                                Date dep = dateFormat.parse(dbrev.getDate_original());
1342                                header.setDepDate(dep);
1343
1344                        } catch (ParseException e){
1345                                logger.warn("Could not parse date string '{}', deposition date will be unavailable", dbrev.getDate_original());
1346                        }
1347
1348                        try {
1349                                Date mod = dateFormat.parse(dbrev.getDate());
1350                                header.setModDate(mod);
1351
1352                        } catch (ParseException e){
1353                                logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
1354                        }
1355
1356
1357                } else {
1358                        try {
1359
1360                                Date mod = dateFormat.parse(dbrev.getDate());
1361                                header.setModDate(mod);
1362
1363                        } catch (ParseException e){
1364                                logger.warn("Could not parse date string '{}', modification date will be unavailable", dbrev.getDate());
1365                        }
1366                }
1367
1368                structure.setPDBHeader(header);
1369        }
1370
1371        @Override
1372        public void newDatabasePDBremark(DatabasePDBremark remark) {
1373                //System.out.println(remark);
1374                String id = remark.getId();
1375                if (id.equals("2")){
1376
1377                        //this remark field contains the resolution information:
1378                        String line = remark.getText();
1379
1380                        int i = line.indexOf("ANGSTROM");
1381                        if ( i > 5) {
1382                                // line contains ANGSTROM info...
1383                                String resolution = line.substring(i-5,i).trim();
1384                                // convert string to float
1385                                float res = 99 ;
1386                                try {
1387                                        res = Float.parseFloat(resolution);
1388
1389                                } catch (NumberFormatException e) {
1390                                        logger.info("could not parse resolution from line and ignoring it " + line);
1391                                        return ;
1392
1393
1394                                }
1395                                // support for old style header
1396
1397                                PDBHeader pdbHeader = structure.getPDBHeader();
1398                                pdbHeader.setResolution(res);
1399
1400                        }
1401
1402                }
1403        }
1404
1405        @Override
1406        public void newRefine(Refine r){
1407
1408                PDBHeader pdbHeader = structure.getPDBHeader();
1409                // RESOLUTION
1410                // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
1411                // there are 2 resolution values, one for each method
1412                // we take the last one found so that behaviour is like in PDB file parsing
1413                if (pdbHeader.getResolution()!=PDBHeader.DEFAULT_RESOLUTION) {
1414                        logger.warn("More than 1 resolution value present, will use last one {} and discard previous {} "
1415                                        ,r.getLs_d_res_high(), String.format("%4.2f",pdbHeader.getResolution()));
1416                }
1417                try {
1418                        pdbHeader.setResolution(Float.parseFloat(r.getLs_d_res_high()));
1419                } catch (NumberFormatException e){
1420                        logger.info("Could not parse resolution from " + r.getLs_d_res_high() + " " + e.getMessage());
1421                }
1422
1423
1424                // RFREE
1425                if (pdbHeader.getRfree()!=PDBHeader.DEFAULT_RFREE) {
1426                        logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {} ",
1427                                        r.getLs_R_factor_R_free(), String.format("%4.2f",pdbHeader.getRfree()));
1428                }
1429                if (r.getLs_R_factor_R_free()==null) {
1430                        // some entries like 2ifo haven't got this field at all
1431                        logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
1432                } else {
1433                        try {
1434                                pdbHeader.setRfree(Float.parseFloat(r.getLs_R_factor_R_free()));
1435                        } catch (NumberFormatException e){
1436                                // no rfree present ('?') is very usual, that's why we set it to debug
1437                                logger.debug("Could not parse Rfree from string '{}'", r.getLs_R_factor_R_free());
1438                        }
1439                }
1440
1441        }
1442
1443
1444        @Override
1445        public void newAuditAuthor(AuditAuthor aa){
1446
1447                String name =  aa.getName();
1448
1449                StringBuffer famName = new StringBuffer();
1450                StringBuffer initials = new StringBuffer();
1451                boolean afterComma = false;
1452                for ( char c: name.toCharArray()) {
1453                        if ( c == ' ')
1454                                continue;
1455                        if ( c == ','){
1456                                afterComma = true;
1457                                continue;
1458                        }
1459
1460                        if ( afterComma)
1461                                initials.append(c);
1462                        else
1463                                famName.append(c);
1464                }
1465
1466                StringBuffer newaa = new StringBuffer();
1467                newaa.append(initials);
1468                newaa.append(famName);
1469
1470                PDBHeader header = structure.getPDBHeader();
1471                String auth = header.getAuthors();
1472                if (auth == null) {
1473                        header.setAuthors(newaa.toString());
1474                }else {
1475                        auth += "," + newaa.toString();
1476                        header.setAuthors(auth);
1477
1478                }
1479        }
1480
1481        @Override
1482        public void newExptl(Exptl exptl) {
1483
1484                PDBHeader pdbHeader = structure.getPDBHeader();
1485                String method = exptl.getMethod();
1486                pdbHeader.setExperimentalTechnique(method);
1487
1488        }
1489
1490        @Override
1491        public void newCell(Cell cell) {
1492
1493                try {
1494                        float a = Float.parseFloat(cell.getLength_a());
1495                        float b = Float.parseFloat(cell.getLength_b());
1496                        float c = Float.parseFloat(cell.getLength_c());
1497                        float alpha = Float.parseFloat(cell.getAngle_alpha());
1498                        float beta = Float.parseFloat(cell.getAngle_beta());
1499                        float gamma = Float.parseFloat(cell.getAngle_gamma());
1500
1501                        CrystalCell xtalCell = new CrystalCell();
1502                        xtalCell.setA(a);
1503                        xtalCell.setB(b);
1504                        xtalCell.setC(c);
1505                        xtalCell.setAlpha(alpha);
1506                        xtalCell.setBeta(beta);
1507                        xtalCell.setGamma(gamma);
1508
1509                        if (!xtalCell.isCellReasonable()) {
1510                                // If the entry describes a structure determined by a technique other than X-ray crystallography,
1511                            // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
1512                                // if so we don't add and CrystalCell will be null
1513                                logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.",
1514                                                CrystalCell.MIN_VALID_CELL_SIZE);
1515                                return;
1516                        }
1517
1518                        structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(xtalCell);
1519
1520                } catch (NumberFormatException e){
1521                        structure.getPDBHeader().getCrystallographicInfo().setCrystalCell(null);
1522                        logger.info("could not parse some cell parameters ("+e.getMessage()+"), ignoring _cell ");
1523                }
1524        }
1525
1526        @Override
1527        public void newSymmetry(Symmetry symmetry) {
1528                String spaceGroup = symmetry.getSpace_group_name_H_M();
1529                SpaceGroup sg = SymoplibParser.getSpaceGroup(spaceGroup);
1530                if (sg==null) {
1531                        logger.warn("Space group '"+spaceGroup+"' not recognised as a standard space group");
1532                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(true);
1533                } else {
1534                        structure.getPDBHeader().getCrystallographicInfo().setSpaceGroup(sg);
1535                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardSg(false);
1536                }
1537        }
1538
1539        @Override
1540        public void newStructNcsOper(StructNcsOper sNcsOper) {
1541                structNcsOper.add(sNcsOper);
1542        }
1543        
1544        public void newAtomSites(AtomSites atomSites) {
1545                
1546                try {
1547                        Matrix4d m = new Matrix4d(
1548                                Double.parseDouble(atomSites.getFract_transf_matrix11()), Double.parseDouble(atomSites.getFract_transf_matrix12()), Double.parseDouble(atomSites.getFract_transf_matrix13()), Double.parseDouble(atomSites.getFract_transf_vector1()),
1549                                Double.parseDouble(atomSites.getFract_transf_matrix21()), Double.parseDouble(atomSites.getFract_transf_matrix22()), Double.parseDouble(atomSites.getFract_transf_matrix23()), Double.parseDouble(atomSites.getFract_transf_vector2()),
1550                                Double.parseDouble(atomSites.getFract_transf_matrix31()), Double.parseDouble(atomSites.getFract_transf_matrix32()), Double.parseDouble(atomSites.getFract_transf_matrix33()), Double.parseDouble(atomSites.getFract_transf_vector3()),
1551                                0,0,0,1);
1552
1553                        parsedScaleMatrix = m;
1554                
1555                } catch (NumberFormatException e) {
1556                        logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}", e.getMessage());
1557                        structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
1558                        
1559                        // in this case parsedScaleMatrix stays null and can't be used in documentEnd()
1560                }
1561        }
1562
1563        @Override
1564        public void newStructRef(StructRef sref) {
1565                logger.debug(sref.toString());
1566                strucRefs.add(sref);
1567        }
1568
1569        private StructRef getStructRef(String ref_id){
1570                for (StructRef structRef : strucRefs) {
1571
1572                        if (structRef.getId().equals(ref_id)){
1573                                return structRef;
1574                        }
1575
1576                }
1577                return null;
1578
1579        }
1580
1581        /**
1582         * create a DBRef record from the StrucRefSeq record:
1583         * <pre>
1584         * PDB record                    DBREF
1585         * Field Name                    mmCIF Data Item
1586         * Section                       n.a.
1587         * PDB_ID_Code                   _struct_ref_seq.pdbx_PDB_id_code
1588         * Strand_ID                     _struct_ref_seq.pdbx_strand_id
1589         * Begin_Residue_Number          _struct_ref_seq.pdbx_auth_seq_align_beg
1590         * Begin_Ins_Code                _struct_ref_seq.pdbx_seq_align_beg_ins_code
1591         * End_Residue_Number            _struct_ref_seq.pdbx_auth_seq_align_end
1592         * End_Ins_Code                  _struct_ref_seq.pdbx_seq_align_end_ins_code
1593         * Database                      _struct_ref.db_name
1594         * Database_Accession_No         _struct_ref_seq.pdbx_db_accession
1595         * Database_ID_Code              _struct_ref.db_code
1596         * Database_Begin_Residue_Number _struct_ref_seq.db_align_beg
1597         * Databaes_Begin_Ins_Code       _struct_ref_seq.pdbx_db_align_beg_ins_code
1598         * Database_End_Residue_Number   _struct_ref_seq.db_align_end
1599         * Databaes_End_Ins_Code         _struct_ref_seq.pdbx_db_align_end_ins_code
1600         * </pre>
1601         *
1602         *
1603         */
1604        @Override
1605        public void newStructRefSeq(StructRefSeq sref) {
1606                //if (DEBUG)
1607                //      System.out.println(sref);
1608                DBRef r = new DBRef();
1609
1610
1611                //if (DEBUG)
1612                //      System.out.println( " " + sref.getPdbx_PDB_id_code() + " " + sref.getPdbx_db_accession());
1613                r.setIdCode(sref.getPdbx_PDB_id_code());
1614                r.setDbAccession(sref.getPdbx_db_accession());
1615                r.setDbIdCode(sref.getPdbx_db_accession());
1616
1617                r.setChainId(sref.getPdbx_strand_id());
1618                StructRef structRef = getStructRef(sref.getRef_id());
1619                if (structRef == null){
1620                        logger.warn("could not find StructRef " + sref.getRef_id() + " for StructRefSeq " + sref);
1621                } else {
1622                        r.setDatabase(structRef.getDb_name());
1623                        r.setDbIdCode(structRef.getDb_code());
1624                }
1625
1626
1627                int seqbegin = Integer.parseInt(sref.getPdbx_auth_seq_align_beg());
1628                int seqend   = Integer.parseInt(sref.getPdbx_auth_seq_align_end());
1629                Character begin_ins_code = new Character(sref.getPdbx_seq_align_beg_ins_code().charAt(0));
1630                Character end_ins_code   = new Character(sref.getPdbx_seq_align_end_ins_code().charAt(0));
1631
1632                if (begin_ins_code == '?')
1633                        begin_ins_code = ' ';
1634
1635                if (end_ins_code == '?')
1636                        end_ins_code = ' ';
1637
1638                r.setSeqBegin(seqbegin);
1639                r.setInsertBegin(begin_ins_code);
1640
1641                r.setSeqEnd(seqend);
1642                r.setInsertEnd(end_ins_code);
1643
1644                int dbseqbegin = Integer.parseInt(sref.getDb_align_beg());
1645                int dbseqend   = Integer.parseInt(sref.getDb_align_end());
1646                Character db_begin_in_code = new Character(sref.getPdbx_db_align_beg_ins_code().charAt(0));
1647                Character db_end_in_code   = new Character(sref.getPdbx_db_align_end_ins_code().charAt(0));
1648
1649                if (db_begin_in_code == '?')
1650                        db_begin_in_code = ' ';
1651
1652                if (db_end_in_code == '?')
1653                        db_end_in_code = ' ';
1654
1655
1656                r.setDbSeqBegin(dbseqbegin);
1657                r.setIdbnsBegin(db_begin_in_code);
1658
1659                r.setDbSeqEnd(dbseqend);
1660                r.setIdbnsEnd(db_end_in_code);
1661
1662                List<DBRef> dbrefs = structure.getDBRefs();
1663                if ( dbrefs == null)
1664                        dbrefs = new ArrayList<DBRef>();
1665                dbrefs.add(r);
1666
1667                logger.debug(r.toPDB());
1668
1669                structure.setDBRefs(dbrefs);
1670
1671        }
1672
1673        @Override
1674        public void newStructRefSeqDif(StructRefSeqDif sref) {
1675                sequenceDifs.add(sref);
1676        }
1677
1678        private Chain getEntityChain(String entity_id){
1679
1680                for (Chain chain : entityChains) {
1681                        if ( chain.getChainID().equals(entity_id)){
1682
1683                                return chain;
1684                        }
1685                }
1686                // does not exist yet, so create...
1687
1688                Chain   chain = new ChainImpl();
1689                chain.setChainID(entity_id);
1690                entityChains.add(chain);
1691
1692                return chain;
1693
1694        }
1695
1696        //private Chain getSeqResChain(String chainID){
1697        //      return getChainFromList(seqResChains, chainID);
1698        //}
1699
1700
1701        /**
1702         * Data items in the ENTITY_SRC_GEN category record details of
1703         * the source from which the entity was obtained in cases
1704         * where the source was genetically manipulated.  The
1705         * following are treated separately:  items pertaining to the tissue
1706         * from which the gene was obtained, items pertaining to the host
1707         * organism for gene expression and items pertaining to the actual
1708         * producing organism (plasmid).
1709         */
1710        @Override
1711        public void newEntitySrcGen(EntitySrcGen entitySrcGen){
1712
1713                // add to internal list. Map to Compound object later on...
1714                entitySrcGens.add(entitySrcGen);
1715        }
1716
1717        @Override
1718        public void newEntitySrcNat(EntitySrcNat entitySrcNat){
1719
1720                // add to internal list. Map to Compound object later on...
1721                entitySrcNats.add(entitySrcNat);
1722        }
1723
1724        @Override
1725        public void newEntitySrcSyn(EntitySrcSyn entitySrcSyn){
1726
1727                // add to internal list. Map to Compound object later on...
1728                entitySrcSyns.add(entitySrcSyn);
1729        }
1730
1731        /**
1732         * The EntityPolySeq object provide the amino acid sequence objects for the Entities.
1733         * Later on the entities are mapped to the BioJava Chain and Compound objects.
1734         * @param epolseq the EntityPolySeq record for one amino acid
1735         */
1736        @Override
1737        public void newEntityPolySeq(EntityPolySeq epolseq) {
1738
1739                logger.debug("NEW entity poly seq " + epolseq);
1740
1741                int eId = -1;
1742                try {
1743                        eId = Integer.parseInt(epolseq.getEntity_id());
1744                } catch (NumberFormatException e) {
1745                        logger.warn("Could not parse entity id from EntityPolySeq: "+e.getMessage());
1746                }
1747                Entity e = getEntity(eId);
1748
1749                if (e == null){
1750                        logger.info("Could not find entity "+ epolseq.getEntity_id()+". Can not match sequence to it.");
1751                        return;
1752                }
1753
1754                Chain entityChain = getEntityChain(epolseq.getEntity_id());
1755
1756
1757                // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
1758                // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
1759
1760                Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(epolseq.getMon_id());
1761                //int seqId = Integer.parseInt(epolseq.getNum());
1762                if ( g != null && !g.getChemComp().isEmpty()) {
1763                        if ( g instanceof AminoAcidImpl) {
1764                                AminoAcidImpl aa = (AminoAcidImpl) g;
1765                                aa.setRecordType(AminoAcid.SEQRESRECORD);
1766                                //aa.setId(seqId);
1767                        }
1768                } else {
1769
1770                        if (epolseq.getMon_id().length()==3 && StructureTools.get1LetterCodeAmino(epolseq.getMon_id())!=null){
1771                                AminoAcidImpl a = new AminoAcidImpl();
1772                                a.setRecordType(AminoAcid.SEQRESRECORD);
1773                                Character code1 = StructureTools.get1LetterCodeAmino(epolseq.getMon_id());
1774                                a.setAminoType(code1);
1775                                g = a;
1776
1777                        } else if ( StructureTools.isNucleotide(epolseq.getMon_id())) {
1778                                // the group is actually a nucleotide group...
1779                                NucleotideImpl n = new NucleotideImpl();
1780                                g = n;
1781
1782                        } else {
1783                                logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", epolseq.getNum(),epolseq.getMon_id());
1784                                HetatomImpl h = new HetatomImpl();
1785                                g = h;
1786
1787                        }
1788
1789
1790                }
1791                // at this stage we don't know about author residue numbers (insertion codes)
1792                // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly sequential and follow the seqres sequence 1 to n)
1793                // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
1794                g.setResidueNumber(ResidueNumber.fromString(epolseq.getNum()));
1795
1796                g.setPDBName(epolseq.getMon_id());
1797
1798                entityChain.addGroup(g);
1799
1800        }
1801
1802        @Override
1803        public void newPdbxPolySeqScheme(PdbxPolySeqScheme ppss) {
1804
1805                //if ( headerOnly)
1806                //      return;
1807
1808                // replace the group asym ids with the real PDB ids!
1809                // replaceGroupSeqPos(ppss);  // This might be incorrect in some pdb, to use auth_seq_id of the pdbx_poly_seq_scheme.
1810
1811                // merge the EntityPolySeq info and the AtomSite chains into one...
1812                //already known ignore:
1813                if (asymStrandId.containsKey(ppss.getAsym_id()))
1814                        return;
1815
1816                // this is one of the internal mmcif rules it seems...
1817                if ( ppss.getPdb_strand_id() == null) {
1818                        asymStrandId.put(ppss.getAsym_id(), ppss.getAuth_mon_id());
1819                        return;
1820                }
1821
1822                //System.out.println(ppss.getAsym_id() + " = " + ppss.getPdb_strand_id());
1823
1824                asymStrandId.put(ppss.getAsym_id(), ppss.getPdb_strand_id());
1825
1826        }
1827
1828
1829        @Override
1830        public void newPdbxNonPolyScheme(PdbxNonPolyScheme ppss) {
1831
1832                //if (headerOnly)
1833                //      return;
1834
1835                // merge the EntityPolySeq info and the AtomSite chains into one...
1836                //already known ignore:
1837                if (asymStrandId.containsKey(ppss.getAsym_id()))
1838                        return;
1839
1840                // this is one of the interal mmcif rules it seems...
1841                if ( ppss.getPdb_strand_id() == null) {
1842                        asymStrandId.put(ppss.getAsym_id(), ppss.getAsym_id());
1843                        return;
1844                }
1845
1846                asymStrandId.put(ppss.getAsym_id(), ppss.getPdb_strand_id());
1847
1848        }
1849
1850        @Override
1851        public void newPdbxEntityNonPoly(PdbxEntityNonPoly pen){
1852                // TODO: do something with them...
1853                // not implemented yet...
1854                //System.out.println(pen.getEntity_id() + " " + pen.getName() + " " + pen.getComp_id());
1855        }
1856
1857        @Override
1858        public void newChemComp(ChemComp c) {
1859                // TODO: do something with them...
1860
1861        }
1862
1863        @Override
1864        public void newGenericData(String category, List<String> loopFields,
1865                        List<String> lineData) {
1866
1867                //logger.debug("unhandled category so far: " + category);
1868        }
1869
1870        @Override
1871        public FileParsingParameters getFileParsingParameters()
1872        {
1873                return params;
1874        }
1875
1876        @Override
1877        public void setFileParsingParameters(FileParsingParameters params)
1878        {
1879                this.params = params;
1880
1881        }
1882
1883        @Override
1884        public void newChemCompDescriptor(ChemCompDescriptor ccd) {
1885
1886                // TODO nothing happening here yet.
1887
1888        }
1889
1890
1891
1892        public List<PdbxStructOperList> getStructOpers() {
1893                return structOpers;
1894        }
1895
1896        @Override
1897        public void newPdbxStrucAssembly(PdbxStructAssembly strucAssembly) {
1898                strucAssemblies.add(strucAssembly);
1899
1900        }
1901
1902        public List<PdbxStructAssembly> getStructAssemblies(){
1903                return strucAssemblies;
1904        }
1905
1906        @Override
1907        public void newPdbxStrucAssemblyGen(PdbxStructAssemblyGen strucAssembly) {
1908                strucAssemblyGens.add(strucAssembly);
1909
1910        }
1911
1912        public List<PdbxStructAssemblyGen> getStructAssemblyGens(){
1913                return strucAssemblyGens;
1914        }
1915
1916        @Override
1917        public void newChemCompAtom(ChemCompAtom atom) {
1918
1919        }
1920
1921        @Override
1922        public void newPdbxChemCompIndentifier(PdbxChemCompIdentifier id) {
1923
1924        }
1925
1926        @Override
1927        public void newChemCompBond(ChemCompBond bond) {
1928
1929        }
1930
1931        @Override
1932        public void newPdbxChemCompDescriptor(PdbxChemCompDescriptor desc) {
1933
1934        }
1935
1936        @Override
1937        public void newStructConn(StructConn structConn) {
1938                this.structConn.add(structConn);
1939        }
1940
1941        @Override
1942        public void newStructSiteGen(StructSiteGen siteGen) { this.structSiteGens.add(siteGen); }
1943
1944        @Override
1945        public void newStructSite(StructSite structSite) {
1946
1947                if (params.isHeaderOnly()) {
1948                        return;
1949                }
1950
1951                // Simply implement the method.
1952                List<Site> sites = structure.getSites();
1953                if (sites == null) sites = new ArrayList<Site>();
1954
1955                Site site = null;
1956                for (Site asite : sites) {
1957                        if (asite.getSiteID().equals(structSite.getId())) {
1958                                site = asite;           // Prevent duplicate siteIds
1959                        }
1960                }
1961                boolean addSite = false;
1962                if (site == null) { site = new Site(); addSite = true; }
1963                site.setSiteID(structSite.getId());
1964                site.setDescription(structSite.getDetails());
1965                // site.setPdbxEvidenceCode(structSite.getPdbxEvidenceCode()); // TODO - add addition fields in Sites
1966                if (addSite) sites.add(site);
1967
1968                structure.setSites(sites);
1969        }
1970
1971        /**
1972         * Build sites in a BioJava Structure using the original author chain id & residue numbers.
1973         * Sites are built from struct_site_gen records that have been parsed.
1974         */
1975        private void addSites() {
1976                List<Site> sites = structure.getSites();
1977                if (sites == null) sites = new ArrayList<Site>();
1978
1979                for (StructSiteGen siteGen : structSiteGens) {
1980                                // For each StructSiteGen, find the residues involved, if they exist then
1981                                String site_id = siteGen.getSite_id(); // multiple could be in same site.
1982                                if (site_id == null) site_id = "";
1983                                String comp_id = siteGen.getLabel_comp_id();  // PDBName
1984                                // Assumption: the author chain ID and residue number for the site is consistent with the original
1985                                // author chain id and residue numbers.
1986                                String chain_id;
1987                                if (params.isUseInternalChainId()){
1988                                                chain_id = siteGen.getLabel_asym_id();
1989                                }
1990                                else {
1991                                        chain_id = siteGen.getAuth_asym_id(); // ChainID
1992                                }
1993                                String auth_seq_id = siteGen.getAuth_seq_id(); // Res num
1994
1995                                String insCode = siteGen.getPdbx_auth_ins_code();
1996                                if ( insCode != null && insCode.equals("?"))
1997                                        insCode = null;
1998
1999                                // Look for asymID = chainID and seqID = seq_ID.  Check that comp_id matches the resname.
2000                                Group g = null;
2001                                try {
2002                                        Chain chain = structure.getChainByPDB(chain_id);
2003                                        if (null != chain) {
2004                                                try {
2005                                                        Character insChar = null;
2006                                                        if (null != insCode && insCode.length() > 0) insChar = insCode.charAt(0);
2007                                                        g = chain.getGroupByPDB(new ResidueNumber(chain_id, Integer.parseInt(auth_seq_id), insChar));
2008                                                } catch (NumberFormatException e) {
2009                                                        logger.warn("Could not lookup residue : " + chain_id + auth_seq_id);
2010                                                }
2011                                        }
2012                                } catch (StructureException e) {
2013                                        logger.warn("Problem finding residue in site entry " + siteGen.getSite_id() + " - " + e.getMessage(), e.getMessage());
2014                                }
2015
2016                                if (g != null) {
2017                                        // 2. find the site_id, if not existing, create anew.
2018                                        Site site = null;
2019                                        for (Site asite: sites) {
2020                                                if (site_id.equals(asite.getSiteID())) site = asite;
2021                                        }
2022
2023                                        boolean addSite = false;
2024
2025                                        // 3. add this residue to the site.
2026                                        if (site == null) {
2027                                                addSite = true;
2028                                                site = new Site();
2029                                                site.setSiteID(site_id);
2030                                        }
2031
2032                                        List<Group> groups = site.getGroups();
2033                                        if (groups == null) groups = new ArrayList<Group>();
2034
2035                                        // Check the self-consistency of the residue reference from auth_seq_id and chain_id
2036                                        if (!comp_id.equals(g.getPDBName())) {
2037                                                logger.warn("comp_id doesn't match the residue at " + chain_id + auth_seq_id + " - skipping");
2038                                        } else {
2039                                                groups.add(g);
2040                                                site.setGroups(groups);
2041                                        }
2042                                        if (addSite) sites.add(site);
2043                                }
2044                }
2045                structure.setSites(sites);
2046        }
2047}