001package org.biojava.nbio.structure.io.cif;
002
003import java.time.LocalDate;
004import java.time.ZoneId;
005import java.time.format.DateTimeFormatter;
006import java.time.format.DateTimeFormatterBuilder;
007import java.util.ArrayList;
008import java.util.Date;
009import java.util.HashMap;
010import java.util.LinkedHashMap;
011import java.util.List;
012import java.util.Locale;
013import java.util.Map;
014import java.util.NoSuchElementException;
015import java.util.Optional;
016import java.util.OptionalInt;
017import java.util.stream.Collectors;
018import java.util.stream.IntStream;
019
020import javax.vecmath.Matrix4d;
021
022import org.biojava.nbio.structure.AminoAcid;
023import org.biojava.nbio.structure.AminoAcidImpl;
024import org.biojava.nbio.structure.Atom;
025import org.biojava.nbio.structure.AtomImpl;
026import org.biojava.nbio.structure.Chain;
027import org.biojava.nbio.structure.ChainImpl;
028import org.biojava.nbio.structure.DBRef;
029import org.biojava.nbio.structure.Element;
030import org.biojava.nbio.structure.EntityInfo;
031import org.biojava.nbio.structure.EntityType;
032import org.biojava.nbio.structure.Group;
033import org.biojava.nbio.structure.GroupType;
034import org.biojava.nbio.structure.HetatomImpl;
035import org.biojava.nbio.structure.NucleotideImpl;
036import org.biojava.nbio.structure.PDBCrystallographicInfo;
037import org.biojava.nbio.structure.PDBHeader;
038import org.biojava.nbio.structure.PdbId;
039import org.biojava.nbio.structure.ResidueNumber;
040import org.biojava.nbio.structure.SeqMisMatch;
041import org.biojava.nbio.structure.SeqMisMatchImpl;
042import org.biojava.nbio.structure.Site;
043import org.biojava.nbio.structure.Structure;
044import org.biojava.nbio.structure.StructureException;
045import org.biojava.nbio.structure.StructureImpl;
046import org.biojava.nbio.structure.StructureTools;
047import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
048import org.biojava.nbio.structure.io.BondMaker;
049import org.biojava.nbio.structure.io.ChargeAdder;
050import org.biojava.nbio.structure.io.EntityFinder;
051import org.biojava.nbio.structure.io.FileParsingParameters;
052import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
053import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
054import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
055import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
056import org.biojava.nbio.structure.xtal.CrystalCell;
057import org.biojava.nbio.structure.xtal.SpaceGroup;
058import org.biojava.nbio.structure.xtal.SymoplibParser;
059import org.rcsb.cif.model.FloatColumn;
060import org.rcsb.cif.model.IntColumn;
061import org.rcsb.cif.model.StrColumn;
062import org.rcsb.cif.model.ValueKind;
063import org.rcsb.cif.schema.mm.AtomSite;
064import org.rcsb.cif.schema.mm.AtomSites;
065import org.rcsb.cif.schema.mm.AuditAuthor;
066import org.rcsb.cif.schema.mm.Cell;
067import org.rcsb.cif.schema.mm.ChemComp;
068import org.rcsb.cif.schema.mm.ChemCompBond;
069import org.rcsb.cif.schema.mm.DatabasePDBRemark;
070import org.rcsb.cif.schema.mm.DatabasePDBRev;
071import org.rcsb.cif.schema.mm.DatabasePDBRevRecord;
072import org.rcsb.cif.schema.mm.Em3dReconstruction;
073import org.rcsb.cif.schema.mm.Entity;
074import org.rcsb.cif.schema.mm.EntityPoly;
075import org.rcsb.cif.schema.mm.EntityPolySeq;
076import org.rcsb.cif.schema.mm.EntitySrcGen;
077import org.rcsb.cif.schema.mm.EntitySrcNat;
078import org.rcsb.cif.schema.mm.Exptl;
079import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory;
080import org.rcsb.cif.schema.mm.PdbxChemCompIdentifier;
081import org.rcsb.cif.schema.mm.PdbxDatabaseStatus;
082import org.rcsb.cif.schema.mm.PdbxEntityBranchDescriptor;
083import org.rcsb.cif.schema.mm.PdbxEntitySrcSyn;
084import org.rcsb.cif.schema.mm.PdbxMolecule;
085import org.rcsb.cif.schema.mm.PdbxMoleculeFeatures;
086import org.rcsb.cif.schema.mm.PdbxNonpolyScheme;
087import org.rcsb.cif.schema.mm.PdbxReferenceEntityLink;
088import org.rcsb.cif.schema.mm.PdbxReferenceEntityList;
089import org.rcsb.cif.schema.mm.PdbxReferenceEntityPolyLink;
090import org.rcsb.cif.schema.mm.PdbxStructAssembly;
091import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen;
092import org.rcsb.cif.schema.mm.PdbxStructModResidue;
093import org.rcsb.cif.schema.mm.PdbxStructOperList;
094import org.rcsb.cif.schema.mm.Refine;
095import org.rcsb.cif.schema.mm.Struct;
096import org.rcsb.cif.schema.mm.StructAsym;
097import org.rcsb.cif.schema.mm.StructConf;
098import org.rcsb.cif.schema.mm.StructConn;
099import org.rcsb.cif.schema.mm.StructConnType;
100import org.rcsb.cif.schema.mm.StructKeywords;
101import org.rcsb.cif.schema.mm.StructNcsOper;
102import org.rcsb.cif.schema.mm.StructRef;
103import org.rcsb.cif.schema.mm.StructRefSeq;
104import org.rcsb.cif.schema.mm.StructRefSeqDif;
105import org.rcsb.cif.schema.mm.StructSheetRange;
106import org.rcsb.cif.schema.mm.StructSite;
107import org.rcsb.cif.schema.mm.StructSiteGen;
108import org.rcsb.cif.schema.mm.Symmetry;
109import org.slf4j.Logger;
110import org.slf4j.LoggerFactory;
111
112/**
113 * An implementation of a CifFileConsumer for BioJava. Will process the information provided by a CifFile instance and
114 * use it to build up a {@link Structure} object.
115 * @author Sebastian Bittrich
116 * @since 6.0.0
117 */
118public class CifStructureConsumerImpl implements CifStructureConsumer {
119    private static final Logger logger = LoggerFactory.getLogger(CifStructureConsumerImpl.class);
120    private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder()
121            .parseCaseInsensitive()
122            .appendPattern("yyyy-MM-dd")
123            .toFormatter(Locale.US);
124
125    private Structure structure;
126    private Chain currentChain;
127    private Group currentGroup;
128    private List<List<Chain>> allModels;
129    private List<Chain> currentModel;
130    private PDBHeader pdbHeader;
131    private String currentNmrModelNumber;
132        private Em3dReconstruction em3dReconstruction;
133    private List<Chain> entityChains;
134
135    private Entity entity;
136    private EntityPoly entityPoly;
137    private EntitySrcGen entitySrcGen;
138    private EntitySrcNat entitySrcNat;
139    private PdbxEntitySrcSyn entitySrcSyn;
140    private List<Chain> seqResChains;
141    private PdbxStructAssembly structAssembly;
142    private PdbxStructAssemblyGen structAssemblyGen;
143    private StructAsym structAsym;
144    private StructConn structConn;
145    private StructNcsOper structNcsOper;
146    private PdbxStructOperList structOpers;
147    private StructRef structRef;
148    private StructRefSeqDif structRefSeqDif;
149    private StructSiteGen structSiteGen;
150
151    private Map<String, String> asymId2entityId;
152    private Map<String, String> asymId2authorId;
153    private Matrix4d parsedScaleMatrix;
154
155    private final FileParsingParameters params;
156
157    public CifStructureConsumerImpl(FileParsingParameters params) {
158        this.params = params;
159    }
160
161    @Override
162    public void prepare() {
163        this.structure = new StructureImpl();
164        this.pdbHeader = new PDBHeader();
165        structure.setPDBHeader(pdbHeader);
166
167        this.allModels = new ArrayList<>();
168        this.currentModel = new ArrayList<>();
169
170        this.seqResChains  = new ArrayList<>();
171        this.asymId2entityId = new HashMap<>();
172        this.asymId2authorId = new HashMap<>();
173
174        this.entityChains = new ArrayList<>();
175    }
176
177    @Override
178    public void consumeAtomSite(AtomSite atomSite) {
179        if (params.isHeaderOnly()) {
180            return;
181        }
182
183        StrColumn labelAsymId = atomSite.getLabelAsymId();
184        StrColumn authAsymId = atomSite.getAuthAsymId();
185
186        StrColumn groupPDB = atomSite.getGroupPDB();
187        IntColumn authSeqId = atomSite.getAuthSeqId();
188
189        StrColumn labelCompId = atomSite.getLabelCompId();
190
191        IntColumn id = atomSite.getId();
192        StrColumn labelAtomId = atomSite.getLabelAtomId();
193
194        FloatColumn cartnX = atomSite.getCartnX();
195        FloatColumn cartnY = atomSite.getCartnY();
196        FloatColumn cartnZ = atomSite.getCartnZ();
197
198        FloatColumn occupancy = atomSite.getOccupancy();
199        FloatColumn bIsoOrEquiv = atomSite.getBIsoOrEquiv();
200
201        StrColumn labelAltId = atomSite.getLabelAltId();
202        StrColumn typeSymbol = atomSite.getTypeSymbol();
203
204        StrColumn pdbxPDBInsCode = atomSite.getPdbxPDBInsCode();
205        IntColumn labelSeqId = atomSite.getLabelSeqId();
206        IntColumn pdbx_pdb_model_num = atomSite.getPdbxPDBModelNum();
207
208        for (int atomIndex = 0; atomIndex < atomSite.getRowCount(); atomIndex++) {
209            boolean startOfNewChain = false;
210            Character oneLetterCode = StructureTools.get1LetterCodeAmino(labelCompId.get(atomIndex));
211
212            boolean isHetAtmInFile = false;
213            if (!"ATOM".equals(groupPDB.get(atomIndex))) {
214                if (oneLetterCode != null && oneLetterCode.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
215                    oneLetterCode = null;
216                }
217
218                isHetAtmInFile = true;
219            }
220
221            String insCodeString = pdbxPDBInsCode.get(atomIndex);
222            Character insCode = null;
223            if (insCodeString != null && !insCodeString.isEmpty() && !"?".equals(insCodeString)) {
224                insCode = insCodeString.charAt(0);
225            }
226
227            // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.'
228            long seqId = labelSeqId.get(atomIndex);
229
230            String nmrModelNumber = pdbx_pdb_model_num.getStringData(atomIndex);
231
232            if (currentNmrModelNumber == null) {
233                currentNmrModelNumber = nmrModelNumber;
234            }
235            if (!currentNmrModelNumber.equals(nmrModelNumber)) {
236                currentNmrModelNumber = nmrModelNumber;
237
238                if (currentChain != null) {
239                    currentChain.addGroup(currentGroup);
240                    currentGroup.trimToSize();
241                }
242
243                allModels.add(currentModel);
244                currentModel = new ArrayList<>();
245                currentChain = null;
246                currentGroup = null;
247            }
248
249            String asymId = labelAsymId.get(atomIndex);
250            String authId = authAsymId.get(atomIndex);
251            if (currentChain == null) {
252                currentChain = new ChainImpl();
253                currentChain.setName(authId);
254                currentChain.setId(asymId);
255                currentModel.add(currentChain);
256                startOfNewChain = true;
257            }
258
259            if (!asymId.equals(currentChain.getId())) {
260                startOfNewChain = true;
261
262                currentChain.addGroup(currentGroup);
263
264                Optional<Chain> testChain = currentModel.stream()
265                        .filter(chain -> chain.getId().equals(asymId))
266                        .findFirst();
267
268                if (testChain.isPresent()) {
269                    currentChain = testChain.get();
270                } else {
271                    currentChain = new ChainImpl();
272                    currentChain.setName(authId);
273                    currentChain.setId(asymId);
274                }
275
276                if (!currentModel.contains(currentChain)) {
277                    currentModel.add(currentChain);
278                }
279            }
280
281            ResidueNumber residueNumber = new ResidueNumber(authId, authSeqId.get(atomIndex), insCode);
282
283            String recordName = groupPDB.get(atomIndex);
284            String compId = labelCompId.get(atomIndex);
285            if (currentGroup == null) {
286                currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
287                currentGroup.setResidueNumber(residueNumber);
288                currentGroup.setPDBName(compId);
289                currentGroup.setHetAtomInFile(isHetAtmInFile);
290            }
291
292            Group altGroup = null;
293            String altLocation = labelAltId.get(atomIndex);
294
295            if (startOfNewChain) {
296                currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
297                currentGroup.setResidueNumber(residueNumber);
298                currentGroup.setPDBName(compId);
299                currentGroup.setHetAtomInFile(isHetAtmInFile);
300            } else {
301                if (!residueNumber.equals(currentGroup.getResidueNumber())) {
302                    currentChain.addGroup(currentGroup);
303                    currentGroup.trimToSize();
304                    currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
305                    currentGroup.setPDBName(compId);
306                    currentGroup.setResidueNumber(residueNumber);
307                    currentGroup.setHetAtomInFile(isHetAtmInFile);
308                } else {
309                    if (altLocation != null && !altLocation.isEmpty() && !altLocation.equals(".")) {
310                        altGroup = getAltLocGroup(recordName, altLocation.charAt(0), oneLetterCode, compId, seqId);
311                        if (altGroup.getChain() == null) {
312                            altGroup.setChain(currentChain);
313                        }
314                    }
315                }
316            }
317
318            if (params.isParseCAOnly()) {
319                if (!labelAtomId.get(atomIndex).equals(StructureTools.CA_ATOM_NAME) && "C".equals(typeSymbol.get(atomIndex))) {
320                    continue;
321                }
322            }
323
324            Atom atom = new AtomImpl();
325
326            atom.setPDBserial(id.get(atomIndex));
327            atom.setName(labelAtomId.get(atomIndex));
328
329            atom.setX(cartnX.get(atomIndex));
330            atom.setY(cartnY.get(atomIndex));
331            atom.setZ(cartnZ.get(atomIndex));
332
333            atom.setOccupancy((float) occupancy.get(atomIndex));
334            atom.setTempFactor((float) bIsoOrEquiv.get(atomIndex));
335
336            if (altLocation == null || altLocation.isEmpty() || altLocation.equals(".")) {
337                atom.setAltLoc(' ');
338            } else {
339                atom.setAltLoc(altLocation.charAt(0));
340            }
341
342            String ts = typeSymbol.get(atomIndex);
343            try {
344                Element element = Element.valueOfIgnoreCase(ts);
345                atom.setElement(element);
346            }  catch (IllegalArgumentException e) {
347                logger.info("Element {} was not recognised as a BioJava-known element, the element will be " +
348                        "represented as the generic element {}", ts, Element.R.name());
349                atom.setElement(Element.R);
350            }
351
352            if (altGroup != null) {
353                altGroup.addAtom(atom);
354            } else {
355                currentGroup.addAtom(atom);
356            }
357
358            String atomName = atom.getName();
359            if (!currentGroup.hasAtom(atomName)) {
360                if (currentGroup.getPDBName().equals(atom.getGroup().getPDBName())) {
361                    if (!StructureTools.hasNonDeuteratedEquiv(atom, currentGroup)) {
362                        currentGroup.addAtom(atom);
363                    }
364                }
365            }
366        }
367    }
368
369    private Group getAltLocGroup(String recordName, Character altLoc, Character oneLetterCode, String threeLetterCode,
370                                 long seqId) {
371        List<Atom> atoms = currentGroup.getAtoms();
372        if (atoms.size() > 0) {
373            if (atoms.get(0).getAltLoc().equals(altLoc)) {
374                return currentGroup;
375            }
376        }
377
378        List<Group> altLocs = currentGroup.getAltLocs();
379        for (Group altLocGroup : altLocs) {
380            atoms = altLocGroup.getAtoms();
381            if (atoms.size() > 0) {
382                for (Atom a1 : atoms) {
383                    if (a1.getAltLoc().equals(altLoc)) {
384                        return altLocGroup;
385                    }
386                }
387            }
388        }
389
390        if (threeLetterCode.equals(currentGroup.getPDBName())) {
391            if (currentGroup.getAtoms().isEmpty()) {
392                return currentGroup;
393            }
394
395            Group altLocGroup = (Group) currentGroup.clone();
396            altLocGroup.setAtoms(new ArrayList<>());
397            altLocGroup.getAltLocs().clear();
398            currentGroup.addAltLoc(altLocGroup);
399            return altLocGroup;
400        }
401
402        Group altLocGroup = createGroup(recordName, oneLetterCode, threeLetterCode, seqId);
403        altLocGroup.setPDBName(threeLetterCode);
404        altLocGroup.setResidueNumber(currentGroup.getResidueNumber());
405        currentGroup.addAltLoc(altLocGroup);
406        return altLocGroup;
407    }
408
409    private Group createGroup(String record, Character oneLetterCode, String threeLetterCode, long seqId) {
410        Group group = ChemCompGroupFactory.getGroupFromChemCompDictionary(threeLetterCode);
411        if (group != null && !group.getChemComp().isEmpty()) {
412            if (group instanceof AminoAcidImpl) {
413                AminoAcidImpl aminoAcid = (AminoAcidImpl) group;
414                aminoAcid.setId(seqId);
415            } else if (group instanceof NucleotideImpl) {
416                NucleotideImpl nucleotide = (NucleotideImpl) group;
417                nucleotide.setId(seqId);
418            } else if (group instanceof HetatomImpl) {
419                HetatomImpl hetatom = (HetatomImpl) group;
420                hetatom.setId(seqId);
421            }
422            return group;
423        }
424
425        if ("ATOM".equals(record)) {
426            if (StructureTools.isNucleotide(threeLetterCode)) {
427                NucleotideImpl nucleotide = new NucleotideImpl();
428                group = nucleotide;
429                nucleotide.setId(seqId);
430            } else if (oneLetterCode == null || oneLetterCode == StructureTools.UNKNOWN_GROUP_LABEL) {
431                HetatomImpl hetatom = new HetatomImpl();
432                group = hetatom;
433                hetatom.setId(seqId);
434            } else {
435                AminoAcidImpl aminoAcid = new AminoAcidImpl();
436                group = aminoAcid;
437                aminoAcid.setAminoType(oneLetterCode);
438                aminoAcid.setId(seqId);
439            }
440        } else {
441            if (StructureTools.isNucleotide(threeLetterCode)) {
442                NucleotideImpl nucleotide = new NucleotideImpl();
443                group = nucleotide;
444                nucleotide.setId(seqId);
445            } else if (oneLetterCode != null) {
446                AminoAcidImpl aminoAcid = new AminoAcidImpl();
447                group = aminoAcid;
448                aminoAcid.setAminoType(oneLetterCode);
449                aminoAcid.setId(seqId);
450            } else {
451                HetatomImpl hetatom = new HetatomImpl();
452                hetatom.setId(seqId);
453                group = hetatom;
454            }
455        }
456        return group;
457    }
458
459    @Override
460    public void consumeAtomSites(AtomSites atomSites) {
461        // no atom sites present
462        if (!atomSites.isDefined() || atomSites.getRowCount() == 0) {
463            return;
464        }
465
466        try {
467            parsedScaleMatrix = new Matrix4d(
468                    atomSites.getFractTransfMatrix11().get(0),
469                    atomSites.getFractTransfMatrix12().get(0),
470                    atomSites.getFractTransfMatrix13().get(0),
471                    atomSites.getFractTransfVector1().get(0),
472
473                    atomSites.getFractTransfMatrix21().get(0),
474                    atomSites.getFractTransfMatrix22().get(0),
475                    atomSites.getFractTransfMatrix23().get(0),
476                    atomSites.getFractTransfVector2().get(0),
477
478                    atomSites.getFractTransfMatrix31().get(0),
479                    atomSites.getFractTransfMatrix32().get(0),
480                    atomSites.getFractTransfMatrix33().get(0),
481                    atomSites.getFractTransfVector3().get(0),
482
483                    0,
484                    0,
485                    0,
486                    1
487            );
488        } catch (NumberFormatException e) {
489            logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not " +
490                    "be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}",
491                    e.getMessage());
492            structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
493        }
494    }
495
496    @Override
497    public void consumeAuditAuthor(AuditAuthor auditAuthor) {
498        for (int rowIndex = 0; rowIndex < auditAuthor.getRowCount(); rowIndex++) {
499            String name = auditAuthor.getName().get(rowIndex);
500
501            StringBuilder last = new StringBuilder();
502            StringBuilder initials = new StringBuilder();
503            boolean afterComma = false;
504            for (char c : name.toCharArray()) {
505                if (c == ' ') {
506                    continue;
507                }
508                if (c == ',') {
509                    afterComma = true;
510                    continue;
511                }
512
513                if (afterComma) {
514                    initials.append(c);
515                } else {
516                    last.append(c);
517                }
518            }
519
520            StringBuilder newaa = new StringBuilder();
521            newaa.append(initials);
522            newaa.append(last);
523
524            String auth = pdbHeader.getAuthors();
525            if (auth == null) {
526                pdbHeader.setAuthors(newaa.toString());
527            } else {
528                auth += "," + newaa.toString();
529                pdbHeader.setAuthors(auth);
530            }
531        }
532    }
533
534    @Override
535    public void consumeCell(Cell cell) {
536        if (!cell.isDefined() || cell.getRowCount() == 0) {
537            return;
538        }
539
540        try {
541            float a = (float) cell.getLengthA().get(0);
542            float b = (float) cell.getLengthB().get(0);
543            float c = (float) cell.getLengthC().get(0);
544            float alpha = (float) cell.getAngleAlpha().get(0);
545            float beta = (float) cell.getAngleBeta().get(0);
546            float gamma = (float) cell.getAngleGamma().get(0);
547
548            CrystalCell crystalCell = new CrystalCell();
549            crystalCell.setA(a);
550            crystalCell.setB(b);
551            crystalCell.setC(c);
552            crystalCell.setAlpha(alpha);
553            crystalCell.setBeta(beta);
554            crystalCell.setGamma(gamma);
555
556            if (!crystalCell.isCellReasonable()) {
557                // If the entry describes a structure determined by a technique other than X-ray crystallography,
558                // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
559                // if so we don't add and CrystalCell will be null
560                logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one " +
561                        "dimension is below {}), discarding it.", CrystalCell.MIN_VALID_CELL_SIZE);
562                return;
563            }
564
565            structure.getPDBHeader()
566                    .getCrystallographicInfo()
567                    .setCrystalCell(crystalCell);
568
569        } catch (NumberFormatException e){
570            structure.getPDBHeader()
571                    .getCrystallographicInfo()
572                    .setCrystalCell(null);
573            logger.info("could not parse some cell parameters ({}), ignoring _cell", e.getMessage());
574        }
575    }
576
577    @Override
578    public void consumeChemComp(ChemComp chemComp) {
579        // TODO not impled in ref
580    }
581
582    @Override
583    public void consumeChemCompBond(ChemCompBond chemCompBond) {
584        // TODO not impled in ref
585    }
586
587    @Override
588    public void consumeDatabasePDBRemark(DatabasePDBRemark databasePDBremark) {
589        for (int rowIndex = 0; rowIndex < databasePDBremark.getRowCount(); rowIndex++) {
590            int id = databasePDBremark.getId().get(rowIndex);
591            if (id == 2) {
592                String line = databasePDBremark.getText().get(rowIndex);
593                int i = line.indexOf("ANGSTROM");
594
595                if (i > 5) {
596                    // line contains ANGSTROM info...
597                    String resolution = line.substring(i - 5, i).trim();
598                    // convert string to float
599                    try {
600                        float res = Float.parseFloat(resolution);
601                        pdbHeader.setResolution(res);
602                    } catch (NumberFormatException e) {
603                        logger.info("could not parse resolution from line and ignoring it {}", line);
604                        return;
605                    }
606                }
607            }
608        }
609    }
610
611    private Date convert(LocalDate localDate) {
612        return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant());
613    }
614
615    @Override
616    public void consumeDatabasePDBRev(DatabasePDBRev databasePDBrev) {
617        logger.debug("got a database revision:" + databasePDBrev);
618
619        Date modDate = null;
620        for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) {
621            if (databasePDBrev.getNum().get(rowIndex) == 1) {
622                String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex);
623                pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT)));
624
625                String date = databasePDBrev.getDate().get(rowIndex);
626                final Date relDate = convert(LocalDate.parse(date, DATE_FORMAT));
627                pdbHeader.setRelDate(relDate);
628                modDate = relDate;
629            } else {
630                String dbrev = databasePDBrev.getDate().get(rowIndex);
631                modDate = convert(LocalDate.parse(dbrev, DATE_FORMAT));
632            }
633            pdbHeader.setModDate(modDate);
634        }
635    }
636
637    @Override
638    public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecord) {
639        List<org.biojava.nbio.structure.DatabasePDBRevRecord> revRecords = pdbHeader.getRevisionRecords();
640        if (revRecords == null) {
641            revRecords = new ArrayList<>();
642            pdbHeader.setRevisionRecords(revRecords);
643        }
644
645        for (int i = 0; i < databasePDBrevRecord.getRowCount(); i++) {
646            revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i));
647        }
648    }
649    
650    @Override
651    public void consumeEm3dReconstruction(Em3dReconstruction em3dReconstruction) {
652        this.em3dReconstruction = em3dReconstruction;
653        
654        for (int rowIndex = 0; rowIndex < em3dReconstruction.getRowCount(); rowIndex++) { //can it have more than 1 value?
655                final FloatColumn resolution = em3dReconstruction.getResolution();
656                        if (ValueKind.PRESENT.equals(resolution.getValueKind(rowIndex)))
657                        pdbHeader.setResolution((float) resolution.get(rowIndex));
658        }
659        //TODO other fields (maybe RFree)?
660    }
661
662    @Override
663    public void consumeEntity(Entity entity) {
664        this.entity = entity;
665    }
666
667    @Override
668    public void consumeEntityPoly(EntityPoly entityPoly) {
669        this.entityPoly = entityPoly;
670    }
671
672    @Override
673    public void consumeEntitySrcGen(EntitySrcGen entitySrcGen) {
674        this.entitySrcGen = entitySrcGen;
675    }
676
677    @Override
678    public void consumeEntitySrcNat(EntitySrcNat entitySrcNat) {
679        this.entitySrcNat = entitySrcNat;
680    }
681
682    @Override
683    public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) {
684        this.entitySrcSyn = entitySrcSyn;
685    }
686
687    @Override
688    public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) {
689        for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) {
690            Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex));
691
692            // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
693            // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
694
695            Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(entityPolySeq.getMonId().get(rowIndex));
696            //int seqId = Integer.parseInt(entityPolySeq.getNum());
697            if (g != null && !g.getChemComp().isEmpty()) {
698                if (g instanceof AminoAcidImpl) {
699                    AminoAcidImpl aa = (AminoAcidImpl) g;
700                    aa.setRecordType(AminoAcid.SEQRESRECORD);
701                }
702            } else {
703                if (entityPolySeq.getMonId().get(rowIndex).length() == 3 &&
704                        StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)) != null) {
705                    AminoAcidImpl a = new AminoAcidImpl();
706                    a.setRecordType(AminoAcid.SEQRESRECORD);
707                    Character code1 = StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex));
708                    a.setAminoType(code1);
709                    g = a;
710
711                } else if (StructureTools.isNucleotide(entityPolySeq.getMonId().get(rowIndex))) {
712                    // the group is actually a nucleotide group...
713                    g = new NucleotideImpl();
714                } else {
715                    logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group " +
716                            "for it", entityPolySeq.getNum().get(rowIndex), entityPolySeq.getMonId().get(rowIndex));
717                    g = new HetatomImpl();
718                }
719            }
720            // at this stage we don't know about author residue numbers (insertion codes)
721            // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly
722            // sequential and follow the seqres sequence 1 to n)
723            // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
724            g.setResidueNumber(ResidueNumber.fromString(entityPolySeq.getNum().getStringData(rowIndex)));
725            g.setPDBName(entityPolySeq.getMonId().get(rowIndex));
726            entityChain.addGroup(g);
727        }
728    }
729
730    private Chain getEntityChain(String entityId) {
731        for (Chain chain : entityChains) {
732            if (chain.getId().equals(entityId)) {
733                return chain;
734            }
735        }
736
737        // does not exist yet, so create...
738        Chain chain = new ChainImpl();
739        chain.setId(entityId);
740        entityChains.add(chain);
741
742        return chain;
743    }
744
745    @Override
746    public void consumeExptl(Exptl exptl) {
747        for (int rowIndex = 0; rowIndex < exptl.getRowCount(); rowIndex++) {
748            pdbHeader.setExperimentalTechnique(exptl.getMethod().get(rowIndex));
749        }
750    }
751
752    @Override
753    public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRevisionHistory) {
754        Date date = null;
755        for (int rowIndex = 0; rowIndex < pdbxAuditRevisionHistory.getRowCount(); rowIndex++) {
756            // first entry in revision history is the release date
757            if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) {
758                String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
759                date = convert(LocalDate.parse(release, DATE_FORMAT));
760                pdbHeader.setRelDate(date);
761            } else {
762                // all other dates are revision dates;
763                // since this method may be called multiple times,
764                // the last revision date will "stick"
765                String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
766                date = convert(LocalDate.parse(revision, DATE_FORMAT));
767            }
768            pdbHeader.setModDate(date);
769        }
770    }
771
772    @Override
773    public void consumePdbxChemCompIdentifier(PdbxChemCompIdentifier pdbxChemCompIdentifier) {
774        // TODO not impled in ref
775    }
776
777    @Override
778    public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) {
779        for (int rowIndex = 0; rowIndex < pdbxDatabaseStatus.getRowCount(); rowIndex++) {
780            // the deposition date field is only available in mmCIF 5.0
781            StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate();
782            if (recvdInitialDepositionDate.isDefined()) {
783                String deposition = recvdInitialDepositionDate.get(rowIndex);
784                pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT)));
785            }
786        }
787    }
788
789    @Override
790    public void consumePdbxEntityBranchDescriptor(PdbxEntityBranchDescriptor pdbxEntityBranchDescriptor) {
791        // TODO not considered in ref
792    }
793
794    @Override
795    public void consumePdbxMolecule(PdbxMolecule pdbxMolecule) {
796        // TODO not considered in ref
797    }
798
799    @Override
800    public void consumePdbxMoleculeFeatures(PdbxMoleculeFeatures pdbxMoleculeFeatures) {
801        // TODO not considered in ref
802    }
803
804    @Override
805    public void consumePdbxNonpolyScheme(PdbxNonpolyScheme pdbxNonpolyScheme) {
806        // TODO not impled in ref
807    }
808
809    @Override
810    public void consumePdbxReferenceEntityLink(PdbxReferenceEntityLink pdbxReferenceEntityLink) {
811        // TODO not considered in ref
812    }
813
814    @Override
815    public void consumePdbxReferenceEntityList(PdbxReferenceEntityList pdbxReferenceEntityList) {
816        // TODO not considered in ref
817    }
818
819    @Override
820    public void consumePdbxReferenceEntityPolyLink(PdbxReferenceEntityPolyLink pdbxReferenceEntityPolyLink) {
821        // TODO not considered in ref
822    }
823
824    @Override
825    public void consumePdbxStructAssembly(PdbxStructAssembly pdbxStructAssembly) {
826        this.structAssembly = pdbxStructAssembly;
827    }
828
829    @Override
830    public void consumePdbxStructAssemblyGen(PdbxStructAssemblyGen pdbxStructAssemblyGen) {
831        this.structAssemblyGen = pdbxStructAssemblyGen;
832    }
833
834    @Override
835    public void consumePdbxStructModResidue(PdbxStructModResidue pdbxStructModResidue) {
836        // TODO not considered in ref
837    }
838
839    @Override
840    public void consumePdbxStructOperList(PdbxStructOperList pdbxStructOperList) {
841        this.structOpers = pdbxStructOperList;
842    }
843
844    @Override
845    public void consumeRefine(Refine refine) {
846        for (int rowIndex = 0; rowIndex < refine.getRowCount(); rowIndex++) {
847            // RESOLUTION
848                ValueKind valueKind = refine.getLsDResHigh().getValueKind(rowIndex);
849                if (! ValueKind.PRESENT.equals(valueKind)) {
850                        continue;
851                }
852            // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
853            // there are 2 resolution values, one for each method
854            // we take the last one found so that behaviour is like in PDB file parsing
855            double lsDResHigh = refine.getLsDResHigh().get(rowIndex);
856            // TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0
857            if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) {
858                logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}",
859                        lsDResHigh, String.format("%4.2f",pdbHeader.getResolution()));
860            }
861            pdbHeader.setResolution((float) lsDResHigh);
862
863            FloatColumn lsRFactorRFree = refine.getLsRFactorRFree();
864            // RFREE
865            if (pdbHeader.getRfree() != PDBHeader.DEFAULT_RFREE) {
866                logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {}",
867                        lsRFactorRFree, String.format("%4.2f",pdbHeader.getRfree()));
868            }
869            if (lsRFactorRFree.isDefined() && lsRFactorRFree.getValueKind(rowIndex) == ValueKind.PRESENT) {
870                pdbHeader.setRfree((float) lsRFactorRFree.get(rowIndex));
871            } else {
872                // some entries like 2ifo haven't got this field at all
873                logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
874            }
875
876            // RWORK
877            FloatColumn lsRFactorRWork = refine.getLsRFactorRWork();
878            if(pdbHeader.getRwork() != PDBHeader.DEFAULT_RFREE) {
879                logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ",
880                        lsRFactorRWork, String.format("%4.2f",pdbHeader.getRwork()));
881            }
882            if (lsRFactorRWork.isDefined() && lsRFactorRWork.getValueKind(rowIndex) == ValueKind.PRESENT) {
883                pdbHeader.setRwork((float) lsRFactorRWork.get(rowIndex));
884            } else {
885                logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value");
886            }
887        }
888    }
889
890    @Override
891    public void consumeStruct(Struct struct) {
892        if (struct.isDefined() && struct.getTitle().isDefined()) {
893            pdbHeader.setTitle(struct.getTitle().get(0));
894        }
895
896        if (struct.isDefined() && struct.getEntryId().isDefined()) {
897            PdbId pdbId;
898            String pdbCode = struct.getEntryId().get(0);
899                        try {
900                                pdbId = new PdbId(pdbCode);
901                        } catch (IllegalArgumentException e) {
902                                logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode);
903                                pdbId = null;
904                        }
905            pdbHeader.setPdbId(pdbId);
906            structure.setPdbId(pdbId);
907        }
908    }
909
910    @Override
911    public void consumeStructAsym(StructAsym structAsym) {
912        this.structAsym = structAsym;
913    }
914
915    @Override
916    public void consumeStructConf(StructConf structConf) {
917        // TODO not considered in ref
918    }
919
920    @Override
921    public void consumeStructConn(StructConn structConn) {
922        this.structConn = structConn;
923    }
924
925    @Override
926    public void consumeStructConnType(StructConnType structConnType) {
927        // TODO not considered in ref
928    }
929
930    @Override
931    public void consumeStructKeywords(StructKeywords structKeywords) {
932        ArrayList<String> keywordsList = new ArrayList<String>();
933
934        StrColumn text = structKeywords.getText();
935        if (text.isDefined()) {
936            String keywords = text.get(0);
937            String[] strings = keywords.split(" *, *");
938            for (String string : strings) {
939                keywordsList.add(string.trim());
940            }
941        }
942        structure.getPDBHeader().setKeywords(keywordsList);
943
944        StrColumn pdbxKeywords = structKeywords.getPdbxKeywords();
945        if (pdbxKeywords.isDefined()) {
946            String keywords = pdbxKeywords.get(0);
947            pdbHeader.setClassification(keywords);
948            //This field should be left empty. TODO The next line should be removed later
949            pdbHeader.setDescription(keywords);
950        }
951    }
952
953    @Override
954    public void consumeStructNcsOper(StructNcsOper structNcsOper) {
955        this.structNcsOper = structNcsOper;
956    }
957
958    @Override
959    public void consumeStructRef(StructRef structRef) {
960        this.structRef = structRef;
961    }
962
963    @Override
964    public void consumeStructRefSeq(StructRefSeq structRefSeq) {
965        for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) {
966            String refId = structRefSeq.getRefId().get(rowIndex);
967
968            DBRef dbRef = new DBRef();
969
970            dbRef.setIdCode(structRefSeq.getPdbxPDBIdCode().get(rowIndex));
971            dbRef.setDbAccession(structRefSeq.getPdbxDbAccession().get(rowIndex));
972            dbRef.setDbIdCode(structRefSeq.getPdbxDbAccession().get(rowIndex));
973            dbRef.setChainName(structRefSeq.getPdbxStrandId().get(rowIndex));
974
975            OptionalInt structRefRowIndex = IntStream.range(0, structRef.getRowCount())
976                    .filter(i -> structRef.getId().get(i).equals(refId))
977                    .findFirst();
978
979            if (structRefRowIndex.isPresent()) {
980                dbRef.setDatabase(structRef.getDbName().get(structRefRowIndex.getAsInt()));
981                dbRef.setDbIdCode(structRef.getDbCode().get(structRefRowIndex.getAsInt()));
982            } else {
983                logger.info("could not find StructRef `{} for StructRefSeq {}", refId, rowIndex);
984            }
985
986            int seqBegin;
987            int seqEnd;
988
989            try {
990                seqBegin = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignBeg().get(rowIndex));
991                seqEnd = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignEnd().get(rowIndex));
992            } catch (NumberFormatException e) {
993                // this happens in a few entries, annotation error? e.g. 6eoj
994                logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref " +
995                        "alignment info for accession {}. Error: {}", dbRef.getDbAccession(), e.getMessage());
996                return;
997            }
998
999            char beginInsCode = ' ';
1000            String pdbxSeqAlignBegInsCode = structRefSeq.getPdbxSeqAlignBegInsCode().get(rowIndex);
1001            if (pdbxSeqAlignBegInsCode.length() > 0) {
1002                beginInsCode = pdbxSeqAlignBegInsCode.charAt(0);
1003            }
1004
1005            char endInsCode = ' ';
1006            String pdbxSeqAlignEndInsCode = structRefSeq.getPdbxSeqAlignEndInsCode().get(rowIndex);
1007            if (pdbxSeqAlignEndInsCode.length() > 0) {
1008                endInsCode = pdbxSeqAlignEndInsCode.charAt(0);
1009            }
1010
1011            if (beginInsCode == '?') {
1012                beginInsCode = ' ';
1013            }
1014            if (endInsCode == '?') {
1015                endInsCode = ' ';
1016            }
1017
1018            dbRef.setSeqBegin(seqBegin);
1019            dbRef.setInsertBegin(beginInsCode);
1020            dbRef.setSeqEnd(seqEnd);
1021            dbRef.setInsertEnd(endInsCode);
1022
1023            int dbSeqBegin = structRefSeq.getDbAlignBeg().get(rowIndex);
1024            int dbSeqEnd = structRefSeq.getDbAlignEnd().get(rowIndex);
1025
1026            char dbBeginInsCode = ' ';
1027            StrColumn pdbxDbAlignBegInsCodeCol = structRefSeq.getPdbxDbAlignBegInsCode();
1028            if (pdbxDbAlignBegInsCodeCol.isDefined()) {
1029                String pdbxDbAlignBegInsCode = pdbxDbAlignBegInsCodeCol.get(rowIndex);
1030                if (pdbxDbAlignBegInsCode.length() > 0) {
1031                    dbBeginInsCode = pdbxDbAlignBegInsCode.charAt(0);
1032                }
1033            }
1034
1035            char dbEndInsCode = ' ';
1036            StrColumn pdbxDbAlignEndInsCodeCol = structRefSeq.getPdbxDbAlignEndInsCode();
1037            if (pdbxDbAlignEndInsCodeCol.isDefined()) {
1038                String pdbxDbAlignEndInsCode = pdbxDbAlignEndInsCodeCol.get(rowIndex);
1039                if (pdbxDbAlignEndInsCode.length() > 0) {
1040                    dbEndInsCode = pdbxDbAlignEndInsCode.charAt(0);
1041                }
1042            }
1043
1044            if (dbBeginInsCode == '?') {
1045                dbBeginInsCode = ' ';
1046            }
1047            if (dbEndInsCode == '?') {
1048                dbEndInsCode = ' ';
1049            }
1050
1051            dbRef.setDbSeqBegin(dbSeqBegin);
1052            dbRef.setIdbnsBegin(dbBeginInsCode);
1053            dbRef.setDbSeqEnd(dbSeqEnd);
1054            dbRef.setIdbnsEnd(dbEndInsCode);
1055
1056            List<DBRef> dbrefs = structure.getDBRefs();
1057            if (dbrefs == null) {
1058                dbrefs = new ArrayList<>();
1059            }
1060            dbrefs.add(dbRef);
1061
1062            logger.debug(dbRef.toPDB());
1063
1064            structure.setDBRefs(dbrefs);
1065        }
1066    }
1067
1068    @Override
1069    public void consumeStructRefSeqDif(StructRefSeqDif structRefSeqDif) {
1070        this.structRefSeqDif = structRefSeqDif;
1071    }
1072
1073    @Override
1074    public void consumeStructSheetRange(StructSheetRange structSheetRange) {
1075        // TODO not considered in ref
1076    }
1077
1078    @Override
1079    public void consumeStructSite(StructSite structSite) {
1080        if (params.isHeaderOnly()) {
1081            return;
1082        }
1083
1084        List<Site> sites = structure.getSites();
1085        if (sites == null) {
1086            sites = new ArrayList<>();
1087        }
1088
1089        for (int rowIndex = 0; rowIndex < structSite.getRowCount(); rowIndex++) {
1090            Site site = null;
1091            for (Site asite : sites) {
1092                if (asite.getSiteID().equals(structSite.getId().get(rowIndex))) {
1093                    site = asite; // prevent duplicate siteIds
1094                }
1095            }
1096
1097            boolean addSite = false;
1098            if (site == null) {
1099                site = new Site();
1100                addSite = true;
1101            }
1102
1103            site.setSiteID(structSite.getId().get(rowIndex));
1104            site.setDescription(structSite.getDetails().get(rowIndex));
1105            site.setEvCode(structSite.getPdbxEvidenceCode().get(rowIndex));
1106
1107            if (addSite) {
1108                sites.add(site);
1109            }
1110        }
1111
1112        structure.setSites(sites);
1113    }
1114
1115    @Override
1116    public void consumeStructSiteGen(StructSiteGen structSiteGen) {
1117        this.structSiteGen = structSiteGen;
1118    }
1119
1120    @Override
1121    public void consumeSymmetry(Symmetry symmetry) {
1122        for (int rowIndex = 0; rowIndex < symmetry.getRowCount(); rowIndex++) {
1123            String spaceGroupString = symmetry.getSpaceGroupNameH_M().get(rowIndex);
1124            SpaceGroup spaceGroup = SymoplibParser.getSpaceGroup(spaceGroupString);
1125            if (spaceGroup == null) {
1126                logger.warn("Space group '{}' not recognised as a standard space group", spaceGroupString);
1127                structure.getPDBHeader()
1128                        .getCrystallographicInfo()
1129                        .setNonStandardSg(true);
1130            } else {
1131                structure.getPDBHeader()
1132                        .getCrystallographicInfo()
1133                        .setSpaceGroup(spaceGroup);
1134                structure.getPDBHeader()
1135                        .getCrystallographicInfo()
1136                        .setNonStandardSg(false);
1137            }
1138        }
1139    }
1140
1141    @Override
1142    public void finish() {
1143        if (currentChain != null) {
1144            currentChain.addGroup(currentGroup);
1145
1146            Optional<Chain> testChain = currentModel.stream()
1147                    .filter(chain -> chain.getId().equals(currentChain.getId()))
1148                    .findFirst();
1149
1150            if (!testChain.isPresent()) {
1151                currentModel.add(currentChain);
1152            }
1153        } else if (!params.isHeaderOnly()) {
1154            logger.warn("current chain is null at end of document.");
1155        }
1156
1157        allModels.add(currentModel);
1158
1159        initMaps();
1160
1161        for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
1162            String id = structAsym.getId().get(rowIndex);
1163            String entityId = structAsym.getEntityId().get(rowIndex);
1164            logger.debug("Entity {} matches asym_id: {}", entityId, id);
1165
1166            Chain chain = getEntityChain(entityId);
1167            Chain seqRes = (Chain) chain.clone();
1168            // to solve issue #160 (e.g. 3u7t)
1169            seqRes = removeSeqResHeterogeneity(seqRes);
1170            seqRes.setId(id);
1171            seqRes.setName(asymId2authorId.getOrDefault(id, id));
1172
1173            EntityType type = EntityType.entityTypeFromString(getEntityType(entityId));
1174            if (type == null || type == EntityType.POLYMER) {
1175                seqResChains.add(seqRes);
1176            }
1177
1178            logger.debug(" seqres: {} {}<", id, seqRes);
1179            addEntity(rowIndex, entityId, getEntityDescription(entityId), getEntityType(entityId));
1180        }
1181
1182        if (!structAsym.isDefined() || structAsym.getRowCount() == 0) {
1183            logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
1184        }
1185
1186        // entities
1187        // In addEntities above we created the entities if they were present in the file
1188        // Now we need to make sure that they are linked to chains and also that if they are not present in the file we
1189        // need to add them now
1190        linkEntities();
1191
1192        // now that we know the entities, we can add all chains to structure so that they are stored
1193        // properly as polymer/nonpolymer/water chains inside structure
1194        allModels.forEach(structure::addModel);
1195
1196        // Only align if requested (default) and not when headerOnly mode with no Atoms.
1197        // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
1198        if (params.isAlignSeqRes() && !params.isHeaderOnly()){
1199            logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
1200            alignSeqRes();
1201        } else {
1202            logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
1203            SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
1204        }
1205
1206        // Now make sure all altlocgroups have all the atoms in all the groups
1207        StructureTools.cleanUpAltLocs(structure);
1208
1209        // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
1210        if (!params.isHeaderOnly()) {
1211            if (params.shouldCreateAtomBonds()) {
1212                addBonds();
1213            }
1214
1215            if (params.shouldCreateAtomCharges()) {
1216                addCharges();
1217            }
1218        }
1219
1220        if (!params.isHeaderOnly()) {
1221            addSites();
1222        }
1223
1224        // set the oligomeric state info in the header...
1225        if (params.isParseBioAssembly()) {
1226            // the more detailed mapping of chains to rotation operations happens in StructureIO...
1227
1228            Map<Integer, BioAssemblyInfo> bioAssemblies = new LinkedHashMap<>();
1229            for (int i = 0; i < structAssembly.getRowCount(); i++) {
1230                String assemblyId = structAssembly.getId().get(i);
1231                List<Integer> structAssemblyGenIndices = new ArrayList<>();
1232                for (int j = 0; j < structAssemblyGen.getRowCount(); j++) {
1233                    if (structAssemblyGen.getAssemblyId().get(j).equals(assemblyId)) {
1234                        structAssemblyGenIndices.add(j);
1235                    }
1236                }
1237                BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
1238                // these are the transformations that need to be applied to our model
1239                List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(structAssembly,
1240                        i, structAssemblyGen, structOpers);
1241
1242                int bioAssemblyId = -1;
1243                try {
1244                    bioAssemblyId = Integer.parseInt(assemblyId);
1245                } catch (NumberFormatException e) {
1246                    logger.info("Could not parse a numerical bio assembly id from '{}'", assemblyId);
1247                }
1248
1249                // if bioassembly id is not numerical we throw it away
1250                // this happens usually for viral capsid entries, like 1ei7
1251                // see issue #230 in github
1252                if (bioAssemblyId != -1) {
1253                    int mmSize = 0;
1254                    // note that the transforms contain asym ids of both polymers and non-polymers
1255                    // For the mmsize, we are only interested in the polymers
1256                    for (BiologicalAssemblyTransformation transf : transformations) {
1257                        Chain c = structure.getChain(transf.getChainId());
1258                        if (c == null) {
1259                            logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
1260                            continue;
1261                        }
1262                        if (c.getEntityType() == EntityType.POLYMER &&
1263                                // for entries like 4kro, sugars are annotated as polymers but we
1264                                // don't want them in the macromolecularSize count
1265                                !c.getEntityInfo().getDescription().contains("SUGAR")) {
1266                            mmSize++;
1267                        }
1268                    }
1269
1270                    BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
1271                    bioAssembly.setId(bioAssemblyId);
1272                    bioAssembly.setMacromolecularSize(mmSize);
1273                    bioAssembly.setTransforms(transformations);
1274                    bioAssemblies.put(bioAssemblyId, bioAssembly);
1275                }
1276
1277            }
1278            structure.getPDBHeader()
1279                    .setBioAssemblies(bioAssemblies);
1280        }
1281
1282        setStructNcsOps();
1283        setCrystallographicInfoMetadata();
1284
1285        Map<String, List<SeqMisMatch>> misMatchMap = new HashMap<>();
1286        for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) {
1287            SeqMisMatch seqMisMatch = new SeqMisMatchImpl();
1288            seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex));
1289
1290            String insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex);
1291                if (insCode != null && insCode.equals("?")) {
1292                insCode = null;
1293            }
1294            seqMisMatch.setInsCode(insCode);
1295            seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex));
1296            seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex));
1297            seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex));
1298            seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex));
1299            seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex));
1300
1301            String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex);
1302            List<SeqMisMatch> seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>());
1303            seqMisMatches.add(seqMisMatch);
1304        }
1305
1306        for (String chainId : misMatchMap.keySet()){
1307            Chain chain = structure.getPolyChainByPDB(chainId);
1308            if (chain == null) {
1309                logger.warn("Could not set mismatches for chain with author id {}", chainId);
1310                continue;
1311            }
1312
1313            chain.setSeqMisMatches(misMatchMap.get(chainId));
1314        }
1315    }
1316
1317    private String getEntityType(String entityId) {
1318        return IntStream.range(0, entity.getRowCount())
1319                .filter(i -> entity.getId().get(i).equals(entityId))
1320                .mapToObj(i -> entity.getType().get(i))
1321                .findFirst()
1322                .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
1323    }
1324
1325    private String getEntityDescription(String entityId) {
1326        return IntStream.range(0, entity.getRowCount())
1327                .filter(i -> entity.getId().get(i).equals(entityId))
1328                .mapToObj(i -> entity.getPdbxDescription().get(i))
1329                .findFirst()
1330                .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
1331    }
1332
1333    private void addEntity(int asymRowIndex, String entityId, String pdbxDescription, String type) {
1334        int eId = 0;
1335        try {
1336            eId = Integer.parseInt(entityId);
1337        } catch (NumberFormatException e) {
1338            logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity", entityId);
1339        }
1340
1341        int entityRowIndex = IntStream.range(0, entity.getRowCount())
1342                .filter(i -> entity.getId().get(i).equals(entityId))
1343                .findFirst()
1344                .orElse(-1);
1345
1346        EntityInfo entityInfo = structure.getEntityById(eId);
1347
1348        if (entityInfo == null) {
1349            entityInfo = new EntityInfo();
1350            entityInfo.setMolId(eId);
1351            // we only add the compound if a polymeric one (to match what the PDB parser does)
1352            if (entityRowIndex != -1) {
1353                entityInfo.setDescription(pdbxDescription);
1354
1355                EntityType eType = EntityType.entityTypeFromString(type);
1356                if (eType != null) {
1357                    entityInfo.setType(eType);
1358                } else {
1359                    logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", type, eId);
1360                }
1361                addAncilliaryEntityData(asymRowIndex, entityInfo);
1362                structure.addEntityInfo(entityInfo);
1363                logger.debug("Adding Entity with entity id {} from _entity, with name: {}", eId,
1364                        entityInfo.getDescription());
1365            }
1366        }
1367    }
1368
1369    private void addAncilliaryEntityData(int asymRowIndex, EntityInfo entityInfo) {
1370        // Loop through each of the entity types and add the corresponding data
1371        // We're assuming if data is duplicated between sources it is consistent
1372        // This is a potentially huge assumption...
1373
1374        for (int rowIndex = 0; rowIndex < entitySrcGen.getRowCount(); rowIndex++) {
1375            if (!entitySrcGen.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1376                continue;
1377            }
1378
1379            addInformationFromEntitySrcGen(rowIndex, entityInfo);
1380        }
1381
1382        for (int rowIndex = 0; rowIndex < entitySrcNat.getRowCount(); rowIndex++) {
1383            if (!entitySrcNat.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1384                continue;
1385            }
1386
1387            addInformationFromEntitySrcNat(rowIndex, entityInfo);
1388        }
1389
1390        for (int rowIndex = 0; rowIndex < entitySrcSyn.getRowCount(); rowIndex++) {
1391            if (!entitySrcSyn.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1392                continue;
1393            }
1394
1395            addInformationFromEntitySrcSyn(rowIndex, entityInfo);
1396        }
1397    }
1398
1399    private void addInformationFromEntitySrcSyn(int rowIndex, EntityInfo entityInfo) {
1400        entityInfo.setOrganismCommon(entitySrcSyn.getOrganismCommonName().get(rowIndex));
1401        entityInfo.setOrganismScientific(entitySrcSyn.getOrganismScientific().get(rowIndex));
1402        entityInfo.setOrganismTaxId(entitySrcSyn.getNcbiTaxonomyId().get(rowIndex));
1403    }
1404
1405    private void addInformationFromEntitySrcNat(int rowIndex, EntityInfo entityInfo) {
1406        entityInfo.setAtcc(entitySrcNat.getPdbxAtcc().get(rowIndex));
1407        entityInfo.setCell(entitySrcNat.getPdbxCell().get(rowIndex));
1408        entityInfo.setOrganismCommon(entitySrcNat.getCommonName().get(rowIndex));
1409        entityInfo.setOrganismScientific(entitySrcNat.getPdbxOrganismScientific().get(rowIndex));
1410        entityInfo.setOrganismTaxId(entitySrcNat.getPdbxNcbiTaxonomyId().get(rowIndex));
1411    }
1412
1413    private void addInformationFromEntitySrcGen(int rowIndex, EntityInfo entityInfo) {
1414        entityInfo.setAtcc(entitySrcGen.getPdbxGeneSrcAtcc().get(rowIndex));
1415        entityInfo.setCell(entitySrcGen.getPdbxGeneSrcCell().get(rowIndex));
1416        entityInfo.setOrganismCommon(entitySrcGen.getGeneSrcCommonName().get(rowIndex));
1417        entityInfo.setOrganismScientific(entitySrcGen.getPdbxGeneSrcScientificName().get(rowIndex));
1418        entityInfo.setOrganismTaxId(entitySrcGen.getPdbxGeneSrcNcbiTaxonomyId().get(rowIndex));
1419        entityInfo.setExpressionSystemTaxId(entitySrcGen.getPdbxHostOrgNcbiTaxonomyId().get(rowIndex));
1420        entityInfo.setExpressionSystem(entitySrcGen.getPdbxHostOrgScientificName().get(rowIndex));
1421    }
1422
1423    private void setStructNcsOps() {
1424        List<Matrix4d> ncsOperators = new ArrayList<>();
1425
1426        for (int rowIndex = 0; rowIndex < structNcsOper.getRowCount(); rowIndex++) {
1427            if (!"generate".equals(structNcsOper.getCode().get(rowIndex))) {
1428                continue;
1429            }
1430
1431            try {
1432                Matrix4d operator = new Matrix4d();
1433
1434                operator.setElement(0, 0, structNcsOper.getMatrix11().get(rowIndex));
1435                operator.setElement(0, 1, structNcsOper.getMatrix12().get(rowIndex));
1436                operator.setElement(0, 2, structNcsOper.getMatrix13().get(rowIndex));
1437                operator.setElement(0, 3, structNcsOper.getVector1().get(rowIndex));
1438
1439                operator.setElement(1, 0, structNcsOper.getMatrix21().get(rowIndex));
1440                operator.setElement(1, 1, structNcsOper.getMatrix22().get(rowIndex));
1441                operator.setElement(1, 2, structNcsOper.getMatrix23().get(rowIndex));
1442                operator.setElement(1, 3, structNcsOper.getVector2().get(rowIndex));
1443
1444                operator.setElement(2, 0, structNcsOper.getMatrix31().get(rowIndex));
1445                operator.setElement(2, 1, structNcsOper.getMatrix32().get(rowIndex));
1446                operator.setElement(2, 2, structNcsOper.getMatrix33().get(rowIndex));
1447                operator.setElement(2, 3, structNcsOper.getVector3().get(rowIndex));
1448
1449                operator.setElement(3, 0, 0);
1450                operator.setElement(3, 1, 0);
1451                operator.setElement(3, 2, 0);
1452                operator.setElement(3, 3, 1);
1453
1454                ncsOperators.add(operator);
1455            } catch (NumberFormatException e) {
1456                logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", rowIndex + 1);
1457            }
1458        }
1459
1460        if (ncsOperators.size() > 0) {
1461            structure.getCrystallographicInfo()
1462                    .setNcsOperators(ncsOperators.toArray(new Matrix4d[0]));
1463        }
1464    }
1465
1466    private void setCrystallographicInfoMetadata() {
1467        if (parsedScaleMatrix != null) {
1468            PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
1469            boolean nonStd = false;
1470            if (crystalInfo.getCrystalCell() != null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
1471                nonStd = true;
1472            }
1473
1474            crystalInfo.setNonStandardCoordFrameConvention(nonStd);
1475        }
1476    }
1477
1478    private void addSites() {
1479        List<Site> sites = structure.getSites();
1480        if (sites == null) sites = new ArrayList<>();
1481
1482        for (int rowIndex = 0; rowIndex < structSiteGen.getRowCount(); rowIndex++) {
1483            // For each StructSiteGen, find the residues involved, if they exist then
1484            String site_id = structSiteGen.getSiteId().get(rowIndex); // multiple could be in same site.
1485            if (site_id == null) {
1486                site_id = "";
1487            }
1488            String comp_id = structSiteGen.getLabelCompId().get(rowIndex);  // PDBName
1489
1490            // Assumption: the author chain ID and residue number for the site is consistent with the original
1491            // author chain id and residue numbers.
1492
1493            String asymId = structSiteGen.getLabelAsymId().get(rowIndex); // chain name
1494            String authId = structSiteGen.getAuthAsymId().get(rowIndex); // chain Id
1495            String auth_seq_id = structSiteGen.getAuthSeqId().get(rowIndex); // Res num
1496
1497            String insCode = structSiteGen.getPdbxAuthInsCode().get(rowIndex);
1498            if (insCode != null && insCode.equals("?")) {
1499                insCode = null;
1500            }
1501
1502            // Look for asymID = chainID and seqID = seq_ID.  Check that comp_id matches the resname.
1503            Group g = null;
1504            try {
1505                Chain chain = structure.getChain(asymId);
1506
1507                if (null != chain) {
1508                    try {
1509                        Character insChar = null;
1510                        if (null != insCode && insCode.length() > 0) {
1511                            insChar = insCode.charAt(0);
1512                        }
1513                        g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar));
1514                    } catch (NumberFormatException e) {
1515                        logger.warn("Could not lookup residue : {}{}", authId, auth_seq_id);
1516                    }
1517                }
1518            } catch (StructureException e) {
1519                logger.warn("Problem finding residue in site entry {} - {}",
1520                        structSiteGen.getSiteId().get(rowIndex), e.getMessage());
1521            }
1522
1523            if (g != null) {
1524                // 2. find the site_id, if not existing, create anew.
1525                Site site = null;
1526                for (Site asite : sites) {
1527                    if (site_id.equals(asite.getSiteID())) {
1528                        site = asite;
1529                    }
1530                }
1531
1532                boolean addSite = false;
1533
1534                // 3. add this residue to the site.
1535                if (site == null) {
1536                    addSite = true;
1537                    site = new Site();
1538                    site.setSiteID(site_id);
1539                }
1540
1541                List<Group> groups = site.getGroups();
1542                if (groups == null) {
1543                    groups = new ArrayList<>();
1544                }
1545
1546                // Check the self-consistency of the residue reference from auth_seq_id and chain_id
1547                if (!comp_id.equals(g.getPDBName())) {
1548                    logger.warn("comp_id doesn't match the residue at {} {} - skipping", authId, auth_seq_id);
1549                } else {
1550                    groups.add(g);
1551                    site.setGroups(groups);
1552                }
1553                if (addSite) {
1554                    sites.add(site);
1555                }
1556            }
1557        }
1558        structure.setSites(sites);
1559    }
1560
1561    private void addCharges() {
1562        ChargeAdder.addCharges(structure);
1563    }
1564
1565    /**
1566     * The method will return a new reference to a Chain with any consecutive groups
1567     * having same residue numbers removed.
1568     * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
1569     */
1570    private static Chain removeSeqResHeterogeneity(Chain c) {
1571        Chain trimmedChain = new ChainImpl();
1572        ResidueNumber lastResNum = null;
1573
1574        for (Group g : c.getAtomGroups()) {
1575            // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
1576            ResidueNumber currentResNum = new ResidueNumber(
1577                    g.getResidueNumber().getChainName(),
1578                    g.getResidueNumber().getSeqNum(),
1579                    g.getResidueNumber().getInsCode());
1580
1581            if (lastResNum == null || !lastResNum.equals(currentResNum)) {
1582                trimmedChain.addGroup(g);
1583            } else {
1584                logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely " +
1585                        "has hetero='y': {}", g);
1586            }
1587            lastResNum = currentResNum;
1588
1589        }
1590        return trimmedChain;
1591    }
1592
1593    private void addBonds() {
1594        BondMaker maker = new BondMaker(structure, params);
1595        maker.makeBonds();
1596        maker.formBondsFromStructConn(structConn);
1597    }
1598
1599    private void alignSeqRes() {
1600        logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
1601
1602        // fix SEQRES residue numbering for all models
1603
1604        for (int model = 0; model < structure.nrModels(); model++) {
1605            List<Chain> atomList   = structure.getModel(model);
1606
1607            for (Chain seqResChain : seqResChains){
1608
1609                // this extracts the matching atom chain from atomList
1610                Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true);
1611
1612                if (atomChain == null) {
1613                    // most likely there's no observed residues at all for the seqres chain: can't map
1614                    // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
1615                    logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's " +
1616                            "no observed residues in the chain.", seqResChain.getId());
1617                    continue;
1618                }
1619
1620                //map the atoms to the seqres...
1621
1622                // we need to first clone the seqres so that they stay independent for different models
1623                List<Group> seqResGroups = new ArrayList<>();
1624                for (int i = 0; i < seqResChain.getAtomGroups().size(); i++) {
1625                    seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
1626                }
1627
1628                for (int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
1629                    Group seqresG = seqResGroups.get(seqResPos);
1630                    boolean found = false;
1631                    for (Group atomG : atomChain.getAtomGroups()) {
1632
1633                        int internalNr = getInternalNr(atomG);
1634
1635                        if (seqresG.getResidueNumber().getSeqNum() == internalNr) {
1636                            seqResGroups.set(seqResPos, atomG);
1637                            found = true;
1638                            break;
1639                        }
1640                    }
1641
1642                    if (!found)
1643                        // so far the residue number has tracked internal numbering.
1644                        // however there are no atom records, as such this can't be a PDB residue number...
1645                        seqresG.setResidueNumber(null);
1646                }
1647                atomChain.setSeqResGroups(seqResGroups);
1648            }
1649        }
1650    }
1651
1652    private int getInternalNr(Group atomG) {
1653        if (atomG.getType().equals(GroupType.AMINOACID)) {
1654            AminoAcidImpl aa = (AminoAcidImpl) atomG;
1655            return (int) aa.getId();
1656        } else if (atomG.getType().equals(GroupType.NUCLEOTIDE)) {
1657            NucleotideImpl nu = (NucleotideImpl) atomG;
1658            return (int) nu.getId();
1659        } else {
1660            HetatomImpl he = (HetatomImpl) atomG;
1661            return (int) he.getId();
1662        }
1663    }
1664
1665    private void linkEntities() {
1666        for (List<Chain> allModel : allModels) {
1667            for (Chain chain : allModel) {
1668                //logger.info("linking entities for " + chain.getId() + " "  + chain.getName());
1669                String entityId = asymId2entityId.get(chain.getId());
1670
1671                if (entityId == null) {
1672                    // this can happen for instance if the cif file didn't have _struct_asym category at all
1673                    // and thus we have no asymId2entityId mapping at all
1674                    logger.info("No entity id could be found for chain {}", chain.getId());
1675                    continue;
1676                }
1677
1678                int eId = Integer.parseInt(entityId);
1679
1680                // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
1681                // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
1682                // asyms (chains).  Either create a unique StructureImpl or modify existing for a better representation of the
1683                // mmCIF internal data structures but is compatible with Structure interface.
1684                // Some examples of PDB entries with this kind of problem:
1685                //   - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName
1686                //   - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule
1687                //   - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone
1688
1689                EntityInfo entityInfo = structure.getEntityById(eId);
1690                if (entityInfo == null) {
1691                    // Supports the case where the only chain members were from non-polymeric entity that is missing.
1692                    // Solved by creating a new Compound(entity) to which this chain will belong.
1693                    logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
1694                            eId, chain.getId());
1695                    entityInfo = new EntityInfo();
1696                    entityInfo.setMolId(eId);
1697                    entityInfo.addChain(chain);
1698                    if (chain.isWaterOnly()) {
1699                        entityInfo.setType(EntityType.WATER);
1700                    } else {
1701                        entityInfo.setType(EntityType.NONPOLYMER);
1702                    }
1703                    chain.setEntityInfo(entityInfo);
1704                    structure.addEntityInfo(entityInfo);
1705                } else {
1706                    logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}",
1707                            chain.getId(), chain.getName(), eId);
1708                    entityInfo.addChain(chain);
1709                    chain.setEntityInfo(entityInfo);
1710                }
1711
1712            }
1713
1714        }
1715
1716        // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
1717        List<EntityInfo> entityInfos = structure.getEntityInfos();
1718        if (entityInfos == null || entityInfos.isEmpty()) {
1719            List<List<Chain>> polyModels = new ArrayList<>();
1720            List<List<Chain>> nonPolyModels = new ArrayList<>();
1721            List<List<Chain>> waterModels = new ArrayList<>();
1722
1723            for (List<Chain> model : allModels) {
1724                List<Chain> polyChains = new ArrayList<>();
1725                List<Chain> nonPolyChains = new ArrayList<>();
1726                List<Chain> waterChains = new ArrayList<>();
1727
1728                polyModels.add(polyChains);
1729                nonPolyModels.add(nonPolyChains);
1730                waterModels.add(waterChains);
1731
1732                for (Chain chain : model) {
1733                    // we only have entities for polymeric chains, all others are ignored for assigning entities
1734                    if (chain.isWaterOnly()) {
1735                        waterChains.add(chain);
1736                    } else if (chain.isPureNonPolymer()) {
1737                        nonPolyChains.add(chain);
1738                    } else {
1739                        polyChains.add(chain);
1740                    }
1741                }
1742            }
1743
1744            entityInfos = EntityFinder.findPolyEntities(polyModels);
1745            EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos);
1746
1747            structure.setEntityInfos(entityInfos);
1748        }
1749
1750        // final sanity check: it can happen that from the annotated entities some are not linked to any chains
1751        // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
1752        // we simply log it, this can sign some other problems if the entities are used down the line
1753        for (EntityInfo e : entityInfos) {
1754            if (e.getChains().isEmpty()) {
1755                logger.info("Entity {} '{}' has no chains associated to it",
1756                        e.getMolId() < 0 ? "with no entity id" : e.getMolId(), e.getDescription());
1757            }
1758        }
1759    }
1760
1761    private void initMaps() {
1762        if (structAsym == null || !structAsym.isDefined() || structAsym.getRowCount() == 0) {
1763            logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available");
1764            return;
1765        }
1766
1767        Map<String, List<String>> entityId2asymId = new HashMap<>();
1768        for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
1769            String id = structAsym.getId().get(rowIndex);
1770            String entityId = structAsym.getEntityId().get(rowIndex);
1771
1772            logger.debug("Entity {} matches asym_id: {}", entityId, id);
1773
1774            asymId2entityId.put(id, entityId);
1775
1776            if (entityId2asymId.containsKey(entityId)) {
1777                List<String> asymIds = entityId2asymId.get(entityId);
1778                asymIds.add(id);
1779            } else {
1780                List<String> asymIds = new ArrayList<>();
1781                asymIds.add(id);
1782                entityId2asymId.put(entityId, asymIds);
1783            }
1784        }
1785
1786        if (entityPoly == null || !entityPoly.isDefined() || entityPoly.getRowCount() == 0) {
1787            logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available " +
1788                    "for header only parsing");
1789            return;
1790        }
1791
1792        for (int rowIndex = 0; rowIndex < entityPoly.getRowCount(); rowIndex++) {
1793            if (!entityPoly.getPdbxStrandId().isDefined()) {
1794                logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to " +
1795                        "author ids for this entity.", entityPoly.getEntityId().get(rowIndex));
1796                break;
1797            }
1798
1799            String[] chainNames = entityPoly.getPdbxStrandId().get(rowIndex).split(",");
1800            List<String> asymIds = entityId2asymId.get(entityPoly.getEntityId().get(rowIndex));
1801            if (chainNames.length != asymIds.size()) {
1802                logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) " +
1803                        "for entity {} have different lengths! Can't provide a mapping from asym ids to author chain " +
1804                        "ids", entityPoly.getEntityId().get(rowIndex));
1805                break;
1806            }
1807
1808            for (int i = 0; i < chainNames.length; i++) {
1809                asymId2authorId.put(asymIds.get(i), chainNames[i]);
1810            }
1811        }
1812    }
1813
1814    @Override
1815    public Structure getContainer() {
1816        return structure;
1817    }
1818}