001package org.biojava.nbio.structure.io.cif;
002
003import java.time.LocalDate;
004import java.time.ZoneId;
005import java.time.format.DateTimeFormatter;
006import java.time.format.DateTimeFormatterBuilder;
007import java.util.ArrayList;
008import java.util.Date;
009import java.util.HashMap;
010import java.util.LinkedHashMap;
011import java.util.List;
012import java.util.Locale;
013import java.util.Map;
014import java.util.NoSuchElementException;
015import java.util.Optional;
016import java.util.OptionalInt;
017import java.util.stream.Collectors;
018import java.util.stream.IntStream;
019
020import javax.vecmath.Matrix4d;
021
022import org.biojava.nbio.structure.AminoAcid;
023import org.biojava.nbio.structure.AminoAcidImpl;
024import org.biojava.nbio.structure.Atom;
025import org.biojava.nbio.structure.AtomImpl;
026import org.biojava.nbio.structure.Chain;
027import org.biojava.nbio.structure.ChainImpl;
028import org.biojava.nbio.structure.DBRef;
029import org.biojava.nbio.structure.Element;
030import org.biojava.nbio.structure.EntityInfo;
031import org.biojava.nbio.structure.EntityType;
032import org.biojava.nbio.structure.Group;
033import org.biojava.nbio.structure.GroupType;
034import org.biojava.nbio.structure.HetatomImpl;
035import org.biojava.nbio.structure.NucleotideImpl;
036import org.biojava.nbio.structure.PDBCrystallographicInfo;
037import org.biojava.nbio.structure.PDBHeader;
038import org.biojava.nbio.structure.PdbId;
039import org.biojava.nbio.structure.ResidueNumber;
040import org.biojava.nbio.structure.SeqMisMatch;
041import org.biojava.nbio.structure.SeqMisMatchImpl;
042import org.biojava.nbio.structure.Site;
043import org.biojava.nbio.structure.Structure;
044import org.biojava.nbio.structure.StructureException;
045import org.biojava.nbio.structure.StructureImpl;
046import org.biojava.nbio.structure.StructureTools;
047import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
048import org.biojava.nbio.structure.io.BondMaker;
049import org.biojava.nbio.structure.io.ChargeAdder;
050import org.biojava.nbio.structure.io.EntityFinder;
051import org.biojava.nbio.structure.io.FileParsingParameters;
052import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
053import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
054import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
055import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
056import org.biojava.nbio.structure.xtal.CrystalCell;
057import org.biojava.nbio.structure.xtal.SpaceGroup;
058import org.biojava.nbio.structure.xtal.SymoplibParser;
059import org.rcsb.cif.model.FloatColumn;
060import org.rcsb.cif.model.IntColumn;
061import org.rcsb.cif.model.StrColumn;
062import org.rcsb.cif.model.ValueKind;
063import org.rcsb.cif.schema.mm.AtomSite;
064import org.rcsb.cif.schema.mm.AtomSites;
065import org.rcsb.cif.schema.mm.AuditAuthor;
066import org.rcsb.cif.schema.mm.Cell;
067import org.rcsb.cif.schema.mm.ChemComp;
068import org.rcsb.cif.schema.mm.ChemCompBond;
069import org.rcsb.cif.schema.mm.DatabasePDBRemark;
070import org.rcsb.cif.schema.mm.DatabasePDBRev;
071import org.rcsb.cif.schema.mm.DatabasePDBRevRecord;
072import org.rcsb.cif.schema.mm.Entity;
073import org.rcsb.cif.schema.mm.EntityPoly;
074import org.rcsb.cif.schema.mm.EntityPolySeq;
075import org.rcsb.cif.schema.mm.EntitySrcGen;
076import org.rcsb.cif.schema.mm.EntitySrcNat;
077import org.rcsb.cif.schema.mm.Exptl;
078import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory;
079import org.rcsb.cif.schema.mm.PdbxChemCompIdentifier;
080import org.rcsb.cif.schema.mm.PdbxDatabaseStatus;
081import org.rcsb.cif.schema.mm.PdbxEntityBranchDescriptor;
082import org.rcsb.cif.schema.mm.PdbxEntitySrcSyn;
083import org.rcsb.cif.schema.mm.PdbxMolecule;
084import org.rcsb.cif.schema.mm.PdbxMoleculeFeatures;
085import org.rcsb.cif.schema.mm.PdbxNonpolyScheme;
086import org.rcsb.cif.schema.mm.PdbxReferenceEntityLink;
087import org.rcsb.cif.schema.mm.PdbxReferenceEntityList;
088import org.rcsb.cif.schema.mm.PdbxReferenceEntityPolyLink;
089import org.rcsb.cif.schema.mm.PdbxStructAssembly;
090import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen;
091import org.rcsb.cif.schema.mm.PdbxStructModResidue;
092import org.rcsb.cif.schema.mm.PdbxStructOperList;
093import org.rcsb.cif.schema.mm.Refine;
094import org.rcsb.cif.schema.mm.Struct;
095import org.rcsb.cif.schema.mm.StructAsym;
096import org.rcsb.cif.schema.mm.StructConf;
097import org.rcsb.cif.schema.mm.StructConn;
098import org.rcsb.cif.schema.mm.StructConnType;
099import org.rcsb.cif.schema.mm.StructKeywords;
100import org.rcsb.cif.schema.mm.StructNcsOper;
101import org.rcsb.cif.schema.mm.StructRef;
102import org.rcsb.cif.schema.mm.StructRefSeq;
103import org.rcsb.cif.schema.mm.StructRefSeqDif;
104import org.rcsb.cif.schema.mm.StructSheetRange;
105import org.rcsb.cif.schema.mm.StructSite;
106import org.rcsb.cif.schema.mm.StructSiteGen;
107import org.rcsb.cif.schema.mm.Symmetry;
108import org.slf4j.Logger;
109import org.slf4j.LoggerFactory;
110
111/**
112 * An implementation of a CifFileConsumer for BioJava. Will process the information provided by a CifFile instance and
113 * use it to build up a {@link Structure} object.
114 * @author Sebastian Bittrich
115 * @since 6.0.0
116 */
117public class CifStructureConsumerImpl implements CifStructureConsumer {
118    private static final Logger logger = LoggerFactory.getLogger(CifStructureConsumerImpl.class);
119    private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder()
120            .parseCaseInsensitive()
121            .appendPattern("yyyy-MM-dd")
122            .toFormatter(Locale.US);
123
124    private Structure structure;
125    private Chain currentChain;
126    private Group currentGroup;
127    private List<List<Chain>> allModels;
128    private List<Chain> currentModel;
129    private PDBHeader pdbHeader;
130    private String currentNmrModelNumber;
131    private List<Chain> entityChains;
132
133    private Entity entity;
134    private EntityPoly entityPoly;
135    private EntitySrcGen entitySrcGen;
136    private EntitySrcNat entitySrcNat;
137    private PdbxEntitySrcSyn entitySrcSyn;
138    private List<Chain> seqResChains;
139    private PdbxStructAssembly structAssembly;
140    private PdbxStructAssemblyGen structAssemblyGen;
141    private StructAsym structAsym;
142    private StructConn structConn;
143    private StructNcsOper structNcsOper;
144    private PdbxStructOperList structOpers;
145    private StructRef structRef;
146    private StructRefSeqDif structRefSeqDif;
147    private StructSiteGen structSiteGen;
148
149    private Map<String, String> asymId2entityId;
150    private Map<String, String> asymId2authorId;
151    private Matrix4d parsedScaleMatrix;
152
153    private final FileParsingParameters params;
154
155    public CifStructureConsumerImpl(FileParsingParameters params) {
156        this.params = params;
157    }
158
159    @Override
160    public void prepare() {
161        this.structure = new StructureImpl();
162        this.pdbHeader = new PDBHeader();
163        structure.setPDBHeader(pdbHeader);
164
165        this.allModels = new ArrayList<>();
166        this.currentModel = new ArrayList<>();
167
168        this.seqResChains  = new ArrayList<>();
169        this.asymId2entityId = new HashMap<>();
170        this.asymId2authorId = new HashMap<>();
171
172        this.entityChains = new ArrayList<>();
173    }
174
175    @Override
176    public void consumeAtomSite(AtomSite atomSite) {
177        if (params.isHeaderOnly()) {
178            return;
179        }
180
181        StrColumn labelAsymId = atomSite.getLabelAsymId();
182        StrColumn authAsymId = atomSite.getAuthAsymId();
183
184        StrColumn groupPDB = atomSite.getGroupPDB();
185        IntColumn authSeqId = atomSite.getAuthSeqId();
186
187        StrColumn labelCompId = atomSite.getLabelCompId();
188
189        IntColumn id = atomSite.getId();
190        StrColumn labelAtomId = atomSite.getLabelAtomId();
191
192        FloatColumn cartnX = atomSite.getCartnX();
193        FloatColumn cartnY = atomSite.getCartnY();
194        FloatColumn cartnZ = atomSite.getCartnZ();
195
196        FloatColumn occupancy = atomSite.getOccupancy();
197        FloatColumn bIsoOrEquiv = atomSite.getBIsoOrEquiv();
198
199        StrColumn labelAltId = atomSite.getLabelAltId();
200        StrColumn typeSymbol = atomSite.getTypeSymbol();
201
202        StrColumn pdbxPDBInsCode = atomSite.getPdbxPDBInsCode();
203        IntColumn labelSeqId = atomSite.getLabelSeqId();
204        IntColumn pdbx_pdb_model_num = atomSite.getPdbxPDBModelNum();
205
206        for (int atomIndex = 0; atomIndex < atomSite.getRowCount(); atomIndex++) {
207            boolean startOfNewChain = false;
208            Character oneLetterCode = StructureTools.get1LetterCodeAmino(labelCompId.get(atomIndex));
209
210            boolean isHetAtmInFile = false;
211            if (!"ATOM".equals(groupPDB.get(atomIndex))) {
212                if (oneLetterCode != null && oneLetterCode.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
213                    oneLetterCode = null;
214                }
215
216                isHetAtmInFile = true;
217            }
218
219            String insCodeString = pdbxPDBInsCode.get(atomIndex);
220            Character insCode = null;
221            if (insCodeString != null && !insCodeString.isEmpty() && !"?".equals(insCodeString)) {
222                insCode = insCodeString.charAt(0);
223            }
224
225            // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.'
226            long seqId = labelSeqId.get(atomIndex);
227
228            String nmrModelNumber = pdbx_pdb_model_num.getStringData(atomIndex);
229
230            if (currentNmrModelNumber == null) {
231                currentNmrModelNumber = nmrModelNumber;
232            }
233            if (!currentNmrModelNumber.equals(nmrModelNumber)) {
234                currentNmrModelNumber = nmrModelNumber;
235
236                if (currentChain != null) {
237                    currentChain.addGroup(currentGroup);
238                    currentGroup.trimToSize();
239                }
240
241                allModels.add(currentModel);
242                currentModel = new ArrayList<>();
243                currentChain = null;
244                currentGroup = null;
245            }
246
247            String asymId = labelAsymId.get(atomIndex);
248            String authId = authAsymId.get(atomIndex);
249            if (currentChain == null) {
250                currentChain = new ChainImpl();
251                currentChain.setName(authId);
252                currentChain.setId(asymId);
253                currentModel.add(currentChain);
254                startOfNewChain = true;
255            }
256
257            if (!asymId.equals(currentChain.getId())) {
258                startOfNewChain = true;
259
260                currentChain.addGroup(currentGroup);
261
262                Optional<Chain> testChain = currentModel.stream()
263                        .filter(chain -> chain.getId().equals(asymId))
264                        .findFirst();
265
266                if (testChain.isPresent()) {
267                    currentChain = testChain.get();
268                } else {
269                    currentChain = new ChainImpl();
270                    currentChain.setName(authId);
271                    currentChain.setId(asymId);
272                }
273
274                if (!currentModel.contains(currentChain)) {
275                    currentModel.add(currentChain);
276                }
277            }
278
279            ResidueNumber residueNumber = new ResidueNumber(authId, authSeqId.get(atomIndex), insCode);
280
281            String recordName = groupPDB.get(atomIndex);
282            String compId = labelCompId.get(atomIndex);
283            if (currentGroup == null) {
284                currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
285                currentGroup.setResidueNumber(residueNumber);
286                currentGroup.setPDBName(compId);
287                currentGroup.setHetAtomInFile(isHetAtmInFile);
288            }
289
290            Group altGroup = null;
291            String altLocation = labelAltId.get(atomIndex);
292
293            if (startOfNewChain) {
294                currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
295                currentGroup.setResidueNumber(residueNumber);
296                currentGroup.setPDBName(compId);
297                currentGroup.setHetAtomInFile(isHetAtmInFile);
298            } else {
299                if (!residueNumber.equals(currentGroup.getResidueNumber())) {
300                    currentChain.addGroup(currentGroup);
301                    currentGroup.trimToSize();
302                    currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
303                    currentGroup.setPDBName(compId);
304                    currentGroup.setResidueNumber(residueNumber);
305                    currentGroup.setHetAtomInFile(isHetAtmInFile);
306                } else {
307                    if (altLocation != null && !altLocation.isEmpty() && !altLocation.equals(".")) {
308                        altGroup = getAltLocGroup(recordName, altLocation.charAt(0), oneLetterCode, compId, seqId);
309                        if (altGroup.getChain() == null) {
310                            altGroup.setChain(currentChain);
311                        }
312                    }
313                }
314            }
315
316            if (params.isParseCAOnly()) {
317                if (!labelAtomId.get(atomIndex).equals(StructureTools.CA_ATOM_NAME) && "C".equals(typeSymbol.get(atomIndex))) {
318                    continue;
319                }
320            }
321
322            Atom atom = new AtomImpl();
323
324            atom.setPDBserial(id.get(atomIndex));
325            atom.setName(labelAtomId.get(atomIndex));
326
327            atom.setX(cartnX.get(atomIndex));
328            atom.setY(cartnY.get(atomIndex));
329            atom.setZ(cartnZ.get(atomIndex));
330
331            atom.setOccupancy((float) occupancy.get(atomIndex));
332            atom.setTempFactor((float) bIsoOrEquiv.get(atomIndex));
333
334            if (altLocation == null || altLocation.isEmpty() || altLocation.equals(".")) {
335                atom.setAltLoc(' ');
336            } else {
337                atom.setAltLoc(altLocation.charAt(0));
338            }
339
340            String ts = typeSymbol.get(atomIndex);
341            try {
342                Element element = Element.valueOfIgnoreCase(ts);
343                atom.setElement(element);
344            }  catch (IllegalArgumentException e) {
345                logger.info("Element {} was not recognised as a BioJava-known element, the element will be " +
346                        "represented as the generic element {}", ts, Element.R.name());
347                atom.setElement(Element.R);
348            }
349
350            if (altGroup != null) {
351                altGroup.addAtom(atom);
352            } else {
353                currentGroup.addAtom(atom);
354            }
355
356            String atomName = atom.getName();
357            if (!currentGroup.hasAtom(atomName)) {
358                if (currentGroup.getPDBName().equals(atom.getGroup().getPDBName())) {
359                    if (!StructureTools.hasNonDeuteratedEquiv(atom, currentGroup)) {
360                        currentGroup.addAtom(atom);
361                    }
362                }
363            }
364        }
365    }
366
367    private Group getAltLocGroup(String recordName, Character altLoc, Character oneLetterCode, String threeLetterCode,
368                                 long seqId) {
369        List<Atom> atoms = currentGroup.getAtoms();
370        if (atoms.size() > 0) {
371            if (atoms.get(0).getAltLoc().equals(altLoc)) {
372                return currentGroup;
373            }
374        }
375
376        List<Group> altLocs = currentGroup.getAltLocs();
377        for (Group altLocGroup : altLocs) {
378            atoms = altLocGroup.getAtoms();
379            if (atoms.size() > 0) {
380                for (Atom a1 : atoms) {
381                    if (a1.getAltLoc().equals(altLoc)) {
382                        return altLocGroup;
383                    }
384                }
385            }
386        }
387
388        if (threeLetterCode.equals(currentGroup.getPDBName())) {
389            if (currentGroup.getAtoms().isEmpty()) {
390                return currentGroup;
391            }
392
393            Group altLocGroup = (Group) currentGroup.clone();
394            altLocGroup.setAtoms(new ArrayList<>());
395            altLocGroup.getAltLocs().clear();
396            currentGroup.addAltLoc(altLocGroup);
397            return altLocGroup;
398        }
399
400        Group altLocGroup = createGroup(recordName, oneLetterCode, threeLetterCode, seqId);
401        altLocGroup.setPDBName(threeLetterCode);
402        altLocGroup.setResidueNumber(currentGroup.getResidueNumber());
403        currentGroup.addAltLoc(altLocGroup);
404        return altLocGroup;
405    }
406
407    private Group createGroup(String record, Character oneLetterCode, String threeLetterCode, long seqId) {
408        Group group = ChemCompGroupFactory.getGroupFromChemCompDictionary(threeLetterCode);
409        if (group != null && !group.getChemComp().isEmpty()) {
410            if (group instanceof AminoAcidImpl) {
411                AminoAcidImpl aminoAcid = (AminoAcidImpl) group;
412                aminoAcid.setId(seqId);
413            } else if (group instanceof NucleotideImpl) {
414                NucleotideImpl nucleotide = (NucleotideImpl) group;
415                nucleotide.setId(seqId);
416            } else if (group instanceof HetatomImpl) {
417                HetatomImpl hetatom = (HetatomImpl) group;
418                hetatom.setId(seqId);
419            }
420            return group;
421        }
422
423        if ("ATOM".equals(record)) {
424            if (StructureTools.isNucleotide(threeLetterCode)) {
425                NucleotideImpl nucleotide = new NucleotideImpl();
426                group = nucleotide;
427                nucleotide.setId(seqId);
428            } else if (oneLetterCode == null || oneLetterCode == StructureTools.UNKNOWN_GROUP_LABEL) {
429                HetatomImpl hetatom = new HetatomImpl();
430                group = hetatom;
431                hetatom.setId(seqId);
432            } else {
433                AminoAcidImpl aminoAcid = new AminoAcidImpl();
434                group = aminoAcid;
435                aminoAcid.setAminoType(oneLetterCode);
436                aminoAcid.setId(seqId);
437            }
438        } else {
439            if (StructureTools.isNucleotide(threeLetterCode)) {
440                NucleotideImpl nucleotide = new NucleotideImpl();
441                group = nucleotide;
442                nucleotide.setId(seqId);
443            } else if (oneLetterCode != null) {
444                AminoAcidImpl aminoAcid = new AminoAcidImpl();
445                group = aminoAcid;
446                aminoAcid.setAminoType(oneLetterCode);
447                aminoAcid.setId(seqId);
448            } else {
449                HetatomImpl hetatom = new HetatomImpl();
450                hetatom.setId(seqId);
451                group = hetatom;
452            }
453        }
454        return group;
455    }
456
457    @Override
458    public void consumeAtomSites(AtomSites atomSites) {
459        // no atom sites present
460        if (!atomSites.isDefined() || atomSites.getRowCount() == 0) {
461            return;
462        }
463
464        try {
465            parsedScaleMatrix = new Matrix4d(
466                    atomSites.getFractTransfMatrix11().get(0),
467                    atomSites.getFractTransfMatrix12().get(0),
468                    atomSites.getFractTransfMatrix13().get(0),
469                    atomSites.getFractTransfVector1().get(0),
470
471                    atomSites.getFractTransfMatrix21().get(0),
472                    atomSites.getFractTransfMatrix22().get(0),
473                    atomSites.getFractTransfMatrix23().get(0),
474                    atomSites.getFractTransfVector2().get(0),
475
476                    atomSites.getFractTransfMatrix31().get(0),
477                    atomSites.getFractTransfMatrix32().get(0),
478                    atomSites.getFractTransfMatrix33().get(0),
479                    atomSites.getFractTransfVector3().get(0),
480
481                    0,
482                    0,
483                    0,
484                    1
485            );
486        } catch (NumberFormatException e) {
487            logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not " +
488                    "be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}",
489                    e.getMessage());
490            structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
491        }
492    }
493
494    @Override
495    public void consumeAuditAuthor(AuditAuthor auditAuthor) {
496        for (int rowIndex = 0; rowIndex < auditAuthor.getRowCount(); rowIndex++) {
497            String name = auditAuthor.getName().get(rowIndex);
498
499            StringBuilder last = new StringBuilder();
500            StringBuilder initials = new StringBuilder();
501            boolean afterComma = false;
502            for (char c : name.toCharArray()) {
503                if (c == ' ') {
504                    continue;
505                }
506                if (c == ',') {
507                    afterComma = true;
508                    continue;
509                }
510
511                if (afterComma) {
512                    initials.append(c);
513                } else {
514                    last.append(c);
515                }
516            }
517
518            StringBuilder newaa = new StringBuilder();
519            newaa.append(initials);
520            newaa.append(last);
521
522            String auth = pdbHeader.getAuthors();
523            if (auth == null) {
524                pdbHeader.setAuthors(newaa.toString());
525            } else {
526                auth += "," + newaa.toString();
527                pdbHeader.setAuthors(auth);
528            }
529        }
530    }
531
532    @Override
533    public void consumeCell(Cell cell) {
534        if (!cell.isDefined() || cell.getRowCount() == 0) {
535            return;
536        }
537
538        try {
539            float a = (float) cell.getLengthA().get(0);
540            float b = (float) cell.getLengthB().get(0);
541            float c = (float) cell.getLengthC().get(0);
542            float alpha = (float) cell.getAngleAlpha().get(0);
543            float beta = (float) cell.getAngleBeta().get(0);
544            float gamma = (float) cell.getAngleGamma().get(0);
545
546            CrystalCell crystalCell = new CrystalCell();
547            crystalCell.setA(a);
548            crystalCell.setB(b);
549            crystalCell.setC(c);
550            crystalCell.setAlpha(alpha);
551            crystalCell.setBeta(beta);
552            crystalCell.setGamma(gamma);
553
554            if (!crystalCell.isCellReasonable()) {
555                // If the entry describes a structure determined by a technique other than X-ray crystallography,
556                // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
557                // if so we don't add and CrystalCell will be null
558                logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one " +
559                        "dimension is below {}), discarding it.", CrystalCell.MIN_VALID_CELL_SIZE);
560                return;
561            }
562
563            structure.getPDBHeader()
564                    .getCrystallographicInfo()
565                    .setCrystalCell(crystalCell);
566
567        } catch (NumberFormatException e){
568            structure.getPDBHeader()
569                    .getCrystallographicInfo()
570                    .setCrystalCell(null);
571            logger.info("could not parse some cell parameters ({}), ignoring _cell", e.getMessage());
572        }
573    }
574
575    @Override
576    public void consumeChemComp(ChemComp chemComp) {
577        // TODO not impled in ref
578    }
579
580    @Override
581    public void consumeChemCompBond(ChemCompBond chemCompBond) {
582        // TODO not impled in ref
583    }
584
585    @Override
586    public void consumeDatabasePDBRemark(DatabasePDBRemark databasePDBremark) {
587        for (int rowIndex = 0; rowIndex < databasePDBremark.getRowCount(); rowIndex++) {
588            int id = databasePDBremark.getId().get(rowIndex);
589            if (id == 2) {
590                String line = databasePDBremark.getText().get(rowIndex);
591                int i = line.indexOf("ANGSTROM");
592
593                if (i > 5) {
594                    // line contains ANGSTROM info...
595                    String resolution = line.substring(i - 5, i).trim();
596                    // convert string to float
597                    try {
598                        float res = Float.parseFloat(resolution);
599                        pdbHeader.setResolution(res);
600                    } catch (NumberFormatException e) {
601                        logger.info("could not parse resolution from line and ignoring it {}", line);
602                        return;
603                    }
604                }
605            }
606        }
607    }
608
609    private Date convert(LocalDate localDate) {
610        return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant());
611    }
612
613    @Override
614    public void consumeDatabasePDBRev(DatabasePDBRev databasePDBrev) {
615        logger.debug("got a database revision:" + databasePDBrev);
616
617        for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) {
618            if (databasePDBrev.getNum().get(rowIndex) == 1) {
619                String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex);
620                pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT)));
621
622                String date = databasePDBrev.getDate().get(rowIndex);
623                pdbHeader.setRelDate(convert(LocalDate.parse(date, DATE_FORMAT)));
624            } else {
625                String dbrev = databasePDBrev.getDate().get(rowIndex);
626                pdbHeader.setModDate(convert(LocalDate.parse(dbrev, DATE_FORMAT)));
627            }
628        }
629    }
630
631    @Override
632    public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecord) {
633        List<org.biojava.nbio.structure.DatabasePDBRevRecord> revRecords = pdbHeader.getRevisionRecords();
634        if (revRecords == null) {
635            revRecords = new ArrayList<>();
636            pdbHeader.setRevisionRecords(revRecords);
637        }
638
639        for (int i = 0; i < databasePDBrevRecord.getRowCount(); i++) {
640            revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i));
641        }
642    }
643
644    @Override
645    public void consumeEntity(Entity entity) {
646        this.entity = entity;
647    }
648
649    @Override
650    public void consumeEntityPoly(EntityPoly entityPoly) {
651        this.entityPoly = entityPoly;
652    }
653
654    @Override
655    public void consumeEntitySrcGen(EntitySrcGen entitySrcGen) {
656        this.entitySrcGen = entitySrcGen;
657    }
658
659    @Override
660    public void consumeEntitySrcNat(EntitySrcNat entitySrcNat) {
661        this.entitySrcNat = entitySrcNat;
662    }
663
664    @Override
665    public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) {
666        this.entitySrcSyn = entitySrcSyn;
667    }
668
669    @Override
670    public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) {
671        for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) {
672            Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex));
673
674            // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
675            // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
676
677            Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(entityPolySeq.getMonId().get(rowIndex));
678            //int seqId = Integer.parseInt(entityPolySeq.getNum());
679            if (g != null && !g.getChemComp().isEmpty()) {
680                if (g instanceof AminoAcidImpl) {
681                    AminoAcidImpl aa = (AminoAcidImpl) g;
682                    aa.setRecordType(AminoAcid.SEQRESRECORD);
683                }
684            } else {
685                if (entityPolySeq.getMonId().get(rowIndex).length() == 3 &&
686                        StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)) != null) {
687                    AminoAcidImpl a = new AminoAcidImpl();
688                    a.setRecordType(AminoAcid.SEQRESRECORD);
689                    Character code1 = StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex));
690                    a.setAminoType(code1);
691                    g = a;
692
693                } else if (StructureTools.isNucleotide(entityPolySeq.getMonId().get(rowIndex))) {
694                    // the group is actually a nucleotide group...
695                    g = new NucleotideImpl();
696                } else {
697                    logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group " +
698                            "for it", entityPolySeq.getNum().get(rowIndex), entityPolySeq.getMonId().get(rowIndex));
699                    g = new HetatomImpl();
700                }
701            }
702            // at this stage we don't know about author residue numbers (insertion codes)
703            // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly
704            // sequential and follow the seqres sequence 1 to n)
705            // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
706            g.setResidueNumber(ResidueNumber.fromString(entityPolySeq.getNum().getStringData(rowIndex)));
707            g.setPDBName(entityPolySeq.getMonId().get(rowIndex));
708            entityChain.addGroup(g);
709        }
710    }
711
712    private Chain getEntityChain(String entityId) {
713        for (Chain chain : entityChains) {
714            if (chain.getId().equals(entityId)) {
715                return chain;
716            }
717        }
718
719        // does not exist yet, so create...
720        Chain chain = new ChainImpl();
721        chain.setId(entityId);
722        entityChains.add(chain);
723
724        return chain;
725    }
726
727    @Override
728    public void consumeExptl(Exptl exptl) {
729        for (int rowIndex = 0; rowIndex < exptl.getRowCount(); rowIndex++) {
730            pdbHeader.setExperimentalTechnique(exptl.getMethod().get(rowIndex));
731        }
732    }
733
734    @Override
735    public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRevisionHistory) {
736        for (int rowIndex = 0; rowIndex < pdbxAuditRevisionHistory.getRowCount(); rowIndex++) {
737            // first entry in revision history is the release date
738            if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) {
739                String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
740                pdbHeader.setRelDate(convert(LocalDate.parse(release, DATE_FORMAT)));
741            } else {
742                // all other dates are revision dates;
743                // since this method may be called multiple times,
744                // the last revision date will "stick"
745                String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
746                pdbHeader.setModDate(convert(LocalDate.parse(revision, DATE_FORMAT)));
747            }
748        }
749    }
750
751    @Override
752    public void consumePdbxChemCompIdentifier(PdbxChemCompIdentifier pdbxChemCompIdentifier) {
753        // TODO not impled in ref
754    }
755
756    @Override
757    public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) {
758        for (int rowIndex = 0; rowIndex < pdbxDatabaseStatus.getRowCount(); rowIndex++) {
759            // the deposition date field is only available in mmCIF 5.0
760            StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate();
761            if (recvdInitialDepositionDate.isDefined()) {
762                String deposition = recvdInitialDepositionDate.get(rowIndex);
763                pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT)));
764            }
765        }
766    }
767
768    @Override
769    public void consumePdbxEntityBranchDescriptor(PdbxEntityBranchDescriptor pdbxEntityBranchDescriptor) {
770        // TODO not considered in ref
771    }
772
773    @Override
774    public void consumePdbxMolecule(PdbxMolecule pdbxMolecule) {
775        // TODO not considered in ref
776    }
777
778    @Override
779    public void consumePdbxMoleculeFeatures(PdbxMoleculeFeatures pdbxMoleculeFeatures) {
780        // TODO not considered in ref
781    }
782
783    @Override
784    public void consumePdbxNonpolyScheme(PdbxNonpolyScheme pdbxNonpolyScheme) {
785        // TODO not impled in ref
786    }
787
788    @Override
789    public void consumePdbxReferenceEntityLink(PdbxReferenceEntityLink pdbxReferenceEntityLink) {
790        // TODO not considered in ref
791    }
792
793    @Override
794    public void consumePdbxReferenceEntityList(PdbxReferenceEntityList pdbxReferenceEntityList) {
795        // TODO not considered in ref
796    }
797
798    @Override
799    public void consumePdbxReferenceEntityPolyLink(PdbxReferenceEntityPolyLink pdbxReferenceEntityPolyLink) {
800        // TODO not considered in ref
801    }
802
803    @Override
804    public void consumePdbxStructAssembly(PdbxStructAssembly pdbxStructAssembly) {
805        this.structAssembly = pdbxStructAssembly;
806    }
807
808    @Override
809    public void consumePdbxStructAssemblyGen(PdbxStructAssemblyGen pdbxStructAssemblyGen) {
810        this.structAssemblyGen = pdbxStructAssemblyGen;
811    }
812
813    @Override
814    public void consumePdbxStructModResidue(PdbxStructModResidue pdbxStructModResidue) {
815        // TODO not considered in ref
816    }
817
818    @Override
819    public void consumePdbxStructOperList(PdbxStructOperList pdbxStructOperList) {
820        this.structOpers = pdbxStructOperList;
821    }
822
823    @Override
824    public void consumeRefine(Refine refine) {
825        for (int rowIndex = 0; rowIndex < refine.getRowCount(); rowIndex++) {
826            // RESOLUTION
827            // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
828            // there are 2 resolution values, one for each method
829            // we take the last one found so that behaviour is like in PDB file parsing
830            double lsDResHigh = refine.getLsDResHigh().get(rowIndex);
831            // TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0
832            if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) {
833                logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}",
834                        lsDResHigh, String.format("%4.2f",pdbHeader.getResolution()));
835            }
836            pdbHeader.setResolution((float) lsDResHigh);
837
838            FloatColumn lsRFactorRFree = refine.getLsRFactorRFree();
839            // RFREE
840            if (pdbHeader.getRfree() != PDBHeader.DEFAULT_RFREE) {
841                logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {}",
842                        lsRFactorRFree, String.format("%4.2f",pdbHeader.getRfree()));
843            }
844            if (lsRFactorRFree.isDefined() && lsRFactorRFree.getValueKind(rowIndex) == ValueKind.PRESENT) {
845                pdbHeader.setRfree((float) lsRFactorRFree.get(rowIndex));
846            } else {
847                // some entries like 2ifo haven't got this field at all
848                logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
849            }
850
851            // RWORK
852            FloatColumn lsRFactorRWork = refine.getLsRFactorRWork();
853            if(pdbHeader.getRwork() != PDBHeader.DEFAULT_RFREE) {
854                logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ",
855                        lsRFactorRWork, String.format("%4.2f",pdbHeader.getRwork()));
856            }
857            if (lsRFactorRWork.isDefined() && lsRFactorRWork.getValueKind(rowIndex) == ValueKind.PRESENT) {
858                pdbHeader.setRwork((float) lsRFactorRWork.get(rowIndex));
859            } else {
860                logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value");
861            }
862        }
863    }
864
865    @Override
866    public void consumeStruct(Struct struct) {
867        if (struct.isDefined() && struct.getTitle().isDefined()) {
868            pdbHeader.setTitle(struct.getTitle().get(0));
869        }
870
871        if (struct.isDefined() && struct.getEntryId().isDefined()) {
872            PdbId pdbId;
873            String pdbCode = struct.getEntryId().get(0);
874                        try {
875                                pdbId = new PdbId(pdbCode);
876                        } catch (IllegalArgumentException e) {
877                                logger.info("Malformed (or null) PDB ID {}. setting PdbId to null", pdbCode);
878                                pdbId = null;
879                        }
880            pdbHeader.setPdbId(pdbId);
881            structure.setPdbId(pdbId);
882        }
883    }
884
885    @Override
886    public void consumeStructAsym(StructAsym structAsym) {
887        this.structAsym = structAsym;
888    }
889
890    @Override
891    public void consumeStructConf(StructConf structConf) {
892        // TODO not considered in ref
893    }
894
895    @Override
896    public void consumeStructConn(StructConn structConn) {
897        this.structConn = structConn;
898    }
899
900    @Override
901    public void consumeStructConnType(StructConnType structConnType) {
902        // TODO not considered in ref
903    }
904
905    @Override
906    public void consumeStructKeywords(StructKeywords structKeywords) {
907        ArrayList<String> keywordsList = new ArrayList<String>();
908
909        StrColumn text = structKeywords.getText();
910        if (text.isDefined()) {
911            String keywords = text.get(0);
912            String[] strings = keywords.split(" *, *");
913            for (String string : strings) {
914                keywordsList.add(string.trim());
915            }
916        }
917        structure.getPDBHeader().setKeywords(keywordsList);
918
919        StrColumn pdbxKeywords = structKeywords.getPdbxKeywords();
920        if (pdbxKeywords.isDefined()) {
921            String keywords = pdbxKeywords.get(0);
922            pdbHeader.setClassification(keywords);
923            //This field should be left empty. TODO The next line should be removed later
924            pdbHeader.setDescription(keywords);
925        }
926    }
927
928    @Override
929    public void consumeStructNcsOper(StructNcsOper structNcsOper) {
930        this.structNcsOper = structNcsOper;
931    }
932
933    @Override
934    public void consumeStructRef(StructRef structRef) {
935        this.structRef = structRef;
936    }
937
938    @Override
939    public void consumeStructRefSeq(StructRefSeq structRefSeq) {
940        for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) {
941            String refId = structRefSeq.getRefId().get(rowIndex);
942
943            DBRef dbRef = new DBRef();
944
945            dbRef.setIdCode(structRefSeq.getPdbxPDBIdCode().get(rowIndex));
946            dbRef.setDbAccession(structRefSeq.getPdbxDbAccession().get(rowIndex));
947            dbRef.setDbIdCode(structRefSeq.getPdbxDbAccession().get(rowIndex));
948            dbRef.setChainName(structRefSeq.getPdbxStrandId().get(rowIndex));
949
950            OptionalInt structRefRowIndex = IntStream.range(0, structRef.getRowCount())
951                    .filter(i -> structRef.getId().get(i).equals(refId))
952                    .findFirst();
953
954            if (structRefRowIndex.isPresent()) {
955                dbRef.setDatabase(structRef.getDbName().get(structRefRowIndex.getAsInt()));
956                dbRef.setDbIdCode(structRef.getDbCode().get(structRefRowIndex.getAsInt()));
957            } else {
958                logger.info("could not find StructRef `{} for StructRefSeq {}", refId, rowIndex);
959            }
960
961            int seqBegin;
962            int seqEnd;
963
964            try {
965                seqBegin = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignBeg().get(rowIndex));
966                seqEnd = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignEnd().get(rowIndex));
967            } catch (NumberFormatException e) {
968                // this happens in a few entries, annotation error? e.g. 6eoj
969                logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref " +
970                        "alignment info for accession {}. Error: {}", dbRef.getDbAccession(), e.getMessage());
971                return;
972            }
973
974            char beginInsCode = ' ';
975            String pdbxSeqAlignBegInsCode = structRefSeq.getPdbxSeqAlignBegInsCode().get(rowIndex);
976            if (pdbxSeqAlignBegInsCode.length() > 0) {
977                beginInsCode = pdbxSeqAlignBegInsCode.charAt(0);
978            }
979
980            char endInsCode = ' ';
981            String pdbxSeqAlignEndInsCode = structRefSeq.getPdbxSeqAlignEndInsCode().get(rowIndex);
982            if (pdbxSeqAlignEndInsCode.length() > 0) {
983                endInsCode = pdbxSeqAlignEndInsCode.charAt(0);
984            }
985
986            if (beginInsCode == '?') {
987                beginInsCode = ' ';
988            }
989            if (endInsCode == '?') {
990                endInsCode = ' ';
991            }
992
993            dbRef.setSeqBegin(seqBegin);
994            dbRef.setInsertBegin(beginInsCode);
995            dbRef.setSeqEnd(seqEnd);
996            dbRef.setInsertEnd(endInsCode);
997
998            int dbSeqBegin = structRefSeq.getDbAlignBeg().get(rowIndex);
999            int dbSeqEnd = structRefSeq.getDbAlignEnd().get(rowIndex);
1000
1001            char dbBeginInsCode = ' ';
1002            StrColumn pdbxDbAlignBegInsCodeCol = structRefSeq.getPdbxDbAlignBegInsCode();
1003            if (pdbxDbAlignBegInsCodeCol.isDefined()) {
1004                String pdbxDbAlignBegInsCode = pdbxDbAlignBegInsCodeCol.get(rowIndex);
1005                if (pdbxDbAlignBegInsCode.length() > 0) {
1006                    dbBeginInsCode = pdbxDbAlignBegInsCode.charAt(0);
1007                }
1008            }
1009
1010            char dbEndInsCode = ' ';
1011            StrColumn pdbxDbAlignEndInsCodeCol = structRefSeq.getPdbxDbAlignEndInsCode();
1012            if (pdbxDbAlignEndInsCodeCol.isDefined()) {
1013                String pdbxDbAlignEndInsCode = pdbxDbAlignEndInsCodeCol.get(rowIndex);
1014                if (pdbxDbAlignEndInsCode.length() > 0) {
1015                    dbEndInsCode = pdbxDbAlignEndInsCode.charAt(0);
1016                }
1017            }
1018
1019            if (dbBeginInsCode == '?') {
1020                dbBeginInsCode = ' ';
1021            }
1022            if (dbEndInsCode == '?') {
1023                dbEndInsCode = ' ';
1024            }
1025
1026            dbRef.setDbSeqBegin(dbSeqBegin);
1027            dbRef.setIdbnsBegin(dbBeginInsCode);
1028            dbRef.setDbSeqEnd(dbSeqEnd);
1029            dbRef.setIdbnsEnd(dbEndInsCode);
1030
1031            List<DBRef> dbrefs = structure.getDBRefs();
1032            if (dbrefs == null) {
1033                dbrefs = new ArrayList<>();
1034            }
1035            dbrefs.add(dbRef);
1036
1037            logger.debug(dbRef.toPDB());
1038
1039            structure.setDBRefs(dbrefs);
1040        }
1041    }
1042
1043    @Override
1044    public void consumeStructRefSeqDif(StructRefSeqDif structRefSeqDif) {
1045        this.structRefSeqDif = structRefSeqDif;
1046    }
1047
1048    @Override
1049    public void consumeStructSheetRange(StructSheetRange structSheetRange) {
1050        // TODO not considered in ref
1051    }
1052
1053    @Override
1054    public void consumeStructSite(StructSite structSite) {
1055        if (params.isHeaderOnly()) {
1056            return;
1057        }
1058
1059        List<Site> sites = structure.getSites();
1060        if (sites == null) {
1061            sites = new ArrayList<>();
1062        }
1063
1064        for (int rowIndex = 0; rowIndex < structSite.getRowCount(); rowIndex++) {
1065            Site site = null;
1066            for (Site asite : sites) {
1067                if (asite.getSiteID().equals(structSite.getId().get(rowIndex))) {
1068                    site = asite; // prevent duplicate siteIds
1069                }
1070            }
1071
1072            boolean addSite = false;
1073            if (site == null) {
1074                site = new Site();
1075                addSite = true;
1076            }
1077
1078            site.setSiteID(structSite.getId().get(rowIndex));
1079            site.setDescription(structSite.getDetails().get(rowIndex));
1080            site.setEvCode(structSite.getPdbxEvidenceCode().get(rowIndex));
1081
1082            if (addSite) {
1083                sites.add(site);
1084            }
1085        }
1086
1087        structure.setSites(sites);
1088    }
1089
1090    @Override
1091    public void consumeStructSiteGen(StructSiteGen structSiteGen) {
1092        this.structSiteGen = structSiteGen;
1093    }
1094
1095    @Override
1096    public void consumeSymmetry(Symmetry symmetry) {
1097        for (int rowIndex = 0; rowIndex < symmetry.getRowCount(); rowIndex++) {
1098            String spaceGroupString = symmetry.getSpaceGroupNameH_M().get(rowIndex);
1099            SpaceGroup spaceGroup = SymoplibParser.getSpaceGroup(spaceGroupString);
1100            if (spaceGroup == null) {
1101                logger.warn("Space group '{}' not recognised as a standard space group", spaceGroupString);
1102                structure.getPDBHeader()
1103                        .getCrystallographicInfo()
1104                        .setNonStandardSg(true);
1105            } else {
1106                structure.getPDBHeader()
1107                        .getCrystallographicInfo()
1108                        .setSpaceGroup(spaceGroup);
1109                structure.getPDBHeader()
1110                        .getCrystallographicInfo()
1111                        .setNonStandardSg(false);
1112            }
1113        }
1114    }
1115
1116    @Override
1117    public void finish() {
1118        if (currentChain != null) {
1119            currentChain.addGroup(currentGroup);
1120
1121            Optional<Chain> testChain = currentModel.stream()
1122                    .filter(chain -> chain.getId().equals(currentChain.getId()))
1123                    .findFirst();
1124
1125            if (!testChain.isPresent()) {
1126                currentModel.add(currentChain);
1127            }
1128        } else if (!params.isHeaderOnly()) {
1129            logger.warn("current chain is null at end of document.");
1130        }
1131
1132        allModels.add(currentModel);
1133
1134        initMaps();
1135
1136        for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
1137            String id = structAsym.getId().get(rowIndex);
1138            String entityId = structAsym.getEntityId().get(rowIndex);
1139            logger.debug("Entity {} matches asym_id: {}", entityId, id);
1140
1141            Chain chain = getEntityChain(entityId);
1142            Chain seqRes = (Chain) chain.clone();
1143            // to solve issue #160 (e.g. 3u7t)
1144            seqRes = removeSeqResHeterogeneity(seqRes);
1145            seqRes.setId(id);
1146            seqRes.setName(asymId2authorId.getOrDefault(id, id));
1147
1148            EntityType type = EntityType.entityTypeFromString(getEntityType(entityId));
1149            if (type == null || type == EntityType.POLYMER) {
1150                seqResChains.add(seqRes);
1151            }
1152
1153            logger.debug(" seqres: {} {}<", id, seqRes);
1154            addEntity(rowIndex, entityId, getEntityDescription(entityId), getEntityType(entityId));
1155        }
1156
1157        if (!structAsym.isDefined() || structAsym.getRowCount() == 0) {
1158            logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
1159        }
1160
1161        // entities
1162        // In addEntities above we created the entities if they were present in the file
1163        // Now we need to make sure that they are linked to chains and also that if they are not present in the file we
1164        // need to add them now
1165        linkEntities();
1166
1167        // now that we know the entities, we can add all chains to structure so that they are stored
1168        // properly as polymer/nonpolymer/water chains inside structure
1169        allModels.forEach(structure::addModel);
1170
1171        // Only align if requested (default) and not when headerOnly mode with no Atoms.
1172        // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
1173        if (params.isAlignSeqRes() && !params.isHeaderOnly()){
1174            logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
1175            alignSeqRes();
1176        } else {
1177            logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
1178            SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
1179        }
1180
1181        // Now make sure all altlocgroups have all the atoms in all the groups
1182        StructureTools.cleanUpAltLocs(structure);
1183
1184        // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
1185        if (!params.isHeaderOnly()) {
1186            if (params.shouldCreateAtomBonds()) {
1187                addBonds();
1188            }
1189
1190            if (params.shouldCreateAtomCharges()) {
1191                addCharges();
1192            }
1193        }
1194
1195        if (!params.isHeaderOnly()) {
1196            addSites();
1197        }
1198
1199        // set the oligomeric state info in the header...
1200        if (params.isParseBioAssembly()) {
1201            // the more detailed mapping of chains to rotation operations happens in StructureIO...
1202
1203            Map<Integer, BioAssemblyInfo> bioAssemblies = new LinkedHashMap<>();
1204            for (int i = 0; i < structAssembly.getRowCount(); i++) {
1205                String assemblyId = structAssembly.getId().get(i);
1206                List<Integer> structAssemblyGenIndices = new ArrayList<>();
1207                for (int j = 0; j < structAssemblyGen.getRowCount(); j++) {
1208                    if (structAssemblyGen.getAssemblyId().get(j).equals(assemblyId)) {
1209                        structAssemblyGenIndices.add(j);
1210                    }
1211                }
1212                BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
1213                // these are the transformations that need to be applied to our model
1214                List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(structAssembly,
1215                        i, structAssemblyGen, structOpers);
1216
1217                int bioAssemblyId = -1;
1218                try {
1219                    bioAssemblyId = Integer.parseInt(assemblyId);
1220                } catch (NumberFormatException e) {
1221                    logger.info("Could not parse a numerical bio assembly id from '{}'", assemblyId);
1222                }
1223
1224                // if bioassembly id is not numerical we throw it away
1225                // this happens usually for viral capsid entries, like 1ei7
1226                // see issue #230 in github
1227                if (bioAssemblyId != -1) {
1228                    int mmSize = 0;
1229                    // note that the transforms contain asym ids of both polymers and non-polymers
1230                    // For the mmsize, we are only interested in the polymers
1231                    for (BiologicalAssemblyTransformation transf : transformations) {
1232                        Chain c = structure.getChain(transf.getChainId());
1233                        if (c == null) {
1234                            logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
1235                            continue;
1236                        }
1237                        if (c.getEntityType() == EntityType.POLYMER &&
1238                                // for entries like 4kro, sugars are annotated as polymers but we
1239                                // don't want them in the macromolecularSize count
1240                                !c.getEntityInfo().getDescription().contains("SUGAR")) {
1241                            mmSize++;
1242                        }
1243                    }
1244
1245                    BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
1246                    bioAssembly.setId(bioAssemblyId);
1247                    bioAssembly.setMacromolecularSize(mmSize);
1248                    bioAssembly.setTransforms(transformations);
1249                    bioAssemblies.put(bioAssemblyId, bioAssembly);
1250                }
1251
1252            }
1253            structure.getPDBHeader()
1254                    .setBioAssemblies(bioAssemblies);
1255        }
1256
1257        setStructNcsOps();
1258        setCrystallographicInfoMetadata();
1259
1260        Map<String, List<SeqMisMatch>> misMatchMap = new HashMap<>();
1261        for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) {
1262            SeqMisMatch seqMisMatch = new SeqMisMatchImpl();
1263            seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex));
1264
1265            String insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex);
1266                if (insCode != null && insCode.equals("?")) {
1267                insCode = null;
1268            }
1269            seqMisMatch.setInsCode(insCode);
1270            seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex));
1271            seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex));
1272            seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex));
1273            seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex));
1274            seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex));
1275
1276            String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex);
1277            List<SeqMisMatch> seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>());
1278            seqMisMatches.add(seqMisMatch);
1279        }
1280
1281        for (String chainId : misMatchMap.keySet()){
1282            Chain chain = structure.getPolyChainByPDB(chainId);
1283            if (chain == null) {
1284                logger.warn("Could not set mismatches for chain with author id {}", chainId);
1285                continue;
1286            }
1287
1288            chain.setSeqMisMatches(misMatchMap.get(chainId));
1289        }
1290    }
1291
1292    private String getEntityType(String entityId) {
1293        return IntStream.range(0, entity.getRowCount())
1294                .filter(i -> entity.getId().get(i).equals(entityId))
1295                .mapToObj(i -> entity.getType().get(i))
1296                .findFirst()
1297                .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
1298    }
1299
1300    private String getEntityDescription(String entityId) {
1301        return IntStream.range(0, entity.getRowCount())
1302                .filter(i -> entity.getId().get(i).equals(entityId))
1303                .mapToObj(i -> entity.getPdbxDescription().get(i))
1304                .findFirst()
1305                .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
1306    }
1307
1308    private void addEntity(int asymRowIndex, String entityId, String pdbxDescription, String type) {
1309        int eId = 0;
1310        try {
1311            eId = Integer.parseInt(entityId);
1312        } catch (NumberFormatException e) {
1313            logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity", entityId);
1314        }
1315
1316        int entityRowIndex = IntStream.range(0, entity.getRowCount())
1317                .filter(i -> entity.getId().get(i).equals(entityId))
1318                .findFirst()
1319                .orElse(-1);
1320
1321        EntityInfo entityInfo = structure.getEntityById(eId);
1322
1323        if (entityInfo == null) {
1324            entityInfo = new EntityInfo();
1325            entityInfo.setMolId(eId);
1326            // we only add the compound if a polymeric one (to match what the PDB parser does)
1327            if (entityRowIndex != -1) {
1328                entityInfo.setDescription(pdbxDescription);
1329
1330                EntityType eType = EntityType.entityTypeFromString(type);
1331                if (eType != null) {
1332                    entityInfo.setType(eType);
1333                } else {
1334                    logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", type, eId);
1335                }
1336                addAncilliaryEntityData(asymRowIndex, entityInfo);
1337                structure.addEntityInfo(entityInfo);
1338                logger.debug("Adding Entity with entity id {} from _entity, with name: {}", eId,
1339                        entityInfo.getDescription());
1340            }
1341        }
1342    }
1343
1344    private void addAncilliaryEntityData(int asymRowIndex, EntityInfo entityInfo) {
1345        // Loop through each of the entity types and add the corresponding data
1346        // We're assuming if data is duplicated between sources it is consistent
1347        // This is a potentially huge assumption...
1348
1349        for (int rowIndex = 0; rowIndex < entitySrcGen.getRowCount(); rowIndex++) {
1350            if (!entitySrcGen.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1351                continue;
1352            }
1353
1354            addInformationFromEntitySrcGen(rowIndex, entityInfo);
1355        }
1356
1357        for (int rowIndex = 0; rowIndex < entitySrcNat.getRowCount(); rowIndex++) {
1358            if (!entitySrcNat.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1359                continue;
1360            }
1361
1362            addInformationFromEntitySrcNat(rowIndex, entityInfo);
1363        }
1364
1365        for (int rowIndex = 0; rowIndex < entitySrcSyn.getRowCount(); rowIndex++) {
1366            if (!entitySrcSyn.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1367                continue;
1368            }
1369
1370            addInformationFromEntitySrcSyn(rowIndex, entityInfo);
1371        }
1372    }
1373
1374    private void addInformationFromEntitySrcSyn(int rowIndex, EntityInfo entityInfo) {
1375        entityInfo.setOrganismCommon(entitySrcSyn.getOrganismCommonName().get(rowIndex));
1376        entityInfo.setOrganismScientific(entitySrcSyn.getOrganismScientific().get(rowIndex));
1377        entityInfo.setOrganismTaxId(entitySrcSyn.getNcbiTaxonomyId().get(rowIndex));
1378    }
1379
1380    private void addInformationFromEntitySrcNat(int rowIndex, EntityInfo entityInfo) {
1381        entityInfo.setAtcc(entitySrcNat.getPdbxAtcc().get(rowIndex));
1382        entityInfo.setCell(entitySrcNat.getPdbxCell().get(rowIndex));
1383        entityInfo.setOrganismCommon(entitySrcNat.getCommonName().get(rowIndex));
1384        entityInfo.setOrganismScientific(entitySrcNat.getPdbxOrganismScientific().get(rowIndex));
1385        entityInfo.setOrganismTaxId(entitySrcNat.getPdbxNcbiTaxonomyId().get(rowIndex));
1386    }
1387
1388    private void addInformationFromEntitySrcGen(int rowIndex, EntityInfo entityInfo) {
1389        entityInfo.setAtcc(entitySrcGen.getPdbxGeneSrcAtcc().get(rowIndex));
1390        entityInfo.setCell(entitySrcGen.getPdbxGeneSrcCell().get(rowIndex));
1391        entityInfo.setOrganismCommon(entitySrcGen.getGeneSrcCommonName().get(rowIndex));
1392        entityInfo.setOrganismScientific(entitySrcGen.getPdbxGeneSrcScientificName().get(rowIndex));
1393        entityInfo.setOrganismTaxId(entitySrcGen.getPdbxGeneSrcNcbiTaxonomyId().get(rowIndex));
1394        entityInfo.setExpressionSystemTaxId(entitySrcGen.getPdbxHostOrgNcbiTaxonomyId().get(rowIndex));
1395        entityInfo.setExpressionSystem(entitySrcGen.getPdbxHostOrgScientificName().get(rowIndex));
1396    }
1397
1398    private void setStructNcsOps() {
1399        List<Matrix4d> ncsOperators = new ArrayList<>();
1400
1401        for (int rowIndex = 0; rowIndex < structNcsOper.getRowCount(); rowIndex++) {
1402            if (!"generate".equals(structNcsOper.getCode().get(rowIndex))) {
1403                continue;
1404            }
1405
1406            try {
1407                Matrix4d operator = new Matrix4d();
1408
1409                operator.setElement(0, 0, structNcsOper.getMatrix11().get(rowIndex));
1410                operator.setElement(0, 1, structNcsOper.getMatrix12().get(rowIndex));
1411                operator.setElement(0, 2, structNcsOper.getMatrix13().get(rowIndex));
1412                operator.setElement(0, 3, structNcsOper.getVector1().get(rowIndex));
1413
1414                operator.setElement(1, 0, structNcsOper.getMatrix21().get(rowIndex));
1415                operator.setElement(1, 1, structNcsOper.getMatrix22().get(rowIndex));
1416                operator.setElement(1, 2, structNcsOper.getMatrix23().get(rowIndex));
1417                operator.setElement(1, 3, structNcsOper.getVector2().get(rowIndex));
1418
1419                operator.setElement(2, 0, structNcsOper.getMatrix31().get(rowIndex));
1420                operator.setElement(2, 1, structNcsOper.getMatrix32().get(rowIndex));
1421                operator.setElement(2, 2, structNcsOper.getMatrix33().get(rowIndex));
1422                operator.setElement(2, 3, structNcsOper.getVector3().get(rowIndex));
1423
1424                operator.setElement(3, 0, 0);
1425                operator.setElement(3, 1, 0);
1426                operator.setElement(3, 2, 0);
1427                operator.setElement(3, 3, 1);
1428
1429                ncsOperators.add(operator);
1430            } catch (NumberFormatException e) {
1431                logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", rowIndex + 1);
1432            }
1433        }
1434
1435        if (ncsOperators.size() > 0) {
1436            structure.getCrystallographicInfo()
1437                    .setNcsOperators(ncsOperators.toArray(new Matrix4d[0]));
1438        }
1439    }
1440
1441    private void setCrystallographicInfoMetadata() {
1442        if (parsedScaleMatrix != null) {
1443            PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
1444            boolean nonStd = false;
1445            if (crystalInfo.getCrystalCell() != null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
1446                nonStd = true;
1447            }
1448
1449            crystalInfo.setNonStandardCoordFrameConvention(nonStd);
1450        }
1451    }
1452
1453    private void addSites() {
1454        List<Site> sites = structure.getSites();
1455        if (sites == null) sites = new ArrayList<>();
1456
1457        for (int rowIndex = 0; rowIndex < structSiteGen.getRowCount(); rowIndex++) {
1458            // For each StructSiteGen, find the residues involved, if they exist then
1459            String site_id = structSiteGen.getSiteId().get(rowIndex); // multiple could be in same site.
1460            if (site_id == null) {
1461                site_id = "";
1462            }
1463            String comp_id = structSiteGen.getLabelCompId().get(rowIndex);  // PDBName
1464
1465            // Assumption: the author chain ID and residue number for the site is consistent with the original
1466            // author chain id and residue numbers.
1467
1468            String asymId = structSiteGen.getLabelAsymId().get(rowIndex); // chain name
1469            String authId = structSiteGen.getAuthAsymId().get(rowIndex); // chain Id
1470            String auth_seq_id = structSiteGen.getAuthSeqId().get(rowIndex); // Res num
1471
1472            String insCode = structSiteGen.getPdbxAuthInsCode().get(rowIndex);
1473            if (insCode != null && insCode.equals("?")) {
1474                insCode = null;
1475            }
1476
1477            // Look for asymID = chainID and seqID = seq_ID.  Check that comp_id matches the resname.
1478            Group g = null;
1479            try {
1480                Chain chain = structure.getChain(asymId);
1481
1482                if (null != chain) {
1483                    try {
1484                        Character insChar = null;
1485                        if (null != insCode && insCode.length() > 0) {
1486                            insChar = insCode.charAt(0);
1487                        }
1488                        g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar));
1489                    } catch (NumberFormatException e) {
1490                        logger.warn("Could not lookup residue : {}{}", authId, auth_seq_id);
1491                    }
1492                }
1493            } catch (StructureException e) {
1494                logger.warn("Problem finding residue in site entry {} - {}",
1495                        structSiteGen.getSiteId().get(rowIndex), e.getMessage());
1496            }
1497
1498            if (g != null) {
1499                // 2. find the site_id, if not existing, create anew.
1500                Site site = null;
1501                for (Site asite : sites) {
1502                    if (site_id.equals(asite.getSiteID())) {
1503                        site = asite;
1504                    }
1505                }
1506
1507                boolean addSite = false;
1508
1509                // 3. add this residue to the site.
1510                if (site == null) {
1511                    addSite = true;
1512                    site = new Site();
1513                    site.setSiteID(site_id);
1514                }
1515
1516                List<Group> groups = site.getGroups();
1517                if (groups == null) {
1518                    groups = new ArrayList<>();
1519                }
1520
1521                // Check the self-consistency of the residue reference from auth_seq_id and chain_id
1522                if (!comp_id.equals(g.getPDBName())) {
1523                    logger.warn("comp_id doesn't match the residue at {} {} - skipping", authId, auth_seq_id);
1524                } else {
1525                    groups.add(g);
1526                    site.setGroups(groups);
1527                }
1528                if (addSite) {
1529                    sites.add(site);
1530                }
1531            }
1532        }
1533        structure.setSites(sites);
1534    }
1535
1536    private void addCharges() {
1537        ChargeAdder.addCharges(structure);
1538    }
1539
1540    /**
1541     * The method will return a new reference to a Chain with any consecutive groups
1542     * having same residue numbers removed.
1543     * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
1544     */
1545    private static Chain removeSeqResHeterogeneity(Chain c) {
1546        Chain trimmedChain = new ChainImpl();
1547        ResidueNumber lastResNum = null;
1548
1549        for (Group g : c.getAtomGroups()) {
1550            // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
1551            ResidueNumber currentResNum = new ResidueNumber(
1552                    g.getResidueNumber().getChainName(),
1553                    g.getResidueNumber().getSeqNum(),
1554                    g.getResidueNumber().getInsCode());
1555
1556            if (lastResNum == null || !lastResNum.equals(currentResNum)) {
1557                trimmedChain.addGroup(g);
1558            } else {
1559                logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely " +
1560                        "has hetero='y': {}", g);
1561            }
1562            lastResNum = currentResNum;
1563
1564        }
1565        return trimmedChain;
1566    }
1567
1568    private void addBonds() {
1569        BondMaker maker = new BondMaker(structure, params);
1570        maker.makeBonds();
1571        maker.formBondsFromStructConn(structConn);
1572    }
1573
1574    private void alignSeqRes() {
1575        logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
1576
1577        // fix SEQRES residue numbering for all models
1578
1579        for (int model = 0; model < structure.nrModels(); model++) {
1580            List<Chain> atomList   = structure.getModel(model);
1581
1582            for (Chain seqResChain : seqResChains){
1583
1584                // this extracts the matching atom chain from atomList
1585                Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true);
1586
1587                if (atomChain == null) {
1588                    // most likely there's no observed residues at all for the seqres chain: can't map
1589                    // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
1590                    logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's " +
1591                            "no observed residues in the chain.", seqResChain.getId());
1592                    continue;
1593                }
1594
1595                //map the atoms to the seqres...
1596
1597                // we need to first clone the seqres so that they stay independent for different models
1598                List<Group> seqResGroups = new ArrayList<>();
1599                for (int i = 0; i < seqResChain.getAtomGroups().size(); i++) {
1600                    seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
1601                }
1602
1603                for (int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
1604                    Group seqresG = seqResGroups.get(seqResPos);
1605                    boolean found = false;
1606                    for (Group atomG : atomChain.getAtomGroups()) {
1607
1608                        int internalNr = getInternalNr(atomG);
1609
1610                        if (seqresG.getResidueNumber().getSeqNum() == internalNr) {
1611                            seqResGroups.set(seqResPos, atomG);
1612                            found = true;
1613                            break;
1614                        }
1615                    }
1616
1617                    if (!found)
1618                        // so far the residue number has tracked internal numbering.
1619                        // however there are no atom records, as such this can't be a PDB residue number...
1620                        seqresG.setResidueNumber(null);
1621                }
1622                atomChain.setSeqResGroups(seqResGroups);
1623            }
1624        }
1625    }
1626
1627    private int getInternalNr(Group atomG) {
1628        if (atomG.getType().equals(GroupType.AMINOACID)) {
1629            AminoAcidImpl aa = (AminoAcidImpl) atomG;
1630            return (int) aa.getId();
1631        } else if (atomG.getType().equals(GroupType.NUCLEOTIDE)) {
1632            NucleotideImpl nu = (NucleotideImpl) atomG;
1633            return (int) nu.getId();
1634        } else {
1635            HetatomImpl he = (HetatomImpl) atomG;
1636            return (int) he.getId();
1637        }
1638    }
1639
1640    private void linkEntities() {
1641        for (List<Chain> allModel : allModels) {
1642            for (Chain chain : allModel) {
1643                //logger.info("linking entities for " + chain.getId() + " "  + chain.getName());
1644                String entityId = asymId2entityId.get(chain.getId());
1645
1646                if (entityId == null) {
1647                    // this can happen for instance if the cif file didn't have _struct_asym category at all
1648                    // and thus we have no asymId2entityId mapping at all
1649                    logger.info("No entity id could be found for chain {}", chain.getId());
1650                    continue;
1651                }
1652
1653                int eId = Integer.parseInt(entityId);
1654
1655                // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
1656                // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
1657                // asyms (chains).  Either create a unique StructureImpl or modify existing for a better representation of the
1658                // mmCIF internal data structures but is compatible with Structure interface.
1659                // Some examples of PDB entries with this kind of problem:
1660                //   - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName
1661                //   - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule
1662                //   - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone
1663
1664                EntityInfo entityInfo = structure.getEntityById(eId);
1665                if (entityInfo == null) {
1666                    // Supports the case where the only chain members were from non-polymeric entity that is missing.
1667                    // Solved by creating a new Compound(entity) to which this chain will belong.
1668                    logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
1669                            eId, chain.getId());
1670                    entityInfo = new EntityInfo();
1671                    entityInfo.setMolId(eId);
1672                    entityInfo.addChain(chain);
1673                    if (chain.isWaterOnly()) {
1674                        entityInfo.setType(EntityType.WATER);
1675                    } else {
1676                        entityInfo.setType(EntityType.NONPOLYMER);
1677                    }
1678                    chain.setEntityInfo(entityInfo);
1679                    structure.addEntityInfo(entityInfo);
1680                } else {
1681                    logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}",
1682                            chain.getId(), chain.getName(), eId);
1683                    entityInfo.addChain(chain);
1684                    chain.setEntityInfo(entityInfo);
1685                }
1686
1687            }
1688
1689        }
1690
1691        // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
1692        List<EntityInfo> entityInfos = structure.getEntityInfos();
1693        if (entityInfos == null || entityInfos.isEmpty()) {
1694            List<List<Chain>> polyModels = new ArrayList<>();
1695            List<List<Chain>> nonPolyModels = new ArrayList<>();
1696            List<List<Chain>> waterModels = new ArrayList<>();
1697
1698            for (List<Chain> model : allModels) {
1699                List<Chain> polyChains = new ArrayList<>();
1700                List<Chain> nonPolyChains = new ArrayList<>();
1701                List<Chain> waterChains = new ArrayList<>();
1702
1703                polyModels.add(polyChains);
1704                nonPolyModels.add(nonPolyChains);
1705                waterModels.add(waterChains);
1706
1707                for (Chain chain : model) {
1708                    // we only have entities for polymeric chains, all others are ignored for assigning entities
1709                    if (chain.isWaterOnly()) {
1710                        waterChains.add(chain);
1711                    } else if (chain.isPureNonPolymer()) {
1712                        nonPolyChains.add(chain);
1713                    } else {
1714                        polyChains.add(chain);
1715                    }
1716                }
1717            }
1718
1719            entityInfos = EntityFinder.findPolyEntities(polyModels);
1720            EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos);
1721
1722            structure.setEntityInfos(entityInfos);
1723        }
1724
1725        // final sanity check: it can happen that from the annotated entities some are not linked to any chains
1726        // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
1727        // we simply log it, this can sign some other problems if the entities are used down the line
1728        for (EntityInfo e : entityInfos) {
1729            if (e.getChains().isEmpty()) {
1730                logger.info("Entity {} '{}' has no chains associated to it",
1731                        e.getMolId() < 0 ? "with no entity id" : e.getMolId(), e.getDescription());
1732            }
1733        }
1734    }
1735
1736    private void initMaps() {
1737        if (structAsym == null || !structAsym.isDefined() || structAsym.getRowCount() == 0) {
1738            logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available");
1739            return;
1740        }
1741
1742        Map<String, List<String>> entityId2asymId = new HashMap<>();
1743        for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
1744            String id = structAsym.getId().get(rowIndex);
1745            String entityId = structAsym.getEntityId().get(rowIndex);
1746
1747            logger.debug("Entity {} matches asym_id: {}", entityId, id);
1748
1749            asymId2entityId.put(id, entityId);
1750
1751            if (entityId2asymId.containsKey(entityId)) {
1752                List<String> asymIds = entityId2asymId.get(entityId);
1753                asymIds.add(id);
1754            } else {
1755                List<String> asymIds = new ArrayList<>();
1756                asymIds.add(id);
1757                entityId2asymId.put(entityId, asymIds);
1758            }
1759        }
1760
1761        if (entityPoly == null || !entityPoly.isDefined() || entityPoly.getRowCount() == 0) {
1762            logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available " +
1763                    "for header only parsing");
1764            return;
1765        }
1766
1767        for (int rowIndex = 0; rowIndex < entityPoly.getRowCount(); rowIndex++) {
1768            if (!entityPoly.getPdbxStrandId().isDefined()) {
1769                logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to " +
1770                        "author ids for this entity.", entityPoly.getEntityId().get(rowIndex));
1771                break;
1772            }
1773
1774            String[] chainNames = entityPoly.getPdbxStrandId().get(rowIndex).split(",");
1775            List<String> asymIds = entityId2asymId.get(entityPoly.getEntityId().get(rowIndex));
1776            if (chainNames.length != asymIds.size()) {
1777                logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) " +
1778                        "for entity {} have different lengths! Can't provide a mapping from asym ids to author chain " +
1779                        "ids", entityPoly.getEntityId().get(rowIndex));
1780                break;
1781            }
1782
1783            for (int i = 0; i < chainNames.length; i++) {
1784                asymId2authorId.put(asymIds.get(i), chainNames[i]);
1785            }
1786        }
1787    }
1788
1789    @Override
1790    public Structure getContainer() {
1791        return structure;
1792    }
1793}