001package org.biojava.nbio.structure.io.cif;
002
003import java.time.LocalDate;
004import java.time.ZoneId;
005import java.time.format.DateTimeFormatter;
006import java.time.format.DateTimeFormatterBuilder;
007import java.util.ArrayList;
008import java.util.Date;
009import java.util.HashMap;
010import java.util.LinkedHashMap;
011import java.util.List;
012import java.util.Locale;
013import java.util.Map;
014import java.util.NoSuchElementException;
015import java.util.Optional;
016import java.util.OptionalInt;
017import java.util.stream.IntStream;
018
019import javax.vecmath.Matrix4d;
020
021import org.biojava.nbio.structure.AminoAcid;
022import org.biojava.nbio.structure.AminoAcidImpl;
023import org.biojava.nbio.structure.Atom;
024import org.biojava.nbio.structure.AtomImpl;
025import org.biojava.nbio.structure.Chain;
026import org.biojava.nbio.structure.ChainImpl;
027import org.biojava.nbio.structure.DBRef;
028import org.biojava.nbio.structure.Element;
029import org.biojava.nbio.structure.EntityInfo;
030import org.biojava.nbio.structure.EntityType;
031import org.biojava.nbio.structure.Group;
032import org.biojava.nbio.structure.GroupType;
033import org.biojava.nbio.structure.HetatomImpl;
034import org.biojava.nbio.structure.NucleotideImpl;
035import org.biojava.nbio.structure.PDBCrystallographicInfo;
036import org.biojava.nbio.structure.PDBHeader;
037import org.biojava.nbio.structure.PdbId;
038import org.biojava.nbio.structure.ResidueNumber;
039import org.biojava.nbio.structure.SeqMisMatch;
040import org.biojava.nbio.structure.SeqMisMatchImpl;
041import org.biojava.nbio.structure.Site;
042import org.biojava.nbio.structure.Structure;
043import org.biojava.nbio.structure.StructureException;
044import org.biojava.nbio.structure.StructureImpl;
045import org.biojava.nbio.structure.StructureTools;
046import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
047import org.biojava.nbio.structure.io.BondMaker;
048import org.biojava.nbio.structure.io.ChargeAdder;
049import org.biojava.nbio.structure.io.EntityFinder;
050import org.biojava.nbio.structure.io.FileParsingParameters;
051import org.biojava.nbio.structure.io.SeqRes2AtomAligner;
052import org.biojava.nbio.structure.quaternary.BioAssemblyInfo;
053import org.biojava.nbio.structure.quaternary.BiologicalAssemblyBuilder;
054import org.biojava.nbio.structure.quaternary.BiologicalAssemblyTransformation;
055import org.biojava.nbio.structure.xtal.CrystalCell;
056import org.biojava.nbio.structure.xtal.SpaceGroup;
057import org.biojava.nbio.structure.xtal.SymoplibParser;
058import org.rcsb.cif.model.FloatColumn;
059import org.rcsb.cif.model.IntColumn;
060import org.rcsb.cif.model.StrColumn;
061import org.rcsb.cif.model.ValueKind;
062import org.rcsb.cif.schema.mm.AtomSite;
063import org.rcsb.cif.schema.mm.AtomSites;
064import org.rcsb.cif.schema.mm.AuditAuthor;
065import org.rcsb.cif.schema.mm.Cell;
066import org.rcsb.cif.schema.mm.ChemComp;
067import org.rcsb.cif.schema.mm.ChemCompBond;
068import org.rcsb.cif.schema.mm.DatabasePDBRemark;
069import org.rcsb.cif.schema.mm.DatabasePDBRev;
070import org.rcsb.cif.schema.mm.DatabasePDBRevRecord;
071import org.rcsb.cif.schema.mm.Em3dReconstruction;
072import org.rcsb.cif.schema.mm.Entity;
073import org.rcsb.cif.schema.mm.EntityPoly;
074import org.rcsb.cif.schema.mm.EntityPolySeq;
075import org.rcsb.cif.schema.mm.EntitySrcGen;
076import org.rcsb.cif.schema.mm.EntitySrcNat;
077import org.rcsb.cif.schema.mm.Exptl;
078import org.rcsb.cif.schema.mm.PdbxAuditRevisionHistory;
079import org.rcsb.cif.schema.mm.PdbxChemCompIdentifier;
080import org.rcsb.cif.schema.mm.PdbxDatabaseStatus;
081import org.rcsb.cif.schema.mm.PdbxEntityBranchDescriptor;
082import org.rcsb.cif.schema.mm.PdbxEntitySrcSyn;
083import org.rcsb.cif.schema.mm.PdbxMolecule;
084import org.rcsb.cif.schema.mm.PdbxMoleculeFeatures;
085import org.rcsb.cif.schema.mm.PdbxNonpolyScheme;
086import org.rcsb.cif.schema.mm.PdbxReferenceEntityLink;
087import org.rcsb.cif.schema.mm.PdbxReferenceEntityList;
088import org.rcsb.cif.schema.mm.PdbxReferenceEntityPolyLink;
089import org.rcsb.cif.schema.mm.PdbxStructAssembly;
090import org.rcsb.cif.schema.mm.PdbxStructAssemblyGen;
091import org.rcsb.cif.schema.mm.PdbxStructModResidue;
092import org.rcsb.cif.schema.mm.PdbxStructOperList;
093import org.rcsb.cif.schema.mm.Refine;
094import org.rcsb.cif.schema.mm.Struct;
095import org.rcsb.cif.schema.mm.StructAsym;
096import org.rcsb.cif.schema.mm.StructConf;
097import org.rcsb.cif.schema.mm.StructConn;
098import org.rcsb.cif.schema.mm.StructConnType;
099import org.rcsb.cif.schema.mm.StructKeywords;
100import org.rcsb.cif.schema.mm.StructNcsOper;
101import org.rcsb.cif.schema.mm.StructRef;
102import org.rcsb.cif.schema.mm.StructRefSeq;
103import org.rcsb.cif.schema.mm.StructRefSeqDif;
104import org.rcsb.cif.schema.mm.StructSheetRange;
105import org.rcsb.cif.schema.mm.StructSite;
106import org.rcsb.cif.schema.mm.StructSiteGen;
107import org.rcsb.cif.schema.mm.Symmetry;
108import org.slf4j.Logger;
109import org.slf4j.LoggerFactory;
110
111/**
112 * An implementation of a CifFileConsumer for BioJava. Will process the information provided by a CifFile instance and
113 * use it to build up a {@link Structure} object.
114 * @author Sebastian Bittrich
115 * @since 6.0.0
116 */
117public class CifStructureConsumerImpl implements CifStructureConsumer {
118    private static final Logger logger = LoggerFactory.getLogger(CifStructureConsumerImpl.class);
119    private static final DateTimeFormatter DATE_FORMAT = new DateTimeFormatterBuilder()
120            .parseCaseInsensitive()
121            .appendPattern("yyyy-MM-dd")
122            .toFormatter(Locale.US);
123
124    private Structure structure;
125    private Chain currentChain;
126    private Group currentGroup;
127    private List<List<Chain>> allModels;
128    private List<Chain> currentModel;
129    private PDBHeader pdbHeader;
130    private String currentNmrModelNumber;
131        private Em3dReconstruction em3dReconstruction;
132    private List<Chain> entityChains;
133
134    private Entity entity;
135    private EntityPoly entityPoly;
136    private EntitySrcGen entitySrcGen;
137    private EntitySrcNat entitySrcNat;
138    private PdbxEntitySrcSyn entitySrcSyn;
139    private List<Chain> seqResChains;
140    private PdbxStructAssembly structAssembly;
141    private PdbxStructAssemblyGen structAssemblyGen;
142    private StructAsym structAsym;
143    private StructConn structConn;
144    private StructNcsOper structNcsOper;
145    private PdbxStructOperList structOpers;
146    private StructRef structRef;
147    private StructRefSeqDif structRefSeqDif;
148    private StructSiteGen structSiteGen;
149
150    private Map<String, String> asymId2entityId;
151    private Map<String, String> asymId2authorId;
152    private Matrix4d parsedScaleMatrix;
153
154    private final FileParsingParameters params;
155
156    public CifStructureConsumerImpl(FileParsingParameters params) {
157        this.params = params;
158    }
159
160    @Override
161    public void prepare() {
162        this.structure = new StructureImpl();
163        this.pdbHeader = new PDBHeader();
164        structure.setPDBHeader(pdbHeader);
165
166        this.allModels = new ArrayList<>();
167        this.currentModel = new ArrayList<>();
168
169        this.seqResChains  = new ArrayList<>();
170        this.asymId2entityId = new HashMap<>();
171        this.asymId2authorId = new HashMap<>();
172
173        this.entityChains = new ArrayList<>();
174    }
175
176    @Override
177    public void consumeAtomSite(AtomSite atomSite) {
178        if (params.isHeaderOnly()) {
179            return;
180        }
181
182        StrColumn labelAsymId = atomSite.getLabelAsymId();
183        StrColumn authAsymId = atomSite.getAuthAsymId();
184
185        StrColumn groupPDB = atomSite.getGroupPDB();
186        IntColumn authSeqId = atomSite.getAuthSeqId();
187
188        StrColumn labelCompId = atomSite.getLabelCompId();
189
190        IntColumn id = atomSite.getId();
191        StrColumn labelAtomId = atomSite.getLabelAtomId();
192
193        FloatColumn cartnX = atomSite.getCartnX();
194        FloatColumn cartnY = atomSite.getCartnY();
195        FloatColumn cartnZ = atomSite.getCartnZ();
196
197        FloatColumn occupancy = atomSite.getOccupancy();
198        FloatColumn bIsoOrEquiv = atomSite.getBIsoOrEquiv();
199
200        StrColumn labelAltId = atomSite.getLabelAltId();
201        StrColumn typeSymbol = atomSite.getTypeSymbol();
202
203        StrColumn pdbxPDBInsCode = atomSite.getPdbxPDBInsCode();
204        IntColumn labelSeqId = atomSite.getLabelSeqId();
205        IntColumn pdbx_pdb_model_num = atomSite.getPdbxPDBModelNum();
206
207        for (int atomIndex = 0; atomIndex < atomSite.getRowCount(); atomIndex++) {
208            boolean startOfNewChain = false;
209            Character oneLetterCode = StructureTools.get1LetterCodeAmino(labelCompId.get(atomIndex));
210
211            boolean isHetAtmInFile = false;
212            if (!"ATOM".equals(groupPDB.get(atomIndex))) {
213                if (oneLetterCode != null && oneLetterCode.equals(StructureTools.UNKNOWN_GROUP_LABEL)) {
214                    oneLetterCode = null;
215                }
216
217                isHetAtmInFile = true;
218            }
219
220            String insCodeString = pdbxPDBInsCode.isDefined()? pdbxPDBInsCode.get(atomIndex) : null;
221
222            Character insCode = null;
223            if (insCodeString != null && !insCodeString.isEmpty() && !"?".equals(insCodeString)) {
224                insCode = insCodeString.charAt(0);
225            }
226
227            // non polymer chains (ligands and small molecules) will have a label_seq_id set to '.'
228            long seqId = labelSeqId.get(atomIndex);
229
230            String nmrModelNumber = pdbx_pdb_model_num.getStringData(atomIndex);
231
232            if (currentNmrModelNumber == null) {
233                currentNmrModelNumber = nmrModelNumber;
234            }
235            if (!currentNmrModelNumber.equals(nmrModelNumber)) {
236                currentNmrModelNumber = nmrModelNumber;
237
238                if (currentChain != null) {
239                    currentChain.addGroup(currentGroup);
240                    currentGroup.trimToSize();
241                }
242
243                allModels.add(currentModel);
244                currentModel = new ArrayList<>();
245                currentChain = null;
246                currentGroup = null;
247            }
248
249            String asymId = labelAsymId.get(atomIndex);
250            String authId = authAsymId.isDefined()? authAsymId.get(atomIndex) : asymId;
251
252            if (currentChain == null) {
253                currentChain = new ChainImpl();
254                currentChain.setName(authId);
255                currentChain.setId(asymId);
256                currentModel.add(currentChain);
257                startOfNewChain = true;
258            }
259
260            if (!asymId.equals(currentChain.getId())) {
261                startOfNewChain = true;
262
263                currentChain.addGroup(currentGroup);
264
265                Optional<Chain> testChain = currentModel.stream()
266                        .filter(chain -> chain.getId().equals(asymId))
267                        .findFirst();
268
269                if (testChain.isPresent()) {
270                    currentChain = testChain.get();
271                } else {
272                    currentChain = new ChainImpl();
273                    currentChain.setName(authId);
274                    currentChain.setId(asymId);
275                }
276
277                if (!currentModel.contains(currentChain)) {
278                    currentModel.add(currentChain);
279                }
280            }
281
282            int authSeqIdInt = authSeqId.isDefined()? authSeqId.get(atomIndex) : (int)seqId;
283
284            ResidueNumber residueNumber = new ResidueNumber(authId, authSeqIdInt, insCode);
285
286            String recordName = groupPDB.get(atomIndex);
287            String compId = labelCompId.get(atomIndex);
288            if (currentGroup == null) {
289                currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
290                currentGroup.setResidueNumber(residueNumber);
291                currentGroup.setPDBName(compId);
292                currentGroup.setHetAtomInFile(isHetAtmInFile);
293            }
294
295            Group altGroup = null;
296            String altLocation = labelAltId.isDefined()? labelAltId.get(atomIndex) : null;
297
298            if (startOfNewChain) {
299                currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
300                currentGroup.setResidueNumber(residueNumber);
301                currentGroup.setPDBName(compId);
302                currentGroup.setHetAtomInFile(isHetAtmInFile);
303            } else {
304                if (!residueNumber.equals(currentGroup.getResidueNumber())) {
305                    currentChain.addGroup(currentGroup);
306                    currentGroup.trimToSize();
307                    currentGroup = createGroup(recordName, oneLetterCode, compId, seqId);
308                    currentGroup.setPDBName(compId);
309                    currentGroup.setResidueNumber(residueNumber);
310                    currentGroup.setHetAtomInFile(isHetAtmInFile);
311                } else {
312                    if (altLocation != null && !altLocation.isEmpty() && !".".equals(altLocation)) {
313                        altGroup = getAltLocGroup(recordName, altLocation.charAt(0), oneLetterCode, compId, seqId);
314                        if (altGroup.getChain() == null) {
315                            altGroup.setChain(currentChain);
316                        }
317                    }
318                }
319            }
320
321            if (params.isParseCAOnly()) {
322                if (!labelAtomId.get(atomIndex).equals(StructureTools.CA_ATOM_NAME) && "C".equals(typeSymbol.get(atomIndex))) {
323                    continue;
324                }
325            }
326
327            Atom atom = new AtomImpl();
328
329            atom.setPDBserial(id.get(atomIndex));
330            atom.setName(labelAtomId.get(atomIndex));
331
332            atom.setX(cartnX.get(atomIndex));
333            atom.setY(cartnY.get(atomIndex));
334            atom.setZ(cartnZ.get(atomIndex));
335
336            atom.setOccupancy((float) occupancy.get(atomIndex));
337            atom.setTempFactor((float) bIsoOrEquiv.get(atomIndex));
338
339            if (altLocation == null || altLocation.isEmpty() || ".".equals(altLocation)) {
340                atom.setAltLoc(' ');
341            } else {
342                atom.setAltLoc(altLocation.charAt(0));
343            }
344
345            String ts = typeSymbol.get(atomIndex);
346            try {
347                Element element = Element.valueOfIgnoreCase(ts);
348                atom.setElement(element);
349            }  catch (IllegalArgumentException e) {
350                logger.info("Element {} was not recognised as a BioJava-known element, the element will be " +
351                        "represented as the generic element {}", ts, Element.R.name());
352                atom.setElement(Element.R);
353            }
354
355            if (altGroup != null) {
356                altGroup.addAtom(atom);
357            } else {
358                currentGroup.addAtom(atom);
359            }
360
361            String atomName = atom.getName();
362            if (!currentGroup.hasAtom(atomName)) {
363                if (currentGroup.getPDBName().equals(atom.getGroup().getPDBName())) {
364                    if (!StructureTools.hasNonDeuteratedEquiv(atom, currentGroup)) {
365                        currentGroup.addAtom(atom);
366                    }
367                }
368            }
369        }
370    }
371
372    private Group getAltLocGroup(String recordName, Character altLoc, Character oneLetterCode, String threeLetterCode,
373                                 long seqId) {
374        List<Atom> atoms = currentGroup.getAtoms();
375        if (atoms.size() > 0) {
376            if (atoms.get(0).getAltLoc().equals(altLoc)) {
377                return currentGroup;
378            }
379        }
380
381        List<Group> altLocs = currentGroup.getAltLocs();
382        for (Group altLocGroup : altLocs) {
383            atoms = altLocGroup.getAtoms();
384            if (atoms.size() > 0) {
385                for (Atom a1 : atoms) {
386                    if (a1.getAltLoc().equals(altLoc)) {
387                        return altLocGroup;
388                    }
389                }
390            }
391        }
392
393        if (threeLetterCode.equals(currentGroup.getPDBName())) {
394            if (currentGroup.getAtoms().isEmpty()) {
395                return currentGroup;
396            }
397
398            Group altLocGroup = (Group) currentGroup.clone();
399            altLocGroup.setAtoms(new ArrayList<>());
400            altLocGroup.getAltLocs().clear();
401            currentGroup.addAltLoc(altLocGroup);
402            return altLocGroup;
403        }
404
405        Group altLocGroup = createGroup(recordName, oneLetterCode, threeLetterCode, seqId);
406        altLocGroup.setPDBName(threeLetterCode);
407        altLocGroup.setResidueNumber(currentGroup.getResidueNumber());
408        currentGroup.addAltLoc(altLocGroup);
409        return altLocGroup;
410    }
411
412    private Group createGroup(String record, Character oneLetterCode, String threeLetterCode, long seqId) {
413        Group group = ChemCompGroupFactory.getGroupFromChemCompDictionary(threeLetterCode);
414        if (group != null && !group.getChemComp().isEmpty()) {
415            if (group instanceof AminoAcidImpl) {
416                AminoAcidImpl aminoAcid = (AminoAcidImpl) group;
417                aminoAcid.setId(seqId);
418            } else if (group instanceof NucleotideImpl) {
419                NucleotideImpl nucleotide = (NucleotideImpl) group;
420                nucleotide.setId(seqId);
421            } else if (group instanceof HetatomImpl) {
422                HetatomImpl hetatom = (HetatomImpl) group;
423                hetatom.setId(seqId);
424            }
425            return group;
426        }
427
428        if ("ATOM".equals(record)) {
429            if (StructureTools.isNucleotide(threeLetterCode)) {
430                NucleotideImpl nucleotide = new NucleotideImpl();
431                group = nucleotide;
432                nucleotide.setId(seqId);
433            } else if (oneLetterCode == null || oneLetterCode == StructureTools.UNKNOWN_GROUP_LABEL) {
434                HetatomImpl hetatom = new HetatomImpl();
435                group = hetatom;
436                hetatom.setId(seqId);
437            } else {
438                AminoAcidImpl aminoAcid = new AminoAcidImpl();
439                group = aminoAcid;
440                aminoAcid.setAminoType(oneLetterCode);
441                aminoAcid.setId(seqId);
442            }
443        } else {
444            if (StructureTools.isNucleotide(threeLetterCode)) {
445                NucleotideImpl nucleotide = new NucleotideImpl();
446                group = nucleotide;
447                nucleotide.setId(seqId);
448            } else if (oneLetterCode != null) {
449                AminoAcidImpl aminoAcid = new AminoAcidImpl();
450                group = aminoAcid;
451                aminoAcid.setAminoType(oneLetterCode);
452                aminoAcid.setId(seqId);
453            } else {
454                HetatomImpl hetatom = new HetatomImpl();
455                hetatom.setId(seqId);
456                group = hetatom;
457            }
458        }
459        return group;
460    }
461
462    @Override
463    public void consumeAtomSites(AtomSites atomSites) {
464        // no atom sites present
465        if (!atomSites.isDefined() || atomSites.getRowCount() == 0) {
466            return;
467        }
468
469        try {
470            parsedScaleMatrix = new Matrix4d(
471                    atomSites.getFractTransfMatrix11().get(0),
472                    atomSites.getFractTransfMatrix12().get(0),
473                    atomSites.getFractTransfMatrix13().get(0),
474                    atomSites.getFractTransfVector1().get(0),
475
476                    atomSites.getFractTransfMatrix21().get(0),
477                    atomSites.getFractTransfMatrix22().get(0),
478                    atomSites.getFractTransfMatrix23().get(0),
479                    atomSites.getFractTransfVector2().get(0),
480
481                    atomSites.getFractTransfMatrix31().get(0),
482                    atomSites.getFractTransfMatrix32().get(0),
483                    atomSites.getFractTransfMatrix33().get(0),
484                    atomSites.getFractTransfVector3().get(0),
485
486                    0,
487                    0,
488                    0,
489                    1
490            );
491        } catch (NumberFormatException e) {
492            logger.warn("Some values in _atom_sites.fract_transf_matrix or _atom_sites.fract_transf_vector could not " +
493                    "be parsed as numbers. Can't check whether coordinate frame convention is correct! Error: {}",
494                    e.getMessage());
495            structure.getPDBHeader().getCrystallographicInfo().setNonStandardCoordFrameConvention(false);
496        }
497    }
498
499    @Override
500    public void consumeAuditAuthor(AuditAuthor auditAuthor) {
501        for (int rowIndex = 0; rowIndex < auditAuthor.getRowCount(); rowIndex++) {
502            String name = auditAuthor.getName().get(rowIndex);
503
504            StringBuilder last = new StringBuilder();
505            StringBuilder initials = new StringBuilder();
506            boolean afterComma = false;
507            for (char c : name.toCharArray()) {
508                if (c == ' ') {
509                    continue;
510                }
511                if (c == ',') {
512                    afterComma = true;
513                    continue;
514                }
515
516                if (afterComma) {
517                    initials.append(c);
518                } else {
519                    last.append(c);
520                }
521            }
522
523            StringBuilder newaa = new StringBuilder();
524            newaa.append(initials);
525            newaa.append(last);
526
527            String auth = pdbHeader.getAuthors();
528            if (auth == null) {
529                pdbHeader.setAuthors(newaa.toString());
530            } else {
531                auth += "," + newaa.toString();
532                pdbHeader.setAuthors(auth);
533            }
534        }
535    }
536
537    @Override
538    public void consumeCell(Cell cell) {
539        if (!cell.isDefined() || cell.getRowCount() == 0) {
540            return;
541        }
542
543        try {
544            float a = (float) cell.getLengthA().get(0);
545            float b = (float) cell.getLengthB().get(0);
546            float c = (float) cell.getLengthC().get(0);
547            float alpha = (float) cell.getAngleAlpha().get(0);
548            float beta = (float) cell.getAngleBeta().get(0);
549            float gamma = (float) cell.getAngleGamma().get(0);
550
551            CrystalCell crystalCell = new CrystalCell();
552            crystalCell.setA(a);
553            crystalCell.setB(b);
554            crystalCell.setC(c);
555            crystalCell.setAlpha(alpha);
556            crystalCell.setBeta(beta);
557            crystalCell.setGamma(gamma);
558
559            if (!crystalCell.isCellReasonable()) {
560                // If the entry describes a structure determined by a technique other than X-ray crystallography,
561                // cell is (sometimes!) a = b = c = 1.0, alpha = beta = gamma = 90 degrees
562                // if so we don't add and CrystalCell will be null
563                logger.debug("The crystal cell read from file does not have reasonable dimensions (at least one dimension is below {}), discarding it.", CrystalCell.MIN_VALID_CELL_SIZE);
564                return;
565            }
566
567            structure.getPDBHeader()
568                    .getCrystallographicInfo()
569                    .setCrystalCell(crystalCell);
570
571        } catch (NumberFormatException e){
572            structure.getPDBHeader()
573                    .getCrystallographicInfo()
574                    .setCrystalCell(null);
575            logger.info("could not parse some cell parameters ({}), ignoring _cell", e.getMessage());
576        }
577    }
578
579    @Override
580    public void consumeChemComp(ChemComp chemComp) {
581        // TODO not impled in ref
582    }
583
584    @Override
585    public void consumeChemCompBond(ChemCompBond chemCompBond) {
586        // TODO not impled in ref
587    }
588
589    @Override
590    public void consumeDatabasePDBRemark(DatabasePDBRemark databasePDBremark) {
591        for (int rowIndex = 0; rowIndex < databasePDBremark.getRowCount(); rowIndex++) {
592            int id = databasePDBremark.getId().get(rowIndex);
593            if (id == 2) {
594                String line = databasePDBremark.getText().get(rowIndex);
595                int i = line.indexOf("ANGSTROM");
596
597                if (i > 5) {
598                    // line contains ANGSTROM info...
599                    String resolution = line.substring(i - 5, i).trim();
600                    // convert string to float
601                    try {
602                        float res = Float.parseFloat(resolution);
603                        pdbHeader.setResolution(res);
604                    } catch (NumberFormatException e) {
605                        logger.info("could not parse resolution from line and ignoring it {}", line);
606                        return;
607                    }
608                }
609            }
610        }
611    }
612
613    private Date convert(LocalDate localDate) {
614        return Date.from(localDate.atStartOfDay().atZone(ZoneId.systemDefault()).toInstant());
615    }
616
617    @Override
618    public void consumeDatabasePDBRev(DatabasePDBRev databasePDBrev) {
619        logger.debug("got a database revision:{}", databasePDBrev);
620
621        Date modDate = null;
622        for (int rowIndex = 0; rowIndex < databasePDBrev.getRowCount(); rowIndex++) {
623            if (databasePDBrev.getNum().get(rowIndex) == 1) {
624                String dateOriginal = databasePDBrev.getDateOriginal().get(rowIndex);
625                pdbHeader.setDepDate(convert(LocalDate.parse(dateOriginal, DATE_FORMAT)));
626
627                String date = databasePDBrev.getDate().get(rowIndex);
628                final Date relDate = convert(LocalDate.parse(date, DATE_FORMAT));
629                pdbHeader.setRelDate(relDate);
630                modDate = relDate;
631            } else {
632                String dbrev = databasePDBrev.getDate().get(rowIndex);
633                modDate = convert(LocalDate.parse(dbrev, DATE_FORMAT));
634            }
635            pdbHeader.setModDate(modDate);
636        }
637    }
638
639    @Override
640    public void consumeDatabasePDBRevRecord(DatabasePDBRevRecord databasePDBrevRecord) {
641        List<org.biojava.nbio.structure.DatabasePDBRevRecord> revRecords = pdbHeader.getRevisionRecords();
642        if (revRecords == null) {
643            revRecords = new ArrayList<>();
644            pdbHeader.setRevisionRecords(revRecords);
645        }
646
647        for (int i = 0; i < databasePDBrevRecord.getRowCount(); i++) {
648            revRecords.add(new org.biojava.nbio.structure.DatabasePDBRevRecord(databasePDBrevRecord, i));
649        }
650    }
651
652    @Override
653    public void consumeEm3dReconstruction(Em3dReconstruction em3dReconstruction) {
654        this.em3dReconstruction = em3dReconstruction;
655
656        for (int rowIndex = 0; rowIndex < em3dReconstruction.getRowCount(); rowIndex++) { //can it have more than 1 value?
657                final FloatColumn resolution = em3dReconstruction.getResolution();
658                        if (ValueKind.PRESENT.equals(resolution.getValueKind(rowIndex)))
659                        pdbHeader.setResolution((float) resolution.get(rowIndex));
660        }
661        //TODO other fields (maybe RFree)?
662    }
663
664    @Override
665    public void consumeEntity(Entity entity) {
666        this.entity = entity;
667    }
668
669    @Override
670    public void consumeEntityPoly(EntityPoly entityPoly) {
671        this.entityPoly = entityPoly;
672    }
673
674    @Override
675    public void consumeEntitySrcGen(EntitySrcGen entitySrcGen) {
676        this.entitySrcGen = entitySrcGen;
677    }
678
679    @Override
680    public void consumeEntitySrcNat(EntitySrcNat entitySrcNat) {
681        this.entitySrcNat = entitySrcNat;
682    }
683
684    @Override
685    public void consumeEntitySrcSyn(PdbxEntitySrcSyn entitySrcSyn) {
686        this.entitySrcSyn = entitySrcSyn;
687    }
688
689    @Override
690    public void consumeEntityPolySeq(EntityPolySeq entityPolySeq) {
691        for (int rowIndex = 0; rowIndex < entityPolySeq.getRowCount(); rowIndex++) {
692            Chain entityChain = getEntityChain(entityPolySeq.getEntityId().get(rowIndex));
693
694            // first we check through the chemcomp provider, if it fails we do some heuristics to guess the type of group
695            // TODO some of this code is analogous to getNewGroup() and we should try to unify them - JD 2016-03-08
696
697            Group g = ChemCompGroupFactory.getGroupFromChemCompDictionary(entityPolySeq.getMonId().get(rowIndex));
698            //int seqId = Integer.parseInt(entityPolySeq.getNum());
699            if (g != null && !g.getChemComp().isEmpty()) {
700                if (g instanceof AminoAcidImpl) {
701                    AminoAcidImpl aa = (AminoAcidImpl) g;
702                    aa.setRecordType(AminoAcid.SEQRESRECORD);
703                }
704            } else {
705                if (entityPolySeq.getMonId().get(rowIndex).length() == 3 &&
706                        StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex)) != null) {
707                    AminoAcidImpl a = new AminoAcidImpl();
708                    a.setRecordType(AminoAcid.SEQRESRECORD);
709                    Character code1 = StructureTools.get1LetterCodeAmino(entityPolySeq.getMonId().get(rowIndex));
710                    a.setAminoType(code1);
711                    g = a;
712
713                } else if (StructureTools.isNucleotide(entityPolySeq.getMonId().get(rowIndex))) {
714                    // the group is actually a nucleotide group...
715                    g = new NucleotideImpl();
716                } else {
717                    logger.debug("Residue {} {} is not a standard aminoacid or nucleotide, will create a het group for it", entityPolySeq.getNum().get(rowIndex), entityPolySeq.getMonId().get(rowIndex));
718                    g = new HetatomImpl();
719                }
720            }
721            // at this stage we don't know about author residue numbers (insertion codes)
722            // we abuse now the ResidueNumber field setting the internal residue numbers (label_seq_id, strictly
723            // sequential and follow the seqres sequence 1 to n)
724            // later the actual ResidueNumbers (author residue numbers) have to be corrected in alignSeqRes()
725            g.setResidueNumber(ResidueNumber.fromString(entityPolySeq.getNum().getStringData(rowIndex)));
726            g.setPDBName(entityPolySeq.getMonId().get(rowIndex));
727            entityChain.addGroup(g);
728        }
729    }
730
731    private Chain getEntityChain(String entityId) {
732        for (Chain chain : entityChains) {
733            if (chain.getId().equals(entityId)) {
734                return chain;
735            }
736        }
737
738        // does not exist yet, so create...
739        Chain chain = new ChainImpl();
740        chain.setId(entityId);
741        entityChains.add(chain);
742
743        return chain;
744    }
745
746    @Override
747    public void consumeExptl(Exptl exptl) {
748        for (int rowIndex = 0; rowIndex < exptl.getRowCount(); rowIndex++) {
749            pdbHeader.setExperimentalTechnique(exptl.getMethod().get(rowIndex));
750        }
751    }
752
753    @Override
754    public void consumePdbxAuditRevisionHistory(PdbxAuditRevisionHistory pdbxAuditRevisionHistory) {
755        Date date = null;
756        for (int rowIndex = 0; rowIndex < pdbxAuditRevisionHistory.getRowCount(); rowIndex++) {
757            // first entry in revision history is the release date
758            if (pdbxAuditRevisionHistory.getOrdinal().get(rowIndex) == 1) {
759                String release = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
760                date = convert(LocalDate.parse(release, DATE_FORMAT));
761                pdbHeader.setRelDate(date);
762            } else {
763                // all other dates are revision dates;
764                // since this method may be called multiple times,
765                // the last revision date will "stick"
766                String revision = pdbxAuditRevisionHistory.getRevisionDate().get(rowIndex);
767                date = convert(LocalDate.parse(revision, DATE_FORMAT));
768            }
769            pdbHeader.setModDate(date);
770        }
771    }
772
773    @Override
774    public void consumePdbxChemCompIdentifier(PdbxChemCompIdentifier pdbxChemCompIdentifier) {
775        // TODO not impled in ref
776    }
777
778    @Override
779    public void consumePdbxDatabaseStatus(PdbxDatabaseStatus pdbxDatabaseStatus) {
780        for (int rowIndex = 0; rowIndex < pdbxDatabaseStatus.getRowCount(); rowIndex++) {
781            // the deposition date field is only available in mmCIF 5.0
782            StrColumn recvdInitialDepositionDate = pdbxDatabaseStatus.getRecvdInitialDepositionDate();
783            if (recvdInitialDepositionDate.isDefined()) {
784                String deposition = recvdInitialDepositionDate.get(rowIndex);
785                pdbHeader.setDepDate(convert(LocalDate.parse(deposition, DATE_FORMAT)));
786            }
787        }
788    }
789
790    @Override
791    public void consumePdbxEntityBranchDescriptor(PdbxEntityBranchDescriptor pdbxEntityBranchDescriptor) {
792        // TODO not considered in ref
793    }
794
795    @Override
796    public void consumePdbxMolecule(PdbxMolecule pdbxMolecule) {
797        // TODO not considered in ref
798    }
799
800    @Override
801    public void consumePdbxMoleculeFeatures(PdbxMoleculeFeatures pdbxMoleculeFeatures) {
802        // TODO not considered in ref
803    }
804
805    @Override
806    public void consumePdbxNonpolyScheme(PdbxNonpolyScheme pdbxNonpolyScheme) {
807        // TODO not impled in ref
808    }
809
810    @Override
811    public void consumePdbxReferenceEntityLink(PdbxReferenceEntityLink pdbxReferenceEntityLink) {
812        // TODO not considered in ref
813    }
814
815    @Override
816    public void consumePdbxReferenceEntityList(PdbxReferenceEntityList pdbxReferenceEntityList) {
817        // TODO not considered in ref
818    }
819
820    @Override
821    public void consumePdbxReferenceEntityPolyLink(PdbxReferenceEntityPolyLink pdbxReferenceEntityPolyLink) {
822        // TODO not considered in ref
823    }
824
825    @Override
826    public void consumePdbxStructAssembly(PdbxStructAssembly pdbxStructAssembly) {
827        this.structAssembly = pdbxStructAssembly;
828    }
829
830    @Override
831    public void consumePdbxStructAssemblyGen(PdbxStructAssemblyGen pdbxStructAssemblyGen) {
832        this.structAssemblyGen = pdbxStructAssemblyGen;
833    }
834
835    @Override
836    public void consumePdbxStructModResidue(PdbxStructModResidue pdbxStructModResidue) {
837        // TODO not considered in ref
838    }
839
840    @Override
841    public void consumePdbxStructOperList(PdbxStructOperList pdbxStructOperList) {
842        this.structOpers = pdbxStructOperList;
843    }
844
845    @Override
846    public void consumeRefine(Refine refine) {
847        for (int rowIndex = 0; rowIndex < refine.getRowCount(); rowIndex++) {
848            // RESOLUTION
849                ValueKind valueKind = refine.getLsDResHigh().getValueKind(rowIndex);
850                if (! ValueKind.PRESENT.equals(valueKind)) {
851                        continue;
852                }
853            // in very rare cases (for instance hybrid methods x-ray + neutron diffraction, e.g. 3ins, 4n9m)
854            // there are 2 resolution values, one for each method
855            // we take the last one found so that behaviour is like in PDB file parsing
856            double lsDResHigh = refine.getLsDResHigh().get(rowIndex);
857            // TODO this could use a check to keep reasonable values - 1.5 may be overwritten by 0.0
858            if (pdbHeader.getResolution() != PDBHeader.DEFAULT_RESOLUTION) {
859                logger.warn("More than 1 resolution value present, will use last one {} and discard previous {}",
860                        lsDResHigh, String.format("%4.2f",pdbHeader.getResolution()));
861            }
862            pdbHeader.setResolution((float) lsDResHigh);
863
864            FloatColumn lsRFactorRFree = refine.getLsRFactorRFree();
865            // RFREE
866            if (pdbHeader.getRfree() != PDBHeader.DEFAULT_RFREE) {
867                logger.warn("More than 1 Rfree value present, will use last one {} and discard previous {}",
868                        lsRFactorRFree, String.format("%4.2f",pdbHeader.getRfree()));
869            }
870            if (lsRFactorRFree.isDefined() && lsRFactorRFree.getValueKind(rowIndex) == ValueKind.PRESENT) {
871                pdbHeader.setRfree((float) lsRFactorRFree.get(rowIndex));
872            } else {
873                // some entries like 2ifo haven't got this field at all
874                logger.info("_refine.ls_R_factor_R_free not present, not parsing Rfree value");
875            }
876
877            // RWORK
878            FloatColumn lsRFactorRWork = refine.getLsRFactorRWork();
879            if(pdbHeader.getRwork() != PDBHeader.DEFAULT_RFREE) {
880                logger.warn("More than 1 R work value present, will use last one {} and discard previous {} ",
881                        lsRFactorRWork, String.format("%4.2f",pdbHeader.getRwork()));
882            }
883            if (lsRFactorRWork.isDefined() && lsRFactorRWork.getValueKind(rowIndex) == ValueKind.PRESENT) {
884                pdbHeader.setRwork((float) lsRFactorRWork.get(rowIndex));
885            } else {
886                logger.info("_refine.ls_R_factor_R_work not present, not parsing R-work value");
887            }
888        }
889    }
890
891    @Override
892    public void consumeStruct(Struct struct) {
893        if (struct.isDefined() && struct.getTitle().isDefined()) {
894            pdbHeader.setTitle(struct.getTitle().get(0));
895        }
896
897        if (struct.isDefined() && struct.getEntryId().isDefined()) {
898            PdbId pdbId;
899            String pdbCode = struct.getEntryId().get(0);
900            if(pdbCode.isBlank()){
901                pdbId = null;
902            } else {
903                try {
904                    pdbId = new PdbId(pdbCode);
905                } catch (IllegalArgumentException e) {
906                    logger.warn("Malformed PDB ID {}. setting PdbId to null", pdbCode);
907                    pdbId = null;
908                }
909            }
910            pdbHeader.setPdbId(pdbId);
911            structure.setPdbId(pdbId);
912        }
913    }
914
915    @Override
916    public void consumeStructAsym(StructAsym structAsym) {
917        this.structAsym = structAsym;
918    }
919
920    @Override
921    public void consumeStructConf(StructConf structConf) {
922        // TODO not considered in ref
923    }
924
925    @Override
926    public void consumeStructConn(StructConn structConn) {
927        this.structConn = structConn;
928    }
929
930    @Override
931    public void consumeStructConnType(StructConnType structConnType) {
932        // TODO not considered in ref
933    }
934
935    @Override
936    public void consumeStructKeywords(StructKeywords structKeywords) {
937        ArrayList<String> keywordsList = new ArrayList<>();
938
939        StrColumn text = structKeywords.getText();
940        if (text.isDefined()) {
941            String keywords = text.get(0);
942            String[] strings = keywords.split(" *, *");
943            for (String string : strings) {
944                keywordsList.add(string.trim());
945            }
946        }
947        structure.getPDBHeader().setKeywords(keywordsList);
948
949        StrColumn pdbxKeywords = structKeywords.getPdbxKeywords();
950        if (pdbxKeywords.isDefined()) {
951            String keywords = pdbxKeywords.get(0);
952            pdbHeader.setClassification(keywords);
953            //This field should be left empty. TODO The next line should be removed later
954            pdbHeader.setDescription(keywords);
955        }
956    }
957
958    @Override
959    public void consumeStructNcsOper(StructNcsOper structNcsOper) {
960        this.structNcsOper = structNcsOper;
961    }
962
963    @Override
964    public void consumeStructRef(StructRef structRef) {
965        this.structRef = structRef;
966    }
967
968    @Override
969    public void consumeStructRefSeq(StructRefSeq structRefSeq) {
970        for (int rowIndex = 0; rowIndex < structRefSeq.getRowCount(); rowIndex++) {
971            String refId = structRefSeq.getRefId().get(rowIndex);
972
973            DBRef dbRef = new DBRef();
974
975            dbRef.setIdCode(structRefSeq.getPdbxPDBIdCode().isDefined()? structRefSeq.getPdbxPDBIdCode().get(rowIndex):null);
976            dbRef.setDbAccession(structRefSeq.getPdbxDbAccession().isDefined()? structRefSeq.getPdbxDbAccession().get(rowIndex):null);
977            dbRef.setDbIdCode(structRefSeq.getPdbxDbAccession().isDefined()? structRefSeq.getPdbxDbAccession().get(rowIndex):null);
978            dbRef.setChainName(structRefSeq.getPdbxStrandId().get(rowIndex));
979
980            OptionalInt structRefRowIndex = IntStream.range(0, structRef.getRowCount())
981                    .filter(i -> structRef.getId().get(i).equals(refId))
982                    .findFirst();
983
984            if (structRefRowIndex.isPresent()) {
985                dbRef.setDatabase(structRef.getDbName().get(structRefRowIndex.getAsInt()));
986                dbRef.setDbIdCode(structRef.getDbCode().get(structRefRowIndex.getAsInt()));
987            } else {
988                logger.info("could not find StructRef `{} for StructRefSeq {}", refId, rowIndex);
989            }
990
991            int seqBegin;
992            int seqEnd;
993
994            try {
995                seqBegin = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignBeg().get(rowIndex));
996                seqEnd = Integer.parseInt(structRefSeq.getPdbxAuthSeqAlignEnd().get(rowIndex));
997            } catch (NumberFormatException e) {
998                // this happens in a few entries, annotation error? e.g. 6eoj
999                logger.warn("Couldn't parse pdbx_auth_seq_align_beg/end in _struct_ref_seq. Will not store dbref " +
1000                        "alignment info for accession {}. Error: {}", dbRef.getDbAccession(), e.getMessage());
1001                return;
1002            }
1003
1004            char beginInsCode = ' ';
1005            String pdbxSeqAlignBegInsCode = structRefSeq.getPdbxSeqAlignBegInsCode().get(rowIndex);
1006            if (pdbxSeqAlignBegInsCode.length() > 0) {
1007                beginInsCode = pdbxSeqAlignBegInsCode.charAt(0);
1008            }
1009
1010            char endInsCode = ' ';
1011            String pdbxSeqAlignEndInsCode = structRefSeq.getPdbxSeqAlignEndInsCode().get(rowIndex);
1012            if (pdbxSeqAlignEndInsCode.length() > 0) {
1013                endInsCode = pdbxSeqAlignEndInsCode.charAt(0);
1014            }
1015
1016            if (beginInsCode == '?') {
1017                beginInsCode = ' ';
1018            }
1019            if (endInsCode == '?') {
1020                endInsCode = ' ';
1021            }
1022
1023            dbRef.setSeqBegin(seqBegin);
1024            dbRef.setInsertBegin(beginInsCode);
1025            dbRef.setSeqEnd(seqEnd);
1026            dbRef.setInsertEnd(endInsCode);
1027
1028            int dbSeqBegin = structRefSeq.getDbAlignBeg().get(rowIndex);
1029            int dbSeqEnd = structRefSeq.getDbAlignEnd().get(rowIndex);
1030
1031            char dbBeginInsCode = ' ';
1032            StrColumn pdbxDbAlignBegInsCodeCol = structRefSeq.getPdbxDbAlignBegInsCode();
1033            if (pdbxDbAlignBegInsCodeCol.isDefined()) {
1034                String pdbxDbAlignBegInsCode = pdbxDbAlignBegInsCodeCol.get(rowIndex);
1035                if (pdbxDbAlignBegInsCode.length() > 0) {
1036                    dbBeginInsCode = pdbxDbAlignBegInsCode.charAt(0);
1037                }
1038            }
1039
1040            char dbEndInsCode = ' ';
1041            StrColumn pdbxDbAlignEndInsCodeCol = structRefSeq.getPdbxDbAlignEndInsCode();
1042            if (pdbxDbAlignEndInsCodeCol.isDefined()) {
1043                String pdbxDbAlignEndInsCode = pdbxDbAlignEndInsCodeCol.get(rowIndex);
1044                if (pdbxDbAlignEndInsCode.length() > 0) {
1045                    dbEndInsCode = pdbxDbAlignEndInsCode.charAt(0);
1046                }
1047            }
1048
1049            if (dbBeginInsCode == '?') {
1050                dbBeginInsCode = ' ';
1051            }
1052            if (dbEndInsCode == '?') {
1053                dbEndInsCode = ' ';
1054            }
1055
1056            dbRef.setDbSeqBegin(dbSeqBegin);
1057            dbRef.setIdbnsBegin(dbBeginInsCode);
1058            dbRef.setDbSeqEnd(dbSeqEnd);
1059            dbRef.setIdbnsEnd(dbEndInsCode);
1060
1061            List<DBRef> dbrefs = structure.getDBRefs();
1062            if (dbrefs == null) {
1063                dbrefs = new ArrayList<>();
1064            }
1065            dbrefs.add(dbRef);
1066
1067            logger.debug(dbRef.toPDB());
1068
1069            structure.setDBRefs(dbrefs);
1070        }
1071    }
1072
1073    @Override
1074    public void consumeStructRefSeqDif(StructRefSeqDif structRefSeqDif) {
1075        this.structRefSeqDif = structRefSeqDif;
1076    }
1077
1078    @Override
1079    public void consumeStructSheetRange(StructSheetRange structSheetRange) {
1080        // TODO not considered in ref
1081    }
1082
1083    @Override
1084    public void consumeStructSite(StructSite structSite) {
1085        if (params.isHeaderOnly()) {
1086            return;
1087        }
1088
1089        List<Site> sites = structure.getSites();
1090        if (sites == null) {
1091            sites = new ArrayList<>();
1092        }
1093
1094        for (int rowIndex = 0; rowIndex < structSite.getRowCount(); rowIndex++) {
1095            Site site = null;
1096            for (Site asite : sites) {
1097                if (asite.getSiteID().equals(structSite.getId().get(rowIndex))) {
1098                    site = asite; // prevent duplicate siteIds
1099                }
1100            }
1101
1102            boolean addSite = false;
1103            if (site == null) {
1104                site = new Site();
1105                addSite = true;
1106            }
1107
1108            site.setSiteID(structSite.getId().get(rowIndex));
1109            site.setDescription(structSite.getDetails().get(rowIndex));
1110            site.setEvCode(structSite.getPdbxEvidenceCode().get(rowIndex));
1111
1112            if (addSite) {
1113                sites.add(site);
1114            }
1115        }
1116
1117        structure.setSites(sites);
1118    }
1119
1120    @Override
1121    public void consumeStructSiteGen(StructSiteGen structSiteGen) {
1122        this.structSiteGen = structSiteGen;
1123    }
1124
1125    @Override
1126    public void consumeSymmetry(Symmetry symmetry) {
1127        for (int rowIndex = 0; rowIndex < symmetry.getRowCount(); rowIndex++) {
1128            String spaceGroupString = symmetry.getSpaceGroupNameH_M().get(rowIndex);
1129            SpaceGroup spaceGroup = SymoplibParser.getSpaceGroup(spaceGroupString);
1130            if (spaceGroup == null) {
1131                logger.warn("Space group '{}' not recognised as a standard space group", spaceGroupString);
1132                structure.getPDBHeader()
1133                        .getCrystallographicInfo()
1134                        .setNonStandardSg(true);
1135            } else {
1136                structure.getPDBHeader()
1137                        .getCrystallographicInfo()
1138                        .setSpaceGroup(spaceGroup);
1139                structure.getPDBHeader()
1140                        .getCrystallographicInfo()
1141                        .setNonStandardSg(false);
1142            }
1143        }
1144    }
1145
1146    @Override
1147    public void finish() {
1148        if (currentChain != null) {
1149            currentChain.addGroup(currentGroup);
1150
1151            Optional<Chain> testChain = currentModel.stream()
1152                    .filter(chain -> chain.getId().equals(currentChain.getId()))
1153                    .findFirst();
1154
1155            if (!testChain.isPresent()) {
1156                currentModel.add(currentChain);
1157            }
1158        } else if (!params.isHeaderOnly()) {
1159            logger.warn("current chain is null at end of document.");
1160        }
1161
1162        allModels.add(currentModel);
1163
1164        initMaps();
1165
1166        for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
1167            String id = structAsym.getId().get(rowIndex);
1168            String entityId = structAsym.getEntityId().get(rowIndex);
1169            logger.debug("Entity {} matches asym_id: {}", entityId, id);
1170
1171            Chain chain = getEntityChain(entityId);
1172            Chain seqRes = (Chain) chain.clone();
1173            // to solve issue #160 (e.g. 3u7t)
1174            seqRes = removeSeqResHeterogeneity(seqRes);
1175            seqRes.setId(id);
1176            seqRes.setName(asymId2authorId.getOrDefault(id, id));
1177
1178            EntityType type = EntityType.entityTypeFromString(getEntityType(entityId));
1179            if (type == null || type == EntityType.POLYMER) {
1180                seqResChains.add(seqRes);
1181            }
1182
1183            logger.debug(" seqres: {} {}<", id, seqRes);
1184            addEntity(rowIndex, entityId, getEntityDescription(entityId), getEntityType(entityId));
1185        }
1186
1187        if (!structAsym.isDefined() || structAsym.getRowCount() == 0) {
1188            logger.warn("No _struct_asym category in file, no SEQRES groups will be added.");
1189        }
1190
1191        // entities
1192        // In addEntities above we created the entities if they were present in the file
1193        // Now we need to make sure that they are linked to chains and also that if they are not present in the file we
1194        // need to add them now
1195        linkEntities();
1196
1197        // now that we know the entities, we can add all chains to structure so that they are stored
1198        // properly as polymer/nonpolymer/water chains inside structure
1199        allModels.forEach(structure::addModel);
1200
1201        // Only align if requested (default) and not when headerOnly mode with no Atoms.
1202        // Otherwise, we store the empty SeqRes Groups unchanged in the right chains.
1203        if (params.isAlignSeqRes() && !params.isHeaderOnly()){
1204            logger.debug("Parsing mode align_seqres, will parse SEQRES and align to ATOM sequence");
1205            alignSeqRes();
1206        } else {
1207            logger.debug("Parsing mode unalign_seqres, will parse SEQRES but not align it to ATOM sequence");
1208            SeqRes2AtomAligner.storeUnAlignedSeqRes(structure, seqResChains, params.isHeaderOnly());
1209        }
1210
1211        // Now make sure all altlocgroups have all the atoms in all the groups
1212        StructureTools.cleanUpAltLocs(structure);
1213
1214        // NOTE bonds and charges can only be done at this point that the chain id mapping is properly sorted out
1215        if (!params.isHeaderOnly()) {
1216            if (params.shouldCreateAtomBonds()) {
1217                addBonds();
1218            }
1219
1220            if (params.shouldCreateAtomCharges()) {
1221                addCharges();
1222            }
1223        }
1224
1225        if (!params.isHeaderOnly()) {
1226            addSites();
1227        }
1228
1229        // set the oligomeric state info in the header...
1230        if (params.isParseBioAssembly()) {
1231            // the more detailed mapping of chains to rotation operations happens in StructureIO...
1232
1233            Map<Integer, BioAssemblyInfo> bioAssemblies = new LinkedHashMap<>();
1234            for (int i = 0; i < structAssembly.getRowCount(); i++) {
1235                String assemblyId = structAssembly.getId().get(i);
1236                List<Integer> structAssemblyGenIndices = new ArrayList<>();
1237                for (int j = 0; j < structAssemblyGen.getRowCount(); j++) {
1238                    if (structAssemblyGen.getAssemblyId().get(j).equals(assemblyId)) {
1239                        structAssemblyGenIndices.add(j);
1240                    }
1241                }
1242                BiologicalAssemblyBuilder builder = new BiologicalAssemblyBuilder();
1243                // these are the transformations that need to be applied to our model
1244                List<BiologicalAssemblyTransformation> transformations = builder.getBioUnitTransformationList(structAssembly,
1245                        i, structAssemblyGen, structOpers);
1246
1247                int bioAssemblyId = -1;
1248                try {
1249                    bioAssemblyId = Integer.parseInt(assemblyId);
1250                } catch (NumberFormatException e) {
1251                    logger.info("Could not parse a numerical bio assembly id from '{}'", assemblyId);
1252                }
1253
1254                // if bioassembly id is not numerical we throw it away
1255                // this happens usually for viral capsid entries, like 1ei7
1256                // see issue #230 in github
1257                if (bioAssemblyId != -1) {
1258                    int mmSize = 0;
1259                    // note that the transforms contain asym ids of both polymers and non-polymers
1260                    // For the mmsize, we are only interested in the polymers
1261                    for (BiologicalAssemblyTransformation transf : transformations) {
1262                        Chain c = structure.getChain(transf.getChainId());
1263                        if (c == null) {
1264                            logger.info("Could not find asym id {} specified in struct_assembly_gen", transf.getChainId());
1265                            continue;
1266                        }
1267                        if (c.getEntityType() == EntityType.POLYMER &&
1268                                // for entries like 4kro, sugars are annotated as polymers but we
1269                                // don't want them in the macromolecularSize count
1270                                !c.getEntityInfo().getDescription().contains("SUGAR")) {
1271                            mmSize++;
1272                        }
1273                    }
1274
1275                    BioAssemblyInfo bioAssembly = new BioAssemblyInfo();
1276                    bioAssembly.setId(bioAssemblyId);
1277                    bioAssembly.setMacromolecularSize(mmSize);
1278                    bioAssembly.setTransforms(transformations);
1279                    bioAssemblies.put(bioAssemblyId, bioAssembly);
1280                }
1281
1282            }
1283            structure.getPDBHeader()
1284                    .setBioAssemblies(bioAssemblies);
1285        }
1286
1287        setStructNcsOps();
1288        setCrystallographicInfoMetadata();
1289
1290        Map<String, List<SeqMisMatch>> misMatchMap = new HashMap<>();
1291        for (int rowIndex = 0; rowIndex < structRefSeqDif.getRowCount(); rowIndex++) {
1292            SeqMisMatch seqMisMatch = new SeqMisMatchImpl();
1293            seqMisMatch.setDetails(structRefSeqDif.getDetails().get(rowIndex));
1294
1295            String insCode = structRefSeqDif.getPdbxPdbInsCode().get(rowIndex);
1296                if ("?".equals(insCode)) {
1297                insCode = null;
1298            }
1299            seqMisMatch.setInsCode(insCode);
1300            seqMisMatch.setOrigGroup(structRefSeqDif.getDbMonId().get(rowIndex));
1301            seqMisMatch.setPdbGroup(structRefSeqDif.getMonId().get(rowIndex));
1302            seqMisMatch.setPdbResNum(structRefSeqDif.getPdbxAuthSeqNum().get(rowIndex));
1303            seqMisMatch.setUniProtId(structRefSeqDif.getPdbxSeqDbAccessionCode().get(rowIndex));
1304            seqMisMatch.setSeqNum(structRefSeqDif.getSeqNum().get(rowIndex));
1305
1306            String strandId = structRefSeqDif.getPdbxPdbStrandId().get(rowIndex);
1307            List<SeqMisMatch> seqMisMatches = misMatchMap.computeIfAbsent(strandId, k -> new ArrayList<>());
1308            seqMisMatches.add(seqMisMatch);
1309        }
1310
1311        for (String chainId : misMatchMap.keySet()){
1312            Chain chain = structure.getPolyChainByPDB(chainId);
1313            if (chain == null) {
1314                logger.warn("Could not set mismatches for chain with author id {}", chainId);
1315                continue;
1316            }
1317
1318            chain.setSeqMisMatches(misMatchMap.get(chainId));
1319        }
1320    }
1321
1322    private String getEntityType(String entityId) {
1323        return IntStream.range(0, entity.getRowCount())
1324                .filter(i -> entity.getId().get(i).equals(entityId))
1325                .mapToObj(i -> entity.getType().get(i))
1326                .findFirst()
1327                .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
1328    }
1329
1330    private String getEntityDescription(String entityId) {
1331        return IntStream.range(0, entity.getRowCount())
1332                .filter(i -> entity.getId().get(i).equals(entityId))
1333                .mapToObj(i -> entity.getPdbxDescription().isDefined()? entity.getPdbxDescription().get(i):"")
1334                .findFirst()
1335                .orElseThrow(() -> new NoSuchElementException("could not find entity with id " + entityId));
1336    }
1337
1338    private void addEntity(int asymRowIndex, String entityId, String pdbxDescription, String type) {
1339        int eId = 0;
1340        try {
1341            eId = Integer.parseInt(entityId);
1342        } catch (NumberFormatException e) {
1343            logger.warn("Could not parse mol_id from string {}. Will use 0 for creating Entity", entityId);
1344        }
1345
1346        int entityRowIndex = IntStream.range(0, entity.getRowCount())
1347                .filter(i -> entity.getId().get(i).equals(entityId))
1348                .findFirst()
1349                .orElse(-1);
1350
1351        EntityInfo entityInfo = structure.getEntityById(eId);
1352
1353        if (entityInfo == null) {
1354            entityInfo = new EntityInfo();
1355            entityInfo.setMolId(eId);
1356            // we only add the compound if a polymeric one (to match what the PDB parser does)
1357            if (entityRowIndex != -1) {
1358                entityInfo.setDescription(pdbxDescription);
1359
1360                EntityType eType = EntityType.entityTypeFromString(type);
1361                if (eType != null) {
1362                    entityInfo.setType(eType);
1363                } else {
1364                    logger.warn("Type '{}' is not recognised as a valid entity type for entity {}", type, eId);
1365                }
1366                addAncilliaryEntityData(asymRowIndex, entityInfo);
1367                structure.addEntityInfo(entityInfo);
1368                logger.debug("Adding Entity with entity id {} from _entity, with name: {}", eId,
1369                        entityInfo.getDescription());
1370            }
1371        }
1372    }
1373
1374    private void addAncilliaryEntityData(int asymRowIndex, EntityInfo entityInfo) {
1375        // Loop through each of the entity types and add the corresponding data
1376        // We're assuming if data is duplicated between sources it is consistent
1377        // This is a potentially huge assumption...
1378
1379        for (int rowIndex = 0; rowIndex < entitySrcGen.getRowCount(); rowIndex++) {
1380            if (!entitySrcGen.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1381                continue;
1382            }
1383
1384            addInformationFromEntitySrcGen(rowIndex, entityInfo);
1385        }
1386
1387        for (int rowIndex = 0; rowIndex < entitySrcNat.getRowCount(); rowIndex++) {
1388            if (!entitySrcNat.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1389                continue;
1390            }
1391
1392            addInformationFromEntitySrcNat(rowIndex, entityInfo);
1393        }
1394
1395        for (int rowIndex = 0; rowIndex < entitySrcSyn.getRowCount(); rowIndex++) {
1396            if (!entitySrcSyn.getEntityId().get(rowIndex).equals(structAsym.getEntityId().get(asymRowIndex))) {
1397                continue;
1398            }
1399
1400            addInformationFromEntitySrcSyn(rowIndex, entityInfo);
1401        }
1402    }
1403
1404    private void addInformationFromEntitySrcSyn(int rowIndex, EntityInfo entityInfo) {
1405        entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcSyn.getOrganismCommonName(), rowIndex, null));
1406        entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcSyn.getOrganismScientific(), rowIndex, null));
1407        entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcSyn.getNcbiTaxonomyId(), rowIndex, null));
1408    }
1409
1410    private void addInformationFromEntitySrcNat(int rowIndex, EntityInfo entityInfo) {
1411        entityInfo.setAtcc(getCifFieldNullAware(entitySrcNat.getPdbxAtcc(), rowIndex, null));
1412        entityInfo.setCell(getCifFieldNullAware(entitySrcNat.getPdbxCell(), rowIndex, null));
1413        entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcNat.getCommonName(), rowIndex, null));
1414        entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcNat.getPdbxOrganismScientific(), rowIndex, null));
1415        entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcNat.getPdbxNcbiTaxonomyId(), rowIndex, null));
1416    }
1417
1418    private void addInformationFromEntitySrcGen(int rowIndex, EntityInfo entityInfo) {
1419        entityInfo.setAtcc(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcAtcc(), rowIndex, null));
1420        entityInfo.setCell(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcCell(), rowIndex, null));
1421        entityInfo.setOrganismCommon(getCifFieldNullAware(entitySrcGen.getGeneSrcCommonName(), rowIndex, null));
1422        entityInfo.setOrganismScientific(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcScientificName(), rowIndex, null));
1423        entityInfo.setOrganismTaxId(getCifFieldNullAware(entitySrcGen.getPdbxGeneSrcNcbiTaxonomyId(), rowIndex, null));
1424        entityInfo.setExpressionSystemTaxId(getCifFieldNullAware(entitySrcGen.getPdbxHostOrgNcbiTaxonomyId(), rowIndex, null));
1425        entityInfo.setExpressionSystem(getCifFieldNullAware(entitySrcGen.getPdbxHostOrgScientificName(), rowIndex, null));
1426    }
1427
1428    private String getCifFieldNullAware(StrColumn column, int rowIndex, String defaultValue) {
1429        if (column.isDefined())
1430            return column.get(rowIndex);
1431        else
1432            return defaultValue;
1433    }
1434
1435    private void setStructNcsOps() {
1436        List<Matrix4d> ncsOperators = new ArrayList<>();
1437
1438        for (int rowIndex = 0; rowIndex < structNcsOper.getRowCount(); rowIndex++) {
1439            if (!"generate".equals(structNcsOper.getCode().get(rowIndex))) {
1440                continue;
1441            }
1442
1443            try {
1444                Matrix4d operator = new Matrix4d();
1445
1446                operator.setElement(0, 0, structNcsOper.getMatrix11().get(rowIndex));
1447                operator.setElement(0, 1, structNcsOper.getMatrix12().get(rowIndex));
1448                operator.setElement(0, 2, structNcsOper.getMatrix13().get(rowIndex));
1449                operator.setElement(0, 3, structNcsOper.getVector1().get(rowIndex));
1450
1451                operator.setElement(1, 0, structNcsOper.getMatrix21().get(rowIndex));
1452                operator.setElement(1, 1, structNcsOper.getMatrix22().get(rowIndex));
1453                operator.setElement(1, 2, structNcsOper.getMatrix23().get(rowIndex));
1454                operator.setElement(1, 3, structNcsOper.getVector2().get(rowIndex));
1455
1456                operator.setElement(2, 0, structNcsOper.getMatrix31().get(rowIndex));
1457                operator.setElement(2, 1, structNcsOper.getMatrix32().get(rowIndex));
1458                operator.setElement(2, 2, structNcsOper.getMatrix33().get(rowIndex));
1459                operator.setElement(2, 3, structNcsOper.getVector3().get(rowIndex));
1460
1461                operator.setElement(3, 0, 0);
1462                operator.setElement(3, 1, 0);
1463                operator.setElement(3, 2, 0);
1464                operator.setElement(3, 3, 1);
1465
1466                ncsOperators.add(operator);
1467            } catch (NumberFormatException e) {
1468                logger.warn("Error parsing doubles in NCS operator list, skipping operator {}", rowIndex + 1);
1469            }
1470        }
1471
1472        if (ncsOperators.size() > 0) {
1473            structure.getCrystallographicInfo()
1474                    .setNcsOperators(ncsOperators.toArray(new Matrix4d[0]));
1475        }
1476    }
1477
1478    private void setCrystallographicInfoMetadata() {
1479        if (parsedScaleMatrix != null) {
1480            PDBCrystallographicInfo crystalInfo = structure.getCrystallographicInfo();
1481            boolean nonStd = false;
1482            if (crystalInfo.getCrystalCell() != null && !crystalInfo.getCrystalCell().checkScaleMatrix(parsedScaleMatrix)) {
1483                nonStd = true;
1484            }
1485
1486            crystalInfo.setNonStandardCoordFrameConvention(nonStd);
1487        }
1488    }
1489
1490    private void addSites() {
1491        List<Site> sites = structure.getSites();
1492        if (sites == null) sites = new ArrayList<>();
1493
1494        for (int rowIndex = 0; rowIndex < structSiteGen.getRowCount(); rowIndex++) {
1495            // For each StructSiteGen, find the residues involved, if they exist then
1496            String site_id = structSiteGen.getSiteId().get(rowIndex); // multiple could be in same site.
1497            if (site_id == null) {
1498                site_id = "";
1499            }
1500            String comp_id = structSiteGen.getLabelCompId().get(rowIndex);  // PDBName
1501
1502            // Assumption: the author chain ID and residue number for the site is consistent with the original
1503            // author chain id and residue numbers.
1504
1505            String asymId = structSiteGen.getLabelAsymId().get(rowIndex); // chain name
1506            String authId = structSiteGen.getAuthAsymId().get(rowIndex); // chain Id
1507            String auth_seq_id = structSiteGen.getAuthSeqId().get(rowIndex); // Res num
1508
1509            String insCode = structSiteGen.getPdbxAuthInsCode().get(rowIndex);
1510            if ("?".equals(insCode)) {
1511                insCode = null;
1512            }
1513
1514            // Look for asymID = chainID and seqID = seq_ID.  Check that comp_id matches the resname.
1515            Group g = null;
1516            try {
1517                Chain chain = structure.getChain(asymId);
1518
1519                if (null != chain) {
1520                    try {
1521                        Character insChar = null;
1522                        if (null != insCode && insCode.length() > 0) {
1523                            insChar = insCode.charAt(0);
1524                        }
1525                        g = chain.getGroupByPDB(new ResidueNumber(null, Integer.parseInt(auth_seq_id), insChar));
1526                    } catch (NumberFormatException e) {
1527                        logger.warn("Could not lookup residue : {}{}", authId, auth_seq_id);
1528                    }
1529                }
1530            } catch (StructureException e) {
1531                logger.warn("Problem finding residue in site entry {} - {}",
1532                        structSiteGen.getSiteId().get(rowIndex), e.getMessage());
1533            }
1534
1535            if (g != null) {
1536                // 2. find the site_id, if not existing, create anew.
1537                Site site = null;
1538                for (Site asite : sites) {
1539                    if (site_id.equals(asite.getSiteID())) {
1540                        site = asite;
1541                    }
1542                }
1543
1544                boolean addSite = false;
1545
1546                // 3. add this residue to the site.
1547                if (site == null) {
1548                    addSite = true;
1549                    site = new Site();
1550                    site.setSiteID(site_id);
1551                }
1552
1553                List<Group> groups = site.getGroups();
1554                if (groups == null) {
1555                    groups = new ArrayList<>();
1556                }
1557
1558                // Check the self-consistency of the residue reference from auth_seq_id and chain_id
1559                if (!comp_id.equals(g.getPDBName())) {
1560                    logger.warn("comp_id doesn't match the residue at {} {} - skipping", authId, auth_seq_id);
1561                } else {
1562                    groups.add(g);
1563                    site.setGroups(groups);
1564                }
1565                if (addSite) {
1566                    sites.add(site);
1567                }
1568            }
1569        }
1570        structure.setSites(sites);
1571    }
1572
1573    private void addCharges() {
1574        ChargeAdder.addCharges(structure);
1575    }
1576
1577    /**
1578     * The method will return a new reference to a Chain with any consecutive groups
1579     * having same residue numbers removed.
1580     * This is necessary to solve the microheterogeneity issue in entries like 3u7t (see github issue #160)
1581     */
1582    private static Chain removeSeqResHeterogeneity(Chain c) {
1583        Chain trimmedChain = new ChainImpl();
1584        ResidueNumber lastResNum = null;
1585
1586        for (Group g : c.getAtomGroups()) {
1587            // note we have to deep copy this, otherwise they stay linked and would get altered in addGroup(g)
1588            ResidueNumber currentResNum = new ResidueNumber(
1589                    g.getResidueNumber().getChainName(),
1590                    g.getResidueNumber().getSeqNum(),
1591                    g.getResidueNumber().getInsCode());
1592
1593            if (lastResNum == null || !lastResNum.equals(currentResNum)) {
1594                trimmedChain.addGroup(g);
1595            } else {
1596                logger.debug("Removing seqres group because it seems to be repeated in entity_poly_seq, most likely has hetero='y': {}", g);
1597            }
1598            lastResNum = currentResNum;
1599
1600        }
1601        return trimmedChain;
1602    }
1603
1604    private void addBonds() {
1605        BondMaker maker = new BondMaker(structure, params);
1606        maker.makeBonds();
1607        maker.formBondsFromStructConn(structConn);
1608    }
1609
1610    private void alignSeqRes() {
1611        logger.debug("Parsing mode align_seqres, will align to ATOM to SEQRES sequence");
1612
1613        // fix SEQRES residue numbering for all models
1614
1615        for (int model = 0; model < structure.nrModels(); model++) {
1616            List<Chain> atomList   = structure.getPolyChains(model);
1617
1618            if (seqResChains.isEmpty()) {
1619                // in files without _entity, seqResChains object is empty: we replace by atomChains resulting below in a trivial alignment and a copy of atom groups to seqres groups
1620                seqResChains = atomList;
1621            }
1622
1623            for (Chain seqResChain : seqResChains){
1624
1625                // this extracts the matching atom chain from atomList
1626                Chain atomChain = SeqRes2AtomAligner.getMatchingAtomRes(seqResChain, atomList, true);
1627
1628                if (atomChain == null) {
1629                    // most likely there's no observed residues at all for the seqres chain: can't map
1630                    // e.g. 3zyb: chains with asym_id L,M,N,O,P have no observed residues
1631                    logger.info("Could not map SEQRES chain with asym_id={} to any ATOM chain. Most likely there's " +
1632                            "no observed residues in the chain.", seqResChain.getId());
1633                    continue;
1634                }
1635
1636                //map the atoms to the seqres...
1637
1638                // we need to first clone the seqres so that they stay independent for different models
1639                List<Group> seqResGroups = new ArrayList<>();
1640                for (int i = 0; i < seqResChain.getAtomGroups().size(); i++) {
1641                    seqResGroups.add((Group)seqResChain.getAtomGroups().get(i).clone());
1642                }
1643
1644                for (int seqResPos = 0 ; seqResPos < seqResGroups.size(); seqResPos++) {
1645                    Group seqresG = seqResGroups.get(seqResPos);
1646                    boolean found = false;
1647                    for (Group atomG : atomChain.getAtomGroups()) {
1648
1649                        int internalNr = getInternalNr(atomG);
1650
1651                        if (seqresG.getResidueNumber().getSeqNum() == internalNr) {
1652                            seqResGroups.set(seqResPos, atomG);
1653                            found = true;
1654                            break;
1655                        }
1656                    }
1657
1658                    if (!found)
1659                        // so far the residue number has tracked internal numbering.
1660                        // however there are no atom records, as such this can't be a PDB residue number...
1661                        seqresG.setResidueNumber(null);
1662                }
1663                atomChain.setSeqResGroups(seqResGroups);
1664            }
1665        }
1666    }
1667
1668    private int getInternalNr(Group atomG) {
1669        if (atomG.getType().equals(GroupType.AMINOACID)) {
1670            AminoAcidImpl aa = (AminoAcidImpl) atomG;
1671            return (int) aa.getId();
1672        } else if (atomG.getType().equals(GroupType.NUCLEOTIDE)) {
1673            NucleotideImpl nu = (NucleotideImpl) atomG;
1674            return (int) nu.getId();
1675        } else {
1676            HetatomImpl he = (HetatomImpl) atomG;
1677            return (int) he.getId();
1678        }
1679    }
1680
1681    private void linkEntities() {
1682        for (List<Chain> allModel : allModels) {
1683            for (Chain chain : allModel) {
1684                //logger.info("linking entities for " + chain.getId() + " "  + chain.getName());
1685                String entityId = asymId2entityId.get(chain.getId());
1686
1687                if (entityId == null) {
1688                    // this can happen for instance if the cif file didn't have _struct_asym category at all
1689                    // and thus we have no asymId2entityId mapping at all
1690                    logger.info("No entity id could be found for chain {}", chain.getId());
1691                    continue;
1692                }
1693
1694                int eId = Integer.parseInt(entityId);
1695
1696                // Entities are not added for non-polymeric entities, if a chain is non-polymeric its entity won't be found.
1697                // TODO: add all entities and unique compounds and add methods to directly get polymer or non-polymer
1698                // asyms (chains).  Either create a unique StructureImpl or modify existing for a better representation of the
1699                // mmCIF internal data structures but is compatible with Structure interface.
1700                // Some examples of PDB entries with this kind of problem:
1701                //   - 2uub: asym_id X, chainName Z, entity_id 24: fully non-polymeric but still with its own chainName
1702                //   - 3o6j: asym_id K, chainName Z, entity_id 6 : a single water molecule
1703                //   - 1dz9: asym_id K, chainName K, entity_id 6 : a potassium ion alone
1704
1705                EntityInfo entityInfo = structure.getEntityById(eId);
1706                if (entityInfo == null) {
1707                    // Supports the case where the only chain members were from non-polymeric entity that is missing.
1708                    // Solved by creating a new Compound(entity) to which this chain will belong.
1709                    logger.info("Could not find an Entity for entity_id {}, for chain id {}, creating a new Entity.",
1710                            eId, chain.getId());
1711                    entityInfo = new EntityInfo();
1712                    entityInfo.setMolId(eId);
1713                    entityInfo.addChain(chain);
1714                    if (chain.isWaterOnly()) {
1715                        entityInfo.setType(EntityType.WATER);
1716                    } else {
1717                        entityInfo.setType(EntityType.NONPOLYMER);
1718                    }
1719                    chain.setEntityInfo(entityInfo);
1720                    structure.addEntityInfo(entityInfo);
1721                } else {
1722                    logger.debug("Adding chain with chain id {} (auth id {}) to Entity with entity_id {}",
1723                            chain.getId(), chain.getName(), eId);
1724                    entityInfo.addChain(chain);
1725                    chain.setEntityInfo(entityInfo);
1726                }
1727
1728            }
1729
1730        }
1731
1732        // if no entity information was present in file we then go and find the entities heuristically with EntityFinder
1733        List<EntityInfo> entityInfos = structure.getEntityInfos();
1734        if (entityInfos == null || entityInfos.isEmpty()) {
1735            List<List<Chain>> polyModels = new ArrayList<>();
1736            List<List<Chain>> nonPolyModels = new ArrayList<>();
1737            List<List<Chain>> waterModels = new ArrayList<>();
1738
1739            for (List<Chain> model : allModels) {
1740                List<Chain> polyChains = new ArrayList<>();
1741                List<Chain> nonPolyChains = new ArrayList<>();
1742                List<Chain> waterChains = new ArrayList<>();
1743
1744                polyModels.add(polyChains);
1745                nonPolyModels.add(nonPolyChains);
1746                waterModels.add(waterChains);
1747
1748                for (Chain chain : model) {
1749                    // we only have entities for polymeric chains, all others are ignored for assigning entities
1750                    if (chain.isWaterOnly()) {
1751                        waterChains.add(chain);
1752                    } else if (chain.isPureNonPolymer()) {
1753                        nonPolyChains.add(chain);
1754                    } else {
1755                        polyChains.add(chain);
1756                    }
1757                }
1758            }
1759
1760            entityInfos = EntityFinder.findPolyEntities(polyModels);
1761            EntityFinder.createPurelyNonPolyEntities(nonPolyModels, waterModels, entityInfos);
1762
1763            structure.setEntityInfos(entityInfos);
1764        }
1765
1766        // final sanity check: it can happen that from the annotated entities some are not linked to any chains
1767        // e.g. 3s26: a sugar entity does not have any chains associated to it (it seems to be happening with many sugar compounds)
1768        // we simply log it, this can sign some other problems if the entities are used down the line
1769        for (EntityInfo e : entityInfos) {
1770            if (e.getChains().isEmpty()) {
1771                logger.info("Entity {} '{}' has no chains associated to it",
1772                        e.getMolId() < 0 ? "with no entity id" : e.getMolId(), e.getDescription());
1773            }
1774        }
1775    }
1776
1777    private void initMaps() {
1778        if (structAsym == null || !structAsym.isDefined() || structAsym.getRowCount() == 0) {
1779            logger.info("No _struct_asym category found in file. No asym id to entity_id mapping will be available");
1780            return;
1781        }
1782
1783        Map<String, List<String>> entityId2asymId = new HashMap<>();
1784        for (int rowIndex = 0; rowIndex < structAsym.getRowCount(); rowIndex++) {
1785            String id = structAsym.getId().get(rowIndex);
1786            String entityId = structAsym.getEntityId().get(rowIndex);
1787
1788            logger.debug("Entity {} matches asym_id: {}", entityId, id);
1789
1790            asymId2entityId.put(id, entityId);
1791
1792            if (entityId2asymId.containsKey(entityId)) {
1793                List<String> asymIds = entityId2asymId.get(entityId);
1794                asymIds.add(id);
1795            } else {
1796                List<String> asymIds = new ArrayList<>();
1797                asymIds.add(id);
1798                entityId2asymId.put(entityId, asymIds);
1799            }
1800        }
1801
1802        if (entityPoly == null || !entityPoly.isDefined() || entityPoly.getRowCount() == 0) {
1803            logger.info("No _entity_poly category found in file. No asym id to author id mapping will be available " +
1804                    "for header only parsing");
1805            return;
1806        }
1807
1808        for (int rowIndex = 0; rowIndex < entityPoly.getRowCount(); rowIndex++) {
1809            if (!entityPoly.getPdbxStrandId().isDefined()) {
1810                logger.info("_entity_poly.pdbx_strand_id is null for entity {}. Won't be able to map asym ids to " +
1811                        "author ids for this entity.", entityPoly.getEntityId().get(rowIndex));
1812                break;
1813            }
1814
1815            String[] chainNames = entityPoly.getPdbxStrandId().get(rowIndex).split(",");
1816            List<String> asymIds = entityId2asymId.get(entityPoly.getEntityId().get(rowIndex));
1817            if (chainNames.length != asymIds.size()) {
1818                logger.warn("The list of asym ids (from _struct_asym) and the list of author ids (from _entity_poly) " +
1819                        "for entity {} have different lengths! Can't provide a mapping from asym ids to author chain " +
1820                        "ids", entityPoly.getEntityId().get(rowIndex));
1821                break;
1822            }
1823
1824            for (int i = 0; i < chainNames.length; i++) {
1825                asymId2authorId.put(asymIds.get(i), chainNames[i]);
1826            }
1827        }
1828    }
1829
1830    @Override
1831    public Structure getContainer() {
1832        return structure;
1833    }
1834}