001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 12.03.2004
021 * @author Andreas Prlic
022 *
023 */
024package org.biojava.nbio.structure;
025
026
027import org.biojava.nbio.structure.chem.ChemComp;
028import org.biojava.nbio.structure.chem.ChemCompGroupFactory;
029import org.biojava.nbio.structure.chem.PolymerType;
030import org.biojava.nbio.structure.io.FileConvert;
031import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
032import org.biojava.nbio.core.sequence.ProteinSequence;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
034import org.biojava.nbio.core.sequence.template.Sequence;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038import java.util.*;
039
040
041/**
042 * A Chain in a PDB file. It contains several groups which can be of
043 * one of the types defined in the {@link GroupType} constants.
044 *
045 * @author Andreas Prlic
046 * @author Jules Jacobsen
047 * @since 1.4
048 */
049public class ChainImpl implements Chain {
050
051        private final static Logger logger = LoggerFactory.getLogger(ChainImpl.class);
052
053        private static final long serialVersionUID = 1990171805277911840L;
054
055        /**
056         * The default chain identifier used to be an empty space
057         */
058        private static final String DEFAULT_CHAIN_ID = "A";
059
060        private String authId; // the 'public' chain identifier as assigned by authors in PDB files
061        private String asymId; // the 'internal' chain identifier as used in mmCIF files
062
063        private List<Group> groups;
064        private List<Group> seqResGroups;
065
066        private EntityInfo entity;
067        private Structure parent;
068
069        private final Map<String, Integer> pdbResnumMap;
070
071        private List<SeqMisMatch> seqMisMatches;
072
073        /**
074         *  Constructs a ChainImpl object.
075         */
076        public ChainImpl() {
077                super();
078
079                authId = DEFAULT_CHAIN_ID;
080                groups = new ArrayList<>() ;
081
082                seqResGroups = new ArrayList<>();
083                pdbResnumMap = new HashMap<>();
084                asymId = null;
085
086        }
087
088        @Override
089        public String getId() {
090                return asymId;
091        }
092
093        @Override
094        public void setId(String asymId) {
095                this.asymId = asymId;
096        }
097
098        @Override
099        public String getName() { return authId; }
100
101        @Override
102        public void setName(String authId) { this.authId = authId; }
103
104        @Override
105        public void setStructure(Structure parent){
106                this.parent = parent;
107        }
108
109        @Override
110        public Structure getStructure() {
111
112                return parent;
113        }
114
115        @Override
116        public Object clone() {
117
118                // go through all groups and add to new Chain.
119                ChainImpl n = new ChainImpl();
120                // copy chain data:
121
122                n.setId(getId());
123                n.setName(getName());
124
125                // NOTE the EntityInfo will be reset at the parent level (Structure) if cloning is happening from parent level
126                // here we don't deep-copy it and just keep the same reference, in case the cloning is happening at the Chain level only
127                n.setEntityInfo(this.entity);
128
129
130                for (Group group : groups) {
131                        Group g = (Group) group.clone();
132                        n.addGroup(g);
133                        g.setChain(n);
134                }
135
136                if (seqResGroups!=null){
137
138                        List<Group> tmpSeqRes = new ArrayList<>();
139
140                        // cloning seqres and atom groups is ugly, due to their
141                        // nested relationship (some of the atoms can be in the seqres, but not all)
142
143                        for (Group seqResGroup : seqResGroups) {
144
145                                if (seqResGroup==null) {
146                                        tmpSeqRes.add(null);
147                                        continue;
148                                }
149
150                                int i = groups.indexOf(seqResGroup);
151
152                                Group g ;
153
154                                if (i!=-1) {
155                                        // group found in atom groups, we get the equivalent reference from the newly cloned atom groups
156                                        g = n.getAtomGroup(i);
157                                } else {
158                                        // group not found in atom groups, we clone the seqres group
159                                        g = (Group) seqResGroup.clone();
160                                }
161                                g.setChain(n);
162                                tmpSeqRes.add(g);
163                        }
164
165                        n.setSeqResGroups(tmpSeqRes);
166                }
167
168                return n ;
169        }
170
171        @Override
172        public void setEntityInfo(EntityInfo mol) {
173                this.entity = mol;
174        }
175
176        @Override
177        public EntityInfo getEntityInfo() {
178                return this.entity;
179        }
180
181        @Override
182        public void addGroup(Group group) {
183
184                group.setChain(this);
185
186                // Set the altlocs chain as well
187                for(Group g : group.getAltLocs()) {
188                        g.setChain(this);
189                }
190
191                groups.add(group);
192
193                // store the position internally for quick access of this group
194
195                String pdbResnum = null ;
196                ResidueNumber resNum = group.getResidueNumber();
197                if ( resNum != null)
198                        pdbResnum = resNum.toString();
199                if ( pdbResnum != null) {
200                        Integer pos = groups.size() - 1;
201                        // ARGH sometimes numbering in PDB files is confusing.
202                        // e.g. PDB: 1sfe
203                        /*
204                         * ATOM    620  N   GLY    93     -24.320  -6.591   4.210  1.00 46.82           N
205                         * ATOM    621  CA  GLY    93     -24.960  -6.849   5.497  1.00 47.35           C
206                         * ATOM    622  C   GLY    93     -26.076  -5.873   5.804  1.00 47.24           C
207                         * ATOM    623  O   GLY    93     -26.382  -4.986   5.006  1.00 47.56           O
208                         *    and ...
209                         * HETATM 1348  O   HOH    92     -21.853 -16.886  19.138  1.00 66.92           O
210                         * HETATM 1349  O   HOH    93     -26.126   1.226  29.069  1.00 71.69           O
211                         * HETATM 1350  O   HOH    94     -22.250 -18.060  -6.401  1.00 61.97           O
212                         */
213
214                        // this check is to give in this case the entry priority that is an AminoAcid / comes first...
215                        // a good example of same residue number for 2 residues is 3th3, chain T, residue 201 (a LYS and a sugar BGC covalently attached to it) - JD 2016-03-09
216                        if (  pdbResnumMap.containsKey(pdbResnum)) {
217
218                                logger.warn("Adding residue {}({}) to chain {} but a residue with same residue number is already present: {}({}). Will add only the aminoacid residue (if any) to the lookup, lookups for that residue number won't work properly.",
219                                                pdbResnum, group.getPDBName(), getId(), groups.get(pdbResnumMap.get(pdbResnum)).getResidueNumber(), groups.get(pdbResnumMap.get(pdbResnum)).getPDBName());
220                                if ( group instanceof AminoAcid)
221                                        pdbResnumMap.put(pdbResnum,pos);
222                        } else
223                                pdbResnumMap.put(pdbResnum,pos);
224                }
225
226        }
227
228        @Override
229        public Group getAtomGroup(int position) {
230
231                return groups.get(position);
232        }
233
234        @Override
235        public List<Group> getAtomGroups(GroupType type){
236
237                List<Group> tmp = new ArrayList<>() ;
238                for (Group g : groups) {
239                        if (g.getType().equals(type)) {
240                                tmp.add(g);
241                        }
242                }
243
244                return tmp ;
245        }
246
247        @Override
248        public List<Group> getAtomGroups(){
249                return groups ;
250        }
251
252        @Override
253        public void setAtomGroups(List<Group> groups){
254                for (Group g:groups){
255                        g.setChain(this);
256                }
257                this.groups = groups;
258        }
259
260        @Override
261        public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end, boolean ignoreMissing)
262                        throws StructureException {
263                // Short-circut for include all groups
264                if(start == null && end == null) {
265                        return groups.toArray(new Group[0]);
266                }
267
268                List<Group> retlst = new ArrayList<>();
269
270                boolean adding, foundStart;
271                if( start == null ) {
272                        // start with first group
273                        adding = true;
274                        foundStart = true;
275                } else {
276                        adding = false;
277                        foundStart = false;
278                }
279
280                for (Group g: groups){
281
282                        // Check for start
283                        if (!adding && start.equalsPositional(g.getResidueNumber())) {
284                                adding = true;
285                                foundStart = true;
286                        }
287
288                        // Check if past start
289                        if ( ignoreMissing && ! (foundStart && adding) ) {
290                                ResidueNumber pos = g.getResidueNumber();
291
292                                if ( start != null && start.compareToPositional(pos) <= 0) {
293                                        foundStart = true;
294                                        adding = true;
295                                }
296                        }
297
298                        if ( adding)
299                                retlst.add(g);
300
301                        // check for end
302                        if ( end != null && end.equalsPositional(g.getResidueNumber())) {
303                                if ( ! adding)
304                                        throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId);
305                                adding = false;
306                                break;
307                        }
308                        // check if past end
309                        if ( ignoreMissing && adding && end != null){
310
311                                ResidueNumber pos = g.getResidueNumber();
312                                if ( end.compareToPositional(pos) <= 0) {
313                                        adding = false;
314                                        break;
315                                }
316
317                        }
318                }
319
320                if ( ! foundStart){
321                        throw new StructureException("did not find start PDB residue number " + start + " in chain " + authId);
322                }
323                if ( end != null && adding && !ignoreMissing) {
324                        throw new StructureException("did not find end PDB residue number " + end + " in chain " + authId);
325                }
326
327                //not checking if the end has been found in this case...
328
329                return retlst.toArray(new Group[0]);
330        }
331
332        @Override
333        public Group getGroupByPDB(ResidueNumber resNum) throws StructureException {
334                String pdbresnum = resNum.toString();
335                if ( pdbResnumMap.containsKey(pdbresnum)) {
336                        Integer pos = pdbResnumMap.get(pdbresnum);
337                        return groups.get(pos);
338                } else {
339                        throw new StructureException("unknown PDB residue number " + pdbresnum + " in chain " + authId);
340                }
341        }
342
343        @Override
344        public Group[] getGroupsByPDB(ResidueNumber start, ResidueNumber end)
345                        throws StructureException {
346                return getGroupsByPDB(start, end, false);
347        }
348
349        @Override
350        public int getSeqResLength() {
351                //new method returns the length of the sequence defined in the SEQRES records
352                return seqResGroups.size();
353        }
354
355        @Override
356        public String toString(){
357                String newline = System.getProperty("line.separator");
358                StringBuilder str = new StringBuilder();
359                str.append("Chain asymId:").append(getId()).append(" authId:").append(getName()).append(newline);
360                if ( entity != null ){
361                        if ( entity.getDescription() != null){
362                                str.append(entity.getDescription()).append(newline);
363                        }
364                }
365                str.append("total SEQRES length: ").append(getSeqResGroups().size()).append(" total ATOM length:")
366                .append(getAtomLength()).append(" residues ").append(newline);
367
368                return str.toString() ;
369        }
370
371        @Override
372        public Sequence<?> getBJSequence()  {
373
374                String seq = getSeqResSequence();
375
376                Sequence<AminoAcidCompound> s = null;
377
378                try {
379                        s = new ProteinSequence(seq);
380                } catch (CompoundNotFoundException e) {
381                        logger.error("Could not create sequence object from seqres sequence. Some unknown compound: {}",e.getMessage());
382                }
383
384                //TODO: return a DNA sequence if the content is DNA...
385                return s;
386        }
387
388        @Override
389        public String getAtomSequence(){
390
391                List<Group> groups = getAtomGroups();
392                StringBuilder sequence = new StringBuilder() ;
393
394                for ( Group g: groups){
395                        ChemComp cc = g.getChemComp();
396
397                        if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) ||
398                                        PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){
399                                // an amino acid residue.. use for alignment
400                                String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc);
401                                if ( oneLetter == null)
402                                        oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL);
403                                sequence.append(oneLetter);
404                        }
405
406                }
407                return sequence.toString();
408        }
409
410        @Override
411        public String getSeqResSequence(){
412
413                StringBuilder str = new StringBuilder();
414                for (Group g : seqResGroups) {
415                        ChemComp cc = g.getChemComp();
416                        if ( cc == null) {
417                                logger.warn("Could not load ChemComp for group: {}", g);
418                                str.append(StructureTools.UNKNOWN_GROUP_LABEL);
419                        } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) ||
420                                        PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){
421                                // an amino acid residue.. use for alignment
422                                String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc);
423                                // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible.
424                                if ( oneLetter == null || oneLetter.isEmpty() || "?".equals(oneLetter)) {
425                                        oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL);
426                                }
427                                str.append(oneLetter);
428                        } else {
429                                str.append(StructureTools.UNKNOWN_GROUP_LABEL);
430                        }
431                }
432                return str.toString();
433        }
434
435        /**
436         * Get the one letter sequence so that Sequence is guaranteed to
437         * be the same length as seqResGroups.
438         * Method related to https://github.com/biojava/biojava/issues/457
439         * @return a string of the sequence guaranteed to be the same length
440         * as seqResGroups.
441         */
442        public String getSeqResOneLetterSeq(){
443
444                StringBuilder str = new StringBuilder();
445                for (Group g : seqResGroups) {
446                        ChemComp cc = g.getChemComp();
447                        if ( cc == null) {
448                                logger.warn("Could not load ChemComp for group: {}", g);
449                                str.append(StructureTools.UNKNOWN_GROUP_LABEL);
450                        } else if ( PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) ||
451                                        PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())){
452                                // an amino acid residue.. use for alignment
453                                String oneLetter= ChemCompGroupFactory.getOneLetterCode(cc);
454                                // AB oneLetter.length() should be one. e.g. in 1EMA it is 3 and this makes mapping residue to sequence impossible.
455                                if ( oneLetter == null || oneLetter.isEmpty() || "?".equals(oneLetter) || oneLetter.length()!=1) {
456                                        oneLetter = Character.toString(StructureTools.UNKNOWN_GROUP_LABEL);
457                                }
458                                str.append(oneLetter);
459                        } else {
460                                str.append(StructureTools.UNKNOWN_GROUP_LABEL);
461                        }
462                }
463                return str.toString();
464        }
465
466        @Override
467        public Group getSeqResGroup(int position) {
468                return seqResGroups.get(position);
469        }
470
471        @Override
472        public List<Group> getSeqResGroups(GroupType type) {
473                List<Group> tmp = new ArrayList<>() ;
474                for (Group g : seqResGroups) {
475                        if (g.getType().equals(type)) {
476                                tmp.add(g);
477                        }
478                }
479
480                return tmp ;
481        }
482
483        @Override
484        public List<Group> getSeqResGroups() {
485                return seqResGroups;
486        }
487
488        @Override
489        public void setSeqResGroups(List<Group> groups){
490                for (Group g: groups){
491                        if (g != null) {
492                                g.setChain(this);
493                        }
494                }
495                this.seqResGroups = groups;
496        }
497
498        @Override
499        public int getAtomLength() {
500
501                return groups.size();
502        }
503
504        @Override
505        public String toPDB() {
506                return FileConvert.toPDB(this);
507        }
508
509        @Override
510        public String toMMCIF() {
511                return FileConvert.toMMCIF(this);
512        }
513
514        @Override
515        public void setSeqMisMatches(List<SeqMisMatch> seqMisMatches) {
516                this.seqMisMatches = seqMisMatches;
517        }
518
519        @Override
520        public List<SeqMisMatch> getSeqMisMatches() {
521                return seqMisMatches;
522        }
523
524        @Override
525        public EntityType getEntityType() {
526                if (getEntityInfo()==null) return null;
527                return getEntityInfo().getType();
528        }
529
530        @Override
531        public boolean isWaterOnly() {
532                for (Group g : getAtomGroups()) {
533                        if (!g.isWater())
534                                return false;
535                }
536                return true;
537        }
538
539        @Override
540        public boolean isPureNonPolymer() {
541                for (Group g : getAtomGroups()) {
542
543                        //ChemComp cc = g.getChemComp();
544
545                        if (    g.isPolymeric() &&
546                                        !g.isHetAtomInFile() ) {
547
548                                // important: the aminoacid or nucleotide residue can be in Atom records
549
550                                return false;
551                        }
552
553                }
554                return true;
555        }
556
557        @Override
558        public GroupType getPredominantGroupType(){
559
560                double ratioResiduesToTotal = StructureTools.RATIO_RESIDUES_TO_TOTAL;
561
562                int sizeAminos = getAtomGroups(GroupType.AMINOACID).size();
563                int sizeNucleotides = getAtomGroups(GroupType.NUCLEOTIDE).size();
564                List<Group> hetAtoms = getAtomGroups(GroupType.HETATM);
565                int sizeHetatoms = hetAtoms.size();
566                int sizeWaters = 0;
567                for (Group g : hetAtoms) {
568                        if (g.isWater())
569                                sizeWaters++;
570                }
571                int sizeHetatomsWithoutWater = sizeHetatoms - sizeWaters;
572
573                int fullSize = sizeAminos + sizeNucleotides + sizeHetatomsWithoutWater;
574
575                if ((double) sizeAminos / (double) fullSize > ratioResiduesToTotal)
576                        return GroupType.AMINOACID;
577
578                if ((double) sizeNucleotides / (double) fullSize > ratioResiduesToTotal)
579                        return GroupType.NUCLEOTIDE;
580
581                if ((double) (sizeHetatomsWithoutWater) / (double) fullSize > ratioResiduesToTotal)
582                        return GroupType.HETATM;
583
584                // finally if neither condition works, we try based on majority, but log
585                // it
586                GroupType max;
587                if (sizeNucleotides > sizeAminos) {
588                        if (sizeNucleotides > sizeHetatomsWithoutWater) {
589                                max = GroupType.NUCLEOTIDE;
590                        } else {
591                                max = GroupType.HETATM;
592                        }
593                } else {
594                        if (sizeAminos > sizeHetatomsWithoutWater) {
595                                max = GroupType.AMINOACID;
596                        } else {
597                                max = GroupType.HETATM;
598                        }
599                }
600                logger.debug("Ratio of residues to total for chain with asym_id {} is below {}. Assuming it is a {} chain. Counts: # aa residues: {}, # nuc residues: {}, # non-water het residues: {}, # waters: {}, ratio aa/total: {}, ratio nuc/total: {}{}{}{}{}", getId(), ratioResiduesToTotal, max, sizeAminos, sizeNucleotides, sizeHetatomsWithoutWater, sizeWaters, (double) sizeAminos, (double) fullSize, (double) sizeNucleotides, (double) fullSize);
601
602                return max;
603        }
604
605        @Override
606        public  boolean isProtein() {
607                return getPredominantGroupType() == GroupType.AMINOACID;
608        }
609
610        @Override
611        public  boolean isNucleicAcid() {
612                return getPredominantGroupType() == GroupType.NUCLEOTIDE;
613        }
614
615
616}
617