001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.io;
022
023import org.biojava.nbio.alignment.Alignments;
024import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
025import org.biojava.nbio.alignment.SimpleGapPenalty;
026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
027import org.biojava.nbio.alignment.template.GapPenalty;
028import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
029import org.biojava.nbio.core.alignment.template.SequencePair;
030import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
031import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
032import org.biojava.nbio.core.sequence.DNASequence;
033import org.biojava.nbio.core.sequence.ProteinSequence;
034import org.biojava.nbio.core.sequence.RNASequence;
035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
036import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
037import org.biojava.nbio.structure.*;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import java.util.ArrayList;
042import java.util.Collections;
043import java.util.Comparator;
044import java.util.HashMap;
045import java.util.List;
046import java.util.Map;
047import java.util.Set;
048import java.util.TreeMap;
049import java.util.TreeSet;
050
051/**
052 * Heuristical finding of Compounds (called Entities in mmCIF dictionary)
053 * in a given Structure. Compounds are the groups of sequence identical NCS-related polymer chains
054 * in the Structure.
055 *
056 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB files where
057 * possibly no SEQRES is given.
058 *
059 * @author duarte_j
060 */
061public class CompoundFinder {
062
063        private Structure s;
064
065        private static final Logger logger = LoggerFactory.getLogger(CompoundFinder.class);
066
067        /**
068         * Above this ratio of mismatching residue types for same residue numbers we
069         * consider the 2 chains to have mismatching residue numbers and warn about it
070         */
071        public static final double RATIO_GAPS_FOR_MISMATCH = 0.50;
072
073        /**
074         * Identity value for 2 chains to be considered part of same entity
075         */
076        public static final double IDENTITY_THRESHOLD = 0.99999;
077
078        /**
079         * Gap coverage value (num gaps over length of sequence) for each chain of the match:
080         * 2 chains with more gap coverage than this value will not be considered part of the same entity
081         */
082        public static final double GAP_COVERAGE_THRESHOLD = 0.3;
083
084
085        public CompoundFinder(Structure s) {
086                this.s = s;
087        }
088
089        /**
090         * Utility method that employs some heuristics to find the Compounds
091         * for this Structure in case the information is missing in PDB/mmCIF file
092         * @return
093         */
094        public List<Compound> findCompounds() {
095
096                TreeMap<String,Compound> chainIds2entities = findCompoundsFromAlignment();
097
098                List<Compound> compounds = findUniqueCompounds(chainIds2entities);
099
100                createPurelyNonPolyCompounds(compounds);
101
102                return compounds;
103        }
104
105        /**
106         * Utility method to obtain a list of unique entities from the chainIds2entities map
107         * @return
108         */
109        private static List<Compound> findUniqueCompounds(TreeMap<String,Compound> chainIds2entities) {
110
111                List<Compound> list = new ArrayList<Compound>();
112
113                for (Compound cluster:chainIds2entities.values()) {
114                        boolean present = false;
115                        for (Compound cl:list) {
116                                if (cl==cluster) {
117                                        present = true;
118                                        break;
119                                }
120                        }
121                        if (!present) list.add(cluster);
122                }
123                return list;
124        }
125
126        private void createPurelyNonPolyCompounds(List<Compound> compounds) {
127
128                List<Chain> pureNonPolymerChains = new ArrayList<Chain>();
129                for (int i=0;i<s.getChains().size();i++) {
130                        if (StructureTools.isChainPureNonPolymer(s.getChain(i))) {
131                                pureNonPolymerChains.add(s.getChain(i));
132                        }
133                }
134
135                if (!pureNonPolymerChains.isEmpty()) {
136
137                        int maxMolId = 0; // if we have no compounds at all we just use 0, so that first assignment is 1
138                        if (!compounds.isEmpty()) {
139                                Collections.max(compounds, new Comparator<Compound>() {
140                                        @Override
141                                        public int compare(Compound o1, Compound o2) {
142                                                return new Integer(o1.getMolId()).compareTo(o2.getMolId());
143                                        }
144                                }).getMolId();
145                        }
146
147                        int molId = maxMolId + 1;
148                        // for the purely non-polymeric chains we assign additional compounds
149                        for (Chain c: pureNonPolymerChains) {
150                                Compound comp = new Compound();
151                                comp.addChain(c);
152                                comp.setMolId(molId);
153                                logger.warn("Chain {} is purely non-polymeric, will assign a new Compound (entity) to it (entity id {})", c.getChainID(), molId);
154                                molId++;
155
156                                compounds.add(comp);
157                        }
158
159                }
160        }
161
162
163        private static boolean areResNumbersAligned(Chain c1, Chain c2) {
164
165                boolean isC1prot = StructureTools.isProtein(c1);
166                boolean isC2prot = StructureTools.isProtein(c2);
167
168                // different kind of chain: we won't try to align them
169                if (isC1prot != isC2prot ) return false;
170
171                List<Group> c1AtomGroups = null;
172                if (isC1prot) {
173                        c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID);
174                }
175                else {
176                        c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE);
177                }
178
179                int countNonExisting = 0;
180
181                for (Group g1:c1AtomGroups) {
182                        try {
183                                Group g2 = c2.getGroupByPDB(g1.getResidueNumber());
184                                if (!g2.getPDBName().equals(g1.getPDBName())) {
185                                        logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}",
186                                                        c1.getChainID(),c2.getChainID(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName());
187                                        return false;
188                                }
189                        } catch (StructureException e) {
190                                // the group doesn't exist (no density) in the chain, go on
191                                countNonExisting++;
192                                continue;
193                        }
194                }
195
196                if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) {
197                        logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.",
198                                        RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getChainID(), c1.getChainID());
199                        return false;
200                }
201
202                return true;
203        }
204
205
206
207        private TreeMap<String,Compound> findCompoundsFromAlignment() {
208
209                // first we determine which chains to consider: anything not looking
210                // polymeric (protein or nucleotide chain) should be discarded
211                Set<Integer> polyChainIndices = new TreeSet<Integer>();
212                List<Chain> pureNonPolymerChains = new ArrayList<Chain>();
213                for (int i=0;i<s.getChains().size();i++) {
214                        if (StructureTools.isChainPureNonPolymer(s.getChain(i))) {
215                                pureNonPolymerChains.add(s.getChain(i));
216                        } else {
217                                polyChainIndices.add(i);
218                        }
219                }
220
221
222                TreeMap<String, Compound> chainIds2compounds = new TreeMap<String,Compound>();
223
224                int molId = 1;
225
226                outer:
227                        for (int i:polyChainIndices) {
228                                for (int j:polyChainIndices) {
229
230                                        if (j<=i) continue;
231
232                                        Chain c1 = s.getChain(i);
233                                        Chain c2 = s.getChain(j);
234
235                                        Map<Integer,Integer> positionIndex1 = new HashMap<Integer, Integer>();
236                                        Map<Integer,Integer> positionIndex2 = new HashMap<Integer, Integer>();
237                                        // here we use false, which means that X will be used for unknown compounds
238                                        String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), positionIndex1, false);
239                                        String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), positionIndex2, false);
240
241                                        int seq1Length = 0;
242                                        int seq2Length = 0;
243
244                                        SequencePair<?,?> pair = null;
245                                        if (isProteinSequence(str1) && isProteinSequence(str2)) {
246                                                ProteinSequence s1 = getProteinSequence(str1);
247                                                ProteinSequence s2 = getProteinSequence(str2);
248                                                seq1Length = s1.getLength();
249                                                seq2Length = s2.getLength();
250
251                                                pair = alignProtein(s1,s2);
252
253                                        } else if (isDNASequence(str1) && isDNASequence(str2)) {
254                                                DNASequence s1 = getDNASequence(str1);
255                                                DNASequence s2 = getDNASequence(str2);
256                                                seq1Length = s1.getLength();
257                                                seq2Length = s2.getLength();
258
259                                                pair = alignDNA(s1,s2);
260
261                                        } else if (isRNASequence(str1) && isRNASequence(str2)) {
262                                                RNASequence s1 = getRNASequence(str1);
263                                                RNASequence s2 = getRNASequence(str2);
264                                                seq1Length = s1.getLength();
265                                                seq2Length = s2.getLength();
266
267                                                pair = alignRNA(s1,s2);
268
269                                        } else {
270                                                logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers");
271                                                continue;
272                                        }
273
274
275                                        int numGaps = getNumGaps(pair);
276                                        int numGaps1 = getNumGapsQuery(pair);
277                                        int numGaps2 = getNumGapsTarget(pair);
278
279                                        int nonGaps = pair.getLength() - numGaps;
280
281                                        double identity = (double)pair.getNumIdenticals()/(double)nonGaps;
282                                        double gapCov1 = (double) numGaps1 / (double) seq1Length;
283                                        double gapCov2 = (double) numGaps2 / (double) seq2Length;
284
285                                        logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}",
286                                                        c1.getChainID(), c2.getChainID(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2));
287                                        logger.debug("\n"+pair.toString(100));
288
289                                        if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) {
290                                                if (    !chainIds2compounds.containsKey(c1.getChainID()) &&
291                                                                !chainIds2compounds.containsKey(c2.getChainID())) {
292                                                        logger.debug("Creating Compound with chains {},{}",c1.getChainID(),c2.getChainID());
293
294                                                        Compound ent = new Compound();
295                                                        ent.addChain(c1);
296                                                        ent.addChain(c2);
297                                                        ent.setMolId(molId++);
298                                                        chainIds2compounds.put(c1.getChainID(), ent);
299                                                        chainIds2compounds.put(c2.getChainID(), ent);
300
301
302                                                } else {
303                                                        Compound ent = chainIds2compounds.get(c1.getChainID());
304
305                                                        if (ent==null) {
306                                                                logger.debug("Adding chain {} to entity {}",c1.getChainID(),c2.getChainID());
307                                                                ent = chainIds2compounds.get(c2.getChainID());
308                                                                ent.addChain(c1);
309                                                                chainIds2compounds.put(c1.getChainID(), ent);
310
311                                                        } else {
312                                                                logger.debug("Adding chain {} to entity {}",c2.getChainID(),c1.getChainID());
313                                                                ent.addChain(c2);
314                                                                chainIds2compounds.put(c2.getChainID(), ent);
315
316                                                        }
317                                                }
318                                                if (!areResNumbersAligned(c1, c2)) {
319                                                        logger.warn("Including 100% identical chains {},{} in same Compound, although they have misaligned residue numbers",
320                                                                        c1.getChainID(),c2.getChainID());
321                                                }
322                                        }
323
324                                        if (identity>1) {
325                                                logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})",
326                                                                c1.getChainID(),c2.getChainID(),pair.getNumIdenticals(),nonGaps,identity);
327                                                logger.warn("\n"+pair.toString(100));
328                                        }
329
330                                        if (chainIds2compounds.size()==polyChainIndices.size()) // we've got all chains in entities
331                                                break outer;
332                                }
333                        }
334
335                // anything not in a Compound will be its own Compound
336                for (int i:polyChainIndices) {
337                        Chain c = s.getChain(i);
338                        if (!chainIds2compounds.containsKey(c.getChainID())) {
339                                logger.debug("Creating a 1-member Compound for chain {}",c.getChainID());
340
341                                Compound ent = new Compound();
342                                ent.addChain(c);
343                                ent.setMolId(molId++);
344
345                                chainIds2compounds.put(c.getChainID(),ent);
346                        }
347                }
348
349
350                return chainIds2compounds;
351        }
352
353        private SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) {
354                SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity();
355
356                GapPenalty penalty = new SimpleGapPenalty(8, 1);
357
358                PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw =
359                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
360
361                return nw.getPair();
362        }
363
364        private SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) {
365                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
366
367                GapPenalty penalty = new SimpleGapPenalty(8, 1);
368
369                PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw =
370                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
371
372                return nw.getPair();
373        }
374
375        private SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) {
376                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
377
378                GapPenalty penalty = new SimpleGapPenalty(8, 1);
379
380                PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw =
381                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
382
383                return nw.getPair();
384        }
385
386
387
388        private static int getNumGaps(SequencePair<?, ?> pair) {
389                int numGaps = 0;
390                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
391                        if (pair.hasGap(alignmentIndex)) numGaps++;
392                }
393                return numGaps;
394        }
395
396        private static int getNumGapsQuery(SequencePair<?, ?> pair) {
397                int numGaps = 0;
398                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
399                        if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) {
400                                numGaps++;
401                        }
402                }
403                return numGaps;
404        }
405
406        private static int getNumGapsTarget(SequencePair<?, ?> pair) {
407                int numGaps = 0;
408                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
409                        if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) {
410                                numGaps++;
411                        }
412                }
413                return numGaps;
414        }
415
416        private boolean isProteinSequence(String str) {
417                try {
418                        new ProteinSequence(str);
419                } catch (CompoundNotFoundException e) {
420
421                        return false;
422                }
423                return true;
424        }
425
426        private boolean isDNASequence(String str) {
427                try {
428                        new DNASequence(str);
429                } catch (CompoundNotFoundException e) {
430
431                        return false;
432                }
433                return true;
434        }
435
436        private boolean isRNASequence(String str) {
437                try {
438                        new RNASequence(str);
439                } catch (CompoundNotFoundException e) {
440
441                        return false;
442                }
443                return true;
444
445        }
446
447        /**
448         * Returns the ProteinSequence or null if one can't be created
449         * @param str
450         * @return
451         */
452        private ProteinSequence getProteinSequence(String str) {
453                try {
454                        ProteinSequence s = new ProteinSequence(str);
455                        return s;
456                } catch (CompoundNotFoundException e) {
457
458                        logger.error("Unexpected error when creating ProteinSequence",e);
459                }
460                return null;
461        }
462
463        /**
464         * Returns the DNASequence or null if one can't be created
465         * @param str
466         * @return
467         */
468        private DNASequence getDNASequence(String str) {
469                try {
470                        DNASequence s = new DNASequence(str);
471                        return s;
472                } catch (CompoundNotFoundException e) {
473
474                        logger.error("Unexpected error when creating DNASequence ",e);
475                }
476                return null;
477        }
478
479        /**
480         * Returns the RNASequence or null if one can't be created
481         * @param str
482         * @return
483         */
484        private RNASequence getRNASequence(String str) {
485                try {
486                        RNASequence s = new RNASequence(str);
487                        return s;
488                } catch (CompoundNotFoundException e) {
489
490                        logger.error("Unexpected error when creating RNASequence ",e);
491                }
492                return null;
493        }
494
495}