Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.io;
022
023import org.biojava.nbio.alignment.Alignments;
024import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
025import org.biojava.nbio.alignment.SimpleGapPenalty;
026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
027import org.biojava.nbio.alignment.template.GapPenalty;
028import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
029import org.biojava.nbio.core.alignment.template.SequencePair;
030import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
031import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
032import org.biojava.nbio.core.sequence.DNASequence;
033import org.biojava.nbio.core.sequence.ProteinSequence;
034import org.biojava.nbio.core.sequence.RNASequence;
035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
036import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
037import org.biojava.nbio.structure.*;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import java.util.ArrayList;
042import java.util.Collections;
043import java.util.Comparator;
044import java.util.HashMap;
045import java.util.List;
046import java.util.Map;
047import java.util.Set;
048import java.util.TreeMap;
049import java.util.TreeSet;
050
051/**
052 * Heuristical finding of Entities (called Compounds in legacy PDB format)
053 * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains
054 * in the Structure.
055 *
056 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where
057 * possibly no SEQRES is given.
058 *
059 * @author Jose Duarte
060 */
061public class EntityFinder {
062
063        private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class);
064        
065        /**
066         * Above this ratio of mismatching residue types for same residue numbers we
067         * consider the 2 chains to have mismatching residue numbers and warn about it
068         */
069        public static final double RATIO_GAPS_FOR_MISMATCH = 0.50;
070
071        /**
072         * Identity value for 2 chains to be considered part of same entity
073         */
074        public static final double IDENTITY_THRESHOLD = 0.99999;
075
076        /**
077         * Gap coverage value (num gaps over length of sequence) for each chain of the match:
078         * 2 chains with more gap coverage than this value will not be considered part of the same entity
079         */
080        public static final double GAP_COVERAGE_THRESHOLD = 0.3;
081
082        
083        
084        
085        /**
086         * Utility method that employs some heuristics to find the {@link EntityInfo}s
087         * for the polymeric chains given in constructor. 
088         * To be used in case the information is missing in PDB/mmCIF file 
089         * @return
090         */
091        public static List<EntityInfo> findPolyEntities(List<List<Chain>> polyModels) {
092                TreeMap<String,EntityInfo> chainIds2entities = findEntitiesFromAlignment(polyModels);
093
094                List<EntityInfo> entities = findUniqueEntities(chainIds2entities);
095
096                return entities;
097        }
098
099        /**
100         * Utility method to obtain a list of unique entities from the chainIds2entities map
101         * @return
102         */
103        private static List<EntityInfo> findUniqueEntities(TreeMap<String,EntityInfo> chainIds2entities) {
104
105                List<EntityInfo> list = new ArrayList<EntityInfo>();
106
107                for (EntityInfo cluster:chainIds2entities.values()) {
108                        boolean present = false;
109                        for (EntityInfo cl:list) {
110                                if (cl==cluster) {
111                                        present = true;
112                                        break;
113                                }
114                        }
115                        if (!present) list.add(cluster);
116                }
117                return list;
118        }
119
120        /**
121         * Given all chains of all models find entities for the nonpolymers and water chains within them,
122         * assigning entity ids, types and descriptions to them. The result is written back to the passed entities List.
123         * @param nonPolyModels
124         * @param waterModels
125         * @param entities
126         */
127        public static void createPurelyNonPolyEntities(List<List<Chain>> nonPolyModels, List<List<Chain>> waterModels, List<EntityInfo> entities) {
128
129                if (nonPolyModels.isEmpty()) return;
130                
131                // let's find first the max entity id to assign entity ids to the newly found entities
132                int maxMolId = 0;
133                if (!entities.isEmpty()) {
134                        maxMolId = Collections.max(entities, new Comparator<EntityInfo>() {
135                                @Override
136                                public int compare(EntityInfo o1, EntityInfo o2) {
137                                        return new Integer(o1.getMolId()).compareTo(o2.getMolId());
138                                }
139                        }).getMolId();
140                }
141                // we go one over the max
142                int molId = maxMolId + 1;
143                                
144                if (!nonPolyModels.get(0).isEmpty()) {
145                        List<EntityInfo> nonPolyEntities = new ArrayList<>();
146                        
147                        for (List<Chain> model:nonPolyModels) {
148                                for (Chain c: model) {
149                                        // we assume there's only 1 group per non-poly chain
150                                        String molecPdbName = c.getAtomGroup(0).getPDBName();
151
152                                        EntityInfo nonPolyEntity = findNonPolyEntityWithDescription(molecPdbName, nonPolyEntities);
153                                        if (nonPolyEntity == null) {
154                                                nonPolyEntity = new EntityInfo();
155                                                nonPolyEntity.setDescription(molecPdbName);
156                                                nonPolyEntity.setType(EntityType.NONPOLYMER);
157                                                nonPolyEntity.setMolId(molId++);
158                                                nonPolyEntities.add(nonPolyEntity);
159
160                                        }
161                                        nonPolyEntity.addChain(c);
162                                        c.setEntityInfo(nonPolyEntity);                         
163                                }
164                        }
165                        entities.addAll(nonPolyEntities);
166                }
167                
168                if (!waterModels.get(0).isEmpty()) {
169                        EntityInfo waterEntity = new EntityInfo();
170                        waterEntity.setType(EntityType.WATER);
171                        waterEntity.setDescription("water");
172                        waterEntity.setMolId(molId);
173
174                        for (List<Chain> model:waterModels) { 
175                                for (Chain waterChain:model) {
176                                        waterEntity.addChain(waterChain);
177                                        waterChain.setEntityInfo(waterEntity);
178                                }
179                        }
180                        entities.add(waterEntity);
181                        
182                }
183                
184                
185        }
186        
187        private static EntityInfo findNonPolyEntityWithDescription(String description, List<EntityInfo> nonPolyEntities) {
188                for (EntityInfo e:nonPolyEntities) {
189                        if (e.getDescription().equals(description)) return e;
190                }
191                return null;
192        }
193
194
195        private static boolean areResNumbersAligned(Chain c1, Chain c2) {
196
197                boolean isC1prot = c1.isProtein();
198                boolean isC2prot = c2.isProtein();
199
200                // different kind of chain: we won't try to align them
201                if (isC1prot != isC2prot ) return false;
202
203                List<Group> c1AtomGroups = null;
204                if (isC1prot) {
205                        c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID);
206                }
207                else {
208                        c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE);
209                }
210
211                int countNonExisting = 0;
212
213                for (Group g1:c1AtomGroups) {
214                        try {
215                                Group g2 = c2.getGroupByPDB(g1.getResidueNumber());
216                                if (!g2.getPDBName().equals(g1.getPDBName())) {
217                                        logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}",
218                                                        c1.getId(),c2.getId(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName());
219                                        return false;
220                                }
221                        } catch (StructureException e) {
222                                // the group doesn't exist (no density) in the chain, go on
223                                countNonExisting++;
224                                continue;
225                        }
226                }
227
228                if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) {
229                        logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.",
230                                        RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getId(), c1.getId());
231                        return false;
232                }
233
234                return true;
235        }
236
237
238
239        private static TreeMap<String,EntityInfo> findEntitiesFromAlignment(List<List<Chain>> polyModels) {
240
241
242
243                TreeMap<String, EntityInfo> chainIds2entities = new TreeMap<String,EntityInfo>();
244
245                if (polyModels.isEmpty()) return chainIds2entities;
246                
247                Set<Integer> polyChainIndices = new TreeSet<Integer>();
248                for (int i=0;i<polyModels.get(0).size();i++) {
249                        polyChainIndices.add(i);
250                }
251
252                
253                int molId = 1;
254
255                outer:
256                        for (int i:polyChainIndices) {
257                                for (int j:polyChainIndices) {
258
259                                        if (j<=i) continue;
260
261                                        Chain c1 = polyModels.get(0).get(i);
262                                        Chain c2 = polyModels.get(0).get(j);
263
264                                        Map<Integer,Integer> positionIndex1 = new HashMap<Integer, Integer>();
265                                        Map<Integer,Integer> positionIndex2 = new HashMap<Integer, Integer>();
266                                        // here we use false, which means that X will be used for unknown compounds
267                                        String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), positionIndex1, false);
268                                        String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), positionIndex2, false);
269
270                                        int seq1Length = 0;
271                                        int seq2Length = 0;
272
273                                        SequencePair<?,?> pair = null;
274                                        if (isProteinSequence(str1) && isProteinSequence(str2)) {
275                                                ProteinSequence s1 = getProteinSequence(str1);
276                                                ProteinSequence s2 = getProteinSequence(str2);
277                                                seq1Length = s1.getLength();
278                                                seq2Length = s2.getLength();
279
280                                                pair = alignProtein(s1,s2);
281
282                                        } else if (isDNASequence(str1) && isDNASequence(str2)) {
283                                                DNASequence s1 = getDNASequence(str1);
284                                                DNASequence s2 = getDNASequence(str2);
285                                                seq1Length = s1.getLength();
286                                                seq2Length = s2.getLength();
287
288                                                pair = alignDNA(s1,s2);
289
290                                        } else if (isRNASequence(str1) && isRNASequence(str2)) {
291                                                RNASequence s1 = getRNASequence(str1);
292                                                RNASequence s2 = getRNASequence(str2);
293                                                seq1Length = s1.getLength();
294                                                seq2Length = s2.getLength();
295
296                                                pair = alignRNA(s1,s2);
297
298                                        } else {
299                                                logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers");
300                                                continue;
301                                        }
302
303
304                                        int numGaps = getNumGaps(pair);
305                                        int numGaps1 = getNumGapsQuery(pair);
306                                        int numGaps2 = getNumGapsTarget(pair);
307
308                                        int nonGaps = pair.getLength() - numGaps;
309
310                                        double identity = (double)pair.getNumIdenticals()/(double)nonGaps;
311                                        double gapCov1 = (double) numGaps1 / (double) seq1Length;
312                                        double gapCov2 = (double) numGaps2 / (double) seq2Length;
313
314                                        logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}",
315                                                        c1.getId(), c2.getId(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2));
316                                        logger.debug("\n"+pair.toString(100));
317
318                                        if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) {
319                                                if (    !chainIds2entities.containsKey(c1.getId()) &&
320                                                                !chainIds2entities.containsKey(c2.getId())) {
321                                                        logger.debug("Creating Compound with chains {},{}",c1.getId(),c2.getId());
322
323                                                        EntityInfo ent = new EntityInfo();
324                                                        ent.addChain(c1);
325                                                        ent.addChain(c2);
326                                                        ent.setMolId(molId++);
327                                                        ent.setType(EntityType.POLYMER);
328                                                        c1.setEntityInfo(ent);
329                                                        c2.setEntityInfo(ent);
330                                                        chainIds2entities.put(c1.getId(), ent);
331                                                        chainIds2entities.put(c2.getId(), ent);
332
333
334                                                } else {
335                                                        EntityInfo ent = chainIds2entities.get(c1.getId());
336
337                                                        if (ent==null) {
338                                                                logger.debug("Adding chain {} to entity {}",c1.getId(),c2.getId());
339                                                                ent = chainIds2entities.get(c2.getId());
340                                                                ent.addChain(c1);
341                                                                c1.setEntityInfo(ent);
342                                                                chainIds2entities.put(c1.getId(), ent);
343
344                                                        } else {
345                                                                logger.debug("Adding chain {} to entity {}",c2.getId(),c1.getId());
346                                                                ent.addChain(c2);
347                                                                c2.setEntityInfo(ent);
348                                                                chainIds2entities.put(c2.getId(), ent);
349
350                                                        }
351                                                }
352                                                if (!areResNumbersAligned(c1, c2)) {
353                                                        logger.warn("Including 100% identical chains {},{} in same Compound, although they have misaligned residue numbers",
354                                                                        c1.getId(),c2.getId());
355                                                }
356                                        }
357
358                                        if (identity>1) {
359                                                logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})",
360                                                                c1.getId(),c2.getId(),pair.getNumIdenticals(),nonGaps,identity);
361                                                logger.warn("\n"+pair.toString(100));
362                                        }
363
364                                        if (chainIds2entities.size()==polyChainIndices.size()) // we've got all chains in entities
365                                                break outer;
366                                }
367                        }
368
369                // anything not in an Entity will be its own Entity
370                for (int i:polyChainIndices) {
371                        Chain c = polyModels.get(0).get(i);
372                        if (!chainIds2entities.containsKey(c.getId())) {
373                                logger.debug("Creating a 1-member Compound for chain {}",c.getId());
374
375                                EntityInfo ent = new EntityInfo();
376                                ent.addChain(c);
377                                ent.setMolId(molId++);
378                                c.setEntityInfo(ent); 
379
380                                chainIds2entities.put(c.getId(),ent);
381                        }
382                }
383                
384                // now that we've found the entities for first model, let's do the same for the rest of the models
385                
386                for (int i=1;i<polyModels.size();i++) {
387                        for (Chain chain : polyModels.get(i)) {
388                                EntityInfo e = chainIds2entities.get(chain.getId());
389                                chain.setEntityInfo(e);
390                                e.addChain(chain);
391                        }
392                }
393
394
395                return chainIds2entities;
396        }
397
398        private static SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) {
399                SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity();
400
401                GapPenalty penalty = new SimpleGapPenalty(8, 1);
402
403                PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw =
404                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
405
406                return nw.getPair();
407        }
408
409        private static SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) {
410                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
411
412                GapPenalty penalty = new SimpleGapPenalty(8, 1);
413
414                PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw =
415                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
416
417                return nw.getPair();
418        }
419
420        private static SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) {
421                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
422
423                GapPenalty penalty = new SimpleGapPenalty(8, 1);
424
425                PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw =
426                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
427
428                return nw.getPair();
429        }
430
431
432
433        private static int getNumGaps(SequencePair<?, ?> pair) {
434                int numGaps = 0;
435                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
436                        if (pair.hasGap(alignmentIndex)) numGaps++;
437                }
438                return numGaps;
439        }
440
441        private static int getNumGapsQuery(SequencePair<?, ?> pair) {
442                int numGaps = 0;
443                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
444                        if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) {
445                                numGaps++;
446                        }
447                }
448                return numGaps;
449        }
450
451        private static int getNumGapsTarget(SequencePair<?, ?> pair) {
452                int numGaps = 0;
453                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
454                        if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) {
455                                numGaps++;
456                        }
457                }
458                return numGaps;
459        }
460
461        private static boolean isProteinSequence(String str) {
462                try {
463                        new ProteinSequence(str);
464                } catch (CompoundNotFoundException e) {
465
466                        return false;
467                }
468                return true;
469        }
470
471        private static boolean isDNASequence(String str) {
472                try {
473                        new DNASequence(str);
474                } catch (CompoundNotFoundException e) {
475
476                        return false;
477                }
478                return true;
479        }
480
481        private static boolean isRNASequence(String str) {
482                try {
483                        new RNASequence(str);
484                } catch (CompoundNotFoundException e) {
485
486                        return false;
487                }
488                return true;
489
490        }
491
492        /**
493         * Returns the ProteinSequence or null if one can't be created
494         * @param str
495         * @return
496         */
497        private static ProteinSequence getProteinSequence(String str) {
498                try {
499                        ProteinSequence s = new ProteinSequence(str);
500                        return s;
501                } catch (CompoundNotFoundException e) {
502
503                        logger.error("Unexpected error when creating ProteinSequence",e);
504                }
505                return null;
506        }
507
508        /**
509         * Returns the DNASequence or null if one can't be created
510         * @param str
511         * @return
512         */
513        private static DNASequence getDNASequence(String str) {
514                try {
515                        DNASequence s = new DNASequence(str);
516                        return s;
517                } catch (CompoundNotFoundException e) {
518
519                        logger.error("Unexpected error when creating DNASequence ",e);
520                }
521                return null;
522        }
523
524        /**
525         * Returns the RNASequence or null if one can't be created
526         * @param str
527         * @return
528         */
529        private static RNASequence getRNASequence(String str) {
530                try {
531                        RNASequence s = new RNASequence(str);
532                        return s;
533                } catch (CompoundNotFoundException e) {
534
535                        logger.error("Unexpected error when creating RNASequence ",e);
536                }
537                return null;
538        }
539
540}