001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.io;
022
023import org.biojava.nbio.alignment.Alignments;
024import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
025import org.biojava.nbio.alignment.SimpleGapPenalty;
026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
027import org.biojava.nbio.alignment.template.GapPenalty;
028import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
029import org.biojava.nbio.core.alignment.template.SequencePair;
030import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
031import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
032import org.biojava.nbio.core.sequence.DNASequence;
033import org.biojava.nbio.core.sequence.ProteinSequence;
034import org.biojava.nbio.core.sequence.RNASequence;
035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
036import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
037import org.biojava.nbio.structure.*;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import java.util.ArrayList;
042import java.util.Collections;
043import java.util.Comparator;
044import java.util.HashMap;
045import java.util.List;
046import java.util.Set;
047import java.util.TreeMap;
048import java.util.TreeSet;
049
050/**
051 * Heuristical finding of Entities (called Compounds in legacy PDB format)
052 * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains
053 * in the Structure.
054 *
055 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where
056 * possibly no SEQRES is given.
057 *
058 * @author Jose Duarte
059 */
060public class EntityFinder {
061
062        private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class);
063
064        /**
065         * Above this ratio of mismatching residue types for same residue numbers we
066         * consider the 2 chains to have mismatching residue numbers and warn about it
067         */
068        public static final double RATIO_GAPS_FOR_MISMATCH = 0.50;
069
070        /**
071         * Identity value for 2 chains to be considered part of same entity
072         */
073        public static final double IDENTITY_THRESHOLD = 0.99999;
074
075        /**
076         * Gap coverage value (num gaps over length of sequence) for each chain of the match:
077         * 2 chains with more gap coverage than this value will not be considered part of the same entity
078         */
079        public static final double GAP_COVERAGE_THRESHOLD = 0.3;
080
081
082
083
084        /**
085         * Utility method that employs some heuristics to find the {@link EntityInfo}s
086         * for the polymeric chains given in constructor.
087         * To be used in case the information is missing in PDB/mmCIF file
088         * @return
089         */
090        public static List<EntityInfo> findPolyEntities(List<List<Chain>> polyModels) {
091                TreeMap<String,EntityInfo> chainIds2entities = findEntitiesFromAlignment(polyModels);
092
093                return findUniqueEntities(chainIds2entities);
094        }
095
096        /**
097         * Utility method to obtain a list of unique entities from the chainIds2entities map
098         * @return
099         */
100        private static List<EntityInfo> findUniqueEntities(TreeMap<String,EntityInfo> chainIds2entities) {
101
102                List<EntityInfo> list = new ArrayList<>();
103
104                for (EntityInfo cluster:chainIds2entities.values()) {
105                        boolean present = false;
106                        for (EntityInfo cl:list) {
107                                if (cl==cluster) {
108                                        present = true;
109                                        break;
110                                }
111                        }
112                        if (!present) list.add(cluster);
113                }
114                return list;
115        }
116
117        /**
118         * Given all chains of all models find entities for the nonpolymers and water chains within them,
119         * assigning entity ids, types and descriptions to them. The result is written back to the passed entities List.
120         * @param nonPolyModels
121         * @param waterModels
122         * @param entities
123         */
124        public static void createPurelyNonPolyEntities(List<List<Chain>> nonPolyModels, List<List<Chain>> waterModels, List<EntityInfo> entities) {
125
126                if (nonPolyModels.isEmpty()) return;
127
128                // let's find first the max entity id to assign entity ids to the newly found entities
129                int maxMolId = 0;
130                if (!entities.isEmpty()) {
131                        maxMolId = Collections.max(entities, Comparator.comparingInt(EntityInfo::getMolId)).getMolId();
132                }
133                // we go one over the max
134                int molId = maxMolId + 1;
135
136                if (!nonPolyModels.get(0).isEmpty()) {
137                        List<EntityInfo> nonPolyEntities = new ArrayList<>();
138
139                        for (List<Chain> model:nonPolyModels) {
140                                for (Chain c: model) {
141                                        // we assume there's only 1 group per non-poly chain
142                                        String molecPdbName = c.getAtomGroup(0).getPDBName();
143
144                                        EntityInfo nonPolyEntity = findNonPolyEntityWithDescription(molecPdbName, nonPolyEntities);
145                                        if (nonPolyEntity == null) {
146                                                nonPolyEntity = new EntityInfo();
147                                                nonPolyEntity.setDescription(molecPdbName);
148                                                nonPolyEntity.setType(EntityType.NONPOLYMER);
149                                                nonPolyEntity.setMolId(molId++);
150                                                nonPolyEntities.add(nonPolyEntity);
151
152                                        }
153                                        nonPolyEntity.addChain(c);
154                                        c.setEntityInfo(nonPolyEntity);
155                                }
156                        }
157                        entities.addAll(nonPolyEntities);
158                }
159
160                if (!waterModels.get(0).isEmpty()) {
161                        EntityInfo waterEntity = new EntityInfo();
162                        waterEntity.setType(EntityType.WATER);
163                        waterEntity.setDescription("water");
164                        waterEntity.setMolId(molId);
165
166                        for (List<Chain> model:waterModels) {
167                                for (Chain waterChain:model) {
168                                        waterEntity.addChain(waterChain);
169                                        waterChain.setEntityInfo(waterEntity);
170                                }
171                        }
172                        entities.add(waterEntity);
173
174                }
175
176        }
177
178        private static EntityInfo findNonPolyEntityWithDescription(String description, List<EntityInfo> nonPolyEntities) {
179                for (EntityInfo e:nonPolyEntities) {
180                        if (e.getDescription().equals(description)) return e;
181                }
182                return null;
183        }
184
185
186        private static boolean areResNumbersAligned(Chain c1, Chain c2) {
187
188                boolean isC1prot = c1.isProtein();
189                boolean isC2prot = c2.isProtein();
190
191                // different kind of chain: we won't try to align them
192                if (isC1prot != isC2prot ) return false;
193
194                List<Group> c1AtomGroups = null;
195                if (isC1prot) {
196                        c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID);
197                }
198                else {
199                        c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE);
200                }
201
202                int countNonExisting = 0;
203
204                for (Group g1:c1AtomGroups) {
205                        try {
206                                Group g2 = c2.getGroupByPDB(g1.getResidueNumber());
207                                if (!g2.getPDBName().equals(g1.getPDBName())) {
208                                        logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}",
209                                                        c1.getId(),c2.getId(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName());
210                                        return false;
211                                }
212                        } catch (StructureException e) {
213                                // the group doesn't exist (no density) in the chain, go on
214                                countNonExisting++;
215                        }
216                }
217
218                if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) {
219                        logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.",
220                                        RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getId(), c1.getId());
221                        return false;
222                }
223
224                return true;
225        }
226
227
228
229        private static TreeMap<String,EntityInfo> findEntitiesFromAlignment(List<List<Chain>> polyModels) {
230
231                TreeMap<String, EntityInfo> chainIds2entities = new TreeMap<>();
232
233                if (polyModels.isEmpty()) return chainIds2entities;
234
235                Set<Integer> polyChainIndices = new TreeSet<>();
236                for (int i=0;i<polyModels.get(0).size();i++) {
237                        polyChainIndices.add(i);
238                }
239
240                int molId = 1;
241
242                outer:
243                        for (int i:polyChainIndices) {
244                                Chain c1 = polyModels.get(0).get(i);
245                                // here we use false, which means that X will be used for unknown compounds
246                                String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), new HashMap<>(), false);
247
248                                for (int j:polyChainIndices) {
249
250                                        if (j<=i) continue;
251
252                                        Chain c2 = polyModels.get(0).get(j);
253                                        String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), new HashMap<>(), false);
254
255                                        int seq1Length = 0;
256                                        int seq2Length = 0;
257
258                                        SequencePair<?,?> pair;
259                                        if (isProteinSequence(str1) && isProteinSequence(str2)) {
260                                                ProteinSequence s1 = getProteinSequence(str1);
261                                                ProteinSequence s2 = getProteinSequence(str2);
262                                                seq1Length = s1.getLength();
263                                                seq2Length = s2.getLength();
264
265                                                pair = alignProtein(s1,s2);
266
267                                        } else if (isDNASequence(str1) && isDNASequence(str2)) {
268                                                DNASequence s1 = getDNASequence(str1);
269                                                DNASequence s2 = getDNASequence(str2);
270                                                seq1Length = s1.getLength();
271                                                seq2Length = s2.getLength();
272
273                                                pair = alignDNA(s1,s2);
274
275                                        } else if (isRNASequence(str1) && isRNASequence(str2)) {
276                                                RNASequence s1 = getRNASequence(str1);
277                                                RNASequence s2 = getRNASequence(str2);
278                                                seq1Length = s1.getLength();
279                                                seq2Length = s2.getLength();
280
281                                                pair = alignRNA(s1,s2);
282
283                                        } else {
284                                                logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers", c1.getId(), c2.getId());
285                                                continue;
286                                        }
287
288                                        int numGaps = getNumGaps(pair);
289                                        int numGaps1 = getNumGapsQuery(pair);
290                                        int numGaps2 = getNumGapsTarget(pair);
291
292                                        int nonGaps = pair.getLength() - numGaps;
293
294                                        double identity = (double)pair.getNumIdenticals()/(double)nonGaps;
295                                        double gapCov1 = (double) numGaps1 / (double) seq1Length;
296                                        double gapCov2 = (double) numGaps2 / (double) seq2Length;
297
298                                        logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}",
299                                                        c1.getId(), c2.getId(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2));
300                                        logger.debug("\n"+pair.toString(100));
301
302                                        if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) {
303                                                if (    !chainIds2entities.containsKey(c1.getId()) &&
304                                                                !chainIds2entities.containsKey(c2.getId())) {
305                                                        logger.debug("Creating entity with chains {},{}",c1.getId(),c2.getId());
306
307                                                        EntityInfo ent = new EntityInfo();
308                                                        ent.addChain(c1);
309                                                        ent.addChain(c2);
310                                                        ent.setMolId(molId++);
311                                                        ent.setType(EntityType.POLYMER);
312                                                        c1.setEntityInfo(ent);
313                                                        c2.setEntityInfo(ent);
314                                                        chainIds2entities.put(c1.getId(), ent);
315                                                        chainIds2entities.put(c2.getId(), ent);
316
317                                                } else {
318                                                        Chain chainToAdd;
319                                                        EntityInfo ent = chainIds2entities.get(c1.getId());
320                                                        if (ent==null) {
321                                                                ent = chainIds2entities.get(c2.getId());
322                                                                chainToAdd = c1;
323                                                        } else {
324                                                                chainToAdd = c2;
325                                                        }
326                                                        if (!chainIds2entities.containsKey(chainToAdd.getId())) {
327                                                                logger.debug("Adding chain {} to entity {}", chainToAdd.getId(), ent.getId());
328                                                                ent.addChain(chainToAdd);
329                                                                chainToAdd.setEntityInfo(ent);
330                                                                chainIds2entities.put(chainToAdd.getId(), ent);
331                                                        }
332                                                }
333                                                if (!areResNumbersAligned(c1, c2)) {
334                                                        logger.warn("Including 100% identical chains {},{} in same entity, although they have misaligned residue numbers",
335                                                                        c1.getId(),c2.getId());
336                                                }
337                                        }
338
339                                        if (identity>1) {
340                                                logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})",
341                                                                c1.getId(),c2.getId(),pair.getNumIdenticals(),nonGaps,identity);
342                                                logger.warn("\n"+pair.toString(100));
343                                        }
344
345                                        if (chainIds2entities.size()==polyChainIndices.size()) // we've got all chains in entities
346                                                break outer;
347                                }
348                        }
349
350                // anything not in an Entity will be its own Entity
351                for (int i:polyChainIndices) {
352                        Chain c = polyModels.get(0).get(i);
353                        if (!chainIds2entities.containsKey(c.getId())) {
354                                logger.debug("Creating a 1-member entity for chain {}",c.getId());
355
356                                EntityInfo ent = new EntityInfo();
357                                ent.addChain(c);
358                                ent.setMolId(molId++);
359                                ent.setType(EntityType.POLYMER);
360                                c.setEntityInfo(ent);
361
362                                chainIds2entities.put(c.getId(),ent);
363                        }
364                }
365
366                // now that we've found the entities for first model, let's do the same for the rest of the models
367
368                for (int i=1;i<polyModels.size();i++) {
369                        for (Chain chain : polyModels.get(i)) {
370                                EntityInfo e = chainIds2entities.get(chain.getId());
371                                chain.setEntityInfo(e);
372                                e.addChain(chain);
373                        }
374                }
375
376                return chainIds2entities;
377        }
378
379        private static SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) {
380                SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity();
381
382                GapPenalty penalty = new SimpleGapPenalty(8, 1);
383
384                PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw =
385                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
386
387                return nw.getPair();
388        }
389
390        private static SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) {
391                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
392
393                GapPenalty penalty = new SimpleGapPenalty(8, 1);
394
395                PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw =
396                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
397
398                return nw.getPair();
399        }
400
401        private static SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) {
402                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
403
404                GapPenalty penalty = new SimpleGapPenalty(8, 1);
405
406                PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw =
407                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
408
409                return nw.getPair();
410        }
411
412
413
414        private static int getNumGaps(SequencePair<?, ?> pair) {
415                int numGaps = 0;
416                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
417                        if (pair.hasGap(alignmentIndex)) numGaps++;
418                }
419                return numGaps;
420        }
421
422        private static int getNumGapsQuery(SequencePair<?, ?> pair) {
423                int numGaps = 0;
424                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
425                        if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) {
426                                numGaps++;
427                        }
428                }
429                return numGaps;
430        }
431
432        private static int getNumGapsTarget(SequencePair<?, ?> pair) {
433                int numGaps = 0;
434                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
435                        if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) {
436                                numGaps++;
437                        }
438                }
439                return numGaps;
440        }
441
442        private static boolean isProteinSequence(String str) {
443                try {
444                        new ProteinSequence(str);
445                } catch (CompoundNotFoundException e) {
446
447                        return false;
448                }
449                return true;
450        }
451
452        private static boolean isDNASequence(String str) {
453                try {
454                        new DNASequence(str);
455                } catch (CompoundNotFoundException e) {
456
457                        return false;
458                }
459                return true;
460        }
461
462        private static boolean isRNASequence(String str) {
463                try {
464                        new RNASequence(str);
465                } catch (CompoundNotFoundException e) {
466
467                        return false;
468                }
469                return true;
470
471        }
472
473        /**
474         * Returns the ProteinSequence or null if one can't be created
475         * @param str
476         * @return
477         */
478        private static ProteinSequence getProteinSequence(String str) {
479                try {
480                        ProteinSequence s = new ProteinSequence(str);
481                        return s;
482                } catch (CompoundNotFoundException e) {
483
484                        logger.error("Unexpected error when creating ProteinSequence",e);
485                }
486                return null;
487        }
488
489        /**
490         * Returns the DNASequence or null if one can't be created
491         * @param str
492         * @return
493         */
494        private static DNASequence getDNASequence(String str) {
495                try {
496                        DNASequence s = new DNASequence(str);
497                        return s;
498                } catch (CompoundNotFoundException e) {
499
500                        logger.error("Unexpected error when creating DNASequence ",e);
501                }
502                return null;
503        }
504
505        /**
506         * Returns the RNASequence or null if one can't be created
507         * @param str
508         * @return
509         */
510        private static RNASequence getRNASequence(String str) {
511                try {
512                        RNASequence s = new RNASequence(str);
513                        return s;
514                } catch (CompoundNotFoundException e) {
515
516                        logger.error("Unexpected error when creating RNASequence ",e);
517                }
518                return null;
519        }
520
521}