001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.io;
022
023import org.biojava.nbio.alignment.Alignments;
024import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
025import org.biojava.nbio.alignment.SimpleGapPenalty;
026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
027import org.biojava.nbio.alignment.template.GapPenalty;
028import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
029import org.biojava.nbio.core.alignment.template.SequencePair;
030import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
031import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
032import org.biojava.nbio.core.sequence.DNASequence;
033import org.biojava.nbio.core.sequence.ProteinSequence;
034import org.biojava.nbio.core.sequence.RNASequence;
035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
036import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
037import org.biojava.nbio.structure.*;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import java.util.ArrayList;
042import java.util.Collections;
043import java.util.Comparator;
044import java.util.HashMap;
045import java.util.List;
046import java.util.Map;
047import java.util.Set;
048import java.util.TreeMap;
049import java.util.TreeSet;
050
051/**
052 * Heuristical finding of Entities (called Compounds in legacy PDB format)
053 * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains
054 * in the Structure.
055 *
056 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where
057 * possibly no SEQRES is given.
058 *
059 * @author Jose Duarte
060 */
061public class EntityFinder {
062
063        private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class);
064
065        /**
066         * Above this ratio of mismatching residue types for same residue numbers we
067         * consider the 2 chains to have mismatching residue numbers and warn about it
068         */
069        public static final double RATIO_GAPS_FOR_MISMATCH = 0.50;
070
071        /**
072         * Identity value for 2 chains to be considered part of same entity
073         */
074        public static final double IDENTITY_THRESHOLD = 0.99999;
075
076        /**
077         * Gap coverage value (num gaps over length of sequence) for each chain of the match:
078         * 2 chains with more gap coverage than this value will not be considered part of the same entity
079         */
080        public static final double GAP_COVERAGE_THRESHOLD = 0.3;
081
082
083
084
085        /**
086         * Utility method that employs some heuristics to find the {@link EntityInfo}s
087         * for the polymeric chains given in constructor.
088         * To be used in case the information is missing in PDB/mmCIF file
089         * @return
090         */
091        public static List<EntityInfo> findPolyEntities(List<List<Chain>> polyModels) {
092                TreeMap<String,EntityInfo> chainIds2entities = findEntitiesFromAlignment(polyModels);
093
094                List<EntityInfo> entities = findUniqueEntities(chainIds2entities);
095
096                return entities;
097        }
098
099        /**
100         * Utility method to obtain a list of unique entities from the chainIds2entities map
101         * @return
102         */
103        private static List<EntityInfo> findUniqueEntities(TreeMap<String,EntityInfo> chainIds2entities) {
104
105                List<EntityInfo> list = new ArrayList<EntityInfo>();
106
107                for (EntityInfo cluster:chainIds2entities.values()) {
108                        boolean present = false;
109                        for (EntityInfo cl:list) {
110                                if (cl==cluster) {
111                                        present = true;
112                                        break;
113                                }
114                        }
115                        if (!present) list.add(cluster);
116                }
117                return list;
118        }
119
120        /**
121         * Given all chains of all models find entities for the nonpolymers and water chains within them,
122         * assigning entity ids, types and descriptions to them. The result is written back to the passed entities List.
123         * @param nonPolyModels
124         * @param waterModels
125         * @param entities
126         */
127        public static void createPurelyNonPolyEntities(List<List<Chain>> nonPolyModels, List<List<Chain>> waterModels, List<EntityInfo> entities) {
128
129                if (nonPolyModels.isEmpty()) return;
130
131                // let's find first the max entity id to assign entity ids to the newly found entities
132                int maxMolId = 0;
133                if (!entities.isEmpty()) {
134                        maxMolId = Collections.max(entities, new Comparator<EntityInfo>() {
135                                @Override
136                                public int compare(EntityInfo o1, EntityInfo o2) {
137                                        return new Integer(o1.getMolId()).compareTo(o2.getMolId());
138                                }
139                        }).getMolId();
140                }
141                // we go one over the max
142                int molId = maxMolId + 1;
143
144                if (!nonPolyModels.get(0).isEmpty()) {
145                        List<EntityInfo> nonPolyEntities = new ArrayList<>();
146
147                        for (List<Chain> model:nonPolyModels) {
148                                for (Chain c: model) {
149                                        // we assume there's only 1 group per non-poly chain
150                                        String molecPdbName = c.getAtomGroup(0).getPDBName();
151
152                                        EntityInfo nonPolyEntity = findNonPolyEntityWithDescription(molecPdbName, nonPolyEntities);
153                                        if (nonPolyEntity == null) {
154                                                nonPolyEntity = new EntityInfo();
155                                                nonPolyEntity.setDescription(molecPdbName);
156                                                nonPolyEntity.setType(EntityType.NONPOLYMER);
157                                                nonPolyEntity.setMolId(molId++);
158                                                nonPolyEntities.add(nonPolyEntity);
159
160                                        }
161                                        nonPolyEntity.addChain(c);
162                                        c.setEntityInfo(nonPolyEntity);
163                                }
164                        }
165                        entities.addAll(nonPolyEntities);
166                }
167
168                if (!waterModels.get(0).isEmpty()) {
169                        EntityInfo waterEntity = new EntityInfo();
170                        waterEntity.setType(EntityType.WATER);
171                        waterEntity.setDescription("water");
172                        waterEntity.setMolId(molId);
173
174                        for (List<Chain> model:waterModels) {
175                                for (Chain waterChain:model) {
176                                        waterEntity.addChain(waterChain);
177                                        waterChain.setEntityInfo(waterEntity);
178                                }
179                        }
180                        entities.add(waterEntity);
181
182                }
183
184
185        }
186
187        private static EntityInfo findNonPolyEntityWithDescription(String description, List<EntityInfo> nonPolyEntities) {
188                for (EntityInfo e:nonPolyEntities) {
189                        if (e.getDescription().equals(description)) return e;
190                }
191                return null;
192        }
193
194
195        private static boolean areResNumbersAligned(Chain c1, Chain c2) {
196
197                boolean isC1prot = c1.isProtein();
198                boolean isC2prot = c2.isProtein();
199
200                // different kind of chain: we won't try to align them
201                if (isC1prot != isC2prot ) return false;
202
203                List<Group> c1AtomGroups = null;
204                if (isC1prot) {
205                        c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID);
206                }
207                else {
208                        c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE);
209                }
210
211                int countNonExisting = 0;
212
213                for (Group g1:c1AtomGroups) {
214                        try {
215                                Group g2 = c2.getGroupByPDB(g1.getResidueNumber());
216                                if (!g2.getPDBName().equals(g1.getPDBName())) {
217                                        logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}",
218                                                        c1.getId(),c2.getId(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName());
219                                        return false;
220                                }
221                        } catch (StructureException e) {
222                                // the group doesn't exist (no density) in the chain, go on
223                                countNonExisting++;
224                                continue;
225                        }
226                }
227
228                if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) {
229                        logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.",
230                                        RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getId(), c1.getId());
231                        return false;
232                }
233
234                return true;
235        }
236
237
238
239        private static TreeMap<String,EntityInfo> findEntitiesFromAlignment(List<List<Chain>> polyModels) {
240
241
242
243                TreeMap<String, EntityInfo> chainIds2entities = new TreeMap<String,EntityInfo>();
244
245                if (polyModels.isEmpty()) return chainIds2entities;
246
247                Set<Integer> polyChainIndices = new TreeSet<Integer>();
248                for (int i=0;i<polyModels.get(0).size();i++) {
249                        polyChainIndices.add(i);
250                }
251
252
253                int molId = 1;
254
255                outer:
256                        for (int i:polyChainIndices) {
257                                for (int j:polyChainIndices) {
258
259                                        if (j<=i) continue;
260
261                                        Chain c1 = polyModels.get(0).get(i);
262                                        Chain c2 = polyModels.get(0).get(j);
263
264                                        Map<Integer,Integer> positionIndex1 = new HashMap<Integer, Integer>();
265                                        Map<Integer,Integer> positionIndex2 = new HashMap<Integer, Integer>();
266                                        // here we use false, which means that X will be used for unknown compounds
267                                        String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), positionIndex1, false);
268                                        String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), positionIndex2, false);
269
270                                        int seq1Length = 0;
271                                        int seq2Length = 0;
272
273                                        SequencePair<?,?> pair = null;
274                                        if (isProteinSequence(str1) && isProteinSequence(str2)) {
275                                                ProteinSequence s1 = getProteinSequence(str1);
276                                                ProteinSequence s2 = getProteinSequence(str2);
277                                                seq1Length = s1.getLength();
278                                                seq2Length = s2.getLength();
279
280                                                pair = alignProtein(s1,s2);
281
282                                        } else if (isDNASequence(str1) && isDNASequence(str2)) {
283                                                DNASequence s1 = getDNASequence(str1);
284                                                DNASequence s2 = getDNASequence(str2);
285                                                seq1Length = s1.getLength();
286                                                seq2Length = s2.getLength();
287
288                                                pair = alignDNA(s1,s2);
289
290                                        } else if (isRNASequence(str1) && isRNASequence(str2)) {
291                                                RNASequence s1 = getRNASequence(str1);
292                                                RNASequence s2 = getRNASequence(str2);
293                                                seq1Length = s1.getLength();
294                                                seq2Length = s2.getLength();
295
296                                                pair = alignRNA(s1,s2);
297
298                                        } else {
299                                                logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers");
300                                                continue;
301                                        }
302
303
304                                        int numGaps = getNumGaps(pair);
305                                        int numGaps1 = getNumGapsQuery(pair);
306                                        int numGaps2 = getNumGapsTarget(pair);
307
308                                        int nonGaps = pair.getLength() - numGaps;
309
310                                        double identity = (double)pair.getNumIdenticals()/(double)nonGaps;
311                                        double gapCov1 = (double) numGaps1 / (double) seq1Length;
312                                        double gapCov2 = (double) numGaps2 / (double) seq2Length;
313
314                                        logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}",
315                                                        c1.getId(), c2.getId(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2));
316                                        logger.debug("\n"+pair.toString(100));
317
318                                        if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) {
319                                                if (    !chainIds2entities.containsKey(c1.getId()) &&
320                                                                !chainIds2entities.containsKey(c2.getId())) {
321                                                        logger.debug("Creating Compound with chains {},{}",c1.getId(),c2.getId());
322
323                                                        EntityInfo ent = new EntityInfo();
324                                                        ent.addChain(c1);
325                                                        ent.addChain(c2);
326                                                        ent.setMolId(molId++);
327                                                        ent.setType(EntityType.POLYMER);
328                                                        c1.setEntityInfo(ent);
329                                                        c2.setEntityInfo(ent);
330                                                        chainIds2entities.put(c1.getId(), ent);
331                                                        chainIds2entities.put(c2.getId(), ent);
332
333
334                                                } else {
335                                                        EntityInfo ent = chainIds2entities.get(c1.getId());
336
337                                                        if (ent==null) {
338                                                                logger.debug("Adding chain {} to entity {}",c1.getId(),c2.getId());
339                                                                ent = chainIds2entities.get(c2.getId());
340                                                                ent.addChain(c1);
341                                                                c1.setEntityInfo(ent);
342                                                                chainIds2entities.put(c1.getId(), ent);
343
344                                                        } else {
345                                                                logger.debug("Adding chain {} to entity {}",c2.getId(),c1.getId());
346                                                                ent.addChain(c2);
347                                                                c2.setEntityInfo(ent);
348                                                                chainIds2entities.put(c2.getId(), ent);
349
350                                                        }
351                                                }
352                                                if (!areResNumbersAligned(c1, c2)) {
353                                                        logger.warn("Including 100% identical chains {},{} in same Compound, although they have misaligned residue numbers",
354                                                                        c1.getId(),c2.getId());
355                                                }
356                                        }
357
358                                        if (identity>1) {
359                                                logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})",
360                                                                c1.getId(),c2.getId(),pair.getNumIdenticals(),nonGaps,identity);
361                                                logger.warn("\n"+pair.toString(100));
362                                        }
363
364                                        if (chainIds2entities.size()==polyChainIndices.size()) // we've got all chains in entities
365                                                break outer;
366                                }
367                        }
368
369                // anything not in an Entity will be its own Entity
370                for (int i:polyChainIndices) {
371                        Chain c = polyModels.get(0).get(i);
372                        if (!chainIds2entities.containsKey(c.getId())) {
373                                logger.debug("Creating a 1-member Compound for chain {}",c.getId());
374
375                                EntityInfo ent = new EntityInfo();
376                                ent.addChain(c);
377                                ent.setMolId(molId++);
378                                ent.setType(EntityType.POLYMER);
379                                c.setEntityInfo(ent);
380
381                                chainIds2entities.put(c.getId(),ent);
382                        }
383                }
384
385                // now that we've found the entities for first model, let's do the same for the rest of the models
386
387                for (int i=1;i<polyModels.size();i++) {
388                        for (Chain chain : polyModels.get(i)) {
389                                EntityInfo e = chainIds2entities.get(chain.getId());
390                                chain.setEntityInfo(e);
391                                e.addChain(chain);
392                        }
393                }
394
395
396                return chainIds2entities;
397        }
398
399        private static SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) {
400                SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity();
401
402                GapPenalty penalty = new SimpleGapPenalty(8, 1);
403
404                PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw =
405                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
406
407                return nw.getPair();
408        }
409
410        private static SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) {
411                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
412
413                GapPenalty penalty = new SimpleGapPenalty(8, 1);
414
415                PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw =
416                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
417
418                return nw.getPair();
419        }
420
421        private static SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) {
422                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
423
424                GapPenalty penalty = new SimpleGapPenalty(8, 1);
425
426                PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw =
427                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix);
428
429                return nw.getPair();
430        }
431
432
433
434        private static int getNumGaps(SequencePair<?, ?> pair) {
435                int numGaps = 0;
436                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
437                        if (pair.hasGap(alignmentIndex)) numGaps++;
438                }
439                return numGaps;
440        }
441
442        private static int getNumGapsQuery(SequencePair<?, ?> pair) {
443                int numGaps = 0;
444                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
445                        if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) {
446                                numGaps++;
447                        }
448                }
449                return numGaps;
450        }
451
452        private static int getNumGapsTarget(SequencePair<?, ?> pair) {
453                int numGaps = 0;
454                for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) {
455                        if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) {
456                                numGaps++;
457                        }
458                }
459                return numGaps;
460        }
461
462        private static boolean isProteinSequence(String str) {
463                try {
464                        new ProteinSequence(str);
465                } catch (CompoundNotFoundException e) {
466
467                        return false;
468                }
469                return true;
470        }
471
472        private static boolean isDNASequence(String str) {
473                try {
474                        new DNASequence(str);
475                } catch (CompoundNotFoundException e) {
476
477                        return false;
478                }
479                return true;
480        }
481
482        private static boolean isRNASequence(String str) {
483                try {
484                        new RNASequence(str);
485                } catch (CompoundNotFoundException e) {
486
487                        return false;
488                }
489                return true;
490
491        }
492
493        /**
494         * Returns the ProteinSequence or null if one can't be created
495         * @param str
496         * @return
497         */
498        private static ProteinSequence getProteinSequence(String str) {
499                try {
500                        ProteinSequence s = new ProteinSequence(str);
501                        return s;
502                } catch (CompoundNotFoundException e) {
503
504                        logger.error("Unexpected error when creating ProteinSequence",e);
505                }
506                return null;
507        }
508
509        /**
510         * Returns the DNASequence or null if one can't be created
511         * @param str
512         * @return
513         */
514        private static DNASequence getDNASequence(String str) {
515                try {
516                        DNASequence s = new DNASequence(str);
517                        return s;
518                } catch (CompoundNotFoundException e) {
519
520                        logger.error("Unexpected error when creating DNASequence ",e);
521                }
522                return null;
523        }
524
525        /**
526         * Returns the RNASequence or null if one can't be created
527         * @param str
528         * @return
529         */
530        private static RNASequence getRNASequence(String str) {
531                try {
532                        RNASequence s = new RNASequence(str);
533                        return s;
534                } catch (CompoundNotFoundException e) {
535
536                        logger.error("Unexpected error when creating RNASequence ",e);
537                }
538                return null;
539        }
540
541}