001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io; 022 023import org.biojava.nbio.alignment.Alignments; 024import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 025import org.biojava.nbio.alignment.SimpleGapPenalty; 026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 027import org.biojava.nbio.alignment.template.GapPenalty; 028import org.biojava.nbio.alignment.template.PairwiseSequenceAligner; 029import org.biojava.nbio.core.alignment.template.SequencePair; 030import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 031import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 032import org.biojava.nbio.core.sequence.DNASequence; 033import org.biojava.nbio.core.sequence.ProteinSequence; 034import org.biojava.nbio.core.sequence.RNASequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 037import org.biojava.nbio.structure.*; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import java.util.ArrayList; 042import java.util.Collections; 043import java.util.Comparator; 044import java.util.HashMap; 045import java.util.List; 046import java.util.Map; 047import java.util.Set; 048import java.util.TreeMap; 049import java.util.TreeSet; 050 051/** 052 * Heuristical finding of Compounds (called Entities in mmCIF dictionary) 053 * in a given Structure. Compounds are the groups of sequence identical NCS-related polymer chains 054 * in the Structure. 055 * 056 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB files where 057 * possibly no SEQRES is given. 058 * 059 * @author duarte_j 060 */ 061public class CompoundFinder { 062 063 private Structure s; 064 065 private static final Logger logger = LoggerFactory.getLogger(CompoundFinder.class); 066 067 /** 068 * Above this ratio of mismatching residue types for same residue numbers we 069 * consider the 2 chains to have mismatching residue numbers and warn about it 070 */ 071 public static final double RATIO_GAPS_FOR_MISMATCH = 0.50; 072 073 /** 074 * Identity value for 2 chains to be considered part of same entity 075 */ 076 public static final double IDENTITY_THRESHOLD = 0.99999; 077 078 /** 079 * Gap coverage value (num gaps over length of sequence) for each chain of the match: 080 * 2 chains with more gap coverage than this value will not be considered part of the same entity 081 */ 082 public static final double GAP_COVERAGE_THRESHOLD = 0.3; 083 084 085 public CompoundFinder(Structure s) { 086 this.s = s; 087 } 088 089 /** 090 * Utility method that employs some heuristics to find the Compounds 091 * for this Structure in case the information is missing in PDB/mmCIF file 092 * @return 093 */ 094 public List<Compound> findCompounds() { 095 096 TreeMap<String,Compound> chainIds2entities = findCompoundsFromAlignment(); 097 098 List<Compound> compounds = findUniqueCompounds(chainIds2entities); 099 100 createPurelyNonPolyCompounds(compounds); 101 102 return compounds; 103 } 104 105 /** 106 * Utility method to obtain a list of unique entities from the chainIds2entities map 107 * @return 108 */ 109 private static List<Compound> findUniqueCompounds(TreeMap<String,Compound> chainIds2entities) { 110 111 List<Compound> list = new ArrayList<Compound>(); 112 113 for (Compound cluster:chainIds2entities.values()) { 114 boolean present = false; 115 for (Compound cl:list) { 116 if (cl==cluster) { 117 present = true; 118 break; 119 } 120 } 121 if (!present) list.add(cluster); 122 } 123 return list; 124 } 125 126 private void createPurelyNonPolyCompounds(List<Compound> compounds) { 127 128 List<Chain> pureNonPolymerChains = new ArrayList<Chain>(); 129 for (int i=0;i<s.getChains().size();i++) { 130 if (StructureTools.isChainPureNonPolymer(s.getChain(i))) { 131 pureNonPolymerChains.add(s.getChain(i)); 132 } 133 } 134 135 if (!pureNonPolymerChains.isEmpty()) { 136 137 int maxMolId = 0; // if we have no compounds at all we just use 0, so that first assignment is 1 138 if (!compounds.isEmpty()) { 139 Collections.max(compounds, new Comparator<Compound>() { 140 @Override 141 public int compare(Compound o1, Compound o2) { 142 return new Integer(o1.getMolId()).compareTo(o2.getMolId()); 143 } 144 }).getMolId(); 145 } 146 147 int molId = maxMolId + 1; 148 // for the purely non-polymeric chains we assign additional compounds 149 for (Chain c: pureNonPolymerChains) { 150 Compound comp = new Compound(); 151 comp.addChain(c); 152 comp.setMolId(molId); 153 logger.warn("Chain {} is purely non-polymeric, will assign a new Compound (entity) to it (entity id {})", c.getChainID(), molId); 154 molId++; 155 156 compounds.add(comp); 157 } 158 159 } 160 } 161 162 163 private static boolean areResNumbersAligned(Chain c1, Chain c2) { 164 165 boolean isC1prot = StructureTools.isProtein(c1); 166 boolean isC2prot = StructureTools.isProtein(c2); 167 168 // different kind of chain: we won't try to align them 169 if (isC1prot != isC2prot ) return false; 170 171 List<Group> c1AtomGroups = null; 172 if (isC1prot) { 173 c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID); 174 } 175 else { 176 c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE); 177 } 178 179 int countNonExisting = 0; 180 181 for (Group g1:c1AtomGroups) { 182 try { 183 Group g2 = c2.getGroupByPDB(g1.getResidueNumber()); 184 if (!g2.getPDBName().equals(g1.getPDBName())) { 185 logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}", 186 c1.getChainID(),c2.getChainID(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName()); 187 return false; 188 } 189 } catch (StructureException e) { 190 // the group doesn't exist (no density) in the chain, go on 191 countNonExisting++; 192 continue; 193 } 194 } 195 196 if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) { 197 logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.", 198 RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getChainID(), c1.getChainID()); 199 return false; 200 } 201 202 return true; 203 } 204 205 206 207 private TreeMap<String,Compound> findCompoundsFromAlignment() { 208 209 // first we determine which chains to consider: anything not looking 210 // polymeric (protein or nucleotide chain) should be discarded 211 Set<Integer> polyChainIndices = new TreeSet<Integer>(); 212 List<Chain> pureNonPolymerChains = new ArrayList<Chain>(); 213 for (int i=0;i<s.getChains().size();i++) { 214 if (StructureTools.isChainPureNonPolymer(s.getChain(i))) { 215 pureNonPolymerChains.add(s.getChain(i)); 216 } else { 217 polyChainIndices.add(i); 218 } 219 } 220 221 222 TreeMap<String, Compound> chainIds2compounds = new TreeMap<String,Compound>(); 223 224 int molId = 1; 225 226 outer: 227 for (int i:polyChainIndices) { 228 for (int j:polyChainIndices) { 229 230 if (j<=i) continue; 231 232 Chain c1 = s.getChain(i); 233 Chain c2 = s.getChain(j); 234 235 Map<Integer,Integer> positionIndex1 = new HashMap<Integer, Integer>(); 236 Map<Integer,Integer> positionIndex2 = new HashMap<Integer, Integer>(); 237 // here we use false, which means that X will be used for unknown compounds 238 String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), positionIndex1, false); 239 String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), positionIndex2, false); 240 241 int seq1Length = 0; 242 int seq2Length = 0; 243 244 SequencePair<?,?> pair = null; 245 if (isProteinSequence(str1) && isProteinSequence(str2)) { 246 ProteinSequence s1 = getProteinSequence(str1); 247 ProteinSequence s2 = getProteinSequence(str2); 248 seq1Length = s1.getLength(); 249 seq2Length = s2.getLength(); 250 251 pair = alignProtein(s1,s2); 252 253 } else if (isDNASequence(str1) && isDNASequence(str2)) { 254 DNASequence s1 = getDNASequence(str1); 255 DNASequence s2 = getDNASequence(str2); 256 seq1Length = s1.getLength(); 257 seq2Length = s2.getLength(); 258 259 pair = alignDNA(s1,s2); 260 261 } else if (isRNASequence(str1) && isRNASequence(str2)) { 262 RNASequence s1 = getRNASequence(str1); 263 RNASequence s2 = getRNASequence(str2); 264 seq1Length = s1.getLength(); 265 seq2Length = s2.getLength(); 266 267 pair = alignRNA(s1,s2); 268 269 } else { 270 logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers"); 271 continue; 272 } 273 274 275 int numGaps = getNumGaps(pair); 276 int numGaps1 = getNumGapsQuery(pair); 277 int numGaps2 = getNumGapsTarget(pair); 278 279 int nonGaps = pair.getLength() - numGaps; 280 281 double identity = (double)pair.getNumIdenticals()/(double)nonGaps; 282 double gapCov1 = (double) numGaps1 / (double) seq1Length; 283 double gapCov2 = (double) numGaps2 / (double) seq2Length; 284 285 logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}", 286 c1.getChainID(), c2.getChainID(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2)); 287 logger.debug("\n"+pair.toString(100)); 288 289 if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) { 290 if ( !chainIds2compounds.containsKey(c1.getChainID()) && 291 !chainIds2compounds.containsKey(c2.getChainID())) { 292 logger.debug("Creating Compound with chains {},{}",c1.getChainID(),c2.getChainID()); 293 294 Compound ent = new Compound(); 295 ent.addChain(c1); 296 ent.addChain(c2); 297 ent.setMolId(molId++); 298 chainIds2compounds.put(c1.getChainID(), ent); 299 chainIds2compounds.put(c2.getChainID(), ent); 300 301 302 } else { 303 Compound ent = chainIds2compounds.get(c1.getChainID()); 304 305 if (ent==null) { 306 logger.debug("Adding chain {} to entity {}",c1.getChainID(),c2.getChainID()); 307 ent = chainIds2compounds.get(c2.getChainID()); 308 ent.addChain(c1); 309 chainIds2compounds.put(c1.getChainID(), ent); 310 311 } else { 312 logger.debug("Adding chain {} to entity {}",c2.getChainID(),c1.getChainID()); 313 ent.addChain(c2); 314 chainIds2compounds.put(c2.getChainID(), ent); 315 316 } 317 } 318 if (!areResNumbersAligned(c1, c2)) { 319 logger.warn("Including 100% identical chains {},{} in same Compound, although they have misaligned residue numbers", 320 c1.getChainID(),c2.getChainID()); 321 } 322 } 323 324 if (identity>1) { 325 logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})", 326 c1.getChainID(),c2.getChainID(),pair.getNumIdenticals(),nonGaps,identity); 327 logger.warn("\n"+pair.toString(100)); 328 } 329 330 if (chainIds2compounds.size()==polyChainIndices.size()) // we've got all chains in entities 331 break outer; 332 } 333 } 334 335 // anything not in a Compound will be its own Compound 336 for (int i:polyChainIndices) { 337 Chain c = s.getChain(i); 338 if (!chainIds2compounds.containsKey(c.getChainID())) { 339 logger.debug("Creating a 1-member Compound for chain {}",c.getChainID()); 340 341 Compound ent = new Compound(); 342 ent.addChain(c); 343 ent.setMolId(molId++); 344 345 chainIds2compounds.put(c.getChainID(),ent); 346 } 347 } 348 349 350 return chainIds2compounds; 351 } 352 353 private SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) { 354 SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity(); 355 356 GapPenalty penalty = new SimpleGapPenalty(8, 1); 357 358 PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw = 359 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 360 361 return nw.getPair(); 362 } 363 364 private SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) { 365 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 366 367 GapPenalty penalty = new SimpleGapPenalty(8, 1); 368 369 PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw = 370 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 371 372 return nw.getPair(); 373 } 374 375 private SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) { 376 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 377 378 GapPenalty penalty = new SimpleGapPenalty(8, 1); 379 380 PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw = 381 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 382 383 return nw.getPair(); 384 } 385 386 387 388 private static int getNumGaps(SequencePair<?, ?> pair) { 389 int numGaps = 0; 390 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 391 if (pair.hasGap(alignmentIndex)) numGaps++; 392 } 393 return numGaps; 394 } 395 396 private static int getNumGapsQuery(SequencePair<?, ?> pair) { 397 int numGaps = 0; 398 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 399 if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) { 400 numGaps++; 401 } 402 } 403 return numGaps; 404 } 405 406 private static int getNumGapsTarget(SequencePair<?, ?> pair) { 407 int numGaps = 0; 408 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 409 if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) { 410 numGaps++; 411 } 412 } 413 return numGaps; 414 } 415 416 private boolean isProteinSequence(String str) { 417 try { 418 new ProteinSequence(str); 419 } catch (CompoundNotFoundException e) { 420 421 return false; 422 } 423 return true; 424 } 425 426 private boolean isDNASequence(String str) { 427 try { 428 new DNASequence(str); 429 } catch (CompoundNotFoundException e) { 430 431 return false; 432 } 433 return true; 434 } 435 436 private boolean isRNASequence(String str) { 437 try { 438 new RNASequence(str); 439 } catch (CompoundNotFoundException e) { 440 441 return false; 442 } 443 return true; 444 445 } 446 447 /** 448 * Returns the ProteinSequence or null if one can't be created 449 * @param str 450 * @return 451 */ 452 private ProteinSequence getProteinSequence(String str) { 453 try { 454 ProteinSequence s = new ProteinSequence(str); 455 return s; 456 } catch (CompoundNotFoundException e) { 457 458 logger.error("Unexpected error when creating ProteinSequence",e); 459 } 460 return null; 461 } 462 463 /** 464 * Returns the DNASequence or null if one can't be created 465 * @param str 466 * @return 467 */ 468 private DNASequence getDNASequence(String str) { 469 try { 470 DNASequence s = new DNASequence(str); 471 return s; 472 } catch (CompoundNotFoundException e) { 473 474 logger.error("Unexpected error when creating DNASequence ",e); 475 } 476 return null; 477 } 478 479 /** 480 * Returns the RNASequence or null if one can't be created 481 * @param str 482 * @return 483 */ 484 private RNASequence getRNASequence(String str) { 485 try { 486 RNASequence s = new RNASequence(str); 487 return s; 488 } catch (CompoundNotFoundException e) { 489 490 logger.error("Unexpected error when creating RNASequence ",e); 491 } 492 return null; 493 } 494 495}