001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io; 022 023import org.biojava.nbio.alignment.Alignments; 024import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 025import org.biojava.nbio.alignment.SimpleGapPenalty; 026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 027import org.biojava.nbio.alignment.template.GapPenalty; 028import org.biojava.nbio.alignment.template.PairwiseSequenceAligner; 029import org.biojava.nbio.core.alignment.template.SequencePair; 030import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 031import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 032import org.biojava.nbio.core.sequence.DNASequence; 033import org.biojava.nbio.core.sequence.ProteinSequence; 034import org.biojava.nbio.core.sequence.RNASequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 037import org.biojava.nbio.structure.*; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import java.util.ArrayList; 042import java.util.Collections; 043import java.util.Comparator; 044import java.util.HashMap; 045import java.util.List; 046import java.util.Map; 047import java.util.Set; 048import java.util.TreeMap; 049import java.util.TreeSet; 050 051/** 052 * Heuristical finding of Entities (called Compounds in legacy PDB format) 053 * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains 054 * in the Structure. 055 * 056 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where 057 * possibly no SEQRES is given. 058 * 059 * @author Jose Duarte 060 */ 061public class EntityFinder { 062 063 private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class); 064 065 /** 066 * Above this ratio of mismatching residue types for same residue numbers we 067 * consider the 2 chains to have mismatching residue numbers and warn about it 068 */ 069 public static final double RATIO_GAPS_FOR_MISMATCH = 0.50; 070 071 /** 072 * Identity value for 2 chains to be considered part of same entity 073 */ 074 public static final double IDENTITY_THRESHOLD = 0.99999; 075 076 /** 077 * Gap coverage value (num gaps over length of sequence) for each chain of the match: 078 * 2 chains with more gap coverage than this value will not be considered part of the same entity 079 */ 080 public static final double GAP_COVERAGE_THRESHOLD = 0.3; 081 082 083 084 085 /** 086 * Utility method that employs some heuristics to find the {@link EntityInfo}s 087 * for the polymeric chains given in constructor. 088 * To be used in case the information is missing in PDB/mmCIF file 089 * @return 090 */ 091 public static List<EntityInfo> findPolyEntities(List<List<Chain>> polyModels) { 092 TreeMap<String,EntityInfo> chainIds2entities = findEntitiesFromAlignment(polyModels); 093 094 List<EntityInfo> entities = findUniqueEntities(chainIds2entities); 095 096 return entities; 097 } 098 099 /** 100 * Utility method to obtain a list of unique entities from the chainIds2entities map 101 * @return 102 */ 103 private static List<EntityInfo> findUniqueEntities(TreeMap<String,EntityInfo> chainIds2entities) { 104 105 List<EntityInfo> list = new ArrayList<EntityInfo>(); 106 107 for (EntityInfo cluster:chainIds2entities.values()) { 108 boolean present = false; 109 for (EntityInfo cl:list) { 110 if (cl==cluster) { 111 present = true; 112 break; 113 } 114 } 115 if (!present) list.add(cluster); 116 } 117 return list; 118 } 119 120 /** 121 * Given all chains of all models find entities for the nonpolymers and water chains within them, 122 * assigning entity ids, types and descriptions to them. The result is written back to the passed entities List. 123 * @param nonPolyModels 124 * @param waterModels 125 * @param entities 126 */ 127 public static void createPurelyNonPolyEntities(List<List<Chain>> nonPolyModels, List<List<Chain>> waterModels, List<EntityInfo> entities) { 128 129 if (nonPolyModels.isEmpty()) return; 130 131 // let's find first the max entity id to assign entity ids to the newly found entities 132 int maxMolId = 0; 133 if (!entities.isEmpty()) { 134 maxMolId = Collections.max(entities, new Comparator<EntityInfo>() { 135 @Override 136 public int compare(EntityInfo o1, EntityInfo o2) { 137 return new Integer(o1.getMolId()).compareTo(o2.getMolId()); 138 } 139 }).getMolId(); 140 } 141 // we go one over the max 142 int molId = maxMolId + 1; 143 144 if (!nonPolyModels.get(0).isEmpty()) { 145 List<EntityInfo> nonPolyEntities = new ArrayList<>(); 146 147 for (List<Chain> model:nonPolyModels) { 148 for (Chain c: model) { 149 // we assume there's only 1 group per non-poly chain 150 String molecPdbName = c.getAtomGroup(0).getPDBName(); 151 152 EntityInfo nonPolyEntity = findNonPolyEntityWithDescription(molecPdbName, nonPolyEntities); 153 if (nonPolyEntity == null) { 154 nonPolyEntity = new EntityInfo(); 155 nonPolyEntity.setDescription(molecPdbName); 156 nonPolyEntity.setType(EntityType.NONPOLYMER); 157 nonPolyEntity.setMolId(molId++); 158 nonPolyEntities.add(nonPolyEntity); 159 160 } 161 nonPolyEntity.addChain(c); 162 c.setEntityInfo(nonPolyEntity); 163 } 164 } 165 entities.addAll(nonPolyEntities); 166 } 167 168 if (!waterModels.get(0).isEmpty()) { 169 EntityInfo waterEntity = new EntityInfo(); 170 waterEntity.setType(EntityType.WATER); 171 waterEntity.setDescription("water"); 172 waterEntity.setMolId(molId); 173 174 for (List<Chain> model:waterModels) { 175 for (Chain waterChain:model) { 176 waterEntity.addChain(waterChain); 177 waterChain.setEntityInfo(waterEntity); 178 } 179 } 180 entities.add(waterEntity); 181 182 } 183 184 185 } 186 187 private static EntityInfo findNonPolyEntityWithDescription(String description, List<EntityInfo> nonPolyEntities) { 188 for (EntityInfo e:nonPolyEntities) { 189 if (e.getDescription().equals(description)) return e; 190 } 191 return null; 192 } 193 194 195 private static boolean areResNumbersAligned(Chain c1, Chain c2) { 196 197 boolean isC1prot = c1.isProtein(); 198 boolean isC2prot = c2.isProtein(); 199 200 // different kind of chain: we won't try to align them 201 if (isC1prot != isC2prot ) return false; 202 203 List<Group> c1AtomGroups = null; 204 if (isC1prot) { 205 c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID); 206 } 207 else { 208 c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE); 209 } 210 211 int countNonExisting = 0; 212 213 for (Group g1:c1AtomGroups) { 214 try { 215 Group g2 = c2.getGroupByPDB(g1.getResidueNumber()); 216 if (!g2.getPDBName().equals(g1.getPDBName())) { 217 logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}", 218 c1.getId(),c2.getId(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName()); 219 return false; 220 } 221 } catch (StructureException e) { 222 // the group doesn't exist (no density) in the chain, go on 223 countNonExisting++; 224 continue; 225 } 226 } 227 228 if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) { 229 logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.", 230 RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getId(), c1.getId()); 231 return false; 232 } 233 234 return true; 235 } 236 237 238 239 private static TreeMap<String,EntityInfo> findEntitiesFromAlignment(List<List<Chain>> polyModels) { 240 241 242 243 TreeMap<String, EntityInfo> chainIds2entities = new TreeMap<String,EntityInfo>(); 244 245 if (polyModels.isEmpty()) return chainIds2entities; 246 247 Set<Integer> polyChainIndices = new TreeSet<Integer>(); 248 for (int i=0;i<polyModels.get(0).size();i++) { 249 polyChainIndices.add(i); 250 } 251 252 253 int molId = 1; 254 255 outer: 256 for (int i:polyChainIndices) { 257 for (int j:polyChainIndices) { 258 259 if (j<=i) continue; 260 261 Chain c1 = polyModels.get(0).get(i); 262 Chain c2 = polyModels.get(0).get(j); 263 264 Map<Integer,Integer> positionIndex1 = new HashMap<Integer, Integer>(); 265 Map<Integer,Integer> positionIndex2 = new HashMap<Integer, Integer>(); 266 // here we use false, which means that X will be used for unknown compounds 267 String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), positionIndex1, false); 268 String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), positionIndex2, false); 269 270 int seq1Length = 0; 271 int seq2Length = 0; 272 273 SequencePair<?,?> pair = null; 274 if (isProteinSequence(str1) && isProteinSequence(str2)) { 275 ProteinSequence s1 = getProteinSequence(str1); 276 ProteinSequence s2 = getProteinSequence(str2); 277 seq1Length = s1.getLength(); 278 seq2Length = s2.getLength(); 279 280 pair = alignProtein(s1,s2); 281 282 } else if (isDNASequence(str1) && isDNASequence(str2)) { 283 DNASequence s1 = getDNASequence(str1); 284 DNASequence s2 = getDNASequence(str2); 285 seq1Length = s1.getLength(); 286 seq2Length = s2.getLength(); 287 288 pair = alignDNA(s1,s2); 289 290 } else if (isRNASequence(str1) && isRNASequence(str2)) { 291 RNASequence s1 = getRNASequence(str1); 292 RNASequence s2 = getRNASequence(str2); 293 seq1Length = s1.getLength(); 294 seq2Length = s2.getLength(); 295 296 pair = alignRNA(s1,s2); 297 298 } else { 299 logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers"); 300 continue; 301 } 302 303 304 int numGaps = getNumGaps(pair); 305 int numGaps1 = getNumGapsQuery(pair); 306 int numGaps2 = getNumGapsTarget(pair); 307 308 int nonGaps = pair.getLength() - numGaps; 309 310 double identity = (double)pair.getNumIdenticals()/(double)nonGaps; 311 double gapCov1 = (double) numGaps1 / (double) seq1Length; 312 double gapCov2 = (double) numGaps2 / (double) seq2Length; 313 314 logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}", 315 c1.getId(), c2.getId(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2)); 316 logger.debug("\n"+pair.toString(100)); 317 318 if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) { 319 if ( !chainIds2entities.containsKey(c1.getId()) && 320 !chainIds2entities.containsKey(c2.getId())) { 321 logger.debug("Creating Compound with chains {},{}",c1.getId(),c2.getId()); 322 323 EntityInfo ent = new EntityInfo(); 324 ent.addChain(c1); 325 ent.addChain(c2); 326 ent.setMolId(molId++); 327 ent.setType(EntityType.POLYMER); 328 c1.setEntityInfo(ent); 329 c2.setEntityInfo(ent); 330 chainIds2entities.put(c1.getId(), ent); 331 chainIds2entities.put(c2.getId(), ent); 332 333 334 } else { 335 EntityInfo ent = chainIds2entities.get(c1.getId()); 336 337 if (ent==null) { 338 logger.debug("Adding chain {} to entity {}",c1.getId(),c2.getId()); 339 ent = chainIds2entities.get(c2.getId()); 340 ent.addChain(c1); 341 c1.setEntityInfo(ent); 342 chainIds2entities.put(c1.getId(), ent); 343 344 } else { 345 logger.debug("Adding chain {} to entity {}",c2.getId(),c1.getId()); 346 ent.addChain(c2); 347 c2.setEntityInfo(ent); 348 chainIds2entities.put(c2.getId(), ent); 349 350 } 351 } 352 if (!areResNumbersAligned(c1, c2)) { 353 logger.warn("Including 100% identical chains {},{} in same Compound, although they have misaligned residue numbers", 354 c1.getId(),c2.getId()); 355 } 356 } 357 358 if (identity>1) { 359 logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})", 360 c1.getId(),c2.getId(),pair.getNumIdenticals(),nonGaps,identity); 361 logger.warn("\n"+pair.toString(100)); 362 } 363 364 if (chainIds2entities.size()==polyChainIndices.size()) // we've got all chains in entities 365 break outer; 366 } 367 } 368 369 // anything not in an Entity will be its own Entity 370 for (int i:polyChainIndices) { 371 Chain c = polyModels.get(0).get(i); 372 if (!chainIds2entities.containsKey(c.getId())) { 373 logger.debug("Creating a 1-member Compound for chain {}",c.getId()); 374 375 EntityInfo ent = new EntityInfo(); 376 ent.addChain(c); 377 ent.setMolId(molId++); 378 ent.setType(EntityType.POLYMER); 379 c.setEntityInfo(ent); 380 381 chainIds2entities.put(c.getId(),ent); 382 } 383 } 384 385 // now that we've found the entities for first model, let's do the same for the rest of the models 386 387 for (int i=1;i<polyModels.size();i++) { 388 for (Chain chain : polyModels.get(i)) { 389 EntityInfo e = chainIds2entities.get(chain.getId()); 390 chain.setEntityInfo(e); 391 e.addChain(chain); 392 } 393 } 394 395 396 return chainIds2entities; 397 } 398 399 private static SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) { 400 SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity(); 401 402 GapPenalty penalty = new SimpleGapPenalty(8, 1); 403 404 PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw = 405 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 406 407 return nw.getPair(); 408 } 409 410 private static SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) { 411 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 412 413 GapPenalty penalty = new SimpleGapPenalty(8, 1); 414 415 PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw = 416 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 417 418 return nw.getPair(); 419 } 420 421 private static SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) { 422 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 423 424 GapPenalty penalty = new SimpleGapPenalty(8, 1); 425 426 PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw = 427 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 428 429 return nw.getPair(); 430 } 431 432 433 434 private static int getNumGaps(SequencePair<?, ?> pair) { 435 int numGaps = 0; 436 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 437 if (pair.hasGap(alignmentIndex)) numGaps++; 438 } 439 return numGaps; 440 } 441 442 private static int getNumGapsQuery(SequencePair<?, ?> pair) { 443 int numGaps = 0; 444 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 445 if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) { 446 numGaps++; 447 } 448 } 449 return numGaps; 450 } 451 452 private static int getNumGapsTarget(SequencePair<?, ?> pair) { 453 int numGaps = 0; 454 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 455 if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) { 456 numGaps++; 457 } 458 } 459 return numGaps; 460 } 461 462 private static boolean isProteinSequence(String str) { 463 try { 464 new ProteinSequence(str); 465 } catch (CompoundNotFoundException e) { 466 467 return false; 468 } 469 return true; 470 } 471 472 private static boolean isDNASequence(String str) { 473 try { 474 new DNASequence(str); 475 } catch (CompoundNotFoundException e) { 476 477 return false; 478 } 479 return true; 480 } 481 482 private static boolean isRNASequence(String str) { 483 try { 484 new RNASequence(str); 485 } catch (CompoundNotFoundException e) { 486 487 return false; 488 } 489 return true; 490 491 } 492 493 /** 494 * Returns the ProteinSequence or null if one can't be created 495 * @param str 496 * @return 497 */ 498 private static ProteinSequence getProteinSequence(String str) { 499 try { 500 ProteinSequence s = new ProteinSequence(str); 501 return s; 502 } catch (CompoundNotFoundException e) { 503 504 logger.error("Unexpected error when creating ProteinSequence",e); 505 } 506 return null; 507 } 508 509 /** 510 * Returns the DNASequence or null if one can't be created 511 * @param str 512 * @return 513 */ 514 private static DNASequence getDNASequence(String str) { 515 try { 516 DNASequence s = new DNASequence(str); 517 return s; 518 } catch (CompoundNotFoundException e) { 519 520 logger.error("Unexpected error when creating DNASequence ",e); 521 } 522 return null; 523 } 524 525 /** 526 * Returns the RNASequence or null if one can't be created 527 * @param str 528 * @return 529 */ 530 private static RNASequence getRNASequence(String str) { 531 try { 532 RNASequence s = new RNASequence(str); 533 return s; 534 } catch (CompoundNotFoundException e) { 535 536 logger.error("Unexpected error when creating RNASequence ",e); 537 } 538 return null; 539 } 540 541}