001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io; 022 023import org.biojava.nbio.alignment.Alignments; 024import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 025import org.biojava.nbio.alignment.SimpleGapPenalty; 026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 027import org.biojava.nbio.alignment.template.GapPenalty; 028import org.biojava.nbio.alignment.template.PairwiseSequenceAligner; 029import org.biojava.nbio.core.alignment.template.SequencePair; 030import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 031import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 032import org.biojava.nbio.core.sequence.DNASequence; 033import org.biojava.nbio.core.sequence.ProteinSequence; 034import org.biojava.nbio.core.sequence.RNASequence; 035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 036import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 037import org.biojava.nbio.structure.*; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import java.util.ArrayList; 042import java.util.Collections; 043import java.util.Comparator; 044import java.util.HashMap; 045import java.util.List; 046import java.util.Set; 047import java.util.TreeMap; 048import java.util.TreeSet; 049 050/** 051 * Heuristical finding of Entities (called Compounds in legacy PDB format) 052 * in a given Structure. Entities are the groups of sequence identical NCS-related polymer chains 053 * in the Structure. 054 * 055 * This is related to {@link SeqRes2AtomAligner} but it is intended for raw PDB/mmCIF files where 056 * possibly no SEQRES is given. 057 * 058 * @author Jose Duarte 059 */ 060public class EntityFinder { 061 062 private static final Logger logger = LoggerFactory.getLogger(EntityFinder.class); 063 064 /** 065 * Above this ratio of mismatching residue types for same residue numbers we 066 * consider the 2 chains to have mismatching residue numbers and warn about it 067 */ 068 public static final double RATIO_GAPS_FOR_MISMATCH = 0.50; 069 070 /** 071 * Identity value for 2 chains to be considered part of same entity 072 */ 073 public static final double IDENTITY_THRESHOLD = 0.99999; 074 075 /** 076 * Gap coverage value (num gaps over length of sequence) for each chain of the match: 077 * 2 chains with more gap coverage than this value will not be considered part of the same entity 078 */ 079 public static final double GAP_COVERAGE_THRESHOLD = 0.3; 080 081 082 083 084 /** 085 * Utility method that employs some heuristics to find the {@link EntityInfo}s 086 * for the polymeric chains given in constructor. 087 * To be used in case the information is missing in PDB/mmCIF file 088 * @return 089 */ 090 public static List<EntityInfo> findPolyEntities(List<List<Chain>> polyModels) { 091 TreeMap<String,EntityInfo> chainIds2entities = findEntitiesFromAlignment(polyModels); 092 093 return findUniqueEntities(chainIds2entities); 094 } 095 096 /** 097 * Utility method to obtain a list of unique entities from the chainIds2entities map 098 * @return 099 */ 100 private static List<EntityInfo> findUniqueEntities(TreeMap<String,EntityInfo> chainIds2entities) { 101 102 List<EntityInfo> list = new ArrayList<>(); 103 104 for (EntityInfo cluster:chainIds2entities.values()) { 105 boolean present = false; 106 for (EntityInfo cl:list) { 107 if (cl==cluster) { 108 present = true; 109 break; 110 } 111 } 112 if (!present) list.add(cluster); 113 } 114 return list; 115 } 116 117 /** 118 * Given all chains of all models find entities for the nonpolymers and water chains within them, 119 * assigning entity ids, types and descriptions to them. The result is written back to the passed entities List. 120 * @param nonPolyModels 121 * @param waterModels 122 * @param entities 123 */ 124 public static void createPurelyNonPolyEntities(List<List<Chain>> nonPolyModels, List<List<Chain>> waterModels, List<EntityInfo> entities) { 125 126 if (nonPolyModels.isEmpty()) return; 127 128 // let's find first the max entity id to assign entity ids to the newly found entities 129 int maxMolId = 0; 130 if (!entities.isEmpty()) { 131 maxMolId = Collections.max(entities, Comparator.comparingInt(EntityInfo::getMolId)).getMolId(); 132 } 133 // we go one over the max 134 int molId = maxMolId + 1; 135 136 if (!nonPolyModels.get(0).isEmpty()) { 137 List<EntityInfo> nonPolyEntities = new ArrayList<>(); 138 139 for (List<Chain> model:nonPolyModels) { 140 for (Chain c: model) { 141 // we assume there's only 1 group per non-poly chain 142 String molecPdbName = c.getAtomGroup(0).getPDBName(); 143 144 EntityInfo nonPolyEntity = findNonPolyEntityWithDescription(molecPdbName, nonPolyEntities); 145 if (nonPolyEntity == null) { 146 nonPolyEntity = new EntityInfo(); 147 nonPolyEntity.setDescription(molecPdbName); 148 nonPolyEntity.setType(EntityType.NONPOLYMER); 149 nonPolyEntity.setMolId(molId++); 150 nonPolyEntities.add(nonPolyEntity); 151 152 } 153 nonPolyEntity.addChain(c); 154 c.setEntityInfo(nonPolyEntity); 155 } 156 } 157 entities.addAll(nonPolyEntities); 158 } 159 160 if (!waterModels.get(0).isEmpty()) { 161 EntityInfo waterEntity = new EntityInfo(); 162 waterEntity.setType(EntityType.WATER); 163 waterEntity.setDescription("water"); 164 waterEntity.setMolId(molId); 165 166 for (List<Chain> model:waterModels) { 167 for (Chain waterChain:model) { 168 waterEntity.addChain(waterChain); 169 waterChain.setEntityInfo(waterEntity); 170 } 171 } 172 entities.add(waterEntity); 173 174 } 175 176 } 177 178 private static EntityInfo findNonPolyEntityWithDescription(String description, List<EntityInfo> nonPolyEntities) { 179 for (EntityInfo e:nonPolyEntities) { 180 if (e.getDescription().equals(description)) return e; 181 } 182 return null; 183 } 184 185 186 private static boolean areResNumbersAligned(Chain c1, Chain c2) { 187 188 boolean isC1prot = c1.isProtein(); 189 boolean isC2prot = c2.isProtein(); 190 191 // different kind of chain: we won't try to align them 192 if (isC1prot != isC2prot ) return false; 193 194 List<Group> c1AtomGroups = null; 195 if (isC1prot) { 196 c1AtomGroups = c1.getAtomGroups(GroupType.AMINOACID); 197 } 198 else { 199 c1AtomGroups = c1.getAtomGroups(GroupType.NUCLEOTIDE); 200 } 201 202 int countNonExisting = 0; 203 204 for (Group g1:c1AtomGroups) { 205 try { 206 Group g2 = c2.getGroupByPDB(g1.getResidueNumber()); 207 if (!g2.getPDBName().equals(g1.getPDBName())) { 208 logger.debug("Mismatch of residues between chains {},{} for residue number {}: {} {}", 209 c1.getId(),c2.getId(),g1.getResidueNumber(), g1.getPDBName(), g2.getPDBName()); 210 return false; 211 } 212 } catch (StructureException e) { 213 // the group doesn't exist (no density) in the chain, go on 214 countNonExisting++; 215 } 216 } 217 218 if ((double)countNonExisting/(double)c1AtomGroups.size() > RATIO_GAPS_FOR_MISMATCH) { 219 logger.debug("More than {} of the residues ({} out of {}) are not present in chain {} when comparing by residue numbers to chain {}.", 220 RATIO_GAPS_FOR_MISMATCH, countNonExisting, c1AtomGroups.size(), c2.getId(), c1.getId()); 221 return false; 222 } 223 224 return true; 225 } 226 227 228 229 private static TreeMap<String,EntityInfo> findEntitiesFromAlignment(List<List<Chain>> polyModels) { 230 231 TreeMap<String, EntityInfo> chainIds2entities = new TreeMap<>(); 232 233 if (polyModels.isEmpty()) return chainIds2entities; 234 235 Set<Integer> polyChainIndices = new TreeSet<>(); 236 for (int i=0;i<polyModels.get(0).size();i++) { 237 polyChainIndices.add(i); 238 } 239 240 int molId = 1; 241 242 outer: 243 for (int i:polyChainIndices) { 244 Chain c1 = polyModels.get(0).get(i); 245 // here we use false, which means that X will be used for unknown compounds 246 String str1 = SeqRes2AtomAligner.getFullAtomSequence(c1.getAtomGroups(), new HashMap<>(), false); 247 248 for (int j:polyChainIndices) { 249 250 if (j<=i) continue; 251 252 Chain c2 = polyModels.get(0).get(j); 253 String str2 = SeqRes2AtomAligner.getFullAtomSequence(c2.getAtomGroups(), new HashMap<>(), false); 254 255 int seq1Length = 0; 256 int seq2Length = 0; 257 258 SequencePair<?,?> pair; 259 if (isProteinSequence(str1) && isProteinSequence(str2)) { 260 ProteinSequence s1 = getProteinSequence(str1); 261 ProteinSequence s2 = getProteinSequence(str2); 262 seq1Length = s1.getLength(); 263 seq2Length = s2.getLength(); 264 265 pair = alignProtein(s1,s2); 266 267 } else if (isDNASequence(str1) && isDNASequence(str2)) { 268 DNASequence s1 = getDNASequence(str1); 269 DNASequence s2 = getDNASequence(str2); 270 seq1Length = s1.getLength(); 271 seq2Length = s2.getLength(); 272 273 pair = alignDNA(s1,s2); 274 275 } else if (isRNASequence(str1) && isRNASequence(str2)) { 276 RNASequence s1 = getRNASequence(str1); 277 RNASequence s2 = getRNASequence(str2); 278 seq1Length = s1.getLength(); 279 seq2Length = s2.getLength(); 280 281 pair = alignRNA(s1,s2); 282 283 } else { 284 logger.debug("Chains {},{} are either different kind of polymers or could not be recognized as protein or nucleotide polymers", c1.getId(), c2.getId()); 285 continue; 286 } 287 288 int numGaps = getNumGaps(pair); 289 int numGaps1 = getNumGapsQuery(pair); 290 int numGaps2 = getNumGapsTarget(pair); 291 292 int nonGaps = pair.getLength() - numGaps; 293 294 double identity = (double)pair.getNumIdenticals()/(double)nonGaps; 295 double gapCov1 = (double) numGaps1 / (double) seq1Length; 296 double gapCov2 = (double) numGaps2 / (double) seq2Length; 297 298 logger.debug("Alignment for chain pair {},{}: identity: {}, gap coverage 1: {}, gap coverage 2: {}", 299 c1.getId(), c2.getId(), String.format("%4.2f",identity), String.format("%4.2f",gapCov1), String.format("%4.2f",gapCov2)); 300 logger.debug("\n"+pair.toString(100)); 301 302 if (identity > IDENTITY_THRESHOLD && gapCov1<GAP_COVERAGE_THRESHOLD && gapCov2<GAP_COVERAGE_THRESHOLD) { 303 if ( !chainIds2entities.containsKey(c1.getId()) && 304 !chainIds2entities.containsKey(c2.getId())) { 305 logger.debug("Creating entity with chains {},{}",c1.getId(),c2.getId()); 306 307 EntityInfo ent = new EntityInfo(); 308 ent.addChain(c1); 309 ent.addChain(c2); 310 ent.setMolId(molId++); 311 ent.setType(EntityType.POLYMER); 312 c1.setEntityInfo(ent); 313 c2.setEntityInfo(ent); 314 chainIds2entities.put(c1.getId(), ent); 315 chainIds2entities.put(c2.getId(), ent); 316 317 } else { 318 Chain chainToAdd; 319 EntityInfo ent = chainIds2entities.get(c1.getId()); 320 if (ent==null) { 321 ent = chainIds2entities.get(c2.getId()); 322 chainToAdd = c1; 323 } else { 324 chainToAdd = c2; 325 } 326 if (!chainIds2entities.containsKey(chainToAdd.getId())) { 327 logger.debug("Adding chain {} to entity {}", chainToAdd.getId(), ent.getId()); 328 ent.addChain(chainToAdd); 329 chainToAdd.setEntityInfo(ent); 330 chainIds2entities.put(chainToAdd.getId(), ent); 331 } 332 } 333 if (!areResNumbersAligned(c1, c2)) { 334 logger.warn("Including 100% identical chains {},{} in same entity, although they have misaligned residue numbers", 335 c1.getId(),c2.getId()); 336 } 337 } 338 339 if (identity>1) { 340 logger.warn("Identity for chains {},{} above 1. {} identicals out of {} non-gap-aligned residues (identity {})", 341 c1.getId(),c2.getId(),pair.getNumIdenticals(),nonGaps,identity); 342 logger.warn("\n"+pair.toString(100)); 343 } 344 345 if (chainIds2entities.size()==polyChainIndices.size()) // we've got all chains in entities 346 break outer; 347 } 348 } 349 350 // anything not in an Entity will be its own Entity 351 for (int i:polyChainIndices) { 352 Chain c = polyModels.get(0).get(i); 353 if (!chainIds2entities.containsKey(c.getId())) { 354 logger.debug("Creating a 1-member entity for chain {}",c.getId()); 355 356 EntityInfo ent = new EntityInfo(); 357 ent.addChain(c); 358 ent.setMolId(molId++); 359 ent.setType(EntityType.POLYMER); 360 c.setEntityInfo(ent); 361 362 chainIds2entities.put(c.getId(),ent); 363 } 364 } 365 366 // now that we've found the entities for first model, let's do the same for the rest of the models 367 368 for (int i=1;i<polyModels.size();i++) { 369 for (Chain chain : polyModels.get(i)) { 370 EntityInfo e = chainIds2entities.get(chain.getId()); 371 chain.setEntityInfo(e); 372 e.addChain(chain); 373 } 374 } 375 376 return chainIds2entities; 377 } 378 379 private static SequencePair<ProteinSequence, AminoAcidCompound> alignProtein(ProteinSequence s1, ProteinSequence s2) { 380 SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getIdentity(); 381 382 GapPenalty penalty = new SimpleGapPenalty(8, 1); 383 384 PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> nw = 385 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 386 387 return nw.getPair(); 388 } 389 390 private static SequencePair<DNASequence, NucleotideCompound> alignDNA(DNASequence s1, DNASequence s2) { 391 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 392 393 GapPenalty penalty = new SimpleGapPenalty(8, 1); 394 395 PairwiseSequenceAligner<DNASequence, NucleotideCompound> nw = 396 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 397 398 return nw.getPair(); 399 } 400 401 private static SequencePair<RNASequence, NucleotideCompound> alignRNA(RNASequence s1, RNASequence s2) { 402 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 403 404 GapPenalty penalty = new SimpleGapPenalty(8, 1); 405 406 PairwiseSequenceAligner<RNASequence, NucleotideCompound> nw = 407 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.GLOBAL, penalty, matrix); 408 409 return nw.getPair(); 410 } 411 412 413 414 private static int getNumGaps(SequencePair<?, ?> pair) { 415 int numGaps = 0; 416 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 417 if (pair.hasGap(alignmentIndex)) numGaps++; 418 } 419 return numGaps; 420 } 421 422 private static int getNumGapsQuery(SequencePair<?, ?> pair) { 423 int numGaps = 0; 424 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 425 if (pair.getCompoundInQueryAt(alignmentIndex).getShortName().equals("-")) { 426 numGaps++; 427 } 428 } 429 return numGaps; 430 } 431 432 private static int getNumGapsTarget(SequencePair<?, ?> pair) { 433 int numGaps = 0; 434 for (int alignmentIndex=1;alignmentIndex<=pair.getLength();alignmentIndex++) { 435 if (pair.getCompoundInTargetAt(alignmentIndex).getShortName().equals("-")) { 436 numGaps++; 437 } 438 } 439 return numGaps; 440 } 441 442 private static boolean isProteinSequence(String str) { 443 try { 444 new ProteinSequence(str); 445 } catch (CompoundNotFoundException e) { 446 447 return false; 448 } 449 return true; 450 } 451 452 private static boolean isDNASequence(String str) { 453 try { 454 new DNASequence(str); 455 } catch (CompoundNotFoundException e) { 456 457 return false; 458 } 459 return true; 460 } 461 462 private static boolean isRNASequence(String str) { 463 try { 464 new RNASequence(str); 465 } catch (CompoundNotFoundException e) { 466 467 return false; 468 } 469 return true; 470 471 } 472 473 /** 474 * Returns the ProteinSequence or null if one can't be created 475 * @param str 476 * @return 477 */ 478 private static ProteinSequence getProteinSequence(String str) { 479 try { 480 ProteinSequence s = new ProteinSequence(str); 481 return s; 482 } catch (CompoundNotFoundException e) { 483 484 logger.error("Unexpected error when creating ProteinSequence",e); 485 } 486 return null; 487 } 488 489 /** 490 * Returns the DNASequence or null if one can't be created 491 * @param str 492 * @return 493 */ 494 private static DNASequence getDNASequence(String str) { 495 try { 496 DNASequence s = new DNASequence(str); 497 return s; 498 } catch (CompoundNotFoundException e) { 499 500 logger.error("Unexpected error when creating DNASequence ",e); 501 } 502 return null; 503 } 504 505 /** 506 * Returns the RNASequence or null if one can't be created 507 * @param str 508 * @return 509 */ 510 private static RNASequence getRNASequence(String str) { 511 try { 512 RNASequence s = new RNASequence(str); 513 return s; 514 } catch (CompoundNotFoundException e) { 515 516 logger.error("Unexpected error when creating RNASequence ",e); 517 } 518 return null; 519 } 520 521}