001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on Aug 23, 2007 021 * 022 */ 023 024package org.biojava.nbio.structure.io; 025 026 027import java.util.ArrayList; 028import java.util.HashMap; 029import java.util.Iterator; 030import java.util.List; 031import java.util.Map; 032 033import org.biojava.nbio.alignment.Alignments; 034import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 035import org.biojava.nbio.alignment.SimpleGapPenalty; 036import org.biojava.nbio.alignment.template.GapPenalty; 037import org.biojava.nbio.alignment.template.PairwiseSequenceAligner; 038import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 039import org.biojava.nbio.core.alignment.template.SequencePair; 040import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 041import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 042import org.biojava.nbio.core.sequence.DNASequence; 043import org.biojava.nbio.core.sequence.ProteinSequence; 044import org.biojava.nbio.core.sequence.RNASequence; 045import org.biojava.nbio.core.sequence.compound.AmbiguityDNACompoundSet; 046import org.biojava.nbio.core.sequence.compound.AmbiguityDNARNAHybridCompoundSet; 047import org.biojava.nbio.core.sequence.compound.AmbiguityRNACompoundSet; 048import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 049import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 050import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 051import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 052import org.biojava.nbio.core.sequence.template.Compound; 053import org.biojava.nbio.core.sequence.template.Sequence; 054import org.biojava.nbio.structure.AminoAcid; 055import org.biojava.nbio.structure.Atom; 056import org.biojava.nbio.structure.Chain; 057import org.biojava.nbio.structure.Group; 058import org.biojava.nbio.structure.GroupType; 059import org.biojava.nbio.structure.HetatomImpl; 060import org.biojava.nbio.structure.NucleotideImpl; 061import org.biojava.nbio.structure.ResidueNumber; 062import org.biojava.nbio.structure.Structure; 063import org.biojava.nbio.structure.io.mmcif.chem.PolymerType; 064import org.biojava.nbio.structure.io.mmcif.chem.ResidueType; 065import org.biojava.nbio.structure.io.mmcif.model.ChemComp; 066import org.slf4j.Logger; 067import org.slf4j.LoggerFactory; 068 069 070/** 071 * Aligns the SEQRES residues to the ATOM residues. 072 * The AminoAcids that can be matched between the two of them will be set in the SEQRES 073 * chains 074 * 075 * 076 * @author Andreas Prlic 077 * @author Jose Duarte 078 */ 079public class SeqRes2AtomAligner { 080 081 private static final Logger logger = LoggerFactory.getLogger(SeqRes2AtomAligner.class); 082 083 084 085 private String alignmentString; 086 087 public SeqRes2AtomAligner(){ 088 logger.debug("initialising SeqRes2AtomAligner"); 089 alignmentString = ""; 090 } 091 092 public String getAlignmentString() { 093 return alignmentString; 094 } 095 096 public static Chain getMatchingAtomRes(Chain seqRes, List<Chain> atomList) 097 { 098 Iterator<Chain> iter = atomList.iterator(); 099 while(iter.hasNext()){ 100 Chain atomChain = iter.next(); 101 if ( atomChain.getChainID().equals(seqRes.getChainID())){ 102 return atomChain; 103 } 104 } 105 106 logger.info("Could not match SEQRES chainID >" + seqRes.getChainID() + "< to ATOM chains!, size of atom chain: " + atomList.size()); 107 return null; 108 } 109 110 111 112 113 public void align(Structure s, List<Chain> seqResList){ 114 115 List<Chain> atomList = s.getModel(0); 116 117 118 for (Chain seqRes: seqResList){ 119 120 Chain atomRes = getMatchingAtomRes(seqRes,atomList); 121 if ( atomRes == null) 122 continue; 123 124 mapSeqresRecords(atomRes,seqRes); 125 126 127 } 128 129 } 130 131 /** 132 * Map the seqRes groups to the atomRes chain. 133 * Updates the atomRes chain object with the mapped data 134 * The seqRes chain should not be needed after this and atomRes should be further used. 135 * 136 * @param atomRes the chain containing ATOM groups (in atomGroups slot). This chain 137 * is modified to contain in its seqresGroups slot the mapped atom groups 138 * @param seqRes the chain containing SEQRES groups (in atomGroups slot). This chain 139 * is not modified 140 */ 141 public void mapSeqresRecords(Chain atomRes, Chain seqRes) { 142 List<Group> seqResGroups = seqRes.getAtomGroups(); 143 List<Group> atmResGroups = atomRes.getAtomGroups(); 144 145 146 147 logger.debug("Comparing ATOM {} ({} groups) to SEQRES {} ({} groups) ", 148 atomRes.getChainID(), atmResGroups.size(), seqRes.getChainID(), seqResGroups.size()); 149 150 151 List<Group> matchedGroups = trySimpleMatch(seqResGroups, atmResGroups); 152 153 if ( matchedGroups != null) { 154 // update the new SEQRES list 155 atomRes.setSeqResGroups(matchedGroups); 156 return; 157 } 158 159 logger.debug("Could not map SEQRES to ATOM records easily, need to align..."); 160 161 int numAminosSeqres = seqRes.getAtomGroups(GroupType.AMINOACID).size(); 162 int numNucleotidesSeqres = seqRes.getAtomGroups(GroupType.NUCLEOTIDE).size(); 163 164 if ( numAminosSeqres < 1) { 165 166 if ( numNucleotidesSeqres > 1) { 167 168 logger.debug("SEQRES chain {} is a nucleotide chain ({} nucleotides), aligning nucleotides...", seqRes.getChainID(), numNucleotidesSeqres); 169 170 alignNucleotideChains(seqRes,atomRes); 171 return; 172 } else { 173 174 logger.debug("SEQRES chain {} contains {} amino acids and {} nucleotides, ignoring...", seqRes.getChainID(),numAminosSeqres,numNucleotidesSeqres); 175 176 return; 177 } 178 } 179 180 if ( atomRes.getAtomGroups(GroupType.AMINOACID).size() < 1) { 181 logger.debug("ATOM chain {} does not contain amino acids, ignoring...", atomRes.getChainID()); 182 return; 183 } 184 185 logger.debug("Proceeding to do protein alignment for chain {}", atomRes.getChainID() ); 186 187 188 boolean noMatchFound = alignProteinChains(seqResGroups,atomRes.getAtomGroups()); 189 if ( ! noMatchFound){ 190 atomRes.setSeqResGroups(seqResGroups); 191 } 192 193 } 194 195 private void alignNucleotideChains(Chain seqRes, Chain atomRes) { 196 197 if ( atomRes.getAtomGroups(GroupType.NUCLEOTIDE).size() < 1) { 198 logger.debug("ATOM chain {} does not contain nucleotides, ignoring...", atomRes.getChainID()); 199 200 return; 201 } 202 logger.debug("Alignment for chain {}", atomRes.getChainID() ); 203 204 List<Group> seqResGroups = seqRes.getAtomGroups(); 205 boolean noMatchFound = alignNucleotideGroups(seqResGroups,atomRes.getAtomGroups()); 206 if ( ! noMatchFound){ 207 atomRes.setSeqResGroups(seqResGroups); 208 } 209 210 } 211 212 /** 213 * A simple matching approach that tries to do a 1:1 mapping between SEQRES and ATOM records 214 * 215 * @param seqRes 216 * @param atomList 217 * @return the matching or null if the matching didn't work 218 */ 219 private List<Group> trySimpleMatch(List<Group> seqResGroups,List<Group> atmResGroups) { 220 // by default first ATOM position is 1 221 // 222 223 @SuppressWarnings("unchecked") 224 List<Group> newSeqResGroups = (ArrayList<Group>)((ArrayList<Group>)seqResGroups).clone(); 225 226 boolean startAt1 = true; 227 228 for ( int atomResPos = 0 ; atomResPos < atmResGroups.size() ; atomResPos++){ 229 230 // let's try to match this case 231 Group atomResGroup = atmResGroups.get(atomResPos); 232 233 // let's ignore waters 234 if ( atomResGroup.isWater()){ 235 continue; 236 } 237 238 ResidueNumber atomResNum = atomResGroup.getResidueNumber(); 239 240 int seqResPos = atomResNum.getSeqNum(); 241 242 243 244 if ( seqResPos < 0) { 245 logger.debug("ATOM residue number < 0 : {}", seqResPos); 246 return null; 247 } 248 249 if ( seqResPos == 0){ 250 // make sure the first SEQRES is matching. 251 Group seqResGroup = seqResGroups.get(0); 252 if ( seqResGroup.getPDBName().equals(atomResGroup.getPDBName())){ 253 // they match! seems in this case the numbering starts with 0... 254 startAt1 = false; 255 } else { 256 257 logger.debug("SEQRES position 1 ({}) does not match ATOM PDB res num 0 ({})", 258 seqResGroup.getPDBName(), atomResGroup.getPDBName()); 259 260 261 return null; 262 263 } 264 } 265 266 267 if ( startAt1 ) 268 seqResPos--; 269 270 // another check that would require the alignment approach 271 if ( startAt1 && seqResPos >= seqResGroups.size() ) 272 { 273 274 // could be a HETATOM... 275 if ( atomResGroup instanceof AminoAcid) { 276 logger.debug(" ATOM residue nr: " + seqResPos + " > seqres! " + seqResGroups.size() + " " + atomResGroup); 277 return null; 278 } else if ( atomResGroup instanceof NucleotideImpl) { 279 logger.debug(" NUCLEOTIDE residue nr: " + seqResPos + " > seqres! " + seqResGroups.size() + " " + atomResGroup); 280 return null; 281 } else { 282 // we won't map HETATOM groups... 283 continue; 284 } 285 } 286 287 288 // if ( seqResPos < 0){ 289 // 290 // System.err.println("What is going on??? " + atomRes.getChainID() + " " + atomResGroup); 291 // } 292 293 if ( seqResPos >= seqResGroups.size()){ 294 logger.debug("seqres groups don't match atom indices " + seqResPos); 295 if ( atomResGroup instanceof AminoAcid ) 296 return null; 297 else 298 continue; 299 } 300 301 Group seqResGroup = seqResGroups.get(seqResPos ); 302 303 if ( ! seqResGroup.getPDBName().trim().equals(atomResGroup.getPDBName().trim())){ 304 // a mismatch! something is wrong in the mapping and we need to do an alignment 305 logger.debug("Mismatch of SEQRES pos " + seqResPos + " and ATOM record: " + atomResGroup + " | " + seqResGroup); 306 return null; 307 } 308 309 // the two groups are identical and we can merge them 310 // replace the SEQRES group with the ATOM group... 311 312 Group replacedGroup = newSeqResGroups.set(seqResPos, atomResGroup); 313 logger.debug("Merging index {}: replaced seqres group {} ({}) with atom group {} ({})", 314 seqResPos, 315 replacedGroup.getResidueNumber(), replacedGroup.getPDBName(), 316 atomResGroup.getResidueNumber(), atomResGroup.getPDBName()); 317 318 } 319 320 // all went ok. copy over the updated list to the original one. 321 // note: if something went wrong, we did not modifiy the original list. 322 //seqResGroups = newSeqResGroups; 323 324 325 // int pos = -1; 326 // for (Group g: seqResGroups){ 327 // pos++; 328 // logger.debug(pos + " " + g); 329 // } 330 331 //System.out.println("I:" + seqResGroups); 332 // all atom records could get matched correctly! 333 return newSeqResGroups; 334 335 } 336 337 /** 338 * Returns the full sequence of the Atom records of a parent 339 * with X instead of HETATMSs. The advantage of this is 340 * that it allows us to also align HETATM groups back to the SEQRES. 341 * @param groups the list of groups in a parent 342 * @param positionIndex a Map to keep track of which group is at which sequence position 343 * @param isNucleotideChain whether the atom groups are predominantly nucleotides (the groups represent a nucleotide chain), if true 344 * non-standard nucleotides will be represented with ambiguous letter 'N' instead of 'X', if false all non-standard residues will be 'X' 345 * @return string representations 346 */ 347 public static String getFullAtomSequence(List<Group> groups, Map<Integer, Integer> positionIndex, boolean isNucleotideChain){ 348 349 StringBuffer sequence = new StringBuffer() ; 350 int seqIndex = 0; // track sequence.length() 351 for ( int i=0 ; i< groups.size(); i++){ 352 Group g = groups.get(i); 353 354 if ( g instanceof AminoAcid ){ 355 AminoAcid a = (AminoAcid)g; 356 char oneLetter =a.getAminoType(); 357 if ( oneLetter == '?') 358 oneLetter = 'X'; 359 360 positionIndex.put(seqIndex,i); 361 sequence.append(oneLetter ); 362 seqIndex++; 363 } else { 364 365 // exclude solvents 366 if ( g.isWater()) 367 continue; 368 369 // exclude metals 370 if ( g.size() == 1 ) { 371 372 Atom a = g.getAtom(0); 373 if ( a == null) 374 continue; 375 if (a.getElement().isMetal()) 376 continue; 377 378 } 379 380 ChemComp cc = g.getChemComp(); 381 if ( cc == null) { 382 logger.debug("No chem comp available for group {}",g.toString()); 383 // not sure what to do in that case! 384 continue; 385 } 386 if ( 387 ResidueType.lPeptideLinking.equals(cc.getResidueType()) || 388 PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 389 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType()) 390 ) { 391 //System.out.println(cc.getOne_letter_code()); 392 String c = cc.getOne_letter_code(); 393 if ( c.equals("?")) { 394 if (isNucleotideChain && PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())) { 395 // the way to represent unknown nucleotides is with 'N', see https://en.wikipedia.org/wiki/Nucleic_acid_notation 396 c = "N"; 397 } else { 398 c = "X"; 399 } 400 } 401 402 // For some unusual cases the het residue can map to 2 or more standard aas and thus give an 403 // insertion of length >1. 404 // e.g. 1: SUI maps to DG (in 1oew,A) 405 // e.g. 2: NRQ maps to MYG (in 3cfh,A) 406 if (c.length()>1) { 407 logger.info("Group '{}' maps to more than 1 standard aminoacid: {}.", 408 g.toString(), c); 409 } 410 // because of the mapping to more than 1 aminoacid, we have 411 // to loop through it (99% of cases c will have length 1 anyway) 412 for (int cIdx=0;cIdx<c.length();cIdx++) { 413 positionIndex.put(seqIndex,i); 414 sequence.append(c.charAt(cIdx)); 415 seqIndex++; 416 } 417 } else { 418 logger.debug("Group {} is not lPeptideLinked, nor PROTEIN_ONLY, nor POLYNUCLEOTIDE_ONLY",g.toString()); 419 continue; 420 } 421 422 423 //sequence.append("X"); 424 } 425 426 } 427 428 return sequence.toString(); 429 430 } 431 432 433 private boolean alignNucleotideGroups(List<Group> seqRes, List<Group> atomRes) { 434 435 Map<Integer,Integer> seqresIndexPosition = new HashMap<Integer, Integer>(); 436 Map<Integer,Integer> atomIndexPosition = new HashMap<Integer, Integer>(); 437 438 String seq1 = getFullAtomSequence(seqRes, seqresIndexPosition, true); 439 // 440 String seq2 = getFullAtomSequence(atomRes, atomIndexPosition, true); 441 442 if (seq1.isEmpty() || seq2.isEmpty()) { 443 logger.warn("Could not align nucleotide sequences, at least one of them is empty"); 444 return true; 445 } 446 447 logger.debug("align seq1 ("+ seq1.length()+") " + seq1); 448 logger.debug("align seq2 ("+ seq2.length()+") " + seq2); 449 450 Sequence<NucleotideCompound> s1 = getNucleotideSequence(seq1); 451 Sequence<NucleotideCompound> s2 = getNucleotideSequence(seq2); 452 453 if (s1==null || s2==null) return true; 454 455 if ( ! s1.getCompoundSet().equals(s2.getCompoundSet()) ) { 456 // e.g. trying to align a DNA and an RNA sequence... 457 // test PDB ID: 1A34 458 // try to make both RNA sequence... 459 if ( ! s1.getCompoundSet().equals(AmbiguityRNACompoundSet.getRNACompoundSet())) { 460 try { 461 s1 = new RNASequence(seq1,AmbiguityRNACompoundSet.getRNACompoundSet()); 462 } catch (CompoundNotFoundException ex) { 463 logger.warn("Could not align DNA and RNA compound sets: " + seq1); 464 return true; 465 } 466 } 467 468 if( ! s2.getCompoundSet().equals(AmbiguityRNACompoundSet.getRNACompoundSet())) { 469 try { 470 s2 = new RNASequence(seq2,AmbiguityRNACompoundSet.getRNACompoundSet()); 471 } catch (CompoundNotFoundException ex) { 472 logger.warn("Could not align DNA and RNA compound sets: " + seq2); 473 return true; 474 } 475 } 476 } 477 478 479 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 480 481 GapPenalty penalty = new SimpleGapPenalty(8,1); 482 483 PairwiseSequenceAligner<Sequence<NucleotideCompound>, NucleotideCompound> smithWaterman = 484 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.LOCAL, penalty, matrix); 485 486 487 488 SequencePair<Sequence<NucleotideCompound>, NucleotideCompound> pair = smithWaterman.getPair(); 489 490 491 492 if ( pair == null) { 493 logger.warn("Could not align nucleotide sequences. ATOM and SEQRES groups will not be aligned."); 494 logger.warn("Sequences: "); 495 logger.warn(seq1); 496 logger.warn(seq2); 497 return true; 498 499 } 500 501 502 503 logger.debug("Alignment:\n"+pair.toString(100)); 504 505 506 boolean noMatchFound = mapDNAChains(seqRes,atomRes,pair,seqresIndexPosition, atomIndexPosition ); 507 508 return noMatchFound; 509 510 } 511 512 private Sequence<NucleotideCompound> getNucleotideSequence(String seq) { 513 Sequence<NucleotideCompound> s = null; 514 515 // first we try DNA, then RNA, them hybrid 516 517 try { 518 519 s = new DNASequence(seq, AmbiguityDNACompoundSet.getDNACompoundSet()); 520 } catch (CompoundNotFoundException e){ 521 522 try { 523 s= new RNASequence(seq, AmbiguityRNACompoundSet.getRNACompoundSet()); 524 } catch (CompoundNotFoundException ex) { 525 526 try { 527 // it could still be a hybrid, e.g. 3hoz, chain T, what to do in that case? 528 s = new DNASequence(seq, AmbiguityDNARNAHybridCompoundSet.getDNARNAHybridCompoundSet()); 529 logger.warn("Hybrid RNA/DNA sequence detected for sequence {}", seq); 530 } catch (CompoundNotFoundException exc) { 531 // not DNA, nor RNA, nor hybrid 532 logger.warn("Could not determine compound set (neither DNA, RNA nor hybrid) for nucleotide sequence " + seq); 533 return null; 534 } 535 536 } 537 } 538 return s; 539 } 540 541 542 /** 543 * Aligns two chains of groups, where the first parent is representing the 544 * list of amino acids as obtained from the SEQRES records, and the second parent 545 * represents the groups obtained from the ATOM records (and containing the actual ATOM information). 546 * This does the actual alignment and if a group can be mapped to a position in the SEQRES then the corresponding 547 * position is replaced with the group that contains the atoms. 548 * 549 * @param seqRes 550 * @param atomRes 551 * @return true if no match has been found 552 */ 553 private boolean alignProteinChains(List<Group> seqRes, List<Group> atomRes) { 554 555 Map<Integer,Integer> seqresIndexPosition = new HashMap<Integer, Integer>(); 556 Map<Integer,Integer> atomIndexPosition = new HashMap<Integer, Integer>(); 557 558 String seq1 = getFullAtomSequence(seqRes, seqresIndexPosition, false); 559 // 560 String seq2 = getFullAtomSequence(atomRes, atomIndexPosition, false); 561 562 563 logger.debug("Protein seq1 to align (length "+ seq1.length()+"): " + seq1); 564 logger.debug("Protein seq2 to align (length "+ seq2.length()+"): " + seq2); 565 566 ProteinSequence s1; 567 ProteinSequence s2; 568 try { 569 s1 = new ProteinSequence(seq1); 570 s2 = new ProteinSequence(seq2); 571 } catch (CompoundNotFoundException e) { 572 logger.warn("Could not create protein sequences ({}) to align ATOM and SEQRES groups, they will remain unaligned.", e.getMessage()); 573 return true; 574 } 575 576 577 SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getBlosum65(); 578 579 GapPenalty penalty = new SimpleGapPenalty(8, 1); 580 581 582 PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> smithWaterman = 583 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.LOCAL, penalty, matrix); 584 585 SequencePair<ProteinSequence, AminoAcidCompound> pair = smithWaterman.getPair(); 586 587 588 // sequences that are only X (e.g. 1jnv chain A) produced empty alignments, because nothing aligns to nothing and thus the local alignment is empty 589 // to avoid those empty alignments we catch them here with pair.getLength()==0 590 if ( pair == null || pair.getLength()==0) { 591 logger.warn("Could not align protein sequences. ATOM and SEQRES groups will not be aligned."); 592 logger.warn("Sequences: "); 593 logger.warn(seq1); 594 logger.warn(seq2); 595 return true; 596 } 597 598 599 logger.debug("Alignment:\n"+pair.toString(100)); 600 601 602 boolean noMatchFound = mapChains(seqRes,atomRes,pair,seqresIndexPosition, atomIndexPosition ); 603 604 return noMatchFound; 605 606 607 } 608 609 610 private boolean mapChains(List<Group> seqResGroups, List<Group> atomRes, 611 SequencePair<ProteinSequence, AminoAcidCompound> pair, 612 Map<Integer,Integer> seqresIndexPosition, 613 Map<Integer,Integer> atomIndexPosition ) { 614 615 616 617 // at the present stage the future seqRes are still stored as Atom groups in the seqRes parent... 618 619 620 int aligLength = pair.getLength(); 621 622 // make sure we actually find an alignment 623 boolean noMatchFound = true; 624 625 Compound gapSymbol = AminoAcidCompoundSet.getAminoAcidCompoundSet().getCompoundForString("-"); 626 627 mainLoop: 628 for (int i = 1; i <= aligLength ; i++) { 629 630 Compound s = pair.getCompoundAt(1, i); 631 Compound a = pair.getCompoundAt(2, i); 632 633 // alignment is using internal index start at 1... 634 int posSeq = pair.getIndexInQueryAt(i) - 1; 635 int posAtom = pair.getIndexInTargetAt(i) - 1; 636 637 if ( s.equals(gapSymbol) || a.equals(gapSymbol)){ 638 continue; 639 } 640 641 if ( s.equals(a)){ 642 643 // the atom record can be aligned to the SeqRes record! 644 // replace the SeqRes group with the Atom group! 645 646 Group s1 = seqResGroups.get(seqresIndexPosition.get(posSeq)); 647 Group a1 = atomRes.get(atomIndexPosition.get(posAtom)); 648 649 if ( s1 == null || a1 == null){ 650 /// can't map this position... 651 logger.warn("can't map " + i + ":" + s + " " + posSeq +" " + s1 + " atom: " + posAtom + " " + a1 ); 652 continue mainLoop; 653 } 654 655 // need to trim the names to allow matching e.g in 656 // pdb1b2m 657 String pdbNameS = s1.getPDBName(); 658 String pdbNameA = a1.getPDBName(); 659 660 if ( pdbNameS == null || pdbNameA == null ){ 661 logger.warn("null value for group.getPDBName found at {} when trying to align {} and {} {}",posSeq, s1, a1, posAtom); 662 logger.warn("ATOM and SEQRES sequences will not be aligned."); 663 return true; 664 } 665 666 if ( ! pdbNameA.trim().equals(pdbNameS.trim())) { 667 668 String msg = "'"+ s1 + "' (position " + posSeq + ") does not align with '" + a1+ "' (position " + posAtom + "), should be: " + s + " : " + a; 669 670 if ( s1.getType().equals(HetatomImpl.type) && a1.getType().equals(HetatomImpl.type)){ 671 logger.info(msg + ". They seem to be hetatoms, so ignoring mismatch."); 672 } 673 else { 674 logger.warn(msg + ". This could be a problem because they aren't both hetatoms"); 675 } 676 677 } 678 679 // do the actual replacing of the SEQRES group with the ATOM group 680 // if ( s1.getChain().getChainID().equals("A")) { 681 // System.out.println(" setting " + posSeq +" " + a1); 682 // } 683 seqResGroups.set(seqresIndexPosition.get(posSeq),a1); 684 noMatchFound = false; 685 } 686 } 687 688 689 // now we merge the two chains into one 690 // the Groups that can be aligned are now pointing to the 691 // groups in the Atom records. 692 if ( noMatchFound) { 693 694 logger.debug("no alignment found!"); 695 } 696 return noMatchFound; 697 698 } 699 700 private boolean mapDNAChains(List<Group> seqResGroups, List<Group> atomRes, 701 SequencePair<Sequence<NucleotideCompound>, NucleotideCompound> pair, 702 Map<Integer,Integer> seqresIndexPosition, 703 Map<Integer,Integer> atomIndexPosition) { 704 705 706 // at the present stage the future seqREs are still stored as Atom groups in the seqRes parent... 707 708 709 int aligLength = pair.getLength(); 710 711 // System.out.println("sequence length seqres:"); 712 // System.out.println(seqresIndexPosition.keySet().size()); 713 // System.out.println("alignment length: " + aligLength); 714 // System.out.println(gapSymbol.getName()); 715 716 // make sure we actually find an alignment 717 boolean noMatchFound = true; 718 719 Compound gapSymbol = DNACompoundSet.getDNACompoundSet().getCompoundForString("-"); 720 721 mainLoop: 722 for (int i = 1; i <= aligLength ; i++) { 723 724 Compound s = pair.getCompoundAt(1, i); 725 Compound a = pair.getCompoundAt(2,i); 726 727 // alignment is using internal index start at 1... 728 int posSeq = pair.getIndexInQueryAt(i) -1; 729 int posAtom = pair.getIndexInTargetAt(i) -1; 730 731 if ( s.equals(gapSymbol) || a.equals(gapSymbol)){ 732 continue; 733 } 734 735 if ( s.equals(a)){ 736 737 // the atom record can be aligned to the SeqRes record! 738 // replace the SeqRes group with the Atom group! 739 740 Group s1 = seqResGroups.get(seqresIndexPosition.get(posSeq)); 741 Group a1 = atomRes.get(atomIndexPosition.get(posAtom)); 742 743 if ( s1 == null || a1 == null){ 744 /// can't map this position... 745 logger.warn("can't map " + i + ":" + s + " " + posSeq +" " + s1 + " atom: " + posAtom + " " + a1 ); 746 continue mainLoop; 747 } 748 749 // need to trim the names to allow matching e.g in 750 // pdb1b2m 751 String pdbNameS = s1.getPDBName(); 752 String pdbNameA = a1.getPDBName(); 753 if ( pdbNameS == null || pdbNameA == null ){ 754 logger.warn("null value found for group.getPDBName() at " + posSeq + " when trying to align " + s1 + " and " + a1 + " " + posAtom); 755 logger.warn("ATOM and SEQRES sequences will not be aligned."); 756 } 757 if (! pdbNameA.equals(pdbNameS)){ 758 if ( ! pdbNameA.trim().equals(pdbNameS.trim())) { 759 logger.info(s1 + " " + posSeq + " does not align with " + a1+ " " + posAtom + " should be: " + s + " : " + a); 760 if ( s1.getType().equals(HetatomImpl.type) && a1.getType().equals(HetatomImpl.type)){ 761 logger.info("they seem to be hetatoms, so ignoring mismatch."); 762 } 763 else { 764 // System.exit(0);// for debug only 765 //System.out.println(lst1.seqString()); 766 //System.out.println(lst2.seqString()); 767 logger.warn("Could not match residues " + s1 + " " + a1); 768 } 769 770 } 771 } 772 773 // do the actual replacing of the SEQRES group with the ATOM group 774 // if ( s1.getChain().getChainID().equals("A")) { 775 // System.out.println(" setting " + posSeq +" " + a1); 776 // } 777 seqResGroups.set(seqresIndexPosition.get(posSeq),a1); 778 noMatchFound = false; 779 } 780 } 781 782 783 // now we merge the two chains into one 784 // the Groups that can be aligned are now pointing to the 785 // groups in the Atom records. 786 if ( noMatchFound) { 787 788 logger.debug("no alignment found!"); 789 } 790 return noMatchFound; 791 792 } 793 794 /** 795 * Storing unaligned SEQRES groups in a Structure. 796 * @param structure 797 * @param seqResChains 798 */ 799 public static void storeUnAlignedSeqRes(Structure structure, List<Chain> seqResChains, boolean headerOnly) { 800 801 for (int i = 0; i < structure.nrModels(); i++) { 802 List<Chain> atomChains = structure.getModel(i); 803 804 for (Chain seqRes: seqResChains){ 805 Chain atomRes; 806 807 if (headerOnly) { 808 // In header-only mode skip ATOM records. 809 // Here we store chains with SEQRES instead of AtomGroups. 810 seqRes.setSeqResGroups(seqRes.getAtomGroups()); 811 seqRes.setAtomGroups(new ArrayList<Group>()); // clear out the atom groups. 812 atomChains.add(seqRes); 813 } else { 814 // Otherwise, we find a chain with AtomGroups 815 // and set this as SEQRES groups. 816 atomRes = SeqRes2AtomAligner.getMatchingAtomRes(seqRes,atomChains); 817 if ( atomRes != null) 818 atomRes.setSeqResGroups(seqRes.getAtomGroups()); 819 else 820 logger.warn("Could not find atom records for chain " + seqRes.getChainID()); 821 } 822 } 823 if (headerOnly) { 824 structure.setChains(i, atomChains); 825 } 826 } 827 } 828}