001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on Aug 23, 2007 021 * 022 */ 023 024package org.biojava.nbio.structure.io; 025 026 027import java.util.ArrayList; 028import java.util.HashMap; 029import java.util.Iterator; 030import java.util.List; 031import java.util.Map; 032 033import org.biojava.nbio.alignment.Alignments; 034import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 035import org.biojava.nbio.alignment.SimpleGapPenalty; 036import org.biojava.nbio.alignment.template.GapPenalty; 037import org.biojava.nbio.alignment.template.PairwiseSequenceAligner; 038import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper; 039import org.biojava.nbio.core.alignment.template.SequencePair; 040import org.biojava.nbio.core.alignment.template.SubstitutionMatrix; 041import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 042import org.biojava.nbio.core.sequence.DNASequence; 043import org.biojava.nbio.core.sequence.ProteinSequence; 044import org.biojava.nbio.core.sequence.RNASequence; 045import org.biojava.nbio.core.sequence.compound.AmbiguityDNACompoundSet; 046import org.biojava.nbio.core.sequence.compound.AmbiguityDNARNAHybridCompoundSet; 047import org.biojava.nbio.core.sequence.compound.AmbiguityRNACompoundSet; 048import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 049import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 050import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 051import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 052import org.biojava.nbio.core.sequence.template.Compound; 053import org.biojava.nbio.core.sequence.template.Sequence; 054import org.biojava.nbio.structure.AminoAcid; 055import org.biojava.nbio.structure.Atom; 056import org.biojava.nbio.structure.Chain; 057import org.biojava.nbio.structure.Group; 058import org.biojava.nbio.structure.GroupType; 059import org.biojava.nbio.structure.HetatomImpl; 060import org.biojava.nbio.structure.NucleotideImpl; 061import org.biojava.nbio.structure.ResidueNumber; 062import org.biojava.nbio.structure.Structure; 063import org.biojava.nbio.structure.chem.ChemComp; 064import org.biojava.nbio.structure.chem.PolymerType; 065import org.biojava.nbio.structure.chem.ResidueType; 066import org.slf4j.Logger; 067import org.slf4j.LoggerFactory; 068 069 070/** 071 * Aligns the SEQRES residues to the ATOM residues. 072 * The AminoAcids that can be matched between the two of them will be set in the SEQRES 073 * chains 074 * 075 * 076 * @author Andreas Prlic 077 * @author Jose Duarte 078 */ 079public class SeqRes2AtomAligner { 080 081 private static final Logger logger = LoggerFactory.getLogger(SeqRes2AtomAligner.class); 082 083 084 085 private String alignmentString; 086 087 public SeqRes2AtomAligner(){ 088 logger.debug("initialising SeqRes2AtomAligner"); 089 alignmentString = ""; 090 } 091 092 public String getAlignmentString() { 093 return alignmentString; 094 } 095 096 /** 097 * 098 * @param seqRes 099 * @param atomList 100 * @param useChainId if true chainId (Chain.getId) is used for matching, 101 * if false chainName (Chain.getName) is used 102 * @return 103 */ 104 public static Chain getMatchingAtomRes(Chain seqRes, List<Chain> atomList, boolean useChainId) 105 { 106 Iterator<Chain> iter = atomList.iterator(); 107 while(iter.hasNext()){ 108 Chain atomChain = iter.next(); 109 110 String atomChainId = null; 111 String seqResChainId = null; 112 if (useChainId) { 113 atomChainId = atomChain.getId(); 114 seqResChainId = seqRes.getId(); 115 } else { 116 atomChainId = atomChain.getName(); 117 seqResChainId = seqRes.getName(); 118 119 } 120 121 if ( atomChainId.equals(seqResChainId)){ 122 return atomChain; 123 } 124 125 } 126 127 logger.info("Could not match SEQRES chainID asymId:" + seqRes.getId() + " authId:"+ seqRes.getName() +" to ATOM chains!, size of atom chain: " + atomList.size()); 128 return null; 129 } 130 131 132 133 134 public void align(Structure s, List<Chain> seqResList){ 135 136 List<Chain> atomList = s.getModel(0); 137 138 139 for (Chain seqRes: seqResList){ 140 141 Chain atomRes = getMatchingAtomRes(seqRes,atomList,false); 142 if ( atomRes == null) 143 continue; 144 145 mapSeqresRecords(atomRes,seqRes); 146 147 148 } 149 150 } 151 152 /** 153 * Map the seqRes groups to the atomRes chain. 154 * Updates the atomRes chain object with the mapped data 155 * The seqRes chain should not be needed after this and atomRes should be further used. 156 * 157 * @param atomRes the chain containing ATOM groups (in atomGroups slot). This chain 158 * is modified to contain in its seqresGroups slot the mapped atom groups 159 * @param seqRes the chain containing SEQRES groups (in atomGroups slot). This chain 160 * is not modified 161 */ 162 public void mapSeqresRecords(Chain atomRes, Chain seqRes) { 163 List<Group> seqResGroups = seqRes.getAtomGroups(); 164 List<Group> atmResGroups = atomRes.getAtomGroups(); 165 166 167 168 logger.debug("Comparing ATOM {} ({} groups) to SEQRES {} ({} groups) ", 169 atomRes.getId(), atmResGroups.size(), seqRes.getId(), seqResGroups.size()); 170 171 172 List<Group> matchedGroups = trySimpleMatch(seqResGroups, atmResGroups); 173 174 if ( matchedGroups != null) { 175 // update the new SEQRES list 176 atomRes.setSeqResGroups(matchedGroups); 177 return; 178 } 179 180 logger.debug("Could not map SEQRES to ATOM records easily, need to align..."); 181 182 int numAminosSeqres = seqRes.getAtomGroups(GroupType.AMINOACID).size(); 183 int numNucleotidesSeqres = seqRes.getAtomGroups(GroupType.NUCLEOTIDE).size(); 184 185 if ( numAminosSeqres < 1) { 186 187 if ( numNucleotidesSeqres > 1) { 188 189 logger.debug("SEQRES chain {} is a nucleotide chain ({} nucleotides), aligning nucleotides...", seqRes.getId(), numNucleotidesSeqres); 190 191 alignNucleotideChains(seqRes,atomRes); 192 return; 193 } else { 194 195 logger.debug("SEQRES chain {} contains {} amino acids and {} nucleotides, ignoring...", seqRes.getId(),numAminosSeqres,numNucleotidesSeqres); 196 197 return; 198 } 199 } 200 201 if ( atomRes.getAtomGroups(GroupType.AMINOACID).size() < 1) { 202 logger.debug("ATOM chain {} does not contain amino acids, ignoring...", atomRes.getId()); 203 return; 204 } 205 206 logger.debug("Proceeding to do protein alignment for chain {}", atomRes.getId() ); 207 208 209 boolean noMatchFound = alignProteinChains(seqResGroups,atomRes.getAtomGroups()); 210 if ( ! noMatchFound){ 211 atomRes.setSeqResGroups(seqResGroups); 212 } 213 214 } 215 216 private void alignNucleotideChains(Chain seqRes, Chain atomRes) { 217 218 if ( atomRes.getAtomGroups(GroupType.NUCLEOTIDE).size() < 1) { 219 logger.debug("ATOM chain {} does not contain nucleotides, ignoring...", atomRes.getId()); 220 221 return; 222 } 223 logger.debug("Alignment for chain {}", atomRes.getId() ); 224 225 List<Group> seqResGroups = seqRes.getAtomGroups(); 226 boolean noMatchFound = alignNucleotideGroups(seqResGroups,atomRes.getAtomGroups()); 227 if ( ! noMatchFound){ 228 atomRes.setSeqResGroups(seqResGroups); 229 } 230 231 } 232 233 /** 234 * A simple matching approach that tries to do a 1:1 mapping between SEQRES and ATOM records 235 * 236 * @param seqResGroups list of seqREs groups 237 * @param atmResGroups list of atmRes Groups 238 * @return the matching or null if the matching didn't work 239 */ 240 private List<Group> trySimpleMatch(List<Group> seqResGroups,List<Group> atmResGroups) { 241 // by default first ATOM position is 1 242 // 243 244 @SuppressWarnings("unchecked") 245 List<Group> newSeqResGroups = (ArrayList<Group>)((ArrayList<Group>)seqResGroups).clone(); 246 247 boolean startAt1 = true; 248 249 for ( int atomResPos = 0 ; atomResPos < atmResGroups.size() ; atomResPos++){ 250 251 // let's try to match this case 252 Group atomResGroup = atmResGroups.get(atomResPos); 253 254 // let's ignore waters 255 if ( atomResGroup.isWater()){ 256 continue; 257 } 258 259 ResidueNumber atomResNum = atomResGroup.getResidueNumber(); 260 261 int seqResPos = atomResNum.getSeqNum(); 262 263 264 265 if ( seqResPos < 0) { 266 logger.debug("ATOM residue number < 0 : {}", seqResPos); 267 return null; 268 } 269 270 if ( seqResPos == 0){ 271 // make sure the first SEQRES is matching. 272 Group seqResGroup = seqResGroups.get(0); 273 if ( seqResGroup.getPDBName().equals(atomResGroup.getPDBName())){ 274 // they match! seems in this case the numbering starts with 0... 275 startAt1 = false; 276 } else { 277 278 logger.debug("SEQRES position 1 ({}) does not match ATOM PDB res num 0 ({})", 279 seqResGroup.getPDBName(), atomResGroup.getPDBName()); 280 281 282 return null; 283 284 } 285 } 286 287 288 if ( startAt1 ) 289 seqResPos--; 290 291 // another check that would require the alignment approach 292 if ( startAt1 && seqResPos >= seqResGroups.size() ) 293 { 294 295 // could be a HETATOM... 296 if ( atomResGroup instanceof AminoAcid) { 297 logger.debug(" ATOM residue nr: " + seqResPos + " > seqres! " + seqResGroups.size() + " " + atomResGroup); 298 return null; 299 } else if ( atomResGroup instanceof NucleotideImpl) { 300 logger.debug(" NUCLEOTIDE residue nr: " + seqResPos + " > seqres! " + seqResGroups.size() + " " + atomResGroup); 301 return null; 302 } else { 303 // we won't map HETATOM groups... 304 continue; 305 } 306 } 307 308 309 // if ( seqResPos < 0){ 310 // 311 // System.err.println("What is going on??? " + atomRes.getChainID() + " " + atomResGroup); 312 // } 313 314 if ( seqResPos >= seqResGroups.size()){ 315 logger.debug("seqres groups don't match atom indices {}", seqResPos); 316 if ( atomResGroup instanceof AminoAcid ) 317 return null; 318 else 319 continue; 320 } 321 322 Group seqResGroup = seqResGroups.get(seqResPos ); 323 324 if ( ! seqResGroup.getPDBName().trim().equals(atomResGroup.getPDBName().trim())){ 325 // a mismatch! something is wrong in the mapping and we need to do an alignment 326 logger.debug("Mismatch of SEQRES pos " + seqResPos + " and ATOM record: " + atomResGroup + " | " + seqResGroup); 327 return null; 328 } 329 330 // the two groups are identical and we can merge them 331 // replace the SEQRES group with the ATOM group... 332 333 Group replacedGroup = newSeqResGroups.set(seqResPos, atomResGroup); 334 logger.debug("Merging index {}: replaced seqres group {} ({}) with atom group {} ({})", 335 seqResPos, 336 replacedGroup.getResidueNumber(), replacedGroup.getPDBName(), 337 atomResGroup.getResidueNumber(), atomResGroup.getPDBName()); 338 339 } 340 341 // all went ok. copy over the updated list to the original one. 342 // note: if something went wrong, we did not modifiy the original list. 343 //seqResGroups = newSeqResGroups; 344 345 346 // int pos = -1; 347 // for (Group g: seqResGroups){ 348 // pos++; 349 // logger.debug(pos + " " + g); 350 // } 351 352 //System.out.println("I:" + seqResGroups); 353 // all atom records could get matched correctly! 354 return newSeqResGroups; 355 356 } 357 358 /** 359 * Returns the full sequence of the Atom records of a parent 360 * with X instead of HETATMSs. The advantage of this is 361 * that it allows us to also align HETATM groups back to the SEQRES. 362 * @param groups the list of groups in a parent 363 * @param positionIndex a Map to keep track of which group is at which sequence position 364 * @param isNucleotideChain whether the atom groups are predominantly nucleotides (the groups represent a nucleotide chain), if true 365 * non-standard nucleotides will be represented with ambiguous letter 'N' instead of 'X', if false all non-standard residues will be 'X' 366 * @return string representations 367 */ 368 public static String getFullAtomSequence(List<Group> groups, Map<Integer, Integer> positionIndex, boolean isNucleotideChain){ 369 370 StringBuffer sequence = new StringBuffer() ; 371 int seqIndex = 0; // track sequence.length() 372 for ( int i=0 ; i< groups.size(); i++){ 373 Group g = groups.get(i); 374 375 if ( g instanceof AminoAcid ){ 376 AminoAcid a = (AminoAcid)g; 377 char oneLetter =a.getAminoType(); 378 if ( oneLetter == '?') 379 oneLetter = 'X'; 380 381 positionIndex.put(seqIndex,i); 382 sequence.append(oneLetter ); 383 seqIndex++; 384 } else { 385 386 // exclude solvents 387 if ( g.isWater()) 388 continue; 389 390 // exclude metals 391 if ( g.size() == 1 ) { 392 393 Atom a = g.getAtom(0); 394 if ( a == null) 395 continue; 396 if (a.getElement().isMetal()) 397 continue; 398 399 } 400 401 ChemComp cc = g.getChemComp(); 402 if ( cc == null) { 403 logger.debug("No chem comp available for group {}",g.toString()); 404 // not sure what to do in that case! 405 continue; 406 } 407 if ( 408 ResidueType.lPeptideLinking.equals(cc.getResidueType()) || 409 PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType()) || 410 PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType()) 411 ) { 412 //System.out.println(cc.getOne_letter_code()); 413 String c = cc.getOneLetterCode(); 414 if ( "?".equals(c)) { 415 if (isNucleotideChain && PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())) { 416 // the way to represent unknown nucleotides is with 'N', see https://en.wikipedia.org/wiki/Nucleic_acid_notation 417 c = "N"; 418 } else { 419 c = "X"; 420 } 421 } 422 423 // For some unusual cases the het residue can map to 2 or more standard aas and thus give an 424 // insertion of length >1. 425 // e.g. 1: SUI maps to DG (in 1oew,A) 426 // e.g. 2: NRQ maps to MYG (in 3cfh,A) 427 if (c.length()>1) { 428 logger.info("Group '{}' maps to more than 1 standard aminoacid: {}.", 429 g.toString(), c); 430 } 431 // because of the mapping to more than 1 aminoacid, we have 432 // to loop through it (99% of cases c will have length 1 anyway) 433 for (int cIdx=0;cIdx<c.length();cIdx++) { 434 positionIndex.put(seqIndex,i); 435 sequence.append(c.charAt(cIdx)); 436 seqIndex++; 437 } 438 } else { 439 logger.debug("Group {} is not lPeptideLinked, nor PROTEIN_ONLY, nor POLYNUCLEOTIDE_ONLY",g.toString()); 440 continue; 441 } 442 443 444 //sequence.append("X"); 445 } 446 447 } 448 449 return sequence.toString(); 450 451 } 452 453 454 private boolean alignNucleotideGroups(List<Group> seqRes, List<Group> atomRes) { 455 456 Map<Integer,Integer> seqresIndexPosition = new HashMap<>(); 457 Map<Integer,Integer> atomIndexPosition = new HashMap<>(); 458 459 String seq1 = getFullAtomSequence(seqRes, seqresIndexPosition, true); 460 // 461 String seq2 = getFullAtomSequence(atomRes, atomIndexPosition, true); 462 463 if (seq1.isEmpty() || seq2.isEmpty()) { 464 logger.warn("Could not align nucleotide sequences, at least one of them is empty"); 465 return true; 466 } 467 468 logger.debug("align seq1 ("+ seq1.length()+") " + seq1); 469 logger.debug("align seq2 ("+ seq2.length()+") " + seq2); 470 471 Sequence<NucleotideCompound> s1 = getNucleotideSequence(seq1); 472 Sequence<NucleotideCompound> s2 = getNucleotideSequence(seq2); 473 474 if (s1==null || s2==null) return true; 475 476 if ( ! s1.getCompoundSet().equals(s2.getCompoundSet()) ) { 477 // e.g. trying to align a DNA and an RNA sequence... 478 // test PDB ID: 1A34 479 // try to make both RNA sequence... 480 if ( ! s1.getCompoundSet().equals(AmbiguityRNACompoundSet.getRNACompoundSet())) { 481 try { 482 s1 = new RNASequence(seq1,AmbiguityRNACompoundSet.getRNACompoundSet()); 483 } catch (CompoundNotFoundException ex) { 484 logger.warn("Could not align DNA and RNA compound sets: " + seq1); 485 return true; 486 } 487 } 488 489 if( ! s2.getCompoundSet().equals(AmbiguityRNACompoundSet.getRNACompoundSet())) { 490 try { 491 s2 = new RNASequence(seq2,AmbiguityRNACompoundSet.getRNACompoundSet()); 492 } catch (CompoundNotFoundException ex) { 493 logger.warn("Could not align DNA and RNA compound sets: " + seq2); 494 return true; 495 } 496 } 497 } 498 499 500 SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4(); 501 502 GapPenalty penalty = new SimpleGapPenalty(8,1); 503 504 PairwiseSequenceAligner<Sequence<NucleotideCompound>, NucleotideCompound> smithWaterman = 505 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.LOCAL, penalty, matrix); 506 507 508 509 SequencePair<Sequence<NucleotideCompound>, NucleotideCompound> pair = smithWaterman.getPair(); 510 511 512 513 if ( pair == null) { 514 logger.warn("Could not align nucleotide sequences. ATOM and SEQRES groups will not be aligned."); 515 logger.warn("Sequences: "); 516 logger.warn(seq1); 517 logger.warn(seq2); 518 return true; 519 520 } 521 522 523 524 logger.debug("Alignment:\n{}", pair.toString(100)); 525 526 527 boolean noMatchFound = mapDNAChains(seqRes,atomRes,pair,seqresIndexPosition, atomIndexPosition ); 528 529 return noMatchFound; 530 531 } 532 533 private Sequence<NucleotideCompound> getNucleotideSequence(String seq) { 534 Sequence<NucleotideCompound> s = null; 535 536 // first we try DNA, then RNA, them hybrid 537 538 try { 539 540 s = new DNASequence(seq, AmbiguityDNACompoundSet.getDNACompoundSet()); 541 } catch (CompoundNotFoundException e){ 542 543 try { 544 s= new RNASequence(seq, AmbiguityRNACompoundSet.getRNACompoundSet()); 545 } catch (CompoundNotFoundException ex) { 546 547 try { 548 // it could still be a hybrid, e.g. 3hoz, chain T, what to do in that case? 549 s = new DNASequence(seq, AmbiguityDNARNAHybridCompoundSet.getDNARNAHybridCompoundSet()); 550 logger.warn("Hybrid RNA/DNA sequence detected for sequence {}", seq); 551 } catch (CompoundNotFoundException exc) { 552 // not DNA, nor RNA, nor hybrid 553 logger.warn("Could not determine compound set (neither DNA, RNA nor hybrid) for nucleotide sequence " + seq); 554 return null; 555 } 556 557 } 558 } 559 return s; 560 } 561 562 563 /** 564 * Aligns two chains of groups, where the first parent is representing the 565 * list of amino acids as obtained from the SEQRES records, and the second parent 566 * represents the groups obtained from the ATOM records (and containing the actual ATOM information). 567 * This does the actual alignment and if a group can be mapped to a position in the SEQRES then the corresponding 568 * position is replaced with the group that contains the atoms. 569 * 570 * @param seqRes 571 * @param atomRes 572 * @return true if no match has been found 573 */ 574 private boolean alignProteinChains(List<Group> seqRes, List<Group> atomRes) { 575 576 Map<Integer,Integer> seqresIndexPosition = new HashMap<>(); 577 Map<Integer,Integer> atomIndexPosition = new HashMap<>(); 578 579 String seq1 = getFullAtomSequence(seqRes, seqresIndexPosition, false); 580 // 581 String seq2 = getFullAtomSequence(atomRes, atomIndexPosition, false); 582 583 584 logger.debug("Protein seq1 to align (length "+ seq1.length()+"): " + seq1); 585 logger.debug("Protein seq2 to align (length "+ seq2.length()+"): " + seq2); 586 587 ProteinSequence s1; 588 ProteinSequence s2; 589 try { 590 s1 = new ProteinSequence(seq1); 591 s2 = new ProteinSequence(seq2); 592 } catch (CompoundNotFoundException e) { 593 logger.warn("Could not create protein sequences ({}) to align ATOM and SEQRES groups, they will remain unaligned.", e.getMessage()); 594 return true; 595 } 596 597 598 SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getBlosum65(); 599 600 GapPenalty penalty = new SimpleGapPenalty(8, 1); 601 602 603 PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> smithWaterman = 604 Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.LOCAL, penalty, matrix); 605 606 SequencePair<ProteinSequence, AminoAcidCompound> pair = smithWaterman.getPair(); 607 608 609 // sequences that are only X (e.g. 1jnv chain A) produced empty alignments, because nothing aligns to nothing and thus the local alignment is empty 610 // to avoid those empty alignments we catch them here with pair.getLength()==0 611 if ( pair == null || pair.getLength()==0) { 612 logger.warn("Could not align protein sequences. ATOM and SEQRES groups will not be aligned."); 613 logger.warn("Sequences: "); 614 logger.warn(seq1); 615 logger.warn(seq2); 616 return true; 617 } 618 619 620 logger.debug("Alignment:\n{}", pair.toString(100)); 621 622 623 boolean noMatchFound = mapChains(seqRes,atomRes,pair,seqresIndexPosition, atomIndexPosition ); 624 625 return noMatchFound; 626 627 628 } 629 630 631 private boolean mapChains(List<Group> seqResGroups, List<Group> atomRes, 632 SequencePair<ProteinSequence, AminoAcidCompound> pair, 633 Map<Integer,Integer> seqresIndexPosition, 634 Map<Integer,Integer> atomIndexPosition ) { 635 636 637 638 // at the present stage the future seqRes are still stored as Atom groups in the seqRes parent... 639 640 641 int aligLength = pair.getLength(); 642 643 // make sure we actually find an alignment 644 boolean noMatchFound = true; 645 646 Compound gapSymbol = AminoAcidCompoundSet.getAminoAcidCompoundSet().getCompoundForString("-"); 647 648 mainLoop: 649 for (int i = 1; i <= aligLength ; i++) { 650 651 Compound s = pair.getCompoundAt(1, i); 652 Compound a = pair.getCompoundAt(2, i); 653 654 // alignment is using internal index start at 1... 655 int posSeq = pair.getIndexInQueryAt(i) - 1; 656 int posAtom = pair.getIndexInTargetAt(i) - 1; 657 658 if ( s.equals(gapSymbol) || a.equals(gapSymbol)){ 659 continue; 660 } 661 662 if ( s.equals(a)){ 663 664 // the atom record can be aligned to the SeqRes record! 665 // replace the SeqRes group with the Atom group! 666 667 Group s1 = seqResGroups.get(seqresIndexPosition.get(posSeq)); 668 Group a1 = atomRes.get(atomIndexPosition.get(posAtom)); 669 670 if ( s1 == null || a1 == null){ 671 /// can't map this position... 672 logger.warn("can't map " + i + ":" + s + " " + posSeq +" " + s1 + " atom: " + posAtom + " " + a1 ); 673 continue mainLoop; 674 } 675 676 // need to trim the names to allow matching e.g in 677 // pdb1b2m 678 String pdbNameS = s1.getPDBName(); 679 String pdbNameA = a1.getPDBName(); 680 681 if ( pdbNameS == null || pdbNameA == null ){ 682 logger.warn("null value for group.getPDBName found at {} when trying to align {} and {} {}",posSeq, s1, a1, posAtom); 683 logger.warn("ATOM and SEQRES sequences will not be aligned."); 684 return true; 685 } 686 687 if ( ! pdbNameA.trim().equals(pdbNameS.trim())) { 688 689 String msg = "'"+ s1 + "' (position " + posSeq + ") does not align with '" + a1+ "' (position " + posAtom + "), should be: " + s + " : " + a; 690 691 if ( s1.getType().equals(HetatomImpl.type) && a1.getType().equals(HetatomImpl.type)){ 692 logger.info(msg + ". They seem to be hetatoms, so ignoring mismatch."); 693 } 694 else { 695 logger.warn(msg + ". This could be a problem because they aren't both hetatoms"); 696 } 697 698 } 699 700 // do the actual replacing of the SEQRES group with the ATOM group 701 // if ( s1.getChain().getChainID().equals("A")) { 702 // System.out.println(" setting " + posSeq +" " + a1); 703 // } 704 seqResGroups.set(seqresIndexPosition.get(posSeq),a1); 705 noMatchFound = false; 706 } 707 } 708 709 710 // now we merge the two chains into one 711 // the Groups that can be aligned are now pointing to the 712 // groups in the Atom records. 713 if ( noMatchFound) { 714 715 logger.debug("no alignment found!"); 716 } 717 return noMatchFound; 718 719 } 720 721 private boolean mapDNAChains(List<Group> seqResGroups, List<Group> atomRes, 722 SequencePair<Sequence<NucleotideCompound>, NucleotideCompound> pair, 723 Map<Integer,Integer> seqresIndexPosition, 724 Map<Integer,Integer> atomIndexPosition) { 725 726 727 // at the present stage the future seqREs are still stored as Atom groups in the seqRes parent... 728 729 730 int aligLength = pair.getLength(); 731 732 // System.out.println("sequence length seqres:"); 733 // System.out.println(seqresIndexPosition.keySet().size()); 734 // System.out.println("alignment length: " + aligLength); 735 // System.out.println(gapSymbol.getName()); 736 737 // make sure we actually find an alignment 738 boolean noMatchFound = true; 739 740 Compound gapSymbol = DNACompoundSet.getDNACompoundSet().getCompoundForString("-"); 741 742 mainLoop: 743 for (int i = 1; i <= aligLength ; i++) { 744 745 Compound s = pair.getCompoundAt(1, i); 746 Compound a = pair.getCompoundAt(2,i); 747 748 // alignment is using internal index start at 1... 749 int posSeq = pair.getIndexInQueryAt(i) -1; 750 int posAtom = pair.getIndexInTargetAt(i) -1; 751 752 if ( s.equals(gapSymbol) || a.equals(gapSymbol)){ 753 continue; 754 } 755 756 if ( s.equals(a)){ 757 758 // the atom record can be aligned to the SeqRes record! 759 // replace the SeqRes group with the Atom group! 760 761 Group s1 = seqResGroups.get(seqresIndexPosition.get(posSeq)); 762 Group a1 = atomRes.get(atomIndexPosition.get(posAtom)); 763 764 if ( s1 == null || a1 == null){ 765 /// can't map this position... 766 logger.warn("can't map " + i + ":" + s + " " + posSeq +" " + s1 + " atom: " + posAtom + " " + a1 ); 767 continue mainLoop; 768 } 769 770 // need to trim the names to allow matching e.g in 771 // pdb1b2m 772 String pdbNameS = s1.getPDBName(); 773 String pdbNameA = a1.getPDBName(); 774 if ( pdbNameS == null || pdbNameA == null ){ 775 logger.warn("null value found for group.getPDBName() at " + posSeq + " when trying to align " + s1 + " and " + a1 + " " + posAtom); 776 logger.warn("ATOM and SEQRES sequences will not be aligned."); 777 } 778 if (! pdbNameA.equals(pdbNameS)){ 779 if ( ! pdbNameA.trim().equals(pdbNameS.trim())) { 780 logger.info(s1 + " " + posSeq + " does not align with " + a1+ " " + posAtom + " should be: " + s + " : " + a); 781 if ( s1.getType().equals(HetatomImpl.type) && a1.getType().equals(HetatomImpl.type)){ 782 logger.info("they seem to be hetatoms, so ignoring mismatch."); 783 } 784 else { 785 // System.exit(0);// for debug only 786 //System.out.println(lst1.seqString()); 787 //System.out.println(lst2.seqString()); 788 logger.warn("Could not match residues " + s1 + " " + a1); 789 } 790 791 } 792 } 793 794 // do the actual replacing of the SEQRES group with the ATOM group 795 // if ( s1.getChain().getChainID().equals("A")) { 796 // System.out.println(" setting " + posSeq +" " + a1); 797 // } 798 seqResGroups.set(seqresIndexPosition.get(posSeq),a1); 799 noMatchFound = false; 800 } 801 } 802 803 804 // now we merge the two chains into one 805 // the Groups that can be aligned are now pointing to the 806 // groups in the Atom records. 807 if ( noMatchFound) { 808 809 logger.debug("no alignment found!"); 810 } 811 return noMatchFound; 812 813 } 814 815 /** 816 * Storing unaligned SEQRES groups in a Structure. 817 * @param structure 818 * @param seqResChains 819 */ 820 public static void storeUnAlignedSeqRes(Structure structure, List<Chain> seqResChains, boolean headerOnly) { 821 822 if (headerOnly) { 823 List<Chain> atomChains = new ArrayList<>(); 824 for (Chain seqRes: seqResChains) { 825 // In header-only mode skip ATOM records. 826 // Here we store chains with SEQRES instead of AtomGroups. 827 seqRes.setSeqResGroups(seqRes.getAtomGroups()); 828 seqRes.setAtomGroups(new ArrayList<>()); // clear out the atom groups. 829 atomChains.add(seqRes); 830 } 831 structure.setChains(0, atomChains); 832 833 } else { 834 835 for (int i = 0; i < structure.nrModels(); i++) { 836 List<Chain> atomChains = structure.getModel(i); 837 if (seqResChains.isEmpty()) { 838 // in files without SEQRES, seqResChains object is empty: we replace by atomChains resulting below in a trivial alignment and a copy of atom groups to seqres groups 839 seqResChains = atomChains; 840 } 841 for (Chain seqRes: seqResChains){ 842 // Otherwise, we find a chain with AtomGroups 843 // and set this as SEQRES groups. 844 // TODO no idea if new parameter useChainId should be false or true here, used true as a guess - JD 2016-05-09 845 Chain atomRes = SeqRes2AtomAligner.getMatchingAtomRes(seqRes,atomChains,true); 846 if ( atomRes != null) 847 atomRes.setSeqResGroups(seqRes.getAtomGroups()); 848 else 849 logger.warn("Could not find atom records for chain " + seqRes.getId()); 850 } 851 } 852 } 853 } 854}