Source code

001/*
002 *                  BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on Aug 23, 2007
021 *
022 */
023
024package org.biojava.nbio.structure.io;
025
026
027import java.util.ArrayList;
028import java.util.HashMap;
029import java.util.Iterator;
030import java.util.List;
031import java.util.Map;
032
033import org.biojava.nbio.alignment.Alignments;
034import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
035import org.biojava.nbio.alignment.SimpleGapPenalty;
036import org.biojava.nbio.alignment.template.GapPenalty;
037import org.biojava.nbio.alignment.template.PairwiseSequenceAligner;
038import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
039import org.biojava.nbio.core.alignment.template.SequencePair;
040import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
041import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
042import org.biojava.nbio.core.sequence.DNASequence;
043import org.biojava.nbio.core.sequence.ProteinSequence;
044import org.biojava.nbio.core.sequence.RNASequence;
045import org.biojava.nbio.core.sequence.compound.AmbiguityDNACompoundSet;
046import org.biojava.nbio.core.sequence.compound.AmbiguityDNARNAHybridCompoundSet;
047import org.biojava.nbio.core.sequence.compound.AmbiguityRNACompoundSet;
048import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
049import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
050import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
051import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
052import org.biojava.nbio.core.sequence.template.Compound;
053import org.biojava.nbio.core.sequence.template.Sequence;
054import org.biojava.nbio.structure.AminoAcid;
055import org.biojava.nbio.structure.Atom;
056import org.biojava.nbio.structure.Chain;
057import org.biojava.nbio.structure.Group;
058import org.biojava.nbio.structure.GroupType;
059import org.biojava.nbio.structure.HetatomImpl;
060import org.biojava.nbio.structure.NucleotideImpl;
061import org.biojava.nbio.structure.ResidueNumber;
062import org.biojava.nbio.structure.Structure;
063import org.biojava.nbio.structure.chem.ChemComp;
064import org.biojava.nbio.structure.chem.PolymerType;
065import org.biojava.nbio.structure.chem.ResidueType;
066import org.slf4j.Logger;
067import org.slf4j.LoggerFactory;
068
069
070/**
071 * Aligns the SEQRES residues to the ATOM residues.
072 * The AminoAcids that can be matched between the two of them will be set in the SEQRES
073 * chains
074 *
075 *
076 * @author Andreas Prlic
077 * @author Jose Duarte
078 */
079public class SeqRes2AtomAligner {
080
081        private static final Logger logger = LoggerFactory.getLogger(SeqRes2AtomAligner.class);
082
083
084
085        private String alignmentString;
086
087        public SeqRes2AtomAligner(){
088                logger.debug("initialising SeqRes2AtomAligner");
089                alignmentString = "";
090        }
091
092        public String getAlignmentString() {
093                return alignmentString;
094        }
095
096        /**
097         *
098         * @param seqRes
099         * @param atomList
100         * @param useChainId if true chainId (Chain.getId) is used for matching,
101         * if false chainName (Chain.getName) is used
102         * @return
103         */
104        public static Chain getMatchingAtomRes(Chain seqRes, List<Chain> atomList, boolean useChainId)
105        {
106                Iterator<Chain> iter = atomList.iterator();
107                while(iter.hasNext()){
108                        Chain atomChain = iter.next();
109
110                        String atomChainId = null;
111                        String seqResChainId = null;
112                        if (useChainId) {
113                                atomChainId = atomChain.getId();
114                                seqResChainId = seqRes.getId();
115                        } else {
116                                atomChainId = atomChain.getName();
117                                seqResChainId = seqRes.getName();
118
119                        }
120
121                        if ( atomChainId.equals(seqResChainId)){
122                                return atomChain;
123                        }
124
125                }
126
127                logger.info("Could not match SEQRES chainID asymId:" + seqRes.getId() + " authId:"+ seqRes.getName() +"  to ATOM chains!, size of atom chain: " + atomList.size());
128                return null;
129        }
130
131
132
133
134        public void align(Structure s, List<Chain> seqResList){
135
136                List<Chain> atomList   = s.getModel(0);
137
138
139                for (Chain seqRes: seqResList){
140
141                                Chain atomRes = getMatchingAtomRes(seqRes,atomList,false);
142                                if ( atomRes == null)
143                                        continue;
144
145                                mapSeqresRecords(atomRes,seqRes);
146
147
148                }
149
150        }
151
152        /**
153         * Map the seqRes groups to the atomRes chain.
154         * Updates the atomRes chain object with the mapped data
155         * The seqRes chain should not be needed after this and atomRes should be further used.
156         *
157         * @param atomRes the chain containing ATOM groups (in atomGroups slot). This chain
158         * is modified to contain in its seqresGroups slot the mapped atom groups
159         * @param seqRes the chain containing SEQRES groups (in atomGroups slot). This chain
160         * is not modified
161         */
162        public void mapSeqresRecords(Chain atomRes, Chain seqRes) {
163                List<Group> seqResGroups = seqRes.getAtomGroups();
164                List<Group> atmResGroups = atomRes.getAtomGroups();
165
166
167
168                logger.debug("Comparing ATOM {} ({} groups) to SEQRES {} ({} groups) ",
169                                atomRes.getId(), atmResGroups.size(), seqRes.getId(), seqResGroups.size());
170
171
172                List<Group> matchedGroups = trySimpleMatch(seqResGroups, atmResGroups);
173
174                if ( matchedGroups != null) {
175                        // update the new SEQRES list
176                        atomRes.setSeqResGroups(matchedGroups);
177                        return;
178                }
179
180                logger.debug("Could not map SEQRES to ATOM records easily, need to align...");
181
182                int numAminosSeqres = seqRes.getAtomGroups(GroupType.AMINOACID).size();
183                int numNucleotidesSeqres = seqRes.getAtomGroups(GroupType.NUCLEOTIDE).size();
184
185                if ( numAminosSeqres < 1) {
186
187                        if ( numNucleotidesSeqres > 1) {
188
189                                logger.debug("SEQRES chain {} is a nucleotide chain ({} nucleotides), aligning nucleotides...", seqRes.getId(), numNucleotidesSeqres);
190
191                                alignNucleotideChains(seqRes,atomRes);
192                                return;
193                        } else {
194
195                                logger.debug("SEQRES chain {} contains {} amino acids and {} nucleotides, ignoring...", seqRes.getId(),numAminosSeqres,numNucleotidesSeqres);
196
197                                return;
198                        }
199                }
200
201                if ( atomRes.getAtomGroups(GroupType.AMINOACID).size() < 1) {
202                        logger.debug("ATOM chain {} does not contain amino acids, ignoring...", atomRes.getId());
203                        return;
204                }
205
206                logger.debug("Proceeding to do protein alignment for chain {}", atomRes.getId() );
207
208
209                boolean noMatchFound = alignProteinChains(seqResGroups,atomRes.getAtomGroups());
210                if ( ! noMatchFound){
211                        atomRes.setSeqResGroups(seqResGroups);
212                }
213
214        }
215
216        private void alignNucleotideChains(Chain seqRes, Chain atomRes) {
217
218                if ( atomRes.getAtomGroups(GroupType.NUCLEOTIDE).size() < 1) {
219                        logger.debug("ATOM chain {} does not contain nucleotides, ignoring...", atomRes.getId());
220
221                        return;
222                }
223                logger.debug("Alignment for chain {}", atomRes.getId() );
224
225                List<Group> seqResGroups = seqRes.getAtomGroups();
226                boolean noMatchFound = alignNucleotideGroups(seqResGroups,atomRes.getAtomGroups());
227                if ( ! noMatchFound){
228                        atomRes.setSeqResGroups(seqResGroups);
229                }
230
231        }
232
233        /**
234         * A simple matching approach that tries to do a 1:1 mapping between SEQRES and ATOM records
235         *
236         * @param seqResGroups list of seqREs groups
237         * @param atmResGroups list of atmRes Groups
238         * @return the matching or null if the matching didn't work
239         */
240        private List<Group> trySimpleMatch(List<Group> seqResGroups,List<Group> atmResGroups) {
241                // by default first ATOM position is 1
242                //
243
244                @SuppressWarnings("unchecked")
245                List<Group> newSeqResGroups = (ArrayList<Group>)((ArrayList<Group>)seqResGroups).clone();
246
247                boolean startAt1 = true;
248
249                for ( int atomResPos = 0 ; atomResPos < atmResGroups.size() ; atomResPos++){
250
251                        // let's try to match this case
252                        Group atomResGroup = atmResGroups.get(atomResPos);
253
254                        // let's ignore waters
255                        if ( atomResGroup.isWater()){
256                                continue;
257                        }
258
259                        ResidueNumber atomResNum = atomResGroup.getResidueNumber();
260
261                        int seqResPos = atomResNum.getSeqNum();
262
263
264
265                        if ( seqResPos < 0) {
266                                logger.debug("ATOM residue number < 0 : {}", seqResPos);
267                                return null;
268                        }
269
270                        if ( seqResPos == 0){
271                                // make sure the first SEQRES is matching.
272                                Group seqResGroup = seqResGroups.get(0);
273                                if (  seqResGroup.getPDBName().equals(atomResGroup.getPDBName())){
274                                        // they match! seems in this case the numbering starts with 0...
275                                        startAt1 = false;
276                                } else {
277
278                                        logger.debug("SEQRES position 1 ({}) does not match ATOM PDB res num 0 ({})",
279                                                        seqResGroup.getPDBName(), atomResGroup.getPDBName());
280
281
282                                        return null;
283
284                                }
285                        }
286
287
288                        if ( startAt1 )
289                                seqResPos--;
290
291                        // another check that would require the alignment approach
292                        if ( startAt1 && seqResPos >=  seqResGroups.size()  )
293                        {
294
295                                // could be a HETATOM...
296                                if ( atomResGroup instanceof AminoAcid) {
297                                        logger.debug(" ATOM residue nr: " + seqResPos + " > seqres! " + seqResGroups.size() + " " + atomResGroup);
298                                        return null;
299                                } else if ( atomResGroup instanceof NucleotideImpl) {
300                                        logger.debug(" NUCLEOTIDE residue nr: " + seqResPos + " > seqres! " + seqResGroups.size() + " " + atomResGroup);
301                                        return null;
302                                } else {
303                                        // we won't map HETATOM groups...
304                                        continue;
305                                }
306                        }
307
308
309                        //                      if ( seqResPos < 0){
310                        //
311                        //                              System.err.println("What is going on??? " + atomRes.getChainID() + " " + atomResGroup);
312                        //                      }
313
314                        if ( seqResPos >= seqResGroups.size()){
315                                logger.debug("seqres groups don't match atom indices " + seqResPos);
316                                if ( atomResGroup instanceof AminoAcid )
317                                        return null;
318                                else
319                                        continue;
320                        }
321
322                        Group seqResGroup = seqResGroups.get(seqResPos );
323
324                        if ( ! seqResGroup.getPDBName().trim().equals(atomResGroup.getPDBName().trim())){
325                                // a mismatch! something is wrong in the mapping and we need to do an alignment
326                                logger.debug("Mismatch of SEQRES pos " + seqResPos + " and ATOM record: " + atomResGroup + " | " + seqResGroup);
327                                return null;
328                        }
329
330                        // the two groups are identical and we can merge them
331                        // replace the SEQRES group with the ATOM group...
332
333                        Group replacedGroup = newSeqResGroups.set(seqResPos, atomResGroup);
334                        logger.debug("Merging index {}: replaced seqres group {} ({}) with atom group {} ({})",
335                                        seqResPos,
336                                        replacedGroup.getResidueNumber(), replacedGroup.getPDBName(),
337                                        atomResGroup.getResidueNumber(), atomResGroup.getPDBName());
338
339                }
340
341                // all went ok. copy over the updated list to the original one.
342                // note: if something went wrong, we did not modifiy the original list.
343                //seqResGroups = newSeqResGroups;
344
345
346                //                      int pos = -1;
347                //                      for (Group g: seqResGroups){
348                //                              pos++;
349                //                              logger.debug(pos + " " + g);
350                //                      }
351
352                //System.out.println("I:" + seqResGroups);
353                // all atom records could get matched correctly!
354                return newSeqResGroups;
355
356        }
357
358        /**
359         * Returns the full sequence of the Atom records of a parent
360         * with X instead of HETATMSs. The advantage of this is
361         * that it allows us to also align HETATM groups back to the SEQRES.
362         * @param groups the list of groups in a parent
363         * @param positionIndex a Map to keep track of which group is at which sequence position
364         * @param isNucleotideChain whether the atom groups are predominantly nucleotides (the groups represent a nucleotide chain), if true
365         * non-standard nucleotides will be represented with ambiguous letter 'N' instead of 'X', if false all non-standard residues will be 'X'
366         * @return string representations
367         */
368        public static String getFullAtomSequence(List<Group> groups, Map<Integer, Integer> positionIndex, boolean isNucleotideChain){
369
370                StringBuffer sequence = new StringBuffer() ;
371                int seqIndex = 0; // track sequence.length()
372                for ( int i=0 ; i< groups.size(); i++){
373                        Group g = groups.get(i);
374
375                        if ( g instanceof AminoAcid ){
376                                AminoAcid a = (AminoAcid)g;
377                                char oneLetter =a.getAminoType();
378                                if ( oneLetter == '?')
379                                        oneLetter = 'X';
380
381                                positionIndex.put(seqIndex,i);
382                                sequence.append(oneLetter );
383                                seqIndex++;
384                        } else {
385
386                                // exclude solvents
387                                if (  g.isWater())
388                                        continue;
389
390                                // exclude metals
391                                if ( g.size() == 1 ) {
392
393                                        Atom a = g.getAtom(0);
394                                        if ( a == null)
395                                                continue;
396                                        if (a.getElement().isMetal())
397                                                continue;
398
399                                }
400
401                                ChemComp cc = g.getChemComp();
402                                if ( cc == null) {
403                                        logger.debug("No chem comp available for group {}",g.toString());
404                                        // not sure what to do in that case!
405                                        continue;
406                                }
407                                if (
408                                                ResidueType.lPeptideLinking.equals(cc.getResidueType()) ||
409                                                PolymerType.PROTEIN_ONLY.contains(cc.getPolymerType())  ||
410                                                PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())
411                                                ) {
412                                        //System.out.println(cc.getOne_letter_code());
413                                        String c = cc.getOneLetterCode();
414                                        if ( c.equals("?")) {
415                                                if (isNucleotideChain && PolymerType.POLYNUCLEOTIDE_ONLY.contains(cc.getPolymerType())) {
416                                                        // the way to represent unknown nucleotides is with 'N', see https://en.wikipedia.org/wiki/Nucleic_acid_notation
417                                                        c = "N";
418                                                } else {
419                                                        c = "X";
420                                                }
421                                        }
422
423                                        // For some unusual cases the het residue can map to 2 or more standard aas and thus give an
424                                        // insertion of length >1.
425                                        //      e.g. 1: SUI maps to DG  (in 1oew,A)
426                                        //              e.g. 2: NRQ maps to MYG (in 3cfh,A)
427                                        if (c.length()>1) {
428                                                logger.info("Group '{}' maps to more than 1 standard aminoacid: {}.",
429                                                                g.toString(), c);
430                                        }
431                                        // because of the mapping to more than 1 aminoacid, we have
432                                        // to loop through it (99% of cases c will have length 1 anyway)
433                                        for (int cIdx=0;cIdx<c.length();cIdx++) {
434                                                positionIndex.put(seqIndex,i);
435                                                sequence.append(c.charAt(cIdx));
436                                                seqIndex++;
437                                        }
438                                } else {
439                                        logger.debug("Group {} is not lPeptideLinked, nor PROTEIN_ONLY, nor POLYNUCLEOTIDE_ONLY",g.toString());
440                                        continue;
441                                }
442
443
444                                //sequence.append("X");
445                        }
446
447                }
448
449                return sequence.toString();
450
451        }
452
453
454        private boolean alignNucleotideGroups(List<Group> seqRes, List<Group> atomRes) {
455
456                Map<Integer,Integer> seqresIndexPosition = new HashMap<Integer, Integer>();
457                Map<Integer,Integer> atomIndexPosition   = new HashMap<Integer, Integer>();
458
459                String seq1 = getFullAtomSequence(seqRes, seqresIndexPosition, true);
460                //
461                String seq2 = getFullAtomSequence(atomRes, atomIndexPosition, true);
462
463                if (seq1.isEmpty() || seq2.isEmpty()) {
464                        logger.warn("Could not align nucleotide sequences, at least one of them is empty");
465                        return true;
466                }
467
468                logger.debug("align seq1 ("+ seq1.length()+") " + seq1);
469                logger.debug("align seq2 ("+ seq2.length()+") " + seq2);
470
471                Sequence<NucleotideCompound> s1 = getNucleotideSequence(seq1);
472                Sequence<NucleotideCompound> s2 = getNucleotideSequence(seq2);
473
474                if (s1==null || s2==null) return true;
475
476                if ( ! s1.getCompoundSet().equals(s2.getCompoundSet()) ) {
477                        // e.g. trying to align a DNA and an RNA sequence...
478                        // test PDB ID: 1A34
479                        // try to make both RNA sequence...
480                        if ( ! s1.getCompoundSet().equals(AmbiguityRNACompoundSet.getRNACompoundSet())) {
481                                try {
482                                        s1 = new RNASequence(seq1,AmbiguityRNACompoundSet.getRNACompoundSet());
483                                } catch (CompoundNotFoundException ex) {
484                                        logger.warn("Could not align DNA and RNA compound sets: " + seq1);
485                                        return true;
486                                }
487                        }
488
489                        if( ! s2.getCompoundSet().equals(AmbiguityRNACompoundSet.getRNACompoundSet())) {
490                                try {
491                                        s2 = new RNASequence(seq2,AmbiguityRNACompoundSet.getRNACompoundSet());
492                                } catch (CompoundNotFoundException ex) {
493                                        logger.warn("Could not align DNA and RNA compound sets: " + seq2);
494                                        return true;
495                                }
496                        }
497                }
498
499
500                SubstitutionMatrix<NucleotideCompound> matrix = SubstitutionMatrixHelper.getNuc4_4();
501
502                GapPenalty penalty = new SimpleGapPenalty(8,1);
503
504                PairwiseSequenceAligner<Sequence<NucleotideCompound>, NucleotideCompound> smithWaterman =
505                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.LOCAL, penalty, matrix);
506
507
508
509                SequencePair<Sequence<NucleotideCompound>, NucleotideCompound> pair = smithWaterman.getPair();
510
511
512
513                if ( pair == null) {
514                        logger.warn("Could not align nucleotide sequences. ATOM and SEQRES groups will not be aligned.");
515                        logger.warn("Sequences: ");
516                        logger.warn(seq1);
517                        logger.warn(seq2);
518                        return true;
519
520                }
521
522
523
524                logger.debug("Alignment:\n"+pair.toString(100));
525
526
527                boolean noMatchFound = mapDNAChains(seqRes,atomRes,pair,seqresIndexPosition, atomIndexPosition );
528
529                return noMatchFound;
530
531        }
532
533        private Sequence<NucleotideCompound> getNucleotideSequence(String seq) {
534                Sequence<NucleotideCompound> s = null;
535
536                // first we try DNA, then RNA, them hybrid
537
538                try {
539
540                        s = new DNASequence(seq, AmbiguityDNACompoundSet.getDNACompoundSet());
541                } catch (CompoundNotFoundException e){
542
543                        try {
544                                s= new RNASequence(seq, AmbiguityRNACompoundSet.getRNACompoundSet());
545                        } catch (CompoundNotFoundException ex) {
546
547                                try {
548                                        // it could still be a hybrid, e.g. 3hoz, chain T, what to do in that case?
549                                        s = new DNASequence(seq, AmbiguityDNARNAHybridCompoundSet.getDNARNAHybridCompoundSet());
550                                        logger.warn("Hybrid RNA/DNA sequence detected for sequence {}", seq);
551                                } catch (CompoundNotFoundException exc) {
552                                        // not DNA, nor RNA, nor hybrid
553                                        logger.warn("Could not determine compound set (neither DNA, RNA nor hybrid) for nucleotide sequence " + seq);
554                                        return null;
555                                }
556
557                        }
558                }
559                return s;
560        }
561
562
563        /**
564         * Aligns two chains of groups, where the first parent is representing the
565         * list of amino acids as obtained from the SEQRES records, and the second parent
566         * represents the groups obtained from the ATOM records (and containing the actual ATOM information).
567         * This does the actual alignment and if a group can be mapped to a position in the SEQRES then the corresponding
568         * position is replaced with the group that contains the atoms.
569         *
570         * @param seqRes
571         * @param atomRes
572         * @return true if no match has been found
573         */
574        private boolean alignProteinChains(List<Group> seqRes, List<Group> atomRes) {
575
576                Map<Integer,Integer> seqresIndexPosition = new HashMap<Integer, Integer>();
577                Map<Integer,Integer> atomIndexPosition   = new HashMap<Integer, Integer>();
578
579                String seq1 = getFullAtomSequence(seqRes, seqresIndexPosition, false);
580                //
581                String seq2 = getFullAtomSequence(atomRes, atomIndexPosition, false);
582
583
584                logger.debug("Protein seq1 to align (length "+ seq1.length()+"): " + seq1);
585                logger.debug("Protein seq2 to align (length "+ seq2.length()+"): " + seq2);
586
587                ProteinSequence s1;
588                ProteinSequence s2;
589                try {
590                        s1 = new ProteinSequence(seq1);
591                        s2 = new ProteinSequence(seq2);
592                } catch (CompoundNotFoundException e) {
593                        logger.warn("Could not create protein sequences ({}) to align ATOM and SEQRES groups, they will remain unaligned.", e.getMessage());
594                        return true;
595                }
596
597
598                SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getBlosum65();
599
600                GapPenalty penalty = new SimpleGapPenalty(8, 1);
601
602
603                PairwiseSequenceAligner<ProteinSequence, AminoAcidCompound> smithWaterman =
604                                Alignments.getPairwiseAligner(s1, s2, PairwiseSequenceAlignerType.LOCAL, penalty, matrix);
605
606                SequencePair<ProteinSequence, AminoAcidCompound> pair = smithWaterman.getPair();
607
608
609                // sequences that are only X (e.g. 1jnv chain A) produced empty alignments, because nothing aligns to nothing and thus the local alignment is empty
610                // to avoid those empty alignments we catch them here with pair.getLength()==0
611                if ( pair == null || pair.getLength()==0) {
612                        logger.warn("Could not align protein sequences. ATOM and SEQRES groups will not be aligned.");
613                        logger.warn("Sequences: ");
614                        logger.warn(seq1);
615                        logger.warn(seq2);
616                        return true;
617                }
618
619
620                logger.debug("Alignment:\n"+pair.toString(100));
621
622
623                boolean noMatchFound = mapChains(seqRes,atomRes,pair,seqresIndexPosition, atomIndexPosition );
624
625                return noMatchFound;
626
627
628        }
629
630
631        private boolean mapChains(List<Group> seqResGroups, List<Group> atomRes,
632                        SequencePair<ProteinSequence, AminoAcidCompound> pair,
633                        Map<Integer,Integer> seqresIndexPosition,
634                        Map<Integer,Integer> atomIndexPosition )   {
635
636
637
638                // at the present stage the future seqRes are still stored as Atom groups in the seqRes parent...
639
640
641                int aligLength = pair.getLength();
642
643                // make sure we actually find an alignment
644                boolean noMatchFound = true;
645
646                Compound gapSymbol =  AminoAcidCompoundSet.getAminoAcidCompoundSet().getCompoundForString("-");
647
648                mainLoop:
649                        for (int i = 1; i <= aligLength ; i++) {
650
651                                Compound s =  pair.getCompoundAt(1, i);
652                                Compound a =  pair.getCompoundAt(2, i);
653
654                                // alignment is using internal index start at 1...
655                                int posSeq  = pair.getIndexInQueryAt(i)  - 1;
656                                int posAtom = pair.getIndexInTargetAt(i) - 1;
657
658                                if (  s.equals(gapSymbol) || a.equals(gapSymbol)){
659                                        continue;
660                                }
661
662                                if ( s.equals(a)){
663
664                                        // the atom record can be aligned to the SeqRes record!
665                                        // replace the SeqRes group with the Atom group!
666
667                                        Group s1 = seqResGroups.get(seqresIndexPosition.get(posSeq));
668                                        Group a1 = atomRes.get(atomIndexPosition.get(posAtom));
669
670                                        if ( s1 == null || a1 == null){
671                                                /// can't map this position...
672                                                logger.warn("can't map " + i + ":" + s + " " + posSeq +" " + s1 + " atom: " + posAtom + " " + a1 );
673                                                continue mainLoop;
674                                        }
675
676                                        // need to trim the names to allow matching e.g in
677                                        // pdb1b2m
678                                        String pdbNameS = s1.getPDBName();
679                                        String pdbNameA = a1.getPDBName();
680
681                                        if ( pdbNameS == null || pdbNameA == null ){
682                                                logger.warn("null value for group.getPDBName found at {} when trying to align {} and {} {}",posSeq, s1, a1, posAtom);
683                                                logger.warn("ATOM and SEQRES sequences will not be aligned.");
684                                                return true;
685                                        }
686
687                                        if ( ! pdbNameA.trim().equals(pdbNameS.trim())) {
688
689                                                String msg = "'"+ s1 + "' (position " + posSeq + ") does not align with '" + a1+ "' (position " + posAtom + "), should be: " + s + " : " + a;
690
691                                                if ( s1.getType().equals(HetatomImpl.type) && a1.getType().equals(HetatomImpl.type)){
692                                                        logger.info(msg + ". They seem to be hetatoms, so ignoring mismatch.");
693                                                }
694                                                else {
695                                                        logger.warn(msg + ". This could be a problem because they aren't both hetatoms");
696                                                }
697
698                                        }
699
700                                        // do the actual replacing of the SEQRES group with the ATOM group
701                                        //                                      if ( s1.getChain().getChainID().equals("A")) {
702                                        //                                              System.out.println(" setting " + posSeq +" " + a1);
703                                        //                                      }
704                                        seqResGroups.set(seqresIndexPosition.get(posSeq),a1);
705                                        noMatchFound = false;
706                                }
707                        }
708
709
710                // now we merge the two chains into one
711                // the Groups that can be aligned are now pointing to the
712                // groups in the Atom records.
713                if (  noMatchFound) {
714
715                        logger.debug("no alignment found!");
716                }
717                return noMatchFound;
718
719        }
720
721        private boolean mapDNAChains(List<Group> seqResGroups, List<Group> atomRes,
722                        SequencePair<Sequence<NucleotideCompound>, NucleotideCompound> pair,
723                        Map<Integer,Integer> seqresIndexPosition,
724                        Map<Integer,Integer> atomIndexPosition)   {
725
726
727                // at the present stage the future seqREs are still stored as Atom groups in the seqRes parent...
728
729
730                int aligLength = pair.getLength();
731
732                //              System.out.println("sequence length seqres:");
733                //              System.out.println(seqresIndexPosition.keySet().size());
734                //              System.out.println("alignment length: " + aligLength);
735                // System.out.println(gapSymbol.getName());
736
737                // make sure we actually find an alignment
738                boolean noMatchFound = true;
739
740                Compound gapSymbol =  DNACompoundSet.getDNACompoundSet().getCompoundForString("-");
741
742                mainLoop:
743                        for (int i = 1; i <= aligLength ; i++) {
744
745                                Compound s =  pair.getCompoundAt(1, i);
746                                Compound a =  pair.getCompoundAt(2,i);
747
748                                // alignment is using internal index start at 1...
749                                int posSeq = pair.getIndexInQueryAt(i)  -1;
750                                int posAtom = pair.getIndexInTargetAt(i) -1;
751
752                                if (  s.equals(gapSymbol) || a.equals(gapSymbol)){
753                                        continue;
754                                }
755
756                                if ( s.equals(a)){
757
758                                        // the atom record can be aligned to the SeqRes record!
759                                        // replace the SeqRes group with the Atom group!
760
761                                        Group s1 = seqResGroups.get(seqresIndexPosition.get(posSeq));
762                                        Group a1 = atomRes.get(atomIndexPosition.get(posAtom));
763
764                                        if ( s1 == null || a1 == null){
765                                                /// can't map this position...
766                                                logger.warn("can't map " + i + ":" + s + " " + posSeq +" " + s1 + " atom: " + posAtom + " " + a1 );
767                                                continue mainLoop;
768                                        }
769
770                                        // need to trim the names to allow matching e.g in
771                                        // pdb1b2m
772                                        String pdbNameS = s1.getPDBName();
773                                        String pdbNameA = a1.getPDBName();
774                                        if ( pdbNameS == null || pdbNameA == null ){
775                                                logger.warn("null value found for group.getPDBName() at " + posSeq + " when trying to align " + s1 + " and " + a1 + " " + posAtom);
776                                                logger.warn("ATOM and SEQRES sequences will not be aligned.");
777                                        }
778                                        if (! pdbNameA.equals(pdbNameS)){
779                                                if ( ! pdbNameA.trim().equals(pdbNameS.trim())) {
780                                                        logger.info(s1 + " " + posSeq + " does not align with " + a1+ " " + posAtom + " should be: " + s + " : " + a);
781                                                        if ( s1.getType().equals(HetatomImpl.type) && a1.getType().equals(HetatomImpl.type)){
782                                                                logger.info("they seem to be hetatoms, so ignoring mismatch.");
783                                                        }
784                                                        else {
785                                                                //  System.exit(0);// for debug only
786                                                                //System.out.println(lst1.seqString());
787                                                                //System.out.println(lst2.seqString());
788                                                                logger.warn("Could not match residues " + s1 + " " + a1);
789                                                        }
790
791                                                }
792                                        }
793
794                                        // do the actual replacing of the SEQRES group with the ATOM group
795                                        //                                      if ( s1.getChain().getChainID().equals("A")) {
796                                        //                                              System.out.println(" setting " + posSeq +" " + a1);
797                                        //                                      }
798                                        seqResGroups.set(seqresIndexPosition.get(posSeq),a1);
799                                        noMatchFound = false;
800                                }
801                        }
802
803
804                // now we merge the two chains into one
805                // the Groups that can be aligned are now pointing to the
806                // groups in the Atom records.
807                if (  noMatchFound) {
808
809                        logger.debug("no alignment found!");
810                }
811                return noMatchFound;
812
813        }
814
815        /**
816         * Storing unaligned SEQRES groups in a Structure.
817         * @param structure
818         * @param seqResChains
819         */
820        public static void storeUnAlignedSeqRes(Structure structure, List<Chain> seqResChains, boolean headerOnly) {
821
822
823                if (headerOnly) {
824
825                        List<Chain> atomChains = new ArrayList<>();
826                        for (Chain seqRes: seqResChains) {
827                                // In header-only mode skip ATOM records.
828                                // Here we store chains with SEQRES instead of AtomGroups.
829                                seqRes.setSeqResGroups(seqRes.getAtomGroups());
830                                seqRes.setAtomGroups(new ArrayList<>()); // clear out the atom groups.
831
832                                atomChains.add(seqRes);
833
834                        }
835                        structure.setChains(0, atomChains);
836
837                } else {
838
839                        for (int i = 0; i < structure.nrModels(); i++) {
840                                List<Chain> atomChains   = structure.getModel(i);
841
842                                for (Chain seqRes: seqResChains){
843                                        Chain atomRes;
844
845                                        // Otherwise, we find a chain with AtomGroups
846                                        // and set this as SEQRES groups.
847                                        // TODO no idea if new parameter useChainId should be false or true here, used true as a guess - JD 2016-05-09
848                                        atomRes = SeqRes2AtomAligner.getMatchingAtomRes(seqRes,atomChains,true);
849                                        if ( atomRes != null)
850                                                atomRes.setSeqResGroups(seqRes.getAtomGroups());
851                                        else
852                                                logger.warn("Could not find atom records for chain " + seqRes.getId());
853                                }
854
855
856                        }
857                }
858        }
859}