001/**
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created by Spencer Bliven
021 *
022 */
023package org.biojava.nbio.structure.io;
024
025import org.biojava.nbio.alignment.Alignments;
026import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
027import org.biojava.nbio.alignment.SimpleGapPenalty;
028import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix;
029import org.biojava.nbio.core.alignment.template.AlignedSequence;
030import org.biojava.nbio.core.alignment.template.SequencePair;
031import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
032import org.biojava.nbio.structure.*;
033import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
034import org.biojava.nbio.core.sequence.ProteinSequence;
035import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
036import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
037import org.biojava.nbio.core.sequence.template.CompoundSet;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import java.io.InputStreamReader;
042import java.util.Arrays;
043import java.util.HashMap;
044import java.util.List;
045import java.util.Map;
046
047
048/**
049 * A utility class with methods for matching ProteinSequences with
050 * Structures.
051 * @author Spencer Bliven
052 *
053 */
054public class StructureSequenceMatcher {
055
056        private static final Logger logger = LoggerFactory.getLogger(StructureSequenceMatcher.class);
057
058        /**
059         * Get a substructure of {@code wholeStructure} containing only the {@link Group Groups} that are included in
060         * {@code sequence}. The resulting structure will contain only {@code ATOM} residues; the SEQ-RES will be empty.
061         * The {@link Chain Chains} of the Structure will be new instances (cloned), but the {@link Group Groups} will not.
062         * @param sequence The input protein sequence
063         * @param wholeStructure The structure from which to take a substructure
064         * @return The resulting structure
065         * @throws StructureException
066         * @see {@link #matchSequenceToStructure(ProteinSequence, Structure)}
067         */
068        public static Structure getSubstructureMatchingProteinSequence(ProteinSequence sequence, Structure wholeStructure) {
069                ResidueNumber[] rns = matchSequenceToStructure(sequence, wholeStructure);
070                Structure structure = wholeStructure.clone();
071                structure.getChains().clear();
072//              structure.getHetGroups().clear();
073                Chain currentChain = null;
074                for (ResidueNumber rn : rns) {
075                        if (rn == null) continue;
076                        Group group; // note that we don't clone
077                        try {
078                                group = StructureTools.getGroupByPDBResidueNumber(wholeStructure, rn);
079                        } catch (StructureException e) {
080                                throw new IllegalArgumentException("Could not find residue " + rn + " in structure", e);
081                        }
082                        Chain chain = new ChainImpl();
083                        chain.setChainID(group.getChainId());
084                        if (currentChain == null || !currentChain.getChainID().equals(chain.getChainID())) {
085                                structure.addChain(chain);
086                                chain.setCompound(group.getChain().getCompound());
087                                chain.setStructure(structure);
088                                chain.setSwissprotId(group.getChain().getSwissprotId());
089                                chain.setInternalChainID(group.getChain().getInternalChainID());
090                                chain.setId(group.getChain().getId());
091                                currentChain = chain;
092                        }
093                        currentChain.addGroup(group);
094                }
095                return structure;
096        }
097
098        /**
099         * Generates a ProteinSequence corresponding to the sequence of struct,
100         * and maintains a mapping from the sequence back to the original groups.
101         *
102         * Chains are appended to one another. 'X' is used for heteroatoms.
103         *
104         * @param struct Input structure
105         * @param groupIndexPosition An empty map, which will be populated with
106         *  (residue index in returned ProteinSequence) -> (Group within struct)
107         * @return A ProteinSequence with the full sequence of struct. Chains are
108         *  concatenated in the same order as the input structures
109         *
110         * @see {@link SeqRes2AtomAligner#getFullAtomSequence(List, Map)}, which
111         *      does the heavy lifting.
112         *
113         */
114        public static ProteinSequence getProteinSequenceForStructure(Structure struct, Map<Integer,Group> groupIndexPosition ) {
115
116                if( groupIndexPosition != null) {
117                        groupIndexPosition.clear();
118                }
119
120                StringBuilder seqStr = new StringBuilder();
121
122                for(Chain chain : struct.getChains()) {
123                        List<Group> groups = chain.getAtomGroups();
124                        Map<Integer,Integer> chainIndexPosition = new HashMap<Integer, Integer>();
125                        int prevLen = seqStr.length();
126
127                        // get the sequence for this chain
128                        String chainSeq = SeqRes2AtomAligner.getFullAtomSequence(groups, chainIndexPosition, false);
129                        seqStr.append(chainSeq);
130
131
132                        // fix up the position to include previous chains, and map the value back to a Group
133                        for(Integer seqIndex : chainIndexPosition.keySet()) {
134                                Integer groupIndex = chainIndexPosition.get(seqIndex);
135                                groupIndexPosition.put(prevLen + seqIndex, groups.get(groupIndex));
136                        }
137                }
138
139                ProteinSequence s = null;
140                try {
141                        s = new ProteinSequence(seqStr.toString());
142                } catch (CompoundNotFoundException e) {
143                        // I believe this can't happen, please correct this if I'm wrong - JD 2014-10-24
144                        // we can log an error if it does, it would mean there's a bad bug somewhere
145                        logger.error("Could not create protein sequence, unknown compounds in string: {}", e.getMessage());
146                }
147
148                return s;
149        }
150
151        /**
152         * Given a sequence and the corresponding Structure, get the ResidueNumber
153         * for each residue in the sequence.
154         *
155         * <p>Smith-Waterman alignment is used to match the sequences. Residues
156         * in the sequence but not the structure or mismatched between sequence
157         * and structure will have a null atom, while
158         * residues in the structure but not the sequence are ignored with a warning.
159         * @param seq The protein sequence. Should match the sequence of struct very
160         *      closely.
161         * @param struct The corresponding protein structure
162         * @return A list of ResidueNumbers of the same length as seq, containing
163         *  either the corresponding residue or null.
164         */
165        public static ResidueNumber[] matchSequenceToStructure(ProteinSequence seq, Structure struct) {
166
167                //1. Create ProteinSequence for struct while remembering to which group each residue corresponds
168                Map<Integer,Group> atomIndexPosition   = new HashMap<Integer, Group>();
169
170                ProteinSequence structSeq = getProteinSequenceForStructure(struct,atomIndexPosition);
171
172                // TODO This should really be semi-global alignment, though global for either sequence OR structure (we don't know which)
173                //2. Run Smith-Waterman to get the alignment
174                // Identity substitution matrix with +1 for match, -1 for mismatch
175                // TODO
176                SubstitutionMatrix<AminoAcidCompound> matrix =
177                        new SimpleSubstitutionMatrix<AminoAcidCompound>(
178                                        AminoAcidCompoundSet.getAminoAcidCompoundSet(),
179                                        (short)1, (short)-1 );
180                matrix = new SimpleSubstitutionMatrix<AminoAcidCompound>(
181                                AminoAcidCompoundSet.getAminoAcidCompoundSet(),
182                                new InputStreamReader(
183                                                SimpleSubstitutionMatrix.class.getResourceAsStream("/matrices/blosum100.txt")),
184                "blosum100");
185                SequencePair<ProteinSequence, AminoAcidCompound> pair =
186                        Alignments.getPairwiseAlignment(seq, structSeq,
187                                        PairwiseSequenceAlignerType.GLOBAL, new SimpleGapPenalty(), matrix);
188
189                //System.out.print(pair.toString());
190
191                //3. Convert the alignment back to Atoms
192                AlignedSequence<ProteinSequence,AminoAcidCompound> alignedSeq = pair.getQuery();
193                AlignedSequence<ProteinSequence,AminoAcidCompound> alignedStruct = pair.getTarget();
194
195
196                assert(alignedSeq.getLength() == alignedStruct.getLength());
197
198//              System.out.println(pair.toString(80));
199//              System.out.format("%d/min{%d,%d}\n", pair.getNumIdenticals(),
200//                              alignedSeq.getLength()-alignedSeq.getNumGaps(),
201//                              alignedStruct.getLength()-alignedStruct.getNumGaps());
202
203                ResidueNumber[] ca = new ResidueNumber[seq.getLength()];
204
205                for( int pos = alignedSeq.getStart().getPosition(); pos <= alignedSeq.getEnd().getPosition(); pos++ ) { // 1-indexed
206                        //skip missing residues from sequence. These probably represent alignment errors
207                        if(alignedSeq.isGap(pos)) {
208                                int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;
209                                assert(structIndex > 0);//should be defined since seq gap
210
211                                Group g = atomIndexPosition.get(structIndex);
212
213                                logger.warn("Chain {} residue {} in the Structure {} has no corresponding amino acid in the sequence.",
214                                                g.getChainId(),
215                                                g.getResidueNumber().toString(),
216                                                g.getChain().getStructure().getPDBCode() );
217                                continue;
218                        }
219
220                        if(! alignedStruct.isGap(pos) ) {
221                                int seqIndex = alignedSeq.getSequenceIndexAt(pos)-1;//1-indexed
222                                int structIndex = alignedStruct.getSequenceIndexAt(pos)-1;//1-indexed
223                                Group g = atomIndexPosition.get(structIndex);
224
225                                assert(0<=seqIndex && seqIndex < ca.length);
226
227                                ca[seqIndex] = g.getResidueNumber(); //remains null for gaps
228                        }
229                }
230                return ca;
231        }
232
233
234        /**
235         * Removes all gaps ('-') from a protein sequence
236         * @param gapped
237         * @return
238         */
239        public static ProteinSequence removeGaps(ProteinSequence gapped) {
240                final String[] gapStrings = {"-","."};
241
242                StringBuilder seq = new StringBuilder();
243
244                CompoundSet<AminoAcidCompound> aaSet = gapped.getCompoundSet();
245                AminoAcidCompound[] gaps = new AminoAcidCompound[gapStrings.length];
246
247                for(int i=0;i<gapStrings.length;i++) {
248                        gaps[i] = aaSet.getCompoundForString(gapStrings[i]);
249                }
250
251                for(int i=1; i<=gapped.getLength();i++) { //1-indexed
252                        AminoAcidCompound aa = gapped.getCompoundAt(i);
253                        boolean isGap = false;
254                        for(AminoAcidCompound gap : gaps) {
255                                if( aa.equals(gap)) {
256                                        isGap = true;
257                                        break;
258                                }
259                        }
260                        if(!isGap) {
261                                seq.append(aa.getShortName());
262                        }
263                }
264
265                ProteinSequence ungapped = null;
266                try {
267                        ungapped = new ProteinSequence(seq.toString());
268                } catch (CompoundNotFoundException e) {
269                        // this can't happen, if it does there's a bug somewhere
270                        logger.error("Could not create ungapped protein sequence, found unknown compounds: {}. This is most likely a bug.", e.getMessage());
271                }
272
273                return ungapped;
274        }
275
276        /**
277         * Creates a new list consisting of all columns of gapped where no row
278         * contained a null value.
279         *
280         * Here, "row" refers to the first index and "column" to the second, eg
281         * gapped.get(row).get(column)
282         * @param gapped A rectangular matrix containing null to mark gaps
283         * @return A new List without columns containing nulls
284         */
285        public static <T> T[][] removeGaps(final T[][] gapped) {
286                if(gapped == null ) return null;
287                if(gapped.length < 1) return Arrays.copyOf(gapped, gapped.length);
288
289                final int nProts = gapped.length;
290                final int protLen = gapped[0].length; // length of gapped proteins
291
292                // Verify that input is rectangular
293                for(int i=0;i<nProts;i++) {
294                        if(gapped[i].length != protLen) {
295                                throw new IllegalArgumentException(String.format(
296                                                "Expected a rectangular array, but row 0 has %d elements " +
297                                                "while row %d has %d.", protLen,i,gapped[i].length));
298
299                        }
300                }
301
302                // determine where gaps exist in any structures
303                boolean[] isGap = new boolean[protLen];
304                int gaps = 0;
305                for(int j=0;j<protLen;j++) {
306                        for(int i=0;i<nProts;i++) {
307                                if(gapped[i][j] == null ) {
308                                        isGap[j] = true;
309                                        gaps++;
310                                        break; //go to next position
311                                }
312                        }
313                }
314
315                // Create ungapped array
316                T[][] ungapped = Arrays.copyOf(gapped,nProts);
317                final int ungappedLen = protLen-gaps;
318                for(int i=0;i<nProts;i++) {
319                        ungapped[i] = Arrays.copyOf(gapped[i],ungappedLen);
320                        int k = 0;
321                        for(int j=0;j<protLen;j++) {
322                                if(!isGap[j]) { //skip gaps
323                                        assert(gapped[i][j] != null);
324                                        ungapped[i][k] = gapped[i][j];
325                                        k++;
326                                }
327                        }
328                        assert(k == ungappedLen);
329                }
330
331                return ungapped;
332        }
333}