001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.alignment;
023
024import org.biojava.bio.BioException;
025import org.biojava.bio.BioRuntimeException;
026import org.biojava.bio.SimpleAnnotation;
027import org.biojava.bio.seq.Sequence;
028import org.biojava.bio.seq.impl.SimpleGappedSequence;
029import org.biojava.bio.seq.impl.SimpleSequence;
030import org.biojava.bio.seq.io.SymbolTokenization;
031import org.biojava.bio.symbol.SimpleSymbolList;
032import org.biojava.bio.symbol.SymbolList;
033
034/**
035 * Smith and Waterman developed an efficient dynamic programming algorithm to
036 * perform local sequence alignments, which returns the most conserved region of
037 * two sequences (longest common substring with modifications). This algorithm
038 * is performed by the method <code>pairwiseAlignment</code> of this class. It
039 * uses affine gap penalties if and only if the expenses of a delete or insert
040 * operation are unequal to the expenses of gap extension. This uses
041 * significantly more memory (four times as much) and increases the runtime if
042 * swapping is performed.
043 * 
044 * @author Andreas Dr&auml;ger
045 * @author Gero Greiner
046 * @author Mark Schreiber
047 * @since 1.5
048 */
049public class SmithWaterman extends AlignmentAlgorithm {
050
051        private short match, replace, insert, delete, gapExt;
052        private SubstitutionMatrix subMatrix;
053
054        /**
055         * Default constructor.
056         */
057        public SmithWaterman() {
058                this.subMatrix = null;
059        }
060
061        /**
062         * Constructs the new SmithWaterman alignment object. Alignments are only
063         * performed, if the alphabet of the given <code>SubstitutionMatrix</code>
064         * equals the alphabet of both the query and the target
065         * <code>Sequence</code>. The alignment parameters here are expenses and not
066         * scores as they are in the <code>NeedlemanWunsch</code> object. scores are
067         * just given by multiplying the expenses with <code>(-1)</code>. For
068         * example you could use parameters like "-2, 5, 3, 3, 0". If the expenses
069         * for gap extension are equal to the cost of starting a gap (delete or
070         * insert), no affine gap penalties are used, which saves memory.
071         * 
072         * @param match
073         *            expenses for a match
074         * @param replace
075         *            expenses for a replace operation
076         * @param insert
077         *            expenses for a gap opening in the query sequence
078         * @param delete
079         *            expenses for a gap opening in the target sequence
080         * @param gapExtend
081         *            expenses for the extension of a gap which was started earlier.
082         * @param matrix
083         *            the <code>SubstitutionMatrix</code> object to use.
084         */
085        public SmithWaterman(short match, short replace, short insert,
086                        short delete, short gapExtend, SubstitutionMatrix matrix) {
087                this.match = (short) -match;
088                this.replace = (short) -replace;
089                this.insert = (short) -insert;
090                this.delete = (short) -delete;
091                this.gapExt = (short) -gapExtend;
092                this.subMatrix = matrix;
093        }
094
095        /**
096         * @return the delete
097         */
098        public short getDelete() {
099                return delete;
100        }
101
102        /**
103         * @return the gapExt
104         */
105        public short getGapExt() {
106                return gapExt;
107        }
108
109        /**
110         * @return the insert
111         */
112        public short getInsert() {
113                return insert;
114        }
115
116        /**
117         * @return the match
118         */
119        public short getMatch() {
120                return match;
121        }
122
123        /**
124         * @return the replace
125         */
126        public short getReplace() {
127                return replace;
128        }
129
130        /**
131         * @return the subMatrix
132         */
133        public SubstitutionMatrix getSubMatrix() {
134                return subMatrix;
135        }
136
137        /**
138         * This method computes the scores for the substitution of the i-th symbol
139         * of query by the j-th symbol of subject.
140         * 
141         * @param query
142         *            The query sequence
143         * @param subject
144         *            The target sequence
145         * @param i
146         *            The position of the symbol under consideration within the
147         *            query sequence (starting from one)
148         * @param j
149         *            The position of the symbol under consideration within the
150         *            target sequence
151         * @return The score for the given substitution.
152         */
153        private short matchReplace(Sequence query, Sequence subject, int i, int j) {
154                try {
155                        if (subMatrix != null)
156                                return subMatrix.getValueAt(query.symbolAt(i), subject
157                                                .symbolAt(j));
158                } catch (Exception exc) {
159                }
160                if (query.symbolAt(i).equals(subject.symbolAt(j)))
161                        return match;
162                return replace;
163        }
164
165        /**
166         * This just computes the maximum of four integers.
167         * 
168         * @param w
169         * @param x
170         * @param y
171         * @param z
172         * @return the maximum of four <code>int</code>s.
173         */
174        private int max(int w, int x, int y, int z) {
175                if ((w > x) && (w > y) && (w > z))
176                        return w;
177                if ((x > y) && (x > z))
178                        return x;
179                if ((y > z))
180                        return y;
181                return z;
182        }
183
184        /**
185         * Overrides the method inherited from the NeedlemanWunsch and performs only
186         * a local alignment. It finds only the longest common subsequence. This is
187         * good for the beginning, but it might be better to have a system to find
188         * more than only one hit within the score matrix. Therefore, one should
189         * only define the k-th best hit, where k is somehow related to the number
190         * of hits.
191         * 
192         * @see AlignmentAlgorithm#pairwiseAlignment(org.biojava.bio.symbol.SymbolList,
193         *      org.biojava.bio.symbol.SymbolList)
194         */
195        public AlignmentPair pairwiseAlignment(SymbolList query, SymbolList subject)
196                        throws BioRuntimeException {
197                int[][] scoreMatrix;
198                Sequence squery = null;
199                Sequence ssubject = null;
200
201                if (query instanceof Sequence) {
202                        squery = (Sequence) query;
203                } else {
204                        // make it a sequence
205                        squery = new SimpleSequence(query, "", "query",
206                                        new SimpleAnnotation());
207                }
208                if (subject instanceof Sequence) {
209                        ssubject = (Sequence) subject;
210                } else {
211                        // make it a sequence
212                        ssubject = new SimpleSequence(subject, "", "subject",
213                                        new SimpleAnnotation());
214                }
215
216                if (squery.getAlphabet().equals(ssubject.getAlphabet())
217                                && (subMatrix == null || squery.getAlphabet().equals(
218                                                subMatrix.getAlphabet()))) {
219                        StringBuffer[] align = { new StringBuffer(), new StringBuffer() };
220                        long time = System.currentTimeMillis();
221                        int i, j, maxI = 0, maxJ = 0, queryStart = 0, subjectStart = 0;
222                        scoreMatrix = new int[squery.length() + 1][ssubject.length() + 1];
223
224                        /*
225                         * Variables needed for traceback
226                         */
227
228                        SymbolTokenization st;
229                        try {
230                                st = squery.getAlphabet().getTokenization("default");
231                        } catch (BioException exc) {
232                                throw new BioRuntimeException(exc);
233                        }
234
235                        /*
236                         * Use affine gap panalties.
237                         */
238                        if ((gapExt != delete) || (gapExt != insert)) {
239
240                                int[][] E = new int[squery.length() + 1][ssubject.length() + 1]; // Inserts
241                                int[][] F = new int[squery.length() + 1][ssubject.length() + 1]; // Deletes
242
243                                scoreMatrix[0][0] = 0;
244                                E[0][0] = F[0][0] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY;
245                                for (i = 1; i <= squery.length(); i++) {
246                                        scoreMatrix[i][0] = F[i][0] = 0;
247                                        E[i][0] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY;
248                                }
249                                for (j = 1; j <= ssubject.length(); j++) {
250                                        scoreMatrix[0][j] = E[0][j] = 0;
251                                        F[0][j] = Integer.MIN_VALUE; // Double.NEGATIVE_INFINITY;
252                                }
253                                for (i = 1; i <= squery.length(); i++)
254                                        for (j = 1; j <= ssubject.length(); j++) {
255                                                E[i][j] = Math.max(E[i][j - 1], scoreMatrix[i][j - 1]
256                                                                + insert)
257                                                                + gapExt;
258                                                F[i][j] = Math.max(F[i - 1][j], scoreMatrix[i - 1][j]
259                                                                + delete)
260                                                                + gapExt;
261                                                scoreMatrix[i][j] = max(0, E[i][j], F[i][j],
262                                                                scoreMatrix[i - 1][j - 1]
263                                                                                + matchReplace(squery, ssubject, i, j));
264
265                                                if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) {
266                                                        maxI = i;
267                                                        maxJ = j;
268                                                }
269                                        }
270                                // System.out.println(printCostMatrix(G,
271                                // query.seqString().toCharArray(),
272                                // subject.seqString().toCharArray()));
273
274                                /*
275                                 * Here starts the traceback for affine gap penalities
276                                 */
277                                try {
278                                        boolean[] gap_extend = { false, false };
279                                        j = maxJ;
280                                        for (i = maxI; i > 0;) {
281                                                do {
282                                                        // only Deletes or Inserts or Replaces possible.
283                                                        // That's not what we want to have.
284                                                        if (scoreMatrix[i][j] == 0) {
285                                                                queryStart = i;
286                                                                subjectStart = j;
287                                                                i = j = 0;
288
289                                                                // Match/Replace
290                                                        } else if ((scoreMatrix[i][j] == scoreMatrix[i - 1][j - 1]
291                                                                        + matchReplace(squery, ssubject, i, j))
292                                                                        && !(gap_extend[0] || gap_extend[1])) {
293
294                                                                align[0].insert(0, st.tokenizeSymbol(squery
295                                                                                .symbolAt(i--)));
296                                                                align[1].insert(0, st.tokenizeSymbol(ssubject
297                                                                                .symbolAt(j--)));
298
299                                                                // Insert || finish gap if extended gap is
300                                                                // opened
301                                                        } else if (scoreMatrix[i][j] == E[i][j]
302                                                                        || gap_extend[0]) {
303                                                                // check if gap has been extended or freshly
304                                                                // opened
305                                                                gap_extend[0] = (E[i][j] != scoreMatrix[i][j - 1]
306                                                                                + insert + gapExt);
307
308                                                                align[0].insert(0, '-');
309                                                                align[1].insert(0, st.tokenizeSymbol(ssubject
310                                                                                .symbolAt(j--)));
311
312                                                                // Delete || finish gap if extended gap is
313                                                                // opened
314                                                        } else {
315                                                                // check if gap has been extended or freshly
316                                                                // opened
317                                                                gap_extend[1] = (F[i][j] != scoreMatrix[i - 1][j]
318                                                                                + delete + gapExt);
319
320                                                                align[0].insert(0, st.tokenizeSymbol(squery
321                                                                                .symbolAt(i--)));
322                                                                align[1].insert(0, '-');
323                                                        }
324                                                } while (j > 0);
325                                        }
326                                } catch (BioException exc) {
327                                        throw new BioRuntimeException(exc);
328                                }
329
330                                /*
331                                 * No affine gap penalties to save memory.
332                                 */
333                        } else {
334
335                                for (i = 0; i <= squery.length(); i++)
336                                        scoreMatrix[i][0] = 0;
337                                for (j = 0; j <= ssubject.length(); j++)
338                                        scoreMatrix[0][j] = 0;
339                                for (i = 1; i <= squery.length(); i++)
340                                        for (j = 1; j <= ssubject.length(); j++) {
341
342                                                scoreMatrix[i][j] = max(0, scoreMatrix[i - 1][j]
343                                                                + delete, scoreMatrix[i][j - 1] + insert,
344                                                                scoreMatrix[i - 1][j - 1]
345                                                                                + matchReplace(squery, ssubject, i, j));
346
347                                                if (scoreMatrix[i][j] > scoreMatrix[maxI][maxJ]) {
348                                                        maxI = i;
349                                                        maxJ = j;
350                                                }
351                                        }
352
353                                /*
354                                 * Here starts the traceback for non-affine gap penalities
355                                 */
356                                try {
357                                        j = maxJ;
358                                        for (i = maxI; i > 0;) {
359                                                do {
360                                                        // only Deletes or Inserts or Replaces possible.
361                                                        // That's not what
362                                                        // we want to have.
363                                                        if (scoreMatrix[i][j] == 0) {
364                                                                queryStart = i;
365                                                                subjectStart = j;
366                                                                i = j = 0;
367
368                                                                // Match/Replace
369                                                        } else if (scoreMatrix[i][j] == scoreMatrix[i - 1][j - 1]
370                                                                        + matchReplace(squery, ssubject, i, j)) {
371
372                                                                align[0].insert(0, st.tokenizeSymbol(squery
373                                                                                .symbolAt(i--)));
374                                                                align[1].insert(0, st.tokenizeSymbol(ssubject
375                                                                                .symbolAt(j--)));
376
377                                                                // Insert
378                                                        } else if (scoreMatrix[i][j] == scoreMatrix[i][j - 1]
379                                                                        + insert) {
380                                                                align[0].insert(0, '-');
381                                                                align[1].insert(0, st.tokenizeSymbol(ssubject
382                                                                                .symbolAt(j--)));
383
384                                                                // Delete
385                                                        } else {
386                                                                align[0].insert(0, st.tokenizeSymbol(squery
387                                                                                .symbolAt(i--)));
388                                                                align[1].insert(0, '-');
389                                                        }
390                                                } while (j > 0);
391                                        }
392                                } catch (BioException exc) {
393                                        throw new BioRuntimeException(exc);
394                                }
395                        }
396
397                        /*
398                         * From here both cases are equal again.
399                         */
400
401                        try {
402
403                                /*
404                                 * Make equal length alignments!
405                                 */
406                                for (i = 0; i < queryStart; i++)
407                                        align[0].insert(0, '-');
408                                for (i = align[0].length(); i < Math.max(maxI, maxJ); i++)
409                                        align[0].append('-');
410                                for (i = 0; i < subjectStart; i++)
411                                        align[1].insert(0, '-');
412                                for (i = align[1].length(); i < Math.max(maxJ, align[0]
413                                                .length()); i++)
414                                        align[1].append('-');
415                                for (i = align[0].length(); i < align[1].length(); i++)
416                                        align[0].append('-');
417
418                                squery = new SimpleGappedSequence(
419                                                new SimpleSequence(new SimpleSymbolList(squery
420                                                                .getAlphabet().getTokenization("token"),
421                                                                align[0].toString()), squery.getURN(), squery
422                                                                .getName(), squery.getAnnotation()));
423                                ssubject = new SimpleGappedSequence(
424                                                new SimpleSequence(new SimpleSymbolList(ssubject
425                                                                .getAlphabet().getTokenization("token"),
426                                                                align[1].toString()), ssubject.getURN(),
427                                                                ssubject.getName(), ssubject.getAnnotation()));
428
429                                AlignmentPair pairalign = new AlignmentPair(squery, ssubject,
430                                                queryStart + 1, maxI, subjectStart + 1, maxJ, subMatrix);
431                                pairalign.setComputationTime(System.currentTimeMillis() - time);
432                                pairalign.setScore(scoreMatrix[maxI][maxJ]);
433                                // Don't waste any memory.
434                                scoreMatrix = null;
435                                return pairalign;
436
437                        } catch (BioException exc) {
438                                throw new BioRuntimeException(exc);
439                        }
440                } else
441                        throw new BioRuntimeException(
442                                        "The alphabets of the sequences and the substitution matrix have to be equal.");
443        }
444
445        /**
446         * Overrides the method inherited from the NeedlemanWunsch and sets the
447         * penalty for a delete operation to the specified value. Reason: internally
448         * scores are used instead of penalties so that the value is muliplyed with
449         * -1.
450         * 
451         * @param del
452         *            costs for a single deletion operation
453         */
454        public void setDelete(short del) {
455                this.delete = (short) -del;
456        }
457
458        /**
459         * Overrides the method inherited from the NeedlemanWunsch and sets the
460         * penalty for an extension of any gap (insert or delete) to the specified
461         * value. Reason: internally scores are used instead of penalties so that
462         * the value is muliplyed with -1.
463         * 
464         * @param ge
465         *            costs for any gap extension
466         */
467        public void setGapExt(short ge) {
468                this.gapExt = (short) -ge;
469        }
470
471        /**
472         * Overrides the method inherited from the NeedlemanWunsch and sets the
473         * penalty for an insert operation to the specified value. Reason:
474         * internally scores are used instead of penalties so that the value is
475         * muliplyed with -1.
476         * 
477         * @param ins
478         *            costs for a single insert operation
479         */
480        public void setInsert(short ins) {
481                this.insert = (short) -ins;
482        }
483
484        /**
485         * Overrides the method inherited from the NeedlemanWunsch and sets the
486         * penalty for a match operation to the specified value. Reason: internally
487         * scores are used instead of penalties so that the value is muliplyed with
488         * -1.
489         * 
490         * @param ma
491         *            costs for a single match operation
492         */
493        public void setMatch(short ma) {
494                this.match = (short) -ma;
495        }
496
497        /**
498         * Overrides the method inherited from the NeedlemanWunsch and sets the
499         * penalty for a replace operation to the specified value. Reason:
500         * internally scores are used instead of penalties so that the value is
501         * muliplyed with -1.
502         * 
503         * @param rep
504         *            costs for a single replace operation
505         */
506        public void setReplace(short rep) {
507                this.replace = (short) -rep;
508        }
509
510        /**
511         * @param subMatrix
512         *            the subMatrix to set
513         */
514        public void setSubMatrix(SubstitutionMatrix subMatrix) {
515                this.subMatrix = subMatrix;
516        }
517
518}