001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.alignment;
022
023import org.biojava.bio.BioException;
024import org.biojava.bio.SimpleAnnotation;
025import org.biojava.bio.seq.Sequence;
026import org.biojava.bio.seq.impl.SimpleGappedSequence;
027import org.biojava.bio.seq.impl.SimpleSequence;
028import org.biojava.bio.seq.io.SymbolTokenization;
029import org.biojava.bio.symbol.SimpleSymbolList;
030import org.biojava.bio.symbol.SymbolList;
031
032/**
033 * Needleman and Wunsch defined the problem of global sequence alignments, from
034 * the first till the last symbol of a sequence. This class is able to perform
035 * such global sequence comparisons efficiently by dynamic programming. If
036 * inserts and deletes are equally expensive and as expensive as the extension
037 * of a gap, the alignment method of this class does not use affine gap
038 * penalties. Otherwise it does. Those costs need four times as much memory,
039 * which has significant effects on the run time, if the computer needs to swap.
040 * 
041 * @author Andreas Dräger
042 * @author Gero Greiner
043 * @author Mark Schreiber
044 * @since 1.5
045 */
046
047public class NeedlemanWunsch extends AlignmentAlgorithm {
048
049        /**
050         * This just computes the minimum of three integer values.
051         * 
052         * @param x
053         * @param y
054         * @param z
055         * @return Gives the minimum of three integers
056         */
057        protected static int min(int x, int y, int z) {
058                if ((x < y) && (x < z))
059                        return x;
060                if (y < z)
061                        return y;
062                return z;
063        }
064
065        /**
066         * Prints a String representation of the CostMatrix for the given Alignment
067         * on the screen. This can be used to get a better understanding of the
068         * algorithm. There is no other purpose. This method also works for all
069         * extensions of this class with all kinds of matrices.
070         * 
071         * @param CostMatrix
072         *            The matrix that contains all expenses for swapping symbols.
073         * @param queryChar
074         *            a character representation of the query sequence (
075         *            <code>mySequence.seqString().toCharArray()</code>).
076         * @param targetChar
077         *            a character representation of the target sequence.
078         * @return a String representation of the matrix.
079         */
080        public static String printCostMatrix(int[][] CostMatrix, char[] queryChar,
081                        char[] targetChar) {
082                int line, col;
083                StringBuilder output = new StringBuilder();
084                output.append('\t');
085                String ls = System.getProperty("line.separator");
086
087                for (col = 0; col <= targetChar.length; col++)
088                        if (col == 0) {
089                                output.append('[');
090                                output.append(col);
091                                output.append("]\t");
092                        } else {
093                                output.append('[');
094                                output.append(targetChar[col - 1]);
095                                output.append("]\t");
096                        }
097                for (line = 0; line <= queryChar.length; line++) {
098                        if (line == 0) {
099                                output.append(ls);
100                                output.append('[');
101                                output.append(line);
102                                output.append("]\t");
103                        } else {
104                                output.append(ls);
105                                output.append('[');
106                                output.append(queryChar[line - 1]);
107                                output.append("]\t");
108                        }
109                        for (col = 0; col <= targetChar.length; col++) {
110                                output.append(CostMatrix[line][col]);
111                                output.append('\t');
112                        }
113                }
114                output.append(ls);
115                output.append("delta[Edit] = ");
116                output.append(CostMatrix[line - 1][col - 1]);
117                output.append(ls);
118                return output.toString();
119        }
120
121        /**
122         * A matrix with the size length(sequence1) times length(sequence2)
123         */
124        protected int[][] CostMatrix;
125
126        /**
127         * Expenses for deletes.
128         */
129        private short delete;
130
131        /**
132         * Expenses for the extension of a gap.
133         */
134        private short gapExt;
135
136        /**
137         * Expenses for inserts.
138         */
139        private short insert;
140
141        /**
142         * Expenses for matches.
143         */
144        private short match;
145
146        /**
147         * Expenses for replaces.
148         */
149        private short replace;
150
151        /**
152         * A matrix with the size length(alphabet) times length(alphabet)
153         */
154        protected SubstitutionMatrix subMatrix;
155
156        /**
157         * Constructs a new Object with the given parameters based on the
158         * Needleman-Wunsch algorithm The alphabet of sequences to be aligned will
159         * be taken from the given substitution matrix.
160         * 
161         * @param match
162         *            This gives the costs for a match operation. It is only used,
163         *            if there is no entry for a certain match of two symbols in the
164         *            substitution matrix (default value).
165         * @param replace
166         *            This is like the match parameter just the default, if there is
167         *            no entry in the substitution matrix object.
168         * @param insert
169         *            The costs of a single insert operation.
170         * @param delete
171         *            The expenses of a single delete operation.
172         * @param gapExtend
173         *            The expenses of an extension of a existing gap (that is a
174         *            previous insert or delete. If the costs for insert and delete
175         *            are equal and also equal to gapExtend, no affine gap penalties
176         *            will be used, which saves a significant amount of memory.
177         * @param subMat
178         *            The substitution matrix object which gives the costs for
179         *            matches and replaces.
180         */
181        public NeedlemanWunsch(short match, short replace, short insert,
182                        short delete, short gapExtend, SubstitutionMatrix subMat) {
183                this.subMatrix = subMat;
184                this.insert = insert;
185                this.delete = delete;
186                this.gapExt = gapExtend;
187                this.match = match;
188                this.replace = replace;
189        }
190
191        /**
192         * Returns the current expenses of a single delete operation.
193         * 
194         * @return delete
195         */
196        public short getDelete() {
197                return delete;
198        }
199
200        /**
201         * This gives the edit distance according to the given parameters of this
202         * certain object. It returns just the last element of the internal cost
203         * matrix (left side down). So if you extend this class, you can just do the
204         * following:
205         * <code>int myDistanceValue = foo; this.CostMatrix = new int[1][1]; this.CostMatrix[0][0] = myDistanceValue;</code>
206         * 
207         * @return returns the edit_distance computed with the given parameters.
208         */
209        public int getEditDistance() {
210                return CostMatrix[CostMatrix.length - 1][CostMatrix[CostMatrix.length - 1].length - 1];
211        }
212
213        /**
214         * Returns the current expenses of any extension of a gap operation.
215         * 
216         * @return gapExt
217         */
218        public short getGapExt() {
219                return gapExt;
220        }
221
222        /**
223         * Returns the current expenses of a single insert operation.
224         * 
225         * @return insert
226         */
227        public short getInsert() {
228                return insert;
229        }
230
231        /**
232         * Returns the current expenses of a single match operation.
233         * 
234         * @return match
235         */
236        public short getMatch() {
237                return match;
238        }
239
240        /**
241         * Returns the current expenses of a single replace operation.
242         * 
243         * @return replace
244         */
245        public short getReplace() {
246                return replace;
247        }
248
249        /**
250         * This method computes the scores for the substitution of the i-th symbol
251         * of query by the j-th symbol of subject.
252         * 
253         * @param query
254         *            The query sequence
255         * @param subject
256         *            The target sequence
257         * @param i
258         *            The position of the symbol under consideration within the
259         *            query sequence (starting from one)
260         * @param j
261         *            The position of the symbol under consideration within the
262         *            target sequence
263         * @return The score for the given substitution.
264         */
265        private int matchReplace(Sequence query, Sequence subject, int i, int j) {
266                try {
267                        return subMatrix.getValueAt(query.symbolAt(i), subject.symbolAt(j));
268                } catch (Exception exc) {
269                        if (query.symbolAt(i).getMatches().contains(subject.symbolAt(j))
270                                        || subject.symbolAt(j).getMatches().contains(
271                                                        query.symbolAt(i)))
272                                return -match;
273                        return -replace;
274                }
275        }
276
277        /**
278         * Global pairwise sequence alignment of two BioJava-Sequence objects
279         * according to the Needleman-Wunsch-algorithm.
280         * 
281         * @see org.biojava.bio.alignment.AlignmentAlgorithm#pairwiseAlignment(org.biojava.bio.symbol.SymbolList,
282         *      org.biojava.bio.symbol.SymbolList)
283         */
284        @Override
285        public AlignmentPair pairwiseAlignment(SymbolList query, SymbolList subject)
286                        throws BioException {
287                Sequence squery = null;
288                Sequence ssubject = null;
289
290                if (query instanceof Sequence) {
291                        squery = (Sequence) query;
292                } else {
293                        // make it a sequence
294                        squery = new SimpleSequence(query, "", "query",
295                                        new SimpleAnnotation());
296                }
297                if (subject instanceof Sequence) {
298                        ssubject = (Sequence) subject;
299                } else {
300                        // make it a sequence
301                        ssubject = new SimpleSequence(subject, "", "subject",
302                                        new SimpleAnnotation());
303                }
304
305                SymbolTokenization st = null;
306                st = subMatrix.getAlphabet().getTokenization("default");
307
308                if (squery.getAlphabet().equals(ssubject.getAlphabet())
309                                && squery.getAlphabet().equals(subMatrix.getAlphabet())) {
310
311                        StringBuffer[] align = { new StringBuffer(), new StringBuffer() };
312
313                        long time = System.currentTimeMillis();
314                        int i, j;
315                        this.CostMatrix = new int[squery.length() + 1][ssubject.length() + 1];
316
317                        /*
318                         * Variables for the traceback
319                         */
320
321                        StringBuffer path = new StringBuffer();
322
323                        // construct the matrix:
324                        CostMatrix[0][0] = 0;
325
326                        /*
327                         * If we want to have affine gap penalties, we have to initialise
328                         * additional matrices: If this is not necessary, we won't do that
329                         * (because it's expensive).
330                         */
331                        if ((gapExt != delete) || (gapExt != insert)) {
332
333                                int[][] E = new int[squery.length() + 1][ssubject.length() + 1]; // Inserts
334                                int[][] F = new int[squery.length() + 1][ssubject.length() + 1]; // Deletes
335
336                                E[0][0] = F[0][0] = Integer.MAX_VALUE; // Double.MAX_VALUE;
337                                for (i = 1; i <= squery.length(); i++) {
338                                        // CostMatrix[i][0] = CostMatrix[i-1][0] + delete;
339                                        E[i][0] = Integer.MAX_VALUE; // Double.POSITIVE_INFINITY;
340                                        CostMatrix[i][0] = F[i][0] = delete + i * gapExt;
341                                }
342                                for (j = 1; j <= ssubject.length(); j++) {
343                                        // CostMatrix[0][j] = CostMatrix[0][j - 1] + insert;
344                                        F[0][j] = Integer.MAX_VALUE; // Double.POSITIVE_INFINITY;
345                                        CostMatrix[0][j] = E[0][j] = insert + j * gapExt;
346                                }
347                                for (i = 1; i <= squery.length(); i++)
348                                        for (j = 1; j <= ssubject.length(); j++) {
349                                                E[i][j] = Math.min(E[i][j - 1], CostMatrix[i][j - 1]
350                                                                + insert)
351                                                                + gapExt;
352                                                F[i][j] = Math.min(F[i - 1][j], CostMatrix[i - 1][j]
353                                                                + delete)
354                                                                + gapExt;
355                                                CostMatrix[i][j] = min(E[i][j], F[i][j],
356                                                                CostMatrix[i - 1][j - 1]
357                                                                                - matchReplace(squery, ssubject, i, j));
358                                        }
359
360                                /*
361                                 * Traceback for affine gap penalties.
362                                 */
363
364                                boolean[] gap_extend = { false, false };
365                                j = this.CostMatrix[CostMatrix.length - 1].length - 1;
366
367                                for (i = this.CostMatrix.length - 1; i > 0;) {
368                                        do {
369                                                // only Insert.
370                                                if (i == 0) {
371                                                        align[0].insert(0, '-');
372                                                        align[1].insert(0, st.tokenizeSymbol(ssubject
373                                                                        .symbolAt(j--)));
374                                                        path.insert(0, ' ');
375
376                                                        // only Delete.
377                                                } else if (j == 0) {
378                                                        align[0].insert(0, st.tokenizeSymbol(squery
379                                                                        .symbolAt(i--)));
380                                                        align[1].insert(0, '-');
381                                                        path.insert(0, ' ');
382
383                                                        // Match/Replace
384                                                } else if ((CostMatrix[i][j] == CostMatrix[i - 1][j - 1]
385                                                                - matchReplace(squery, ssubject, i, j))
386                                                                && !(gap_extend[0] || gap_extend[1])) {
387                                                        if (squery.symbolAt(i) == ssubject.symbolAt(j))
388                                                                path.insert(0, '|');
389                                                        else
390                                                                path.insert(0, ' ');
391                                                        align[0].insert(0, st.tokenizeSymbol(squery
392                                                                        .symbolAt(i--)));
393                                                        align[1].insert(0, st.tokenizeSymbol(ssubject
394                                                                        .symbolAt(j--)));
395
396                                                        // Insert || finish gap if extended gap is
397                                                        // opened
398                                                } else if (CostMatrix[i][j] == E[i][j] || gap_extend[0]) {
399                                                        // check if gap has been extended or freshly
400                                                        // opened
401                                                        gap_extend[0] = (E[i][j] != CostMatrix[i][j - 1]
402                                                                        + insert + gapExt);
403
404                                                        align[0].insert(0, '-');
405                                                        align[1].insert(0, st.tokenizeSymbol(ssubject
406                                                                        .symbolAt(j--)));
407                                                        path.insert(0, ' ');
408
409                                                        // Delete || finish gap if extended gap is
410                                                        // opened
411                                                } else {
412                                                        // check if gap has been extended or freshly
413                                                        // opened
414                                                        gap_extend[1] = (F[i][j] != CostMatrix[i - 1][j]
415                                                                        + delete + gapExt);
416
417                                                        align[0].insert(0, st.tokenizeSymbol(squery
418                                                                        .symbolAt(i--)));
419                                                        align[1].insert(0, '-');
420                                                        path.insert(0, ' ');
421                                                }
422                                        } while (j > 0);
423                                }
424
425                                /*
426                                 * No affine gap penalties, constant gap penalties, which is
427                                 * much faster and needs less memory.
428                                 */
429
430                        } else {
431
432                                for (i = 1; i <= squery.length(); i++)
433                                        CostMatrix[i][0] = CostMatrix[i - 1][0] + delete;
434                                for (j = 1; j <= ssubject.length(); j++)
435                                        CostMatrix[0][j] = CostMatrix[0][j - 1] + insert;
436                                for (i = 1; i <= squery.length(); i++)
437                                        for (j = 1; j <= ssubject.length(); j++) {
438                                                CostMatrix[i][j] = min(CostMatrix[i - 1][j] + delete,
439                                                                CostMatrix[i][j - 1] + insert,
440                                                                CostMatrix[i - 1][j - 1]
441                                                                                - matchReplace(squery, ssubject, i, j));
442                                        }
443
444                                /*
445                                 * Traceback for constant gap penalties.
446                                 */
447                                j = this.CostMatrix[CostMatrix.length - 1].length - 1;
448
449                                // System.out.println(printCostMatrix(CostMatrix,
450                                // query.seqString().toCharArray(),
451                                // subject.seqString().toCharArray()));
452
453                                for (i = this.CostMatrix.length - 1; i > 0;) {
454                                        do {
455                                                // only Insert.
456                                                if (i == 0) {
457                                                        align[0].insert(0, '-');
458                                                        align[1].insert(0, st.tokenizeSymbol(ssubject
459                                                                        .symbolAt(j--)));
460                                                        path.insert(0, ' ');
461
462                                                        // only Delete.
463                                                } else if (j == 0) {
464                                                        align[0].insert(0, st.tokenizeSymbol(squery
465                                                                        .symbolAt(i--)));
466                                                        align[1].insert(0, '-');
467                                                        path.insert(0, ' ');
468
469                                                        // Match/Replace
470                                                } else if (CostMatrix[i][j] == CostMatrix[i - 1][j - 1]
471                                                                - matchReplace(squery, ssubject, i, j)) {
472
473                                                        if (squery.symbolAt(i) == ssubject.symbolAt(j))
474                                                                path.insert(0, '|');
475                                                        else
476                                                                path.insert(0, ' ');
477                                                        align[0].insert(0, st.tokenizeSymbol(squery
478                                                                        .symbolAt(i--)));
479                                                        align[1].insert(0, st.tokenizeSymbol(ssubject
480                                                                        .symbolAt(j--)));
481
482                                                        // Insert
483                                                } else if (CostMatrix[i][j] == CostMatrix[i][j - 1]
484                                                                + insert) {
485                                                        align[0].insert(0, '-');
486                                                        align[1].insert(0, st.tokenizeSymbol(ssubject
487                                                                        .symbolAt(j--)));
488                                                        path.insert(0, ' ');
489
490                                                        // Delete
491                                                } else {
492                                                        align[0].insert(0, st.tokenizeSymbol(squery
493                                                                        .symbolAt(i--)));
494                                                        align[1].insert(0, '-');
495                                                        path.insert(0, ' ');
496                                                }
497                                        } while (j > 0);
498                                }
499
500                        }
501
502                        /*
503                         * From here both cases are equal again.
504                         */
505                        squery = new SimpleGappedSequence(new SimpleSequence(
506                                        new SimpleSymbolList(squery.getAlphabet().getTokenization(
507                                                        "token"), align[0].toString()), squery.getURN(),
508                                        squery.getName(), squery.getAnnotation()));
509                        ssubject = new SimpleGappedSequence(new SimpleSequence(
510                                        new SimpleSymbolList(ssubject.getAlphabet()
511                                                        .getTokenization("token"), align[1].toString()),
512                                        ssubject.getURN(), ssubject.getName(), ssubject
513                                                        .getAnnotation()));
514                        AlignmentPair pairalign = new AlignmentPair(squery, ssubject, 1,
515                                        squery.length(), 1, ssubject.length(), subMatrix);
516                        pairalign.setComputationTime(System.currentTimeMillis() - time);
517                        pairalign.setScore((-1) * getEditDistance());
518                        return pairalign;
519                } else {
520                        throw new BioException(
521                                        "Alphabet missmatch occured: sequences with different alphabet cannot be aligned.");
522                }
523        }
524
525        /**
526         * Sets the penalty for a delete operation to the specified value.
527         * 
528         * @param del
529         *            costs for a single deletion operation
530         */
531        public void setDelete(short del) {
532                this.delete = del;
533        }
534
535        /**
536         * Sets the penalty for an extension of any gap (insert or delete) to the
537         * specified value.
538         * 
539         * @param ge
540         *            costs for any gap extension
541         */
542        public void setGapExt(short ge) {
543                this.gapExt = ge;
544        }
545
546        /**
547         * Sets the penalty for an insert operation to the specified value.
548         * 
549         * @param ins
550         *            costs for a single insert operation
551         */
552        public void setInsert(short ins) {
553                this.insert = ins;
554        }
555
556        /**
557         * Sets the penalty for a match operation to the specified value.
558         * 
559         * @param ma
560         *            costs for a single match operation
561         */
562        public void setMatch(short ma) {
563                this.match = ma;
564        }
565
566        /**
567         * Sets the penalty for a replace operation to the specified value.
568         * 
569         * @param rep
570         *            costs for a single replace operation
571         */
572        public void setReplace(short rep) {
573                this.replace = rep;
574        }
575
576        /**
577         * Sets the substitution matrix to be used to the specified one. Afterwards
578         * it is only possible to align sequences of the alphabet of this
579         * substitution matrix.
580         * 
581         * @param matrix
582         *            an instance of a substitution matrix.
583         */
584        public void setSubstitutionMatrix(SubstitutionMatrix matrix) {
585                this.subMatrix = matrix;
586        }
587
588}