001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.alignment;
022
023import java.util.Formatter;
024import java.util.HashMap;
025import java.util.Map;
026
027import org.biojava.bio.BioException;
028import org.biojava.bio.seq.GappedSequence;
029import org.biojava.bio.seq.Sequence;
030import org.biojava.bio.seq.impl.SimpleGappedSequence;
031import org.biojava.bio.symbol.Symbol;
032import org.biojava.bio.symbol.SymbolList;
033
034/**
035 * This class stores the result of an alignment procedure that creates a
036 * pairwise alignment of two sequences. Currently, these sequences must have the
037 * identical length (this may be changed in the future). A format routine
038 * produces a BLAST-like output for the sequences but all necessary information
039 * to visualize the alignment are contained in this class.
040 * 
041 * @author Andreas Dräger
042 * 
043 */
044public class AlignmentPair extends SimpleAlignment {
045
046        /**
047         * Generated serial version identifier
048         */
049        private static final long serialVersionUID = -8834131912021612261L;
050
051        /**
052         * 
053         * @param query
054         * @param subject
055         * @param algorithm
056         * @return
057         * @throws Exception
058         */
059        public static AlignmentPair align(Sequence query, Sequence subject,
060                        AlignmentAlgorithm algorithm) throws Exception {
061                AlignmentPair pair = algorithm.pairwiseAlignment(query, subject);
062                return pair;
063        }
064
065        /**
066         * Creates the Map required by the super class.
067         * 
068         * @param s1
069         * @param s2
070         * @return
071         */
072        private static Map<String, SymbolList> createHashMap(Sequence s1,
073                        Sequence s2) {
074                Map<String, SymbolList> m = new HashMap<String, SymbolList>();
075                m.put(s1.getName(), s1);
076                m.put(s2.getName(), s2);
077                return m;
078        }
079
080        /**
081         * Number of identical elements in both sequences
082         */
083        private int identicals;
084
085        /**
086         * Start position in the query
087         */
088        private int queryStart;
089
090        /**
091         * @return the queryStart
092         */
093        public int getQueryStart() {
094                return queryStart;
095        }
096
097        /**
098         * @param queryStart
099         *            the queryStart to set
100         * @throws BioException
101         */
102        void setQueryStart(int queryStart) throws BioException {
103                this.queryStart = queryStart;
104                init();
105        }
106
107        /**
108         * End position in the query
109         */
110        private int queryEnd;
111
112        /**
113         * @return the queryEnd
114         */
115        public int getQueryEnd() {
116                return queryEnd;
117        }
118
119        /**
120         * @param queryEnd
121         *            the queryEnd to set
122         * @throws BioException
123         */
124        void setQueryEnd(int queryEnd) throws BioException {
125                this.queryEnd = queryEnd;
126                init();
127        }
128
129        /**
130         * Number of gaps in query
131         */
132        private int nGapsQ;
133
134        /**
135         * Number of gaps in subject
136         */
137        private int nGapsS;
138
139        /**
140         * Length of the query sequence
141         */
142        private final Sequence query;
143
144        /**
145         * Start position in the subject
146         */
147        private int subjectStart;
148
149        /**
150         * @return the subjectStart
151         */
152        public int getSubjectStart() {
153                return subjectStart;
154        }
155
156        /**
157         * @param subjectStart
158         *            the subjectStart to set
159         * @throws BioException
160         */
161        void setSubjectStart(int subjectStart) throws BioException {
162                this.subjectStart = subjectStart;
163                init();
164        }
165
166        /**
167         * End position in the subject
168         */
169        private int subjectEnd;
170
171        /**
172         * @return the subjectEnd
173         */
174        public int getSubjectEnd() {
175                return subjectEnd;
176        }
177
178        /**
179         * @param subjectEnd
180         *            the subjectEnd to set
181         * @throws BioException
182         */
183        void setSubjectEnd(int subjectEnd) throws BioException {
184                this.subjectEnd = subjectEnd;
185                init();
186        }
187
188        /**
189         * Number of similar symbols
190         */
191        private int similars;
192
193        /**
194         * The subject sequence.
195         */
196        private final Sequence subject;
197
198        /**
199         * Reference to the underlying substitution matrix of this alignment.
200         */
201        private final SubstitutionMatrix subMatrix;
202
203        /**
204         * Time consumption to create this alignment.
205         */
206        private long time;
207
208        /**
209         * 
210         * @param queryStart
211         *            the start position in the query, where the alignment starts.
212         *            For example zero for normal Needleman-Wunsch-Alignments.
213         * @param queryEnd
214         *            the end position, that means the sequence coordinate, which is
215         *            the last symbol of the query sequence. Counting starts at
216         *            zero!
217         * @param queryLength
218         *            The length of the query sequence without gaps.
219         * @param subjectStart
220         *            These are all the same for the target. Have a look at these
221         *            above.
222         * @param subjectEnd
223         * @param subMatrix
224         *            the subsitution Matrix used for calculating the alignment
225         * @throws IllegalArgumentException
226         * @throws BioException
227         */
228        public AlignmentPair(Sequence query, Sequence subject, int queryStart,
229                        int queryEnd, int subjectStart, int subjectEnd,
230                        SubstitutionMatrix subMatrix) throws IllegalArgumentException,
231                        BioException {
232                super(createHashMap(query, subject));
233                this.subMatrix = subMatrix;
234                this.query = query;
235                this.subject = subject;
236                this.queryStart = queryStart;
237                this.queryEnd = queryEnd;
238                this.subjectStart = subjectStart;
239                this.subjectEnd = subjectEnd;
240                init();
241        }
242
243        /**
244         * 
245         * @param query
246         * @param subject
247         * @param subMatrix
248         * @throws IllegalArgumentException
249         * @throws BioException
250         */
251        public AlignmentPair(Sequence query, Sequence subject,
252                        SubstitutionMatrix subMatrix) throws IllegalArgumentException,
253                        BioException {
254                this(query, subject, 1, query.length(), 1, subject.length(), subMatrix);
255        }
256
257        /**
258         * 
259         * @throws BioException
260         */
261        private void init() throws BioException {
262                similars = 0;
263                identicals = 0;
264                nGapsQ = 0;
265                nGapsS = 0;
266                for (int i = 0; i < Math.min(queryEnd - queryStart, subjectEnd
267                                - subjectStart); i++) {
268                        Symbol a = query.symbolAt(i + queryStart);
269                        Symbol b = subject.symbolAt(i + subjectStart);
270                        boolean gap = false;
271                        if (a.equals(b)) {
272                                identicals++;
273                        }
274
275                        // get score for this pair. if it is positive, they are similar...
276                        if (a.equals(query.getAlphabet().getGapSymbol())) {
277                                nGapsQ++;
278                                gap = true;
279                        }
280                        if (b.equals(subject.getAlphabet().getGapSymbol())) {
281                                nGapsS++;
282                                gap = true;
283                        }
284                        if (!gap && subMatrix != null && subMatrix.getValueAt(a, b) > 0) {
285                                similars++;
286                        }
287                }
288        }
289
290        /**
291         * @return the time
292         */
293        public long getComputationTime() {
294                return time;
295        }
296
297        /**
298         * @return the ngapsq
299         */
300        public int getNumGapsInQuery() {
301                return nGapsQ;
302        }
303
304        /**
305         * @return the ngapst
306         */
307        public int getNumGapsInSubject() {
308                return nGapsS;
309        }
310
311        /**
312         * @return the identicals
313         */
314        public int getNumIdenticals() {
315                return identicals;
316        }
317
318        /**
319         * @return the similars
320         */
321        public int getNumSimilars() {
322                return similars;
323        }
324
325        /**
326         * 
327         * @return
328         */
329        public float getPercentIdentityQuery() {
330                return identicals / (float) (query.length() - nGapsQ) * 100;
331        }
332
333        /**
334         * 
335         * @return
336         */
337        public float getPercentIdentitySubject() {
338                return identicals / (float) (subject.length() - nGapsS) * 100;
339        }
340
341        /**
342         * 
343         * @return
344         */
345        public float getPercentSimilarityQuery() {
346                return similars / (float) query.length() * 100;
347        }
348
349        /**
350         * 
351         * @return
352         */
353        public float getPercentSimilaritySubject() {
354                return similars / (float) subject.length() * 100;
355        }
356
357        /**
358         * 
359         * @return
360         */
361        public int getQueryLength() {
362                return query.length();
363        }
364
365        /**
366         * 
367         * @return
368         */
369        public int getSubjectLength() {
370                return subject.length();
371        }
372
373        /**
374         * @return the subMatrix
375         */
376        public SubstitutionMatrix getSubstitutionMatrix() {
377                return subMatrix;
378        }
379
380        /**
381         * @param time
382         *            The time in milliseconds, which was needed to generate the
383         *            alignment.
384         */
385        void setComputationTime(long time) {
386                this.time = time;
387        }
388
389        /**
390         * 
391         * @return
392         * @throws BioException
393         */
394        public String formatOutput() throws BioException {
395                return formatOutput(60);
396        }
397
398        /**
399         * This method provides a BLAST-like formated alignment from the given
400         * <code>String</code>s, in which the sequence coordinates and the
401         * information "Query" or "Sbjct", respectively is added to each line. Each
402         * line contains <code>width</code> sequence characters including the gap
403         * symbols plus the meta information. There is one white line between two
404         * pairs of sequences.
405         * 
406         * @param width
407         *            the number of symbols to be displayed per line.
408         * @return formated String.
409         * @throws BioException
410         */
411        public String formatOutput(int width) throws BioException {
412                int i, j;
413                /*
414                 * Highlights equal symbols within the alignment, String match/missmatch
415                 * representation
416                 */
417                StringBuilder path = new StringBuilder();
418                for (i = 0; i < Math.min(queryEnd - queryStart, subjectEnd
419                                - subjectStart) + 1; i++) {
420                        Symbol a = query.symbolAt(i + queryStart);
421                        Symbol b = subject.symbolAt(i + subjectStart);
422                        if (!a.equals(query.getAlphabet().getGapSymbol())
423                                        && !b.equals(subject.getAlphabet().getGapSymbol())
424                                        && ((subMatrix.getValueAt(a, b) >= 0) || a.equals(b))) {
425                                path.append('|');
426                        } else {
427                                path.append(' ');
428                        }
429                }
430
431                int maxLength = path.length();
432                /*
433                 * Math.max(queryEnd - queryStart, subjectEnd - subjectStart) + 1;
434                 */
435                Formatter output = new Formatter();
436                output.format("%n Time (ms):  %s%n", time);
437                output.format(" Length:     %d%n", maxLength);
438                output.format("  Score:     %d%n", getScore());
439                output.format("  Query:     %s, Length: %d%n", query.getName(), query
440                                .length()
441                                - nGapsQ);
442                output.format("  Sbjct:     %s, Length: %d%n", subject.getName(),
443                                subject.length() - nGapsS);
444                output.format(
445                                " Identities: %d/%d, i.e., %d %% (query) and %d %% (sbjct)%n",
446                                identicals, maxLength, Math.round(getPercentIdentityQuery()),
447                                Math.round(getPercentIdentitySubject()));
448                output.format(
449                                " Similars:   %d/%d, i.e., %d %% (query) and %d %% (sbjct)%n",
450                                similars, maxLength, Math.round(getPercentSimilarityQuery()),
451                                Math.round(getPercentSimilaritySubject()));
452                output.format(
453                                " No. gaps:   %d (%d %%) in query and %d (%d %%) in sbjct%n",
454                                nGapsQ, Math.round(getPercentGapsQuery()), nGapsS, Math
455                                                .round(getPercentGapsTarget()));
456
457                int queryLPos = queryStart, queryRPos, pathLPos = 0, pathRPos;
458                int subjectLPos = subjectStart, subjectRPos;
459                int ql = queryLPos - 1, qr = queryLPos - 1, qgaps;
460                int sl = subjectLPos - 1, sr = subjectLPos - 1, sgaps;
461
462                int widthLeft = String.valueOf(Math.max(queryStart, queryEnd)).length();
463                int widthRight = String.valueOf(Math.max(queryEnd, subjectEnd))
464                                .length() + 1;
465
466                // Take width of the meta information into account.
467                width = Math.max(width - widthLeft - widthRight - 12, 2);
468
469                for (i = 1; i <= Math.ceil((double) maxLength / width); i++) {
470
471                        // Query
472                        queryRPos = Math.min(queryStart + i * width - 1, Math.min(queryEnd,
473                                        subjectEnd - subjectStart + queryStart));
474                        qgaps = 0;
475                        for (j = queryLPos; j <= queryRPos; j++) {
476                                if (!query.symbolAt(j).equals(
477                                                query.getAlphabet().getGapSymbol())) {
478                                        qr++;
479                                } else {
480                                        qgaps++;
481                                }
482                        }
483                        if (qgaps <= queryRPos - queryLPos) {
484                                ql++;
485                        }
486                        output.format("%nQuery:   %" + widthLeft + "d ", ql);
487                        output.format("%s ", query.subStr(queryLPos, queryRPos));
488                        output.format("%-" + widthRight + "d%n", qr);
489                        queryLPos = queryRPos + 1;
490                        ql = qr;
491
492                        // Path
493                        pathRPos = Math.min(i * width, path.length());
494                        output.format("%-" + (widthLeft + 10) + "c%s", Character
495                                        .valueOf(' '), path.substring(pathLPos, pathRPos));
496                        pathLPos = pathRPos;
497
498                        // Sbjct
499                        subjectRPos = Math.min(subjectStart + i * width - 1, Math.min(
500                                        queryEnd - queryStart + subjectStart, subjectEnd));
501                        sgaps = 0;
502                        for (j = subjectLPos; j <= subjectRPos; j++) {
503                                if (!subject.symbolAt(j).equals(
504                                                subject.getAlphabet().getGapSymbol())) {
505                                        sr++;
506                                } else {
507                                        sgaps++;
508                                }
509                        }
510                        if (sgaps <= subjectRPos - subjectLPos) {
511                                sl++;
512                        }
513                        output.format("%nSbjct:   %" + widthLeft + "d ", sl);
514                        output.format("%s ", subject.subStr(subjectLPos, subjectRPos));
515                        output.format("%-" + widthRight + "d%n", sr);
516                        subjectLPos = subjectRPos + 1;
517                        sl = sr;
518                }
519                return output.toString();
520        }
521
522        /**
523         * 
524         * @return
525         */
526        public float getPercentGapsTarget() {
527                return nGapsS / (float) subject.length() * 100;
528        }
529
530        /**
531         * 
532         * @return
533         */
534        public float getPercentGapsQuery() {
535                return nGapsQ / (float) query.length() * 100;
536        }
537
538
539    /**
540     * Return the query sequence as a gapped sequence.
541     *
542     * @return the query sequence as a gapped sequence
543     */
544    public GappedSequence getQuery() {
545        return (query instanceof GappedSequence) ? (GappedSequence) query : new SimpleGappedSequence(query);
546    }
547
548    /**
549     * Return the subject sequence as a gapped sequence.
550     *
551     * @return the subject sequence as a gapped sequence
552     */
553    public GappedSequence getSubject() {
554        return (subject instanceof GappedSequence) ? (GappedSequence) subject : new SimpleGappedSequence(subject);
555    }
556}