001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on June 14, 2010
021 * Author: Mark Chapman
022 */
023
024package org.biojava.nbio.core.alignment;
025
026import org.biojava.nbio.core.alignment.matrices.SubstitutionMatrixHelper;
027import org.biojava.nbio.core.alignment.template.AlignedSequence;
028import org.biojava.nbio.core.alignment.template.AlignedSequence.Step;
029import org.biojava.nbio.core.alignment.template.Profile;
030import org.biojava.nbio.core.alignment.template.ProfileView;
031import org.biojava.nbio.core.alignment.template.SubstitutionMatrix;
032import org.biojava.nbio.core.sequence.AccessionID;
033import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
034import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
035import org.biojava.nbio.core.sequence.io.util.IOUtils;
036import org.biojava.nbio.core.sequence.location.template.Location;
037import org.biojava.nbio.core.sequence.template.Compound;
038import org.biojava.nbio.core.sequence.template.CompoundSet;
039import org.biojava.nbio.core.sequence.template.Sequence;
040
041import java.io.Serializable;
042import java.util.*;
043
044
045/**
046 * Implements a data structure for the results of sequence alignment.  Every {@link List} returned is unmodifiable.
047 *
048 * @author Mark Chapman
049 * @author Paolo Pavan
050 * @param <S> each element of the alignment {@link Profile} is of type S
051 * @param <C> each element of an {@link AlignedSequence} is a {@link Compound} of type C
052 */
053public class SimpleProfile<S extends Sequence<C>, C extends Compound> implements Serializable, Profile<S, C> {
054
055
056        private static final long serialVersionUID = 1L;
057
058        private List<AlignedSequence<S, C>> list;
059        private List<S> originals;
060        private int length;
061
062        /**
063         * Creates a pair profile for the given already aligned sequences.
064         *
065         * @param query the first sequence of the pair
066         * @param target the second sequence of the pair
067         * @throws IllegalArgumentException if sequences differ in size
068         */
069        protected SimpleProfile(AlignedSequence<S, C> query, AlignedSequence<S, C> target) {
070                if (query.getLength() != target.getLength()) {
071                        throw new IllegalArgumentException("Aligned sequences differ in size");
072                }
073                list = new ArrayList<>();
074                list.add(query);
075                list.add(target);
076                list = Collections.unmodifiableList(list);
077                originals = new ArrayList<>();
078                originals.add(query.getOriginalSequence());
079                originals.add(target.getOriginalSequence());
080                originals = Collections.unmodifiableList(originals);
081                length = query.getLength();
082        }
083
084        /**
085         * Creates a profile from a single sequence.
086         *
087         * @param sequence sequence to seed profile
088         */
089        public SimpleProfile(S sequence) {
090                List<Step> s = new ArrayList<>();
091                for (int i = 0; i < sequence.getLength(); i++) {
092                        s.add(Step.COMPOUND);
093                }
094                list = new ArrayList<>();
095                list.add(new SimpleAlignedSequence<S, C>(sequence, s));
096                list = Collections.unmodifiableList(list);
097                originals = new ArrayList<>();
098                originals.add(sequence);
099                originals = Collections.unmodifiableList(originals);
100                length = sequence.getLength();
101        }
102
103        /**
104         * Creates a pair profile for the given sequences.
105         *
106         * @param query the first sequence of the pair
107         * @param target the second sequence of the pair
108         * @param sx lists whether the query sequence aligns a {@link Compound} or gap at each index of the alignment
109         * @param xb number of {@link Compound}s skipped in the query sequence before the aligned region
110         * @param xa number of {@link Compound}s skipped in the query sequence after the aligned region
111         * @param sy lists whether the target sequence aligns a {@link Compound} or gap at each index of the alignment
112         * @param yb number of {@link Compound}s skipped in the target sequence before the aligned region
113         * @param ya number of {@link Compound}s skipped in the target sequence after the aligned region
114         * @throws IllegalArgumentException if alignments differ in size or given sequences do not fit in alignments
115         */
116        protected SimpleProfile(S query, S target, List<Step> sx, int xb, int xa, List<Step> sy, int yb, int ya) {
117                if (sx.size() != sy.size()) {
118                        throw new IllegalArgumentException("Alignments differ in size");
119                }
120                list = new ArrayList<>();
121                list.add(new SimpleAlignedSequence<S, C>(query, sx, xb, xa));
122                list.add(new SimpleAlignedSequence<S, C>(target, sy, yb, ya));
123                list = Collections.unmodifiableList(list);
124                originals = new ArrayList<>();
125                originals.add(query);
126                originals.add(target);
127                originals = Collections.unmodifiableList(originals);
128                length = sx.size();
129        }
130
131        /**
132         * Creates a pair profile for the given profiles.
133         *
134         * @param query the first profile of the pair
135         * @param target the second profile of the pair
136         * @param sx lists whether the query profile aligns a {@link Compound} or gap at each index of the alignment
137         * @param sy lists whether the target profile aligns a {@link Compound} or gap at each index of the alignment
138         * @throws IllegalArgumentException if alignments differ in size or given profiles do not fit in alignments
139         */
140        protected SimpleProfile(Profile<S, C> query, Profile<S, C> target, List<Step> sx, List<Step> sy) {
141                if (sx.size() != sy.size()) {
142                        throw new IllegalArgumentException("Alignments differ in size");
143                }
144                list = new ArrayList<>();
145                for (AlignedSequence<S, C> s : query) {
146                        list.add(new SimpleAlignedSequence<S, C>(s, sx));
147                }
148                for (AlignedSequence<S, C> s : target) {
149                        list.add(new SimpleAlignedSequence<S, C>(s, sy));
150                }
151                list = Collections.unmodifiableList(list);
152                originals = new ArrayList<>();
153                originals.addAll(query.getOriginalSequences());
154                originals.addAll(target.getOriginalSequences());
155                originals = Collections.unmodifiableList(originals);
156                length = sx.size();
157        }
158
159         /**
160         * Creates a profile for the already aligned sequences.
161         * @param alignedSequences the already aligned sequences
162         * @throws IllegalArgumentException if aligned sequences differ in length or
163         * collection is empty.
164         */
165        public SimpleProfile(Collection<AlignedSequence<S,C>> alignedSequences) {
166                list = new ArrayList<>();
167                originals = new ArrayList<>();
168
169                Iterator<AlignedSequence<S,C>> itr = alignedSequences.iterator();
170                if(!itr.hasNext()) {
171                        throw new IllegalArgumentException("alignedSequences must not be empty");
172                }
173
174                AlignedSequence<S, C> curAlignedSeq = itr.next();
175                length = curAlignedSeq.getLength();
176                list.add(curAlignedSeq);
177                originals.add(curAlignedSeq.getOriginalSequence());
178
179                while (itr.hasNext()) {
180                        curAlignedSeq = itr.next();
181                        if (curAlignedSeq.getLength() != length) {
182                                throw new IllegalArgumentException("Aligned sequences differ in size");
183                        }
184                        list.add(curAlignedSeq);
185                        originals.add(curAlignedSeq.getOriginalSequence());
186                }
187                list = Collections.unmodifiableList(list);
188                originals = Collections.unmodifiableList(originals);
189        }
190
191
192        // methods for Profile
193
194        @Override
195        public AlignedSequence<S, C> getAlignedSequence(int listIndex) {
196                return list.get(listIndex - 1);
197        }
198
199        @Override
200        public AlignedSequence<S, C> getAlignedSequence(S sequence) {
201                for (AlignedSequence<S, C> s : list) {
202                        if (s.equals(sequence) || s.getOriginalSequence().equals(sequence)) {
203                                return s;
204                        }
205                }
206                return null;
207        }
208
209        @Override
210        public List<AlignedSequence<S, C>> getAlignedSequences() {
211                return list;
212        }
213
214        @Override
215        public List<AlignedSequence<S, C>> getAlignedSequences(int... listIndices) {
216                List<AlignedSequence<S, C>> tempList = new ArrayList<>();
217                for (int i : listIndices) {
218                        tempList.add(getAlignedSequence(i));
219                }
220                return Collections.unmodifiableList(tempList);
221        }
222
223        @Override
224        public List<AlignedSequence<S, C>> getAlignedSequences(S... sequences) {
225                List<AlignedSequence<S, C>> tempList = new ArrayList<>();
226                for (S s : sequences) {
227                        tempList.add(getAlignedSequence(s));
228                }
229                return Collections.unmodifiableList(tempList);
230        }
231
232        @Override
233        public C getCompoundAt(int listIndex, int alignmentIndex) {
234                return getAlignedSequence(listIndex).getCompoundAt(alignmentIndex);
235        }
236
237        @Override
238        public C getCompoundAt(S sequence, int alignmentIndex) {
239                AlignedSequence<S, C> s = getAlignedSequence(sequence);
240                return (s == null) ? null : s.getCompoundAt(alignmentIndex);
241        }
242
243        @Override
244        public int[] getCompoundCountsAt(int alignmentIndex) {
245                return getCompoundCountsAt(alignmentIndex, getCompoundSet().getAllCompounds());
246        }
247
248        @Override
249        public int[] getCompoundCountsAt(int alignmentIndex, List<C> compounds) {
250                int[] counts = new int[compounds.size()];
251                C gap = getCompoundSet().getCompoundForString("-");
252                int igap = compounds.indexOf(gap);
253                for (C compound : getCompoundsAt(alignmentIndex)) {
254                        int i = compounds.indexOf(compound);
255                        if (i >= 0 && i != igap && !getCompoundSet().compoundsEquivalent(compound, gap)) {
256                                counts[i]++;
257                        }
258                }
259                return counts;
260        }
261
262        @Override
263        public List<C> getCompoundsAt(int alignmentIndex) {
264                // TODO handle circular alignments
265                List<C> column = new ArrayList<>();
266                for (AlignedSequence<S, C> s : list) {
267                        column.add(s.getCompoundAt(alignmentIndex));
268                }
269                return Collections.unmodifiableList(column);
270        }
271
272        @Override
273        public CompoundSet<C> getCompoundSet() {
274                return list.get(0).getCompoundSet();
275        }
276
277        @Override
278        public float[] getCompoundWeightsAt(int alignmentIndex) {
279                return getCompoundWeightsAt(alignmentIndex, getCompoundSet().getAllCompounds());
280        }
281
282        @Override
283        public float[] getCompoundWeightsAt(int alignmentIndex, List<C> compounds) {
284                float[] weights = new float[compounds.size()];
285                int[] counts = getCompoundCountsAt(alignmentIndex, compounds);
286                float total = 0.0f;
287                for (int i : counts) {
288                        total += i;
289                }
290                if (total > 0.0f) {
291                        for (int i = 0; i < weights.length; i++) {
292                                weights[i] = counts[i]/total;
293                        }
294                }
295                return weights;
296        }
297
298        @Override
299        public int getIndexOf(C compound) {
300                for (int i = 1; i <= length; i++) {
301                        if (getCompoundsAt(i).contains(compound)) {
302                                return i;
303                        }
304                }
305                return -1;
306        }
307
308        @Override
309        public int[] getIndicesAt(int alignmentIndex) {
310                int[] indices = new int[list.size()];
311                for (int i = 0; i < indices.length; i++) {
312                        indices[i] = list.get(i).getSequenceIndexAt(alignmentIndex);
313                }
314                return indices;
315        }
316
317        @Override
318        public int getLastIndexOf(C compound) {
319                for (int i = length; i >= 1; i--) {
320                        if (getCompoundsAt(i).contains(compound)) {
321                                return i;
322                        }
323                }
324                return -1;
325        }
326
327        @Override
328        public int getLength() {
329                return length;
330        }
331
332        @Override
333        public List<S> getOriginalSequences() {
334                return originals;
335        }
336
337        @Override
338        public int getSize() {
339                int size = 0;
340                for (AlignedSequence<S, C> s : list) {
341                        size += s.getOverlapCount();
342                }
343                return size;
344        }
345
346        @Override
347        public ProfileView<S, C> getSubProfile(Location location) {
348                // TODO ProfileView<S, C> getSubProfile(Location)
349                return null;
350        }
351
352        @Override
353        public boolean hasGap(int alignmentIndex) {
354                C gap = getCompoundSet().getCompoundForString("-");
355                for (C compound : getCompoundsAt(alignmentIndex)) {
356                        if (getCompoundSet().compoundsEquivalent(compound, gap)) {
357                                return true;
358                        }
359                }
360                return false;
361        }
362
363        @Override
364        public boolean isCircular() {
365                for (AlignedSequence<S, C> s : list) {
366                        if (s.isCircular()) {
367                                return true;
368                        }
369                }
370                return false;
371        }
372
373        @Override
374        public String toString(int width) {
375                return toString(width, null, IOUtils.getIDFormat(list), true, true, true, true, true, false);
376        }
377
378        @Override
379        public String toString(StringFormat format) {
380                switch (format) {
381                case ALN:
382                case CLUSTALW:
383                default:
384                        return toString(60, String.format("CLUSTAL W MSA from BioJava%n%n"), IOUtils.getIDFormat(list) + "   ",
385                                        false, true, true, false, true, false);
386                case FASTA:
387                        return toString(60, null, ">%s%n", false, false, false, false, false, false);
388                case GCG:
389                case MSF:
390                        return toString(50, IOUtils.getGCGHeader(list), IOUtils.getIDFormat(list), false, false, true, false,
391                                        false, false);
392                case PDBWEB:
393                        return toString(60, null, "%10s", true, true, true, false, true, true);
394                }
395        }
396
397        // method from Object
398
399        @Override
400        public String toString() {
401                return toString(getLength(), null, null, false, false, false, false, false, false);
402        }
403
404        // method for Iterable
405
406        @Override
407        public Iterator<AlignedSequence<S, C>> iterator() {
408                return list.iterator();
409        }
410
411        // helper methods
412
413        // creates formatted String
414        private String toString(int width, String header, String idFormat, boolean seqIndexPre, boolean seqIndexPost,
415                        boolean interlaced, boolean aligIndices, boolean aligConservation, boolean webDisplay) {
416
417                // TODO handle circular alignments
418                StringBuilder s = (header == null) ? new StringBuilder() : new StringBuilder(header);
419
420                if ( webDisplay && list.size() == 2){
421                        s.append("<div><pre>");
422                }
423
424                width = Math.max(1, width);
425                int seqIndexPad = (int) (Math.floor(Math.log10(getLength())) + 2);
426                String seqIndexFormatPre = "%" + seqIndexPad + "d ", seqIndexFormatPost = "%" + seqIndexPad + "d";
427                if (interlaced) {
428                        String aligIndFormat = "%-" + Math.max(1, width / 2) + "d %" + Math.max(1, width - (width / 2) - 1) +
429                        "d%n";
430                        for (int i = 0; i < getLength(); i += width) {
431                                int start = i + 1, end = Math.min(getLength(), i + width);
432                                if (i > 0) {
433                                        s.append(String.format("%n"));
434                                }
435                                if (aligIndices) {
436                                        if (end < i + width) {
437                                                int line = end - start + 1;
438                                                aligIndFormat = "%-" + Math.max(1, line / 2) + "d %" + Math.max(1, line - (line / 2) - 1) +
439                                                "d%n";
440                                        }
441                                        if (idFormat != null) {
442                                                s.append(String.format(idFormat, ""));
443                                        }
444                                        if (seqIndexPre) {
445                                                s.append(String.format("%" + (seqIndexPad + 1) + "s", ""));
446                                        }
447                                        s.append(String.format(aligIndFormat, start, end));
448                                }
449
450                                int counter = 0;
451                                for (AlignedSequence<S, C> as : list) {
452                                        counter++;
453
454                                        if ( webDisplay && list.size() == 2){
455                                                printSequenceAlignmentWeb(s, counter, idFormat, seqIndexPre, seqIndexFormatPre, seqIndexPost,
456                                                                seqIndexFormatPost, start, end);
457                                        } else {
458                                                if (idFormat != null) {
459                                                        s.append(String.format(idFormat, as.getAccession()));
460                                                }
461                                                if (seqIndexPre) {
462                                                        s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start)));
463                                                }
464
465                                                s.append(as.getSubSequence(start, end).getSequenceAsString());
466
467                                                if (seqIndexPost) {
468                                                        s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end)));
469                                                }
470                                                s.append(String.format("%n"));
471                                        }
472
473                                        if (aligConservation && list.size() == 2 && counter == 1) {
474                                                printConservation(s, idFormat, seqIndexPad, seqIndexPre, start, end, webDisplay);
475                                        }
476                                }
477
478                        }
479                } else {
480                        for (AlignedSequence<S, C> as : list) {
481                                if (idFormat != null) {
482                                        s.append(String.format(idFormat, as.getAccession()));
483                                }
484                                for (int i = 0; i < getLength(); i += width) {
485                                        int start = i + 1, end = Math.min(getLength(), i + width);
486                                        if (seqIndexPre) {
487                                                s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start)));
488                                        }
489                                        s.append(as.getSubSequence(start, end).getSequenceAsString());
490                                        if (seqIndexPost) {
491                                                s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end)));
492                                        }
493                                        s.append(String.format("%n"));
494                                }
495                        }
496                }
497
498                if (webDisplay && aligConservation && list.size() == 2) {
499                        s.append(IOUtils.getPDBLegend());
500                }
501                return s.toString();
502        }
503
504        private void printSequenceAlignmentWeb(StringBuilder s, int counter, String idFormat, boolean seqIndexPre,
505                        String seqIndexFormatPre, boolean seqIndexPost, String seqIndexFormatPost, int start, int end) {
506                AlignedSequence<S,C> as1 = list.get(0), as2 = list.get(1), as = list.get(counter - 1);
507
508                if (idFormat != null) {
509                        s.append(String.format(idFormat, as.getAccession()));
510                }
511                if (seqIndexPre) {
512                        s.append(String.format(seqIndexFormatPre, as.getSequenceIndexAt(start)));
513                }
514
515                String mySeq = as.getSubSequence(start, end).getSequenceAsString();
516                String s1 = as1.getSubSequence(start, end).getSequenceAsString();
517                String s2 = as2.getSubSequence(start, end).getSequenceAsString();
518
519                for (int i = 0; i < s1.length(); i++) {
520                        if (i >= s2.length() || i >= mySeq.length())
521                                break;
522
523                        char c1 = s1.charAt(i);
524                        char c2 = s2.charAt(i);
525                        char c = mySeq.charAt(i);
526                        s.append(IOUtils.getPDBCharacter(true, c1, c2, isSimilar(c1, c2), c));
527                }
528
529                if (seqIndexPost) {
530                        s.append(String.format(seqIndexFormatPost, as.getSequenceIndexAt(end)));
531                }
532                s.append(String.format("%n"));
533
534        }
535
536        private void printConservation(StringBuilder s, String idFormat, int seqIndexPad, boolean seqIndexPre, int start,
537                        int end, boolean webDisplay) {
538                AlignedSequence<S,C> as1 = list.get(0), as2 = list.get(1);
539
540                if (idFormat != null) {
541                        AccessionID ac1 = as1.getAccession();
542                        String id1 = (ac1 == null) ? "null" : ac1.getID();
543                        id1 = id1.replaceAll(".", " ");
544                        s.append(String.format(idFormat, id1));
545                }
546
547                if (seqIndexPre) {
548                        s.append(String.format("%" + (seqIndexPad + 1) + "s", ""));
549                }
550
551                String subseq1 = as1.getSubSequence(start, end).getSequenceAsString();
552                String subseq2 = as2.getSubSequence(start, end).getSequenceAsString();
553
554                for ( int ii =0 ; ii < subseq1.length() ; ii++){
555                        if ( ii >= subseq2.length())
556                                break;
557                        char c1 = subseq1.charAt(ii);
558                        char c2 = subseq2.charAt(ii);
559                        s.append(IOUtils.getPDBConservation(webDisplay, c1, c2, isSimilar(c1, c2)));
560                }
561
562                s.append(String.format("%n"));
563        }
564
565        protected static final SubstitutionMatrix<AminoAcidCompound> matrix = SubstitutionMatrixHelper.getBlosum65();
566
567        private boolean isSimilar(char c1, char c2) {
568                AminoAcidCompoundSet set = AminoAcidCompoundSet.getAminoAcidCompoundSet();
569
570                AminoAcidCompound aa1 = set.getCompoundForString(String.valueOf(c1));
571                AminoAcidCompound aa2 = set.getCompoundForString(String.valueOf(c2));
572
573                short val = matrix.getValue(aa1,aa2);
574                return val > 0;
575        }
576
577}