001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.template;
022
023import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
024import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader;
025import org.biojava.nbio.core.sequence.views.ComplementSequenceView;
026import org.biojava.nbio.core.sequence.views.ReversedSequenceView;
027import org.biojava.nbio.core.sequence.views.WindowedSequence;
028import org.biojava.nbio.core.util.CRC64Checksum;
029
030import java.io.IOException;
031import java.util.*;
032
033/**
034 * Provides a set of static methods to be used as static imports when needed
035 * across multiple Sequence implementations but inheritance gets in the way.
036 *
037 * It also provides a place to put utility methods whose application can
038 * be to a single class of Sequence e.g. {@link NucleotideCompound}
039 * {@link Sequence}; or to any Sequence e.g. looking for the
040 * {@link #getComposition(Sequence)} or {@link #getDistribution(Sequence)}
041 * for any type of Sequence.
042 *
043 * All of these methods assume that you can use the {@link Iterable} interface
044 * offered by the implementations of {@link Sequence} to provide all the
045 * compounds that implementation allows you to see. Since sequence should know
046 * nothing about its backing stores (apart from calling out to it) this should
047 * be true.
048 *
049 * @author ayates
050 */
051public class SequenceMixin {
052
053        /**
054         * For the given vargs of compounds this method counts the number of
055         * times those compounds appear in the given sequence
056         *
057         * @param sequence The {@link Sequence} to perform the count on
058         * @param compounds The compounds to look for
059         * @param <C> The type of compound we are looking for
060         * @return The number of times the given compounds appear in this Sequence
061         */
062        public static <C extends Compound> int countCompounds(
063                        Sequence<C> sequence, C... compounds) {
064                int count = 0;
065                Map<C, Integer> compositon = getComposition(sequence);
066                for (C compound : compounds) {
067                        if(compositon.containsKey(compound)) {
068                                count = compositon.get(compound) + count;
069                        }
070                }
071                return count;
072        }
073
074        /**
075         * Returns the count of GC in the given sequence
076         *
077         * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform
078         * the GC analysis on
079         * @return The number of GC compounds in the sequence
080         */
081        public static int countGC(Sequence<NucleotideCompound> sequence) {
082                CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet();
083                NucleotideCompound G = cs.getCompoundForString("G");
084                NucleotideCompound C = cs.getCompoundForString("C");
085                NucleotideCompound g = cs.getCompoundForString("g");
086                NucleotideCompound c = cs.getCompoundForString("c");
087                return countCompounds(sequence, G, C, g, c);
088        }
089
090        /**
091         * Returns the count of AT in the given sequence
092         *
093         * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform
094         * the AT analysis on
095         * @return The number of AT compounds in the sequence
096         */
097        public static int countAT(Sequence<NucleotideCompound> sequence) {
098                CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet();
099                NucleotideCompound A = cs.getCompoundForString("A");
100                NucleotideCompound T = cs.getCompoundForString("T");
101                NucleotideCompound a = cs.getCompoundForString("a");
102                NucleotideCompound t = cs.getCompoundForString("t");
103                return countCompounds(sequence, A, T, a, t);
104        }
105
106        /**
107         * Analogous to {@link #getComposition(Sequence)} but returns the
108         * distribution of that {@link Compound} over the given sequence.
109         *
110         * @param <C> The type of compound to look for
111         * @param sequence The type of sequence to look over
112         * @return Returns the decimal fraction of the compounds in the given
113         * sequence. Any compound not in the Map will return a fraction of 0.
114         */
115        public static <C extends Compound> Map<C, Double> getDistribution(Sequence<C> sequence) {
116                Map<C, Double> results = new HashMap<>();
117                Map<C, Integer> composition = getComposition(sequence);
118                double length = sequence.getLength();
119                for (Map.Entry<C, Integer> entry : composition.entrySet()) {
120                        double dist = entry.getValue().doubleValue() / length;
121                        results.put(entry.getKey(), dist);
122                }
123                return results;
124        }
125
126        /**
127         * Does a linear scan over the given Sequence and records the number of
128         * times each base appears. The returned map will return 0 if a compound
129         * is asked for and the Map has no record of it.
130         *
131         * @param <C> The type of compound to look for
132         * @param sequence The type of sequence to look over
133         * @return Counts for the instances of all compounds in the sequence
134         */
135        public static <C extends Compound> Map<C, Integer> getComposition(Sequence<C> sequence) {
136                Map<C, Integer> results = new HashMap<>();
137
138                for (C currentCompound : sequence) {
139                        Integer currentInteger = results.get(currentCompound);
140                        if ( currentInteger == null)
141                                currentInteger = 0;
142                        currentInteger++;
143                        results.put(currentCompound, currentInteger);
144                }
145                return results;
146        }
147
148        /**
149         * Used as a way of sending a Sequence to a writer without the cost of
150         * converting to a full length String and then writing the data out
151         *
152         * @param <C> Type of compound
153         * @param appendable The writer to send data to
154         * @param sequence The sequence to write out
155         * @throws IOException Thrown if we encounter a problem
156         */
157        public static <C extends Compound> void write(Appendable appendable, Sequence<C> sequence) throws IOException {
158                for(C compound: sequence) {
159                        appendable.append(compound.toString());
160                }
161        }
162
163        /**
164         * For the given Sequence this will return a {@link StringBuilder} object
165         * filled with the results of {@link Compound#toString()}. Does not
166         * used {@link #write(java.lang.Appendable, org.biojava.nbio.core.sequence.template.Sequence) }
167         * because of its {@link IOException} signature.
168         */
169        public static <C extends Compound> StringBuilder toStringBuilder(Sequence<C> sequence) {
170                StringBuilder sb = new StringBuilder(sequence.getLength());
171                for (C compound : sequence) {
172                        sb.append(compound.toString());
173                }
174                return sb;
175        }
176
177        /**
178         * Shortcut to {@link #toStringBuilder(org.biojava.nbio.core.sequence.template.Sequence)}
179         * which calls toString() on the resulting object.
180         */
181        public static <C extends Compound> String toString(Sequence<C> sequence) {
182                return toStringBuilder(sequence).toString();
183        }
184
185        /**
186         * For the given {@link Sequence} this will return a {@link List} filled with
187         * the Compounds of that {@link Sequence}.
188         */
189        public static <C extends Compound> List<C> toList(Sequence<C> sequence) {
190                List<C> list = new ArrayList<>(sequence.getLength());
191                for (C compound : sequence) {
192                        list.add(compound);
193                }
194                return list;
195        }
196
197        /**
198         * Performs a linear search of the given Sequence for the given compound.
199         * Once we find the compound we return the position.
200         */
201        public static <C extends Compound> int indexOf(Sequence<C> sequence,
202                        C compound) {
203                int index = 1;
204                for (C currentCompound : sequence) {
205                        if (currentCompound.equals(compound)) {
206                                return index;
207                        }
208                        index++;
209                }
210                return 0;
211        }
212
213        /**
214         * Performs a reversed linear search of the given Sequence by wrapping
215         * it in a {@link ReversedSequenceView} and passing it into
216         * {@link #indexOf(Sequence, Compound)}. We then inverse the index coming
217         * out of it.
218         */
219        public static <C extends Compound> int lastIndexOf(Sequence<C> sequence,
220                        C compound) {
221                int index = indexOf(new ReversedSequenceView<C>(sequence), compound);
222                return (sequence.getLength() - index)+1;
223        }
224
225        /**
226         * Creates a simple sequence iterator which moves through a sequence going
227         * from 1 to the length of the Sequence. Modification of the Sequence is not
228         * allowed.
229         */
230        public static <C extends Compound> Iterator<C> createIterator(
231                        Sequence<C> sequence) {
232                return new SequenceIterator<>(sequence);
233        }
234
235        /**
236         * Creates a simple sub sequence view delimited by the given start and end.
237         */
238        public static <C extends Compound> SequenceView<C> createSubSequence(
239                        Sequence<C> sequence, int start, int end) {
240                return new SequenceProxyView<>(sequence, start, end);
241        }
242
243        /**
244         * Implements sequence shuffling by first materializing the given
245         * {@link Sequence} into a {@link List}, applying
246         * {@link Collections#shuffle(List)} and then returning the shuffled
247         * elements in a new instance of {@link Sequence} which behaves
248         * as a {@link Sequence}.
249         */
250        public static <C extends Compound> Sequence<C> shuffle(Sequence<C> sequence) {
251                List<C> compounds = sequence.getAsList();
252                Collections.shuffle(compounds);
253                return new ArrayListSequenceReader<>(compounds,
254                                sequence.getCompoundSet());
255        }
256
257        /**
258         * Performs a simple CRC64 checksum on any given sequence.
259         */
260        public static <C extends Compound> String checksum(Sequence<C> sequence) {
261                CRC64Checksum checksum = new CRC64Checksum();
262                for (C compound : sequence) {
263                        checksum.update(compound.getShortName());
264                }
265                return checksum.toString();
266        }
267
268        /**
269         * Produces kmers of the specified size e.g. ATGTGA returns two views which
270         * have ATG TGA
271         *
272         * @param <C> Compound to use
273         * @param sequence Sequence to build from
274         * @param kmer Kmer size
275         * @return The list of non-overlapping K-mers
276         */
277        public static <C extends Compound> List<SequenceView<C>> nonOverlappingKmers(Sequence<C> sequence, int kmer) {
278                List<SequenceView<C>> l = new ArrayList<>();
279                WindowedSequence<C> w = new WindowedSequence<>(sequence, kmer);
280                for(SequenceView<C> view: w) {
281                        l.add(view);
282                }
283                return l;
284        }
285
286        /**
287         * Used to generate overlapping k-mers such i.e. ATGTA will give rise to
288         * ATG, TGT &amp; GTA
289         *
290         * @param <C> Compound to use
291         * @param sequence Sequence to build from
292         * @param kmer Kmer size
293         * @return The list of overlapping K-mers
294         */
295        public static <C extends Compound> List<SequenceView<C>> overlappingKmers(Sequence<C> sequence, int kmer) {
296                List<SequenceView<C>> l = new ArrayList<>();
297                List<Iterator<SequenceView<C>>> windows
298                                = new ArrayList<>();
299
300                for(int i=1; i<=kmer; i++) {
301                        if(i == 1) {
302                                windows.add(new WindowedSequence<C>(sequence, kmer).iterator());
303                        }
304                        else {
305                                SequenceView<C> sv = sequence.getSubSequence(i, sequence.getLength());
306                                windows.add(new WindowedSequence<C>(sv, kmer).iterator());
307                        }
308                }
309
310                OUTER: while(true) {
311                        for(int i=0; i<kmer; i++) {
312                                Iterator<SequenceView<C>> iterator = windows.get(i);
313                                boolean breakLoop=true;
314                                if(iterator.hasNext()) {
315                                        l.add(iterator.next());
316                                        breakLoop = false;
317                                }
318                                if(breakLoop) {
319                                        break OUTER;
320                                }
321                        }
322                }
323                return l;
324        }
325
326        /**
327         * A method which attempts to do the right thing when is comes to a
328         * reverse/reverse complement
329         *
330         * @param <C> The type of compound
331         * @param sequence The input sequence
332         * @return The inverted sequence which is optionally complemented
333         */
334        @SuppressWarnings({ "unchecked" })
335        public static <C extends Compound> SequenceView<C> inverse(Sequence<C> sequence) {
336                SequenceView<C> reverse = new ReversedSequenceView<>(sequence);
337                if(sequence.getCompoundSet().isComplementable()) {
338                        return new ComplementSequenceView(reverse);
339                }
340                return reverse;
341        }
342
343        /**
344         * A case-insensitive manner of comparing two sequence objects together.
345         * We will throw out any compounds which fail to match on their sequence
346         * length &amp; compound sets used. The code will also bail out the moment
347         * we find something is wrong with a Sequence. Cost to run is linear to
348         * the length of the Sequence.
349         *
350         * @param <C> The type of compound
351         * @param source Source sequence to assess
352         * @param target Target sequence to assess
353         * @return Boolean indicating if the sequences matched ignoring case
354         */
355        public static <C extends Compound> boolean sequenceEqualityIgnoreCase(Sequence<C> source, Sequence<C> target) {
356                return baseSequenceEquality(source, target, true);
357        }
358
359        /**
360         * A case-sensitive manner of comparing two sequence objects together.
361         * We will throw out any compounds which fail to match on their sequence
362         * length &amp; compound sets used. The code will also bail out the moment
363         * we find something is wrong with a Sequence. Cost to run is linear to
364         * the length of the Sequence.
365         *
366         * @param <C> The type of compound
367         * @param source Source sequence to assess
368         * @param target Target sequence to assess
369         * @return Boolean indicating if the sequences matched
370         */
371        public static <C extends Compound> boolean sequenceEquality(Sequence<C> source, Sequence<C> target) {
372                return baseSequenceEquality(source, target, false);
373        }
374
375        private static <C extends Compound> boolean baseSequenceEquality(Sequence<C> source, Sequence<C> target, boolean ignoreCase) {
376                boolean equal = true;
377                if(
378                                source.getLength() == target.getLength() &&
379                                source.getCompoundSet().equals(target.getCompoundSet())) {
380                        Iterator<C> sIter = source.iterator();
381                        Iterator<C> tIter = target.iterator();
382                        while(sIter.hasNext()) {
383                                C s = sIter.next();
384                                C t = tIter.next();
385                                boolean cEqual = (ignoreCase) ? s.equalsIgnoreCase(t) : s.equals(t);
386                                if(!cEqual) {
387                                        equal = false;
388                                        break;
389                                }
390                        }
391                }
392                else {
393                        equal = false;
394                }
395                return equal;
396        }
397
398        /**
399         * A basic sequence iterator which iterates over the given Sequence by
400         * biological index. This assumes your sequence supports random access
401         * and performs well when doing these operations.
402         *
403         * @author ayates
404         *
405         * @param <C> Type of compound to return
406         */
407        public static class SequenceIterator<C extends Compound>
408                        implements Iterator<C> {
409
410                private final Sequence<C> sequence;
411                private final int length;
412                private int currentPosition = 0;
413
414                public SequenceIterator(Sequence<C> sequence) {
415                        this.sequence = sequence;
416                        this.length = sequence.getLength();
417                }
418
419
420                @Override
421                public boolean hasNext() {
422                        return (currentPosition < length);
423                }
424
425
426                @Override
427                public C next() {
428                        if(!hasNext()) {
429                                throw new NoSuchElementException("Exhausted sequence of elements");
430                        }
431                        return sequence.getCompoundAt(++currentPosition);
432                }
433
434                @Override
435                public void remove() {
436                        throw new UnsupportedOperationException("Cannot remove() on a SequenceIterator");
437                }
438        }
439}