001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.template; 022 023import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 024import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader; 025import org.biojava.nbio.core.sequence.views.ComplementSequenceView; 026import org.biojava.nbio.core.sequence.views.ReversedSequenceView; 027import org.biojava.nbio.core.sequence.views.WindowedSequence; 028import org.biojava.nbio.core.util.CRC64Checksum; 029 030import java.io.IOException; 031import java.util.*; 032 033/** 034 * Provides a set of static methods to be used as static imports when needed 035 * across multiple Sequence implementations but inheritance gets in the way. 036 * 037 * It also provides a place to put utility methods whose application can 038 * be to a single class of Sequence e.g. {@link NucleotideCompound} 039 * {@link Sequence}; or to any Sequence e.g. looking for the 040 * {@link #getComposition(Sequence)} or {@link #getDistribution(Sequence)} 041 * for any type of Sequence. 042 * 043 * All of these methods assume that you can use the {@link Iterable} interface 044 * offered by the implementations of {@link Sequence} to provide all the 045 * compounds that implementation allows you to see. Since sequence should know 046 * nothing about its backing stores (apart from calling out to it) this should 047 * be true. 048 * 049 * @author ayates 050 */ 051public class SequenceMixin { 052 053 /** 054 * For the given vargs of compounds this method counts the number of 055 * times those compounds appear in the given sequence 056 * 057 * @param sequence The {@link Sequence} to perform the count on 058 * @param compounds The compounds to look for 059 * @param <C> The type of compound we are looking for 060 * @return The number of times the given compounds appear in this Sequence 061 */ 062 public static <C extends Compound> int countCompounds( 063 Sequence<C> sequence, C... compounds) { 064 int count = 0; 065 Map<C, Integer> compositon = getComposition(sequence); 066 for (C compound : compounds) { 067 if(compositon.containsKey(compound)) { 068 count = compositon.get(compound) + count; 069 } 070 } 071 return count; 072 } 073 074 /** 075 * Returns the count of GC in the given sequence 076 * 077 * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform 078 * the GC analysis on 079 * @return The number of GC compounds in the sequence 080 */ 081 public static int countGC(Sequence<NucleotideCompound> sequence) { 082 CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet(); 083 NucleotideCompound G = cs.getCompoundForString("G"); 084 NucleotideCompound C = cs.getCompoundForString("C"); 085 NucleotideCompound g = cs.getCompoundForString("g"); 086 NucleotideCompound c = cs.getCompoundForString("c"); 087 return countCompounds(sequence, G, C, g, c); 088 } 089 090 /** 091 * Returns the count of AT in the given sequence 092 * 093 * @param sequence The {@link NucleotideCompound} {@link Sequence} to perform 094 * the AT analysis on 095 * @return The number of AT compounds in the sequence 096 */ 097 public static int countAT(Sequence<NucleotideCompound> sequence) { 098 CompoundSet<NucleotideCompound> cs = sequence.getCompoundSet(); 099 NucleotideCompound A = cs.getCompoundForString("A"); 100 NucleotideCompound T = cs.getCompoundForString("T"); 101 NucleotideCompound a = cs.getCompoundForString("a"); 102 NucleotideCompound t = cs.getCompoundForString("t"); 103 return countCompounds(sequence, A, T, a, t); 104 } 105 106 /** 107 * Analogous to {@link #getComposition(Sequence)} but returns the 108 * distribution of that {@link Compound} over the given sequence. 109 * 110 * @param <C> The type of compound to look for 111 * @param sequence The type of sequence to look over 112 * @return Returns the decimal fraction of the compounds in the given 113 * sequence. Any compound not in the Map will return a fraction of 0. 114 */ 115 public static <C extends Compound> Map<C, Double> getDistribution(Sequence<C> sequence) { 116 Map<C, Double> results = new HashMap<>(); 117 Map<C, Integer> composition = getComposition(sequence); 118 double length = sequence.getLength(); 119 for (Map.Entry<C, Integer> entry : composition.entrySet()) { 120 double dist = entry.getValue().doubleValue() / length; 121 results.put(entry.getKey(), dist); 122 } 123 return results; 124 } 125 126 /** 127 * Does a linear scan over the given Sequence and records the number of 128 * times each base appears. The returned map will return 0 if a compound 129 * is asked for and the Map has no record of it. 130 * 131 * @param <C> The type of compound to look for 132 * @param sequence The type of sequence to look over 133 * @return Counts for the instances of all compounds in the sequence 134 */ 135 public static <C extends Compound> Map<C, Integer> getComposition(Sequence<C> sequence) { 136 Map<C, Integer> results = new HashMap<>(); 137 138 for (C currentCompound : sequence) { 139 Integer currentInteger = results.get(currentCompound); 140 if ( currentInteger == null) 141 currentInteger = 0; 142 currentInteger++; 143 results.put(currentCompound, currentInteger); 144 } 145 return results; 146 } 147 148 /** 149 * Used as a way of sending a Sequence to a writer without the cost of 150 * converting to a full length String and then writing the data out 151 * 152 * @param <C> Type of compound 153 * @param appendable The writer to send data to 154 * @param sequence The sequence to write out 155 * @throws IOException Thrown if we encounter a problem 156 */ 157 public static <C extends Compound> void write(Appendable appendable, Sequence<C> sequence) throws IOException { 158 for(C compound: sequence) { 159 appendable.append(compound.toString()); 160 } 161 } 162 163 /** 164 * For the given Sequence this will return a {@link StringBuilder} object 165 * filled with the results of {@link Compound#toString()}. Does not 166 * used {@link #write(java.lang.Appendable, org.biojava.nbio.core.sequence.template.Sequence) } 167 * because of its {@link IOException} signature. 168 */ 169 public static <C extends Compound> StringBuilder toStringBuilder(Sequence<C> sequence) { 170 StringBuilder sb = new StringBuilder(sequence.getLength()); 171 for (C compound : sequence) { 172 sb.append(compound.toString()); 173 } 174 return sb; 175 } 176 177 /** 178 * Shortcut to {@link #toStringBuilder(org.biojava.nbio.core.sequence.template.Sequence)} 179 * which calls toString() on the resulting object. 180 */ 181 public static <C extends Compound> String toString(Sequence<C> sequence) { 182 return toStringBuilder(sequence).toString(); 183 } 184 185 /** 186 * For the given {@link Sequence} this will return a {@link List} filled with 187 * the Compounds of that {@link Sequence}. 188 */ 189 public static <C extends Compound> List<C> toList(Sequence<C> sequence) { 190 List<C> list = new ArrayList<>(sequence.getLength()); 191 for (C compound : sequence) { 192 list.add(compound); 193 } 194 return list; 195 } 196 197 /** 198 * Performs a linear search of the given Sequence for the given compound. 199 * Once we find the compound we return the position. 200 */ 201 public static <C extends Compound> int indexOf(Sequence<C> sequence, 202 C compound) { 203 int index = 1; 204 for (C currentCompound : sequence) { 205 if (currentCompound.equals(compound)) { 206 return index; 207 } 208 index++; 209 } 210 return 0; 211 } 212 213 /** 214 * Performs a reversed linear search of the given Sequence by wrapping 215 * it in a {@link ReversedSequenceView} and passing it into 216 * {@link #indexOf(Sequence, Compound)}. We then inverse the index coming 217 * out of it. 218 */ 219 public static <C extends Compound> int lastIndexOf(Sequence<C> sequence, 220 C compound) { 221 int index = indexOf(new ReversedSequenceView<C>(sequence), compound); 222 return (sequence.getLength() - index)+1; 223 } 224 225 /** 226 * Creates a simple sequence iterator which moves through a sequence going 227 * from 1 to the length of the Sequence. Modification of the Sequence is not 228 * allowed. 229 */ 230 public static <C extends Compound> Iterator<C> createIterator( 231 Sequence<C> sequence) { 232 return new SequenceIterator<>(sequence); 233 } 234 235 /** 236 * Creates a simple sub sequence view delimited by the given start and end. 237 */ 238 public static <C extends Compound> SequenceView<C> createSubSequence( 239 Sequence<C> sequence, int start, int end) { 240 return new SequenceProxyView<>(sequence, start, end); 241 } 242 243 /** 244 * Implements sequence shuffling by first materializing the given 245 * {@link Sequence} into a {@link List}, applying 246 * {@link Collections#shuffle(List)} and then returning the shuffled 247 * elements in a new instance of {@link Sequence} which behaves 248 * as a {@link Sequence}. 249 */ 250 public static <C extends Compound> Sequence<C> shuffle(Sequence<C> sequence) { 251 List<C> compounds = sequence.getAsList(); 252 Collections.shuffle(compounds); 253 return new ArrayListSequenceReader<>(compounds, 254 sequence.getCompoundSet()); 255 } 256 257 /** 258 * Performs a simple CRC64 checksum on any given sequence. 259 */ 260 public static <C extends Compound> String checksum(Sequence<C> sequence) { 261 CRC64Checksum checksum = new CRC64Checksum(); 262 for (C compound : sequence) { 263 checksum.update(compound.getShortName()); 264 } 265 return checksum.toString(); 266 } 267 268 /** 269 * Produces kmers of the specified size e.g. ATGTGA returns two views which 270 * have ATG TGA 271 * 272 * @param <C> Compound to use 273 * @param sequence Sequence to build from 274 * @param kmer Kmer size 275 * @return The list of non-overlapping K-mers 276 */ 277 public static <C extends Compound> List<SequenceView<C>> nonOverlappingKmers(Sequence<C> sequence, int kmer) { 278 List<SequenceView<C>> l = new ArrayList<>(); 279 WindowedSequence<C> w = new WindowedSequence<>(sequence, kmer); 280 for(SequenceView<C> view: w) { 281 l.add(view); 282 } 283 return l; 284 } 285 286 /** 287 * Used to generate overlapping k-mers such i.e. ATGTA will give rise to 288 * ATG, TGT & GTA 289 * 290 * @param <C> Compound to use 291 * @param sequence Sequence to build from 292 * @param kmer Kmer size 293 * @return The list of overlapping K-mers 294 */ 295 public static <C extends Compound> List<SequenceView<C>> overlappingKmers(Sequence<C> sequence, int kmer) { 296 List<SequenceView<C>> l = new ArrayList<>(); 297 List<Iterator<SequenceView<C>>> windows 298 = new ArrayList<>(); 299 300 for(int i=1; i<=kmer; i++) { 301 if(i == 1) { 302 windows.add(new WindowedSequence<C>(sequence, kmer).iterator()); 303 } 304 else { 305 SequenceView<C> sv = sequence.getSubSequence(i, sequence.getLength()); 306 windows.add(new WindowedSequence<C>(sv, kmer).iterator()); 307 } 308 } 309 310 OUTER: while(true) { 311 for(int i=0; i<kmer; i++) { 312 Iterator<SequenceView<C>> iterator = windows.get(i); 313 boolean breakLoop=true; 314 if(iterator.hasNext()) { 315 l.add(iterator.next()); 316 breakLoop = false; 317 } 318 if(breakLoop) { 319 break OUTER; 320 } 321 } 322 } 323 return l; 324 } 325 326 /** 327 * A method which attempts to do the right thing when is comes to a 328 * reverse/reverse complement 329 * 330 * @param <C> The type of compound 331 * @param sequence The input sequence 332 * @return The inverted sequence which is optionally complemented 333 */ 334 @SuppressWarnings({ "unchecked" }) 335 public static <C extends Compound> SequenceView<C> inverse(Sequence<C> sequence) { 336 SequenceView<C> reverse = new ReversedSequenceView<>(sequence); 337 if(sequence.getCompoundSet().isComplementable()) { 338 return new ComplementSequenceView(reverse); 339 } 340 return reverse; 341 } 342 343 /** 344 * A case-insensitive manner of comparing two sequence objects together. 345 * We will throw out any compounds which fail to match on their sequence 346 * length & compound sets used. The code will also bail out the moment 347 * we find something is wrong with a Sequence. Cost to run is linear to 348 * the length of the Sequence. 349 * 350 * @param <C> The type of compound 351 * @param source Source sequence to assess 352 * @param target Target sequence to assess 353 * @return Boolean indicating if the sequences matched ignoring case 354 */ 355 public static <C extends Compound> boolean sequenceEqualityIgnoreCase(Sequence<C> source, Sequence<C> target) { 356 return baseSequenceEquality(source, target, true); 357 } 358 359 /** 360 * A case-sensitive manner of comparing two sequence objects together. 361 * We will throw out any compounds which fail to match on their sequence 362 * length & compound sets used. The code will also bail out the moment 363 * we find something is wrong with a Sequence. Cost to run is linear to 364 * the length of the Sequence. 365 * 366 * @param <C> The type of compound 367 * @param source Source sequence to assess 368 * @param target Target sequence to assess 369 * @return Boolean indicating if the sequences matched 370 */ 371 public static <C extends Compound> boolean sequenceEquality(Sequence<C> source, Sequence<C> target) { 372 return baseSequenceEquality(source, target, false); 373 } 374 375 private static <C extends Compound> boolean baseSequenceEquality(Sequence<C> source, Sequence<C> target, boolean ignoreCase) { 376 boolean equal = true; 377 if( 378 source.getLength() == target.getLength() && 379 source.getCompoundSet().equals(target.getCompoundSet())) { 380 Iterator<C> sIter = source.iterator(); 381 Iterator<C> tIter = target.iterator(); 382 while(sIter.hasNext()) { 383 C s = sIter.next(); 384 C t = tIter.next(); 385 boolean cEqual = (ignoreCase) ? s.equalsIgnoreCase(t) : s.equals(t); 386 if(!cEqual) { 387 equal = false; 388 break; 389 } 390 } 391 } 392 else { 393 equal = false; 394 } 395 return equal; 396 } 397 398 /** 399 * A basic sequence iterator which iterates over the given Sequence by 400 * biological index. This assumes your sequence supports random access 401 * and performs well when doing these operations. 402 * 403 * @author ayates 404 * 405 * @param <C> Type of compound to return 406 */ 407 public static class SequenceIterator<C extends Compound> 408 implements Iterator<C> { 409 410 private final Sequence<C> sequence; 411 private final int length; 412 private int currentPosition = 0; 413 414 public SequenceIterator(Sequence<C> sequence) { 415 this.sequence = sequence; 416 this.length = sequence.getLength(); 417 } 418 419 420 @Override 421 public boolean hasNext() { 422 return (currentPosition < length); 423 } 424 425 426 @Override 427 public C next() { 428 if(!hasNext()) { 429 throw new NoSuchElementException("Exhausted sequence of elements"); 430 } 431 return sequence.getCompoundAt(++currentPosition); 432 } 433 434 @Override 435 public void remove() { 436 throw new UnsupportedOperationException("Cannot remove() on a SequenceIterator"); 437 } 438 } 439}