001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.bio.program.fastq; 022 023import java.util.List; 024 025import com.google.common.collect.Lists; 026import com.google.common.collect.ImmutableList; 027 028import org.biojava.bio.Annotation; 029 030import org.biojava.bio.dist.Distribution; 031 032import org.biojava.bio.program.phred.PhredSequence; 033import org.biojava.bio.program.phred.PhredTools; 034 035import org.biojava.bio.seq.DNATools; 036import org.biojava.bio.seq.Sequence; 037 038import org.biojava.bio.symbol.IllegalAlphabetException; 039import org.biojava.bio.symbol.IllegalSymbolException; 040import org.biojava.bio.symbol.IntegerAlphabet; 041import org.biojava.bio.symbol.IntegerAlphabet.SubIntegerAlphabet; 042import org.biojava.bio.symbol.SymbolList; 043import org.biojava.bio.symbol.SimpleSymbolList; 044 045/** 046 * Utility methods for FASTQ formatted sequences. 047 * 048 * @since 1.8.2 049 */ 050public final class FastqTools 051{ 052 053 /** 054 * Private no-arg constructor. 055 */ 056 private FastqTools() 057 { 058 // empty 059 } 060 061 062 /** 063 * Create and return a new DNA {@link SymbolList} from the specified FASTQ formatted sequence. 064 * 065 * @param fastq FASTQ formatted sequence, must not be null 066 * @return a new DNA {@link SymbolList} from the specified FASTQ formatted sequence 067 * @throws IllegalSymbolException if an illegal symbol is found 068 */ 069 public static SymbolList createDNA(final Fastq fastq) throws IllegalSymbolException 070 { 071 if (fastq == null) 072 { 073 throw new IllegalArgumentException("fastq must not be null"); 074 } 075 return DNATools.createDNA(fastq.getSequence()); 076 } 077 078 /** 079 * Create and return a new {@link SymbolList} of quality scores from the specified FASTQ formatted sequence. 080 * 081 * @param fastq FASTQ formatted sequence, must not be null 082 * @return a new {@link SymbolList} of quality scores from the specified FASTQ formatted sequence 083 * @throws IllegalSymbolException if an illegal symbol is found 084 */ 085 public static SymbolList createQualityScores(final Fastq fastq) throws IllegalSymbolException 086 { 087 if (fastq == null) 088 { 089 throw new IllegalArgumentException("fastq must not be null"); 090 } 091 FastqVariant variant = fastq.getVariant(); 092 SubIntegerAlphabet alphabet = IntegerAlphabet.getSubAlphabet(variant.minimumQualityScore(), variant.maximumQualityScore()); 093 SimpleSymbolList qualitySymbols = new SimpleSymbolList(alphabet); 094 for (int i = 0, size = fastq.getQuality().length(); i < size; i++) 095 { 096 char c = fastq.getQuality().charAt(i); 097 qualitySymbols.addSymbol(alphabet.getSymbol(variant.qualityScore(c))); 098 } 099 return qualitySymbols; 100 } 101 102 /** 103 * Create and return a new DNA {@link Sequence} from the specified FASTQ formatted sequence. 104 * 105 * @param fastq FASTQ formatted sequence, must not be null 106 * @return a new {@link Sequence} from the specified FASTQ formatted sequence 107 * @throws IllegalSymbolException if an illegal symbol is found 108 */ 109 public static Sequence createDNASequence(final Fastq fastq) throws IllegalSymbolException 110 { 111 if (fastq == null) 112 { 113 throw new IllegalArgumentException("fastq must not be null"); 114 } 115 return DNATools.createDNASequence(fastq.getSequence(), fastq.getDescription()); 116 } 117 118 /** 119 * Create and return a new {@link PhredSequence} from the specified FASTQ formatted sequence. 120 * Only Sanger variant FASTQ formatted sequences are supported. 121 * 122 * @param fastq FASTQ formatted sequence, must not be null and must be Sanger variant 123 * @return a new {@link PhredSequence} from the specified FASTQ formatted sequence 124 * @throws IllegalAlphabetException if an illegal alphabet is used 125 * @throws IllegalSymbolException if an illegal symbol is found 126 */ 127 public static PhredSequence createPhredSequence(final Fastq fastq) throws IllegalAlphabetException, IllegalSymbolException 128 { 129 if (fastq == null) 130 { 131 throw new IllegalArgumentException("fastq must not be null"); 132 } 133 if (!fastq.getVariant().isSanger()) 134 { 135 throw new IllegalArgumentException("fastq must be sanger variant, was " + fastq.getVariant()); 136 } 137 SymbolList dnaSymbols = createDNA(fastq); 138 139 // 0-99 subinteger alphabet required by PhredSequence, thus only Sanger variant is supported 140 SubIntegerAlphabet alphabet = IntegerAlphabet.getSubAlphabet(0, 99); 141 SimpleSymbolList qualitySymbols = new SimpleSymbolList(alphabet); 142 for (int i = 0, size = fastq.getQuality().length(); i < size; i++) 143 { 144 char c = fastq.getQuality().charAt(i); 145 qualitySymbols.addSymbol(alphabet.getSymbol(FastqVariant.FASTQ_SANGER.qualityScore(c))); 146 } 147 148 SymbolList phredSymbols = PhredTools.createPhred(dnaSymbols, qualitySymbols); 149 return new PhredSequence(phredSymbols, fastq.getDescription(), null, Annotation.EMPTY_ANNOTATION); 150 } 151 152 /** 153 * Create and return a new array of symbol {@link Distribution}s from the specified FASTQ formatted sequence. 154 * Only Sanger variant FASTQ formatted sequences are supported. 155 * 156 * @param fastq FASTQ formatted sequence, must not be null and must be Sanger variant 157 * @return a new array of symbol {@link Distribution}s from the specified FASTQ formatted sequence 158 * @throws IllegalAlphabetException if an illegal alphabet is used 159 * @throws IllegalSymbolException if an illegal symbol is found 160 */ 161 public static Distribution[] createSymbolDistribution(final Fastq fastq) throws IllegalAlphabetException, IllegalSymbolException 162 { 163 PhredSequence phredSequence = createPhredSequence(fastq); 164 return PhredTools.phredToDistArray(phredSequence); 165 } 166 167 /** 168 * Return the quality scores from the specified FASTQ formatted sequence. 169 * 170 * @param fastq FASTQ formatted sequence, must not be null 171 * @return the quality scores from the specified FASTQ formatted sequence 172 */ 173 public static Iterable<Integer> qualityScores(final Fastq fastq) 174 { 175 if (fastq == null) 176 { 177 throw new IllegalArgumentException("fastq must not be null"); 178 } 179 int size = fastq.getQuality().length(); 180 List<Integer> qualityScores = Lists.newArrayListWithExpectedSize(size); 181 FastqVariant variant = fastq.getVariant(); 182 for (int i = 0; i < size; i++) 183 { 184 char c = fastq.getQuality().charAt(i); 185 qualityScores.add(variant.qualityScore(c)); 186 } 187 return ImmutableList.copyOf(qualityScores); 188 } 189 190 /** 191 * Copy the quality scores from the specified FASTQ formatted sequence into the specified int array. 192 * 193 * @param fastq FASTQ formatted sequence, must not be null 194 * @param qualityScores int array of quality scores, must not be null and must be the same 195 * length as the FASTQ formatted sequence quality 196 * @return the specified int array of quality scores 197 */ 198 public static int[] qualityScores(final Fastq fastq, final int[] qualityScores) 199 { 200 if (fastq == null) 201 { 202 throw new IllegalArgumentException("fastq must not be null"); 203 } 204 if (qualityScores == null) 205 { 206 throw new IllegalArgumentException("qualityScores must not be null"); 207 } 208 int size = fastq.getQuality().length(); 209 if (qualityScores.length != size) 210 { 211 throw new IllegalArgumentException("qualityScores must be the same length as the FASTQ formatted sequence quality"); 212 } 213 FastqVariant variant = fastq.getVariant(); 214 for (int i = 0; i < size; i++) 215 { 216 char c = fastq.getQuality().charAt(i); 217 qualityScores[i] = variant.qualityScore(c); 218 } 219 return qualityScores; 220 } 221 222 /** 223 * Return the error probabilities from the specified FASTQ formatted sequence. 224 * 225 * @param fastq FASTQ formatted sequence, must not be null 226 * @return the error probabilities from the specified FASTQ formatted sequence 227 */ 228 public static Iterable<Double> errorProbabilities(final Fastq fastq) 229 { 230 if (fastq == null) 231 { 232 throw new IllegalArgumentException("fastq must not be null"); 233 } 234 int size = fastq.getQuality().length(); 235 List<Double> errorProbabilities = Lists.newArrayListWithExpectedSize(size); 236 FastqVariant variant = fastq.getVariant(); 237 for (int i = 0; i < size; i++) 238 { 239 char c = fastq.getQuality().charAt(i); 240 errorProbabilities.add(variant.errorProbability(c)); 241 } 242 return ImmutableList.copyOf(errorProbabilities); 243 } 244 245 /** 246 * Copy the error probabilities from the specified FASTQ formatted sequence into the specified double array. 247 * 248 * @param fastq FASTQ formatted sequence, must not be null 249 * @param errorProbabilities double array of error probabilities, must not be null and must be the same 250 * length as the FASTQ formatted sequence quality 251 * @return the specified double array of error probabilities 252 */ 253 public static double[] errorProbabilities(final Fastq fastq, final double[] errorProbabilities) 254 { 255 if (fastq == null) 256 { 257 throw new IllegalArgumentException("fastq must not be null"); 258 } 259 if (errorProbabilities == null) 260 { 261 throw new IllegalArgumentException("errorProbabilities must not be null"); 262 } 263 int size = fastq.getQuality().length(); 264 if (errorProbabilities.length != size) 265 { 266 throw new IllegalArgumentException("errorProbabilities must be the same length as the FASTQ formatted sequence quality"); 267 } 268 FastqVariant variant = fastq.getVariant(); 269 for (int i = 0; i < size; i++) 270 { 271 char c = fastq.getQuality().charAt(i); 272 errorProbabilities[i] = variant.errorProbability(c); 273 } 274 return errorProbabilities; 275 } 276 277 /** 278 * Convert the specified FASTQ formatted sequence to the 279 * specified FASTQ sequence format variant. 280 * 281 * @since 1.9.3 282 * @param fastq FASTQ formatted sequence, must not be null 283 * @param variant FASTQ sequence format variant, must not be null 284 * @return the specified FASTQ formatted sequence converted to the 285 * specified FASTQ sequence format variant 286 */ 287 public static Fastq convert(final Fastq fastq, final FastqVariant variant) 288 { 289 if (fastq == null) 290 { 291 throw new IllegalArgumentException("fastq must not be null"); 292 } 293 if (variant == null) 294 { 295 throw new IllegalArgumentException("variant must not be null"); 296 } 297 if (fastq.getVariant().equals(variant)) 298 { 299 return fastq; 300 } 301 return new Fastq(fastq.getDescription(), fastq.getSequence(), convertQualities(fastq, variant), variant); 302 } 303 304 /** 305 * Convert the qualities in the specified FASTQ formatted sequence to the 306 * specified FASTQ sequence format variant. 307 * 308 * @since 1.9.3 309 * @param fastq FASTQ formatted sequence, must not be null 310 * @param variant FASTQ sequence format variant, must not be null 311 * @return the qualities in the specified FASTQ formatted sequence converted to the 312 * specified FASTQ sequence format variant 313 */ 314 static String convertQualities(final Fastq fastq, final FastqVariant variant) 315 { 316 if (fastq == null) 317 { 318 throw new IllegalArgumentException("fastq must not be null"); 319 } 320 if (variant == null) 321 { 322 throw new IllegalArgumentException("variant must not be null"); 323 } 324 if (fastq.getVariant().equals(variant)) 325 { 326 return fastq.getQuality(); 327 } 328 int size = fastq.getQuality().length(); 329 double[] errorProbabilities = errorProbabilities(fastq, new double[size]); 330 StringBuilder sb = new StringBuilder(size); 331 for (int i = 0; i < size; i++) 332 { 333 sb.append(variant.quality(variant.qualityScore(errorProbabilities[i]))); 334 } 335 return sb.toString(); 336 } 337}