001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.io.fastq; 022 023import com.google.common.collect.ImmutableList; 024import com.google.common.collect.Lists; 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.DNASequence; 027import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 028import org.biojava.nbio.core.sequence.features.QualityFeature; 029import org.biojava.nbio.core.sequence.features.QuantityFeature; 030import org.biojava.nbio.core.sequence.template.AbstractSequence; 031 032import java.util.List; 033 034/** 035 * Utility methods for FASTQ formatted sequences. 036 * 037 * @since 3.0.3 038 */ 039public final class FastqTools 040{ 041 042 /** 043 * Private no-arg constructor. 044 */ 045 private FastqTools() 046 { 047 // empty 048 } 049 050 051 /** 052 * Create and return a new {@link DNASequence} from the specified FASTQ formatted sequence. 053 * 054 * @param fastq FASTQ formatted sequence, must not be null 055 * @return a new {@link DNASequence} from the specified FASTQ formatted sequence 056 * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds 057 */ 058 public static DNASequence createDNASequence(final Fastq fastq) throws CompoundNotFoundException 059 { 060 if (fastq == null) 061 { 062 throw new IllegalArgumentException("fastq must not be null"); 063 } 064 DNASequence sequence = new DNASequence(fastq.getSequence()); 065 sequence.setOriginalHeader(fastq.getDescription()); 066 return sequence; 067 } 068 069 /** 070 * Create and return a new {@link DNASequence} with quality scores from the specified 071 * FASTQ formatted sequence. The quality scores are stored in a {@link QualityFeature} 072 * with a type <code>"qualityScores"</code> the same length as the sequence. 073 * 074 * @param fastq FASTQ formatted sequence, must not be null 075 * @return a new {@link DNASequence} with quality scores from the specified FASTQ formatted sequence 076 * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds 077 */ 078 public static DNASequence createDNASequenceWithQualityScores(final Fastq fastq) throws CompoundNotFoundException 079 { 080 DNASequence sequence = createDNASequence(fastq); 081 sequence.addFeature(1, sequence.getLength(), createQualityScores(fastq)); 082 return sequence; 083 } 084 085 /** 086 * Create and return a new {@link DNASequence} with error probabilities from the specified 087 * FASTQ formatted sequence. The error probabilities are stored in a {@link QuantityFeature} 088 * with a type <code>"errorProbabilities"</code> the same length as the sequence. 089 * 090 * @param fastq FASTQ formatted sequence, must not be null 091 * @return a new {@link DNASequence} with error probabilities from the specified FASTQ formatted sequence 092 * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds 093 */ 094 public static DNASequence createDNASequenceWithErrorProbabilities(final Fastq fastq) throws CompoundNotFoundException 095 { 096 DNASequence sequence = createDNASequence(fastq); 097 sequence.addFeature(1, sequence.getLength(), createErrorProbabilities(fastq)); 098 return sequence; 099 } 100 101 /** 102 * Create and return a new {@link DNASequence} with quality scores and error probabilities from the 103 * specified FASTQ formatted sequence. The quality scores are stored in a {@link QualityFeature} 104 * with a type <code>"qualityScores"</code> the same length as the sequence and the error 105 * probabilities are stored in a {@link QuantityFeature} with a type <code>"errorProbabilities"</code> 106 * the same length as the sequence. 107 * 108 * @param fastq FASTQ formatted sequence, must not be null 109 * @return a new {@link DNASequence} with quality scores and error probabilities from the specified 110 * FASTQ formatted sequence 111 * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds 112 */ 113 public static DNASequence createDNASequenceWithQualityScoresAndErrorProbabilities(final Fastq fastq) throws CompoundNotFoundException 114 { 115 DNASequence sequence = createDNASequence(fastq); 116 sequence.addFeature(1, sequence.getLength(), createQualityScores(fastq)); 117 sequence.addFeature(1, sequence.getLength(), createErrorProbabilities(fastq)); 118 return sequence; 119 } 120 121 /** 122 * Create and return a new {@link QualityFeature} from the quality scores of the specified 123 * FASTQ formatted sequence. The quality scores feature has a type <code>"qualityScores"</code> 124 * and will be the same length as the sequence. 125 * 126 * @param fastq FASTQ formatted sequence, must not be null 127 * @return a new {@link QualityFeature} from the quality scores of the specified FASTQ 128 * formatted sequence 129 */ 130 public static QualityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> createQualityScores(final Fastq fastq) 131 { 132 if (fastq == null) 133 { 134 throw new IllegalArgumentException("fastq must not be null"); 135 } 136 QualityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> qualityScores = new QualityFeature<>("qualityScores", "sequencing"); 137 qualityScores.setQualities(toList(qualityScores(fastq))); 138 return qualityScores; 139 } 140 141 /** 142 * Create and return a new {@link QuantityFeature} from the error probabilities of the specified 143 * FASTQ formatted sequence. The error probabilities feature has a type <code>"errorProbabilities"</code> 144 * and will be the same length as the sequence. 145 * 146 * @param fastq FASTQ formatted sequence, must not be null 147 * @return a new {@link QualityFeature} from the error probabilities of the specified FASTQ 148 * formatted sequence 149 */ 150 public static QuantityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> createErrorProbabilities(final Fastq fastq) 151 { 152 if (fastq == null) 153 { 154 throw new IllegalArgumentException("fastq must not be null"); 155 } 156 QuantityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> errorProbabilities = new QuantityFeature<>("errorProbabilities", "sequencing"); 157 errorProbabilities.setQuantities(toList(errorProbabilities(fastq))); 158 return errorProbabilities; 159 } 160 161 /** 162 * Return the quality scores from the specified FASTQ formatted sequence. 163 * 164 * @param fastq FASTQ formatted sequence, must not be null 165 * @return the quality scores from the specified FASTQ formatted sequence 166 */ 167 public static Iterable<Number> qualityScores(final Fastq fastq) 168 { 169 if (fastq == null) 170 { 171 throw new IllegalArgumentException("fastq must not be null"); 172 } 173 int size = fastq.getQuality().length(); 174 List<Number> qualityScores = Lists.newArrayListWithExpectedSize(size); 175 FastqVariant variant = fastq.getVariant(); 176 for (int i = 0; i < size; i++) 177 { 178 char c = fastq.getQuality().charAt(i); 179 qualityScores.add(variant.qualityScore(c)); 180 } 181 return ImmutableList.copyOf(qualityScores); 182 } 183 184 /** 185 * Copy the quality scores from the specified FASTQ formatted sequence into the specified int array. 186 * 187 * @param fastq FASTQ formatted sequence, must not be null 188 * @param qualityScores int array of quality scores, must not be null and must be the same 189 * length as the FASTQ formatted sequence quality 190 * @return the specified int array of quality scores 191 */ 192 public static int[] qualityScores(final Fastq fastq, final int[] qualityScores) 193 { 194 if (fastq == null) 195 { 196 throw new IllegalArgumentException("fastq must not be null"); 197 } 198 if (qualityScores == null) 199 { 200 throw new IllegalArgumentException("qualityScores must not be null"); 201 } 202 int size = fastq.getQuality().length(); 203 if (qualityScores.length != size) 204 { 205 throw new IllegalArgumentException("qualityScores must be the same length as the FASTQ formatted sequence quality"); 206 } 207 FastqVariant variant = fastq.getVariant(); 208 for (int i = 0; i < size; i++) 209 { 210 char c = fastq.getQuality().charAt(i); 211 qualityScores[i] = variant.qualityScore(c); 212 } 213 return qualityScores; 214 } 215 216 /** 217 * Return the error probabilities from the specified FASTQ formatted sequence. 218 * 219 * @param fastq FASTQ formatted sequence, must not be null 220 * @return the error probabilities from the specified FASTQ formatted sequence 221 */ 222 public static Iterable<Number> errorProbabilities(final Fastq fastq) 223 { 224 if (fastq == null) 225 { 226 throw new IllegalArgumentException("fastq must not be null"); 227 } 228 int size = fastq.getQuality().length(); 229 List<Number> errorProbabilities = Lists.newArrayListWithExpectedSize(size); 230 FastqVariant variant = fastq.getVariant(); 231 for (int i = 0; i < size; i++) 232 { 233 char c = fastq.getQuality().charAt(i); 234 errorProbabilities.add(variant.errorProbability(c)); 235 } 236 return ImmutableList.copyOf(errorProbabilities); 237 } 238 239 /** 240 * Copy the error probabilities from the specified FASTQ formatted sequence into the specified double array. 241 * 242 * @param fastq FASTQ formatted sequence, must not be null 243 * @param errorProbabilities double array of error probabilities, must not be null and must be the same 244 * length as the FASTQ formatted sequence quality 245 * @return the specified double array of error probabilities 246 */ 247 public static double[] errorProbabilities(final Fastq fastq, final double[] errorProbabilities) 248 { 249 if (fastq == null) 250 { 251 throw new IllegalArgumentException("fastq must not be null"); 252 } 253 if (errorProbabilities == null) 254 { 255 throw new IllegalArgumentException("errorProbabilities must not be null"); 256 } 257 int size = fastq.getQuality().length(); 258 if (errorProbabilities.length != size) 259 { 260 throw new IllegalArgumentException("errorProbabilities must be the same length as the FASTQ formatted sequence quality"); 261 } 262 FastqVariant variant = fastq.getVariant(); 263 for (int i = 0; i < size; i++) 264 { 265 char c = fastq.getQuality().charAt(i); 266 errorProbabilities[i] = variant.errorProbability(c); 267 } 268 return errorProbabilities; 269 } 270 271 /** 272 * Convert the specified FASTQ formatted sequence to the 273 * specified FASTQ sequence format variant. 274 * 275 * @since 4.2 276 * @param fastq FASTQ formatted sequence, must not be null 277 * @param variant FASTQ sequence format variant, must not be null 278 * @return the specified FASTQ formatted sequence converted to the 279 * specified FASTQ sequence format variant 280 */ 281 public static Fastq convert(final Fastq fastq, final FastqVariant variant) 282 { 283 if (fastq == null) 284 { 285 throw new IllegalArgumentException("fastq must not be null"); 286 } 287 if (variant == null) 288 { 289 throw new IllegalArgumentException("variant must not be null"); 290 } 291 if (fastq.getVariant().equals(variant)) 292 { 293 return fastq; 294 } 295 return new Fastq(fastq.getDescription(), fastq.getSequence(), convertQualities(fastq, variant), variant); 296 } 297 298 /** 299 * Convert the qualities in the specified FASTQ formatted sequence to the 300 * specified FASTQ sequence format variant. 301 * 302 * @since 4.2 303 * @param fastq FASTQ formatted sequence, must not be null 304 * @param variant FASTQ sequence format variant, must not be null 305 * @return the qualities in the specified FASTQ formatted sequence converted to the 306 * specified FASTQ sequence format variant 307 */ 308 static String convertQualities(final Fastq fastq, final FastqVariant variant) 309 { 310 if (fastq == null) 311 { 312 throw new IllegalArgumentException("fastq must not be null"); 313 } 314 if (variant == null) 315 { 316 throw new IllegalArgumentException("variant must not be null"); 317 } 318 if (fastq.getVariant().equals(variant)) 319 { 320 return fastq.getQuality(); 321 } 322 int size = fastq.getQuality().length(); 323 double[] errorProbabilities = errorProbabilities(fastq, new double[size]); 324 StringBuilder sb = new StringBuilder(size); 325 for (int i = 0; i < size; i++) 326 { 327 sb.append(variant.quality(variant.qualityScore(errorProbabilities[i]))); 328 } 329 return sb.toString(); 330 } 331 332 /** 333 * Return the specified iterable as a list. 334 * 335 * @paam <T> element type 336 * @param iterable iterable 337 * @return the specified iterable as a list 338 */ 339 @SuppressWarnings("unchecked") 340 static <T> List<T> toList(final Iterable<? extends T> iterable) 341 { 342 if (iterable instanceof List) 343 { 344 return (List<T>) iterable; 345 } 346 return ImmutableList.copyOf(iterable); 347 } 348}