001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.bio.program.fastq;
022
023import java.util.List;
024
025import com.google.common.collect.Lists;
026import com.google.common.collect.ImmutableList;
027
028import org.biojava.bio.Annotation;
029
030import org.biojava.bio.dist.Distribution;
031
032import org.biojava.bio.program.phred.PhredSequence;
033import org.biojava.bio.program.phred.PhredTools;
034
035import org.biojava.bio.seq.DNATools;
036import org.biojava.bio.seq.Sequence;
037
038import org.biojava.bio.symbol.IllegalAlphabetException;
039import org.biojava.bio.symbol.IllegalSymbolException;
040import org.biojava.bio.symbol.IntegerAlphabet;
041import org.biojava.bio.symbol.IntegerAlphabet.SubIntegerAlphabet;
042import org.biojava.bio.symbol.SymbolList;
043import org.biojava.bio.symbol.SimpleSymbolList;
044
045/**
046 * Utility methods for FASTQ formatted sequences.
047 *
048 * @since 1.8.2
049 */
050public final class FastqTools
051{
052
053    /**
054     * Private no-arg constructor.
055     */
056    private FastqTools()
057    {
058        // empty
059    }
060
061
062    /**
063     * Create and return a new DNA {@link SymbolList} from the specified FASTQ formatted sequence.
064     *
065     * @param fastq FASTQ formatted sequence, must not be null
066     * @return a new DNA {@link SymbolList} from the specified FASTQ formatted sequence
067     * @throws IllegalSymbolException if an illegal symbol is found
068     */
069    public static SymbolList createDNA(final Fastq fastq) throws IllegalSymbolException
070    {
071        if (fastq == null)
072        {
073            throw new IllegalArgumentException("fastq must not be null");
074        }
075        return DNATools.createDNA(fastq.getSequence());
076    }
077
078    /**
079     * Create and return a new {@link SymbolList} of quality scores from the specified FASTQ formatted sequence.
080     *
081     * @param fastq FASTQ formatted sequence, must not be null
082     * @return a new {@link SymbolList} of quality scores from the specified FASTQ formatted sequence
083     * @throws IllegalSymbolException if an illegal symbol is found
084     */
085    public static SymbolList createQualityScores(final Fastq fastq) throws IllegalSymbolException
086    {
087        if (fastq == null)
088        {
089            throw new IllegalArgumentException("fastq must not be null");
090        }
091        FastqVariant variant = fastq.getVariant();
092        SubIntegerAlphabet alphabet = IntegerAlphabet.getSubAlphabet(variant.minimumQualityScore(), variant.maximumQualityScore());
093        SimpleSymbolList qualitySymbols = new SimpleSymbolList(alphabet);
094        for (int i = 0, size = fastq.getQuality().length(); i < size; i++)
095        {
096            char c = fastq.getQuality().charAt(i);
097            qualitySymbols.addSymbol(alphabet.getSymbol(variant.qualityScore(c)));
098        }
099        return qualitySymbols;
100    }
101
102    /**
103     * Create and return a new DNA {@link Sequence} from the specified FASTQ formatted sequence.
104     *
105     * @param fastq FASTQ formatted sequence, must not be null
106     * @return a new {@link Sequence} from the specified FASTQ formatted sequence
107     * @throws IllegalSymbolException if an illegal symbol is found
108     */
109    public static Sequence createDNASequence(final Fastq fastq) throws IllegalSymbolException
110    {
111        if (fastq == null)
112        {
113            throw new IllegalArgumentException("fastq must not be null");
114        }
115        return DNATools.createDNASequence(fastq.getSequence(), fastq.getDescription());
116    }
117
118    /**
119     * Create and return a new {@link PhredSequence} from the specified FASTQ formatted sequence.
120     * Only Sanger variant FASTQ formatted sequences are supported.
121     *
122     * @param fastq FASTQ formatted sequence, must not be null and must be Sanger variant
123     * @return a new {@link PhredSequence} from the specified FASTQ formatted sequence
124     * @throws IllegalAlphabetException if an illegal alphabet is used
125     * @throws IllegalSymbolException if an illegal symbol is found
126     */
127    public static PhredSequence createPhredSequence(final Fastq fastq) throws IllegalAlphabetException, IllegalSymbolException
128    {
129        if (fastq == null)
130        {
131            throw new IllegalArgumentException("fastq must not be null");
132        }
133        if (!fastq.getVariant().isSanger())
134        {
135            throw new IllegalArgumentException("fastq must be sanger variant, was " + fastq.getVariant());
136        }
137        SymbolList dnaSymbols = createDNA(fastq);
138
139        // 0-99 subinteger alphabet required by PhredSequence, thus only Sanger variant is supported
140        SubIntegerAlphabet alphabet = IntegerAlphabet.getSubAlphabet(0, 99);
141        SimpleSymbolList qualitySymbols = new SimpleSymbolList(alphabet);
142        for (int i = 0, size = fastq.getQuality().length(); i < size; i++)
143        {
144            char c = fastq.getQuality().charAt(i);
145            qualitySymbols.addSymbol(alphabet.getSymbol(FastqVariant.FASTQ_SANGER.qualityScore(c)));
146        }
147
148        SymbolList phredSymbols = PhredTools.createPhred(dnaSymbols, qualitySymbols);
149        return new PhredSequence(phredSymbols, fastq.getDescription(), null, Annotation.EMPTY_ANNOTATION);
150    }
151
152    /**
153     * Create and return a new array of symbol {@link Distribution}s from the specified FASTQ formatted sequence.
154     * Only Sanger variant FASTQ formatted sequences are supported.
155     *
156     * @param fastq FASTQ formatted sequence, must not be null and must be Sanger variant
157     * @return a new array of symbol {@link Distribution}s from the specified FASTQ formatted sequence
158     * @throws IllegalAlphabetException if an illegal alphabet is used
159     * @throws IllegalSymbolException if an illegal symbol is found
160     */
161    public static Distribution[] createSymbolDistribution(final Fastq fastq) throws IllegalAlphabetException, IllegalSymbolException
162    {
163        PhredSequence phredSequence = createPhredSequence(fastq);
164        return PhredTools.phredToDistArray(phredSequence);
165    }
166
167    /**
168     * Return the quality scores from the specified FASTQ formatted sequence.
169     *
170     * @param fastq FASTQ formatted sequence, must not be null
171     * @return the quality scores from the specified FASTQ formatted sequence
172     */
173    public static Iterable<Integer> qualityScores(final Fastq fastq)
174    {
175        if (fastq == null)
176        {
177            throw new IllegalArgumentException("fastq must not be null");
178        }
179        int size = fastq.getQuality().length();
180        List<Integer> qualityScores = Lists.newArrayListWithExpectedSize(size);
181        FastqVariant variant = fastq.getVariant();
182        for (int i = 0; i < size; i++)
183        {
184            char c = fastq.getQuality().charAt(i);
185            qualityScores.add(variant.qualityScore(c));
186        }
187        return ImmutableList.copyOf(qualityScores);
188    }
189
190    /**
191     * Copy the quality scores from the specified FASTQ formatted sequence into the specified int array.
192     *
193     * @param fastq FASTQ formatted sequence, must not be null
194     * @param qualityScores int array of quality scores, must not be null and must be the same
195     *    length as the FASTQ formatted sequence quality
196     * @return the specified int array of quality scores
197     */
198    public static int[] qualityScores(final Fastq fastq, final int[] qualityScores)
199    {
200        if (fastq == null)
201        {
202            throw new IllegalArgumentException("fastq must not be null");
203        }
204        if (qualityScores == null)
205        {
206            throw new IllegalArgumentException("qualityScores must not be null");
207        }
208        int size = fastq.getQuality().length();
209        if (qualityScores.length != size)
210        {
211            throw new IllegalArgumentException("qualityScores must be the same length as the FASTQ formatted sequence quality");
212        }
213        FastqVariant variant = fastq.getVariant();
214        for (int i = 0; i < size; i++)
215        {
216            char c = fastq.getQuality().charAt(i);
217            qualityScores[i] = variant.qualityScore(c);
218        }
219        return qualityScores;
220    }
221
222    /**
223     * Return the error probabilities from the specified FASTQ formatted sequence.
224     *
225     * @param fastq FASTQ formatted sequence, must not be null
226     * @return the error probabilities from the specified FASTQ formatted sequence
227     */
228    public static Iterable<Double> errorProbabilities(final Fastq fastq)
229    {
230        if (fastq == null)
231        {
232            throw new IllegalArgumentException("fastq must not be null");
233        }
234        int size = fastq.getQuality().length();
235        List<Double> errorProbabilities = Lists.newArrayListWithExpectedSize(size);
236        FastqVariant variant = fastq.getVariant();
237        for (int i = 0; i < size; i++)
238        {
239            char c = fastq.getQuality().charAt(i);
240            errorProbabilities.add(variant.errorProbability(c));
241        }
242        return ImmutableList.copyOf(errorProbabilities);
243    }
244
245    /**
246     * Copy the error probabilities from the specified FASTQ formatted sequence into the specified double array.
247     *
248     * @param fastq FASTQ formatted sequence, must not be null
249     * @param errorProbabilities double array of error probabilities, must not be null and must be the same
250     *    length as the FASTQ formatted sequence quality
251     * @return the specified double array of error probabilities
252     */
253    public static double[] errorProbabilities(final Fastq fastq, final double[] errorProbabilities)
254    {
255        if (fastq == null)
256        {
257            throw new IllegalArgumentException("fastq must not be null");
258        }
259        if (errorProbabilities == null)
260        {
261            throw new IllegalArgumentException("errorProbabilities must not be null");
262        }
263        int size = fastq.getQuality().length();
264        if (errorProbabilities.length != size)
265        {
266            throw new IllegalArgumentException("errorProbabilities must be the same length as the FASTQ formatted sequence quality");
267        }
268        FastqVariant variant = fastq.getVariant();
269        for (int i = 0; i < size; i++)
270        {
271            char c = fastq.getQuality().charAt(i);
272            errorProbabilities[i] = variant.errorProbability(c);
273        }
274        return errorProbabilities;
275    }
276}