001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.io.fastq;
022
023import com.google.common.collect.ImmutableList;
024import com.google.common.collect.Lists;
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.DNASequence;
027import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
028import org.biojava.nbio.core.sequence.features.QualityFeature;
029import org.biojava.nbio.core.sequence.features.QuantityFeature;
030import org.biojava.nbio.core.sequence.template.AbstractSequence;
031
032import java.util.List;
033
034/**
035 * Utility methods for FASTQ formatted sequences.
036 *
037 * @since 3.0.3
038 */
039public final class FastqTools
040{
041
042        /**
043         * Private no-arg constructor.
044         */
045        private FastqTools()
046        {
047                // empty
048        }
049
050
051        /**
052         * Create and return a new {@link DNASequence} from the specified FASTQ formatted sequence.
053         *
054         * @param fastq FASTQ formatted sequence, must not be null
055         * @return a new {@link DNASequence} from the specified FASTQ formatted sequence
056         * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds
057         */
058        public static DNASequence createDNASequence(final Fastq fastq) throws CompoundNotFoundException
059        {
060                if (fastq == null)
061                {
062                        throw new IllegalArgumentException("fastq must not be null");
063                }
064                DNASequence sequence = new DNASequence(fastq.getSequence());
065                sequence.setOriginalHeader(fastq.getDescription());
066                return sequence;
067        }
068
069        /**
070         * Create and return a new {@link DNASequence} with quality scores from the specified
071         * FASTQ formatted sequence.  The quality scores are stored in a {@link QualityFeature}
072         * with a type <code>"qualityScores"</code> the same length as the sequence.
073         *
074         * @param fastq FASTQ formatted sequence, must not be null
075         * @return a new {@link DNASequence} with quality scores from the specified FASTQ formatted sequence
076         * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds
077         */
078        public static DNASequence createDNASequenceWithQualityScores(final Fastq fastq) throws CompoundNotFoundException
079        {
080                DNASequence sequence = createDNASequence(fastq);
081                sequence.addFeature(1, sequence.getLength(), createQualityScores(fastq));
082                return sequence;
083        }
084
085        /**
086         * Create and return a new {@link DNASequence} with error probabilities from the specified
087         * FASTQ formatted sequence.  The error probabilities are stored in a {@link QuantityFeature}
088         * with a type <code>"errorProbabilities"</code> the same length as the sequence.
089         *
090         * @param fastq FASTQ formatted sequence, must not be null
091         * @return a new {@link DNASequence} with error probabilities from the specified FASTQ formatted sequence
092         * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds
093         */
094        public static DNASequence createDNASequenceWithErrorProbabilities(final Fastq fastq) throws CompoundNotFoundException
095        {
096                DNASequence sequence = createDNASequence(fastq);
097                sequence.addFeature(1, sequence.getLength(), createErrorProbabilities(fastq));
098                return sequence;
099        }
100
101        /**
102         * Create and return a new {@link DNASequence} with quality scores and error probabilities from the
103         * specified FASTQ formatted sequence.  The quality scores are stored in a {@link QualityFeature}
104         * with a type <code>"qualityScores"</code> the same length as the sequence and the error
105         * probabilities are stored in a {@link QuantityFeature} with a type <code>"errorProbabilities"</code>
106         * the same length as the sequence.
107         *
108         * @param fastq FASTQ formatted sequence, must not be null
109         * @return a new {@link DNASequence} with quality scores and error probabilities from the specified
110         *    FASTQ formatted sequence
111         * @throws CompoundNotFoundException if DNA sequence in fastq contains unrecognised compounds
112         */
113        public static DNASequence createDNASequenceWithQualityScoresAndErrorProbabilities(final Fastq fastq) throws CompoundNotFoundException
114        {
115                DNASequence sequence = createDNASequence(fastq);
116                sequence.addFeature(1, sequence.getLength(), createQualityScores(fastq));
117                sequence.addFeature(1, sequence.getLength(), createErrorProbabilities(fastq));
118                return sequence;
119        }
120
121        /**
122         * Create and return a new {@link QualityFeature} from the quality scores of the specified
123         * FASTQ formatted sequence.  The quality scores feature has a type <code>"qualityScores"</code>
124         * and will be the same length as the sequence.
125         *
126         * @param fastq FASTQ formatted sequence, must not be null
127         * @return a new {@link QualityFeature} from the quality scores of the specified FASTQ
128         *    formatted sequence
129         */
130        public static QualityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> createQualityScores(final Fastq fastq)
131        {
132                if (fastq == null)
133                {
134                        throw new IllegalArgumentException("fastq must not be null");
135                }
136                QualityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> qualityScores = new QualityFeature<>("qualityScores", "sequencing");
137                qualityScores.setQualities(toList(qualityScores(fastq)));
138                return qualityScores;
139        }
140
141        /**
142         * Create and return a new {@link QuantityFeature} from the error probabilities of the specified
143         * FASTQ formatted sequence.  The error probabilities feature has a type <code>"errorProbabilities"</code>
144         * and will be the same length as the sequence.
145         *
146         * @param fastq FASTQ formatted sequence, must not be null
147         * @return a new {@link QualityFeature} from the error probabilities of the specified FASTQ
148         *    formatted sequence
149         */
150        public static QuantityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> createErrorProbabilities(final Fastq fastq)
151        {
152                if (fastq == null)
153                {
154                        throw new IllegalArgumentException("fastq must not be null");
155                }
156                QuantityFeature<AbstractSequence<NucleotideCompound>, NucleotideCompound> errorProbabilities = new QuantityFeature<>("errorProbabilities", "sequencing");
157                errorProbabilities.setQuantities(toList(errorProbabilities(fastq)));
158                return errorProbabilities;
159        }
160
161        /**
162         * Return the quality scores from the specified FASTQ formatted sequence.
163         *
164         * @param fastq FASTQ formatted sequence, must not be null
165         * @return the quality scores from the specified FASTQ formatted sequence
166         */
167        public static Iterable<Number> qualityScores(final Fastq fastq)
168        {
169                if (fastq == null)
170                {
171                        throw new IllegalArgumentException("fastq must not be null");
172                }
173                int size = fastq.getQuality().length();
174                List<Number> qualityScores = Lists.newArrayListWithExpectedSize(size);
175                FastqVariant variant = fastq.getVariant();
176                for (int i = 0; i < size; i++)
177                {
178                        char c = fastq.getQuality().charAt(i);
179                        qualityScores.add(variant.qualityScore(c));
180                }
181                return ImmutableList.copyOf(qualityScores);
182        }
183
184        /**
185         * Copy the quality scores from the specified FASTQ formatted sequence into the specified int array.
186         *
187         * @param fastq FASTQ formatted sequence, must not be null
188         * @param qualityScores int array of quality scores, must not be null and must be the same
189         *    length as the FASTQ formatted sequence quality
190         * @return the specified int array of quality scores
191         */
192        public static int[] qualityScores(final Fastq fastq, final int[] qualityScores)
193        {
194                if (fastq == null)
195                {
196                        throw new IllegalArgumentException("fastq must not be null");
197                }
198                if (qualityScores == null)
199                {
200                        throw new IllegalArgumentException("qualityScores must not be null");
201                }
202                int size = fastq.getQuality().length();
203                if (qualityScores.length != size)
204                {
205                        throw new IllegalArgumentException("qualityScores must be the same length as the FASTQ formatted sequence quality");
206                }
207                FastqVariant variant = fastq.getVariant();
208                for (int i = 0; i < size; i++)
209                {
210                        char c = fastq.getQuality().charAt(i);
211                        qualityScores[i] = variant.qualityScore(c);
212                }
213                return qualityScores;
214        }
215
216        /**
217         * Return the error probabilities from the specified FASTQ formatted sequence.
218         *
219         * @param fastq FASTQ formatted sequence, must not be null
220         * @return the error probabilities from the specified FASTQ formatted sequence
221         */
222        public static Iterable<Number> errorProbabilities(final Fastq fastq)
223        {
224                if (fastq == null)
225                {
226                        throw new IllegalArgumentException("fastq must not be null");
227                }
228                int size = fastq.getQuality().length();
229                List<Number> errorProbabilities = Lists.newArrayListWithExpectedSize(size);
230                FastqVariant variant = fastq.getVariant();
231                for (int i = 0; i < size; i++)
232                {
233                        char c = fastq.getQuality().charAt(i);
234                        errorProbabilities.add(variant.errorProbability(c));
235                }
236                return ImmutableList.copyOf(errorProbabilities);
237        }
238
239        /**
240         * Copy the error probabilities from the specified FASTQ formatted sequence into the specified double array.
241         *
242         * @param fastq FASTQ formatted sequence, must not be null
243         * @param errorProbabilities double array of error probabilities, must not be null and must be the same
244         *    length as the FASTQ formatted sequence quality
245         * @return the specified double array of error probabilities
246         */
247        public static double[] errorProbabilities(final Fastq fastq, final double[] errorProbabilities)
248        {
249                if (fastq == null)
250                {
251                        throw new IllegalArgumentException("fastq must not be null");
252                }
253                if (errorProbabilities == null)
254                {
255                        throw new IllegalArgumentException("errorProbabilities must not be null");
256                }
257                int size = fastq.getQuality().length();
258                if (errorProbabilities.length != size)
259                {
260                        throw new IllegalArgumentException("errorProbabilities must be the same length as the FASTQ formatted sequence quality");
261                }
262                FastqVariant variant = fastq.getVariant();
263                for (int i = 0; i < size; i++)
264                {
265                        char c = fastq.getQuality().charAt(i);
266                        errorProbabilities[i] = variant.errorProbability(c);
267                }
268                return errorProbabilities;
269        }
270
271        /**
272         * Convert the specified FASTQ formatted sequence to the
273         * specified FASTQ sequence format variant.
274         *
275         * @since 4.2
276         * @param fastq FASTQ formatted sequence, must not be null
277         * @param variant FASTQ sequence format variant, must not be null
278         * @return the specified FASTQ formatted sequence converted to the
279         *    specified FASTQ sequence format variant
280         */
281        public static Fastq convert(final Fastq fastq, final FastqVariant variant)
282        {
283                if (fastq == null)
284                {
285                        throw new IllegalArgumentException("fastq must not be null");
286                }
287                if (variant == null)
288                {
289                        throw new IllegalArgumentException("variant must not be null");
290                }
291                if (fastq.getVariant().equals(variant))
292                {
293                        return fastq;
294                }
295                return new Fastq(fastq.getDescription(), fastq.getSequence(), convertQualities(fastq, variant), variant);
296        }
297
298        /**
299         * Convert the qualities in the specified FASTQ formatted sequence to the
300         * specified FASTQ sequence format variant.
301         *
302         * @since 4.2
303         * @param fastq FASTQ formatted sequence, must not be null
304         * @param variant FASTQ sequence format variant, must not be null
305         * @return the qualities in the specified FASTQ formatted sequence converted to the
306         *    specified FASTQ sequence format variant
307         */
308        static String convertQualities(final Fastq fastq, final FastqVariant variant)
309        {
310                if (fastq == null)
311                {
312                        throw new IllegalArgumentException("fastq must not be null");
313                }
314                if (variant == null)
315                {
316                        throw new IllegalArgumentException("variant must not be null");
317                }
318                if (fastq.getVariant().equals(variant))
319                {
320                        return fastq.getQuality();
321                }
322                int size = fastq.getQuality().length();
323                double[] errorProbabilities = errorProbabilities(fastq, new double[size]);
324                StringBuilder sb = new StringBuilder(size);
325                for (int i = 0; i < size; i++)
326                {
327                        sb.append(variant.quality(variant.qualityScore(errorProbabilities[i])));
328                }
329                return sb.toString();
330        }
331
332        /**
333         * Return the specified iterable as a list.
334         *
335         * @paam <T> element type
336         * @param iterable iterable
337         * @return the specified iterable as a list
338         */
339        @SuppressWarnings("unchecked")
340        static <T> List<T> toList(final Iterable<? extends T> iterable)
341        {
342                if (iterable instanceof List)
343                {
344                        return (List<T>) iterable;
345                }
346                return ImmutableList.copyOf(iterable);
347        }
348}