001/*
002 * @(#)SequenceUtil.java 1.0 September 2009
003 *
004 * Copyright (c) 2009 Peter Troshin
005 *
006 *        BioJava development code
007 *
008 * This code may be freely distributed and modified under the
009 * terms of the GNU Lesser General Public Licence.  This should
010 * be distributed with the code.  If you do not have a copy,
011 * see:
012 *
013 *      http://www.gnu.org/copyleft/lesser.html
014 *
015 * Copyright for this code is held jointly by the individual
016 * authors.  These should be listed in @author doc comments.
017 *
018 * For more information on the BioJava project and its aims,
019 * or to join the biojava-l mailing list, visit the home page
020 * at:
021 *
022 *      http://www.biojava.org/
023 *
024 */
025
026package org.biojava.nbio.data.sequence;
027
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.io.*;
032import java.util.ArrayList;
033import java.util.List;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036
037/**
038 * Utility class for operations on sequences
039 *
040 * @author Peter Troshin
041 * @version 1.0
042 * @since 3.0.2
043 */
044public final class SequenceUtil {
045
046        private static final Logger logger = LoggerFactory.getLogger(SequenceUtil.class);
047
048        /**
049         * A whitespace character: [\t\n\x0B\f\r]
050         */
051        public static final Pattern WHITE_SPACE = Pattern.compile("\\s");
052
053        /**
054         * A digit
055         */
056        public static final Pattern DIGIT = Pattern.compile("\\d");
057
058        /**
059         * Non word
060         */
061        public static final Pattern NONWORD = Pattern.compile("\\W");
062
063        /**
064         * Valid Amino acids
065         */
066        public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYVUO]+",
067                Pattern.CASE_INSENSITIVE);
068
069        /**
070         * inversion of AA pattern
071         */
072        public static final Pattern NON_AA = Pattern.compile(
073                "[^ARNDCQEGHILKMFPSTWYVXUO]+", Pattern.CASE_INSENSITIVE);
074
075        /**
076         * Same as AA pattern but with one additional letters - X
077         */
078        public static final Pattern AMBIGUOUS_AA = Pattern.compile(
079                "[ARNDCQEGHILKMFPSTWYVXUO]+", Pattern.CASE_INSENSITIVE);
080
081        /**
082         * Nucleotides a, t, g, c, u
083         */
084        public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",
085                Pattern.CASE_INSENSITIVE);
086
087        /**
088         * Ambiguous nucleotide
089         */
090        public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(
091                "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC
092        /**
093         * Non nucleotide
094         */
095        public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",
096                Pattern.CASE_INSENSITIVE);
097
098        private SequenceUtil() {
099        } // utility class, no instantiation
100
101        /*
102         * public static void write_PirSeq(OutputStream os, FastaSequence seq)
103         * throws IOException { BufferedWriter pir_out = new BufferedWriter(new
104         * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +
105         * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +
106         * SysPrefs.newlinechar); pir_out.close(); }
107         *
108         * public static void write_FastaSeq(OutputStream os, FastaSequence seq)
109         * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new
110         * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +
111         * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +
112         * SysPrefs.newlinechar); fasta_out.close(); }
113         */
114
115        /**
116         * @return true is the sequence contains only letters a,c, t, g, u
117         */
118        public static boolean isNucleotideSequence(final FastaSequence s) {
119        return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());
120        }
121
122        /**
123         * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one
124         * (!) - B char
125         */
126        public static boolean isNonAmbNucleotideSequence(String sequence) {
127        sequence = SequenceUtil.cleanSequence(sequence);
128        if (SequenceUtil.DIGIT.matcher(sequence).find()) {
129                return false;
130        }
131        if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {
132                return false;
133                /*
134                 * System.out.format("I found the text starting at " +
135                 * "index %d and ending at index %d.%n", nonDNAmatcher .start(),
136                 * nonDNAmatcher.end());
137                 */
138        }
139        final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);
140        return DNAmatcher.find();
141        }
142
143        /**
144         * Removes all whitespace chars in the sequence string
145         *
146         * @param sequence
147         * @return cleaned up sequence
148         */
149        public static String cleanSequence(String sequence) {
150        assert sequence != null;
151        final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);
152        sequence = m.replaceAll("").toUpperCase();
153        return sequence;
154        }
155
156        /**
157         * Removes all special characters and digits as well as whitespace chars
158         * from the sequence
159         *
160         * @param sequence
161         * @return cleaned up sequence
162         */
163        public static String deepCleanSequence(String sequence) {
164        sequence = SequenceUtil.cleanSequence(sequence);
165        sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");
166        sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");
167        final Pattern othernonSeqChars = Pattern.compile("[_-]+");
168        sequence = othernonSeqChars.matcher(sequence).replaceAll("");
169        return sequence;
170        }
171
172        /**
173         *
174         * @param sequence
175         * @return true is the sequence is a protein sequence, false overwise
176         */
177        public static boolean isProteinSequence(String sequence) {
178        sequence = SequenceUtil.cleanSequence(sequence);
179        if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {
180                return false;
181        }
182        if (SequenceUtil.DIGIT.matcher(sequence).find()) {
183                return false;
184        }
185        if (SequenceUtil.NON_AA.matcher(sequence).find()) {
186                logger.info("Found non aa: {}", sequence);
187                return false;
188        }
189        final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);
190        return protmatcher.find();
191        }
192
193        /**
194         * Check whether the sequence confirms to amboguous protein sequence
195         *
196         * @param sequence
197         * @return return true only if the sequence if ambiguous protein sequence
198         *         Return false otherwise. e.g. if the sequence is non-ambiguous
199         *         protein or DNA
200         */
201        public static boolean isAmbiguosProtein(String sequence) {
202        sequence = SequenceUtil.cleanSequence(sequence);
203        if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {
204                return false;
205        }
206        if (SequenceUtil.DIGIT.matcher(sequence).find()) {
207                return false;
208        }
209        if (SequenceUtil.NON_AA.matcher(sequence).find()) {
210                return false;
211        }
212        if (SequenceUtil.AA.matcher(sequence).find()) {
213                return false;
214        }
215        final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);
216        return amb_prot.find();
217        }
218
219        /**
220         * Writes list of FastaSequeces into the outstream formatting the sequence
221         * so that it contains width chars on each line
222         *
223         * @param outstream
224         * @param sequences
225         * @param width
226         *            - the maximum number of characters to write in one line
227         * @throws IOException
228         */
229        public static void writeFasta(final OutputStream outstream,
230                final List<FastaSequence> sequences, final int width)
231                throws IOException {
232        final OutputStreamWriter writer = new OutputStreamWriter(outstream);
233        final BufferedWriter fastawriter = new BufferedWriter(writer);
234        for (final FastaSequence fs : sequences) {
235                fastawriter.write(fs.getFormatedSequence(width));
236        }
237        outstream.flush();
238        fastawriter.close();
239        writer.close();
240        }
241
242        /**
243         * Reads fasta sequences from inStream into the list of FastaSequence
244         * objects
245         *
246         * @param inStream
247         *            from
248         * @return list of FastaSequence objects
249         * @throws IOException
250         */
251        public static List<FastaSequence> readFasta(final InputStream inStream)
252                throws IOException {
253        final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
254
255        final BufferedReader infasta = new BufferedReader(
256                new InputStreamReader(inStream, "UTF8"), 16000);
257        final Pattern pattern = Pattern.compile("//s+");
258
259        String line;
260        String sname = "", seqstr = null;
261        do {
262                line = infasta.readLine();
263                if ((line == null) || line.startsWith(">")) {
264                if (seqstr != null) {
265                        seqs.add(new FastaSequence(sname.substring(1), seqstr));
266                }
267                sname = line; // remove >
268                seqstr = "";
269                } else {
270                final String subseq = pattern.matcher(line).replaceAll("");
271                seqstr += subseq;
272                }
273        } while (line != null);
274
275        infasta.close();
276        return seqs;
277        }
278
279        /**
280         * Writes FastaSequence in the file, each sequence will take one line only
281         *
282         * @param os
283         * @param sequences
284         * @throws IOException
285         */
286        public static void writeFasta(final OutputStream os,
287                final List<FastaSequence> sequences) throws IOException {
288        final OutputStreamWriter outWriter = new OutputStreamWriter(os);
289        final BufferedWriter fasta_out = new BufferedWriter(outWriter);
290        for (final FastaSequence fs : sequences) {
291                fasta_out.write(fs.getOnelineFasta());
292        }
293        fasta_out.close();
294        outWriter.close();
295        }
296
297}