001/* 002 * @(#)SequenceUtil.java 1.0 September 2009 003 * 004 * Copyright (c) 2009 Peter Troshin 005 * 006 * BioJava development code 007 * 008 * This code may be freely distributed and modified under the 009 * terms of the GNU Lesser General Public Licence. This should 010 * be distributed with the code. If you do not have a copy, 011 * see: 012 * 013 * http://www.gnu.org/copyleft/lesser.html 014 * 015 * Copyright for this code is held jointly by the individual 016 * authors. These should be listed in @author doc comments. 017 * 018 * For more information on the BioJava project and its aims, 019 * or to join the biojava-l mailing list, visit the home page 020 * at: 021 * 022 * http://www.biojava.org/ 023 * 024 */ 025 026package org.biojava.nbio.data.sequence; 027 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.io.*; 032import java.util.ArrayList; 033import java.util.List; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036 037/** 038 * Utility class for operations on sequences 039 * 040 * @author Peter Troshin 041 * @version 1.0 042 * @since 3.0.2 043 */ 044public final class SequenceUtil { 045 046 private static final Logger logger = LoggerFactory.getLogger(SequenceUtil.class); 047 048 /** 049 * A whitespace character: [\t\n\x0B\f\r] 050 */ 051 public static final Pattern WHITE_SPACE = Pattern.compile("\\s"); 052 053 /** 054 * A digit 055 */ 056 public static final Pattern DIGIT = Pattern.compile("\\d"); 057 058 /** 059 * Non word 060 */ 061 public static final Pattern NONWORD = Pattern.compile("\\W"); 062 063 /** 064 * Valid Amino acids 065 */ 066 public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYVUO]+", 067 Pattern.CASE_INSENSITIVE); 068 069 /** 070 * inversion of AA pattern 071 */ 072 public static final Pattern NON_AA = Pattern.compile( 073 "[^ARNDCQEGHILKMFPSTWYVXUO]+", Pattern.CASE_INSENSITIVE); 074 075 /** 076 * Same as AA pattern but with one additional letters - X 077 */ 078 public static final Pattern AMBIGUOUS_AA = Pattern.compile( 079 "[ARNDCQEGHILKMFPSTWYVXUO]+", Pattern.CASE_INSENSITIVE); 080 081 /** 082 * Nucleotides a, t, g, c, u 083 */ 084 public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+", 085 Pattern.CASE_INSENSITIVE); 086 087 /** 088 * Ambiguous nucleotide 089 */ 090 public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile( 091 "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC 092 /** 093 * Non nucleotide 094 */ 095 public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+", 096 Pattern.CASE_INSENSITIVE); 097 098 private SequenceUtil() { 099 } // utility class, no instantiation 100 101 /* 102 * public static void write_PirSeq(OutputStream os, FastaSequence seq) 103 * throws IOException { BufferedWriter pir_out = new BufferedWriter(new 104 * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() + 105 * SysPrefs.newlinechar); pir_out.write(seq.getSequence() + 106 * SysPrefs.newlinechar); pir_out.close(); } 107 * 108 * public static void write_FastaSeq(OutputStream os, FastaSequence seq) 109 * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new 110 * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() + 111 * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() + 112 * SysPrefs.newlinechar); fasta_out.close(); } 113 */ 114 115 /** 116 * @return true is the sequence contains only letters a,c, t, g, u 117 */ 118 public static boolean isNucleotideSequence(final FastaSequence s) { 119 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence()); 120 } 121 122 /** 123 * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one 124 * (!) - B char 125 */ 126 public static boolean isNonAmbNucleotideSequence(String sequence) { 127 sequence = SequenceUtil.cleanSequence(sequence); 128 if (SequenceUtil.DIGIT.matcher(sequence).find()) { 129 return false; 130 } 131 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) { 132 return false; 133 /* 134 * System.out.format("I found the text starting at " + 135 * "index %d and ending at index %d.%n", nonDNAmatcher .start(), 136 * nonDNAmatcher.end()); 137 */ 138 } 139 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence); 140 return DNAmatcher.find(); 141 } 142 143 /** 144 * Removes all whitespace chars in the sequence string 145 * 146 * @param sequence 147 * @return cleaned up sequence 148 */ 149 public static String cleanSequence(String sequence) { 150 assert sequence != null; 151 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence); 152 sequence = m.replaceAll("").toUpperCase(); 153 return sequence; 154 } 155 156 /** 157 * Removes all special characters and digits as well as whitespace chars 158 * from the sequence 159 * 160 * @param sequence 161 * @return cleaned up sequence 162 */ 163 public static String deepCleanSequence(String sequence) { 164 sequence = SequenceUtil.cleanSequence(sequence); 165 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll(""); 166 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll(""); 167 final Pattern othernonSeqChars = Pattern.compile("[_-]+"); 168 sequence = othernonSeqChars.matcher(sequence).replaceAll(""); 169 return sequence; 170 } 171 172 /** 173 * 174 * @param sequence 175 * @return true is the sequence is a protein sequence, false overwise 176 */ 177 public static boolean isProteinSequence(String sequence) { 178 sequence = SequenceUtil.cleanSequence(sequence); 179 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { 180 return false; 181 } 182 if (SequenceUtil.DIGIT.matcher(sequence).find()) { 183 return false; 184 } 185 if (SequenceUtil.NON_AA.matcher(sequence).find()) { 186 logger.info("Found non aa: {}", sequence); 187 return false; 188 } 189 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence); 190 return protmatcher.find(); 191 } 192 193 /** 194 * Check whether the sequence confirms to amboguous protein sequence 195 * 196 * @param sequence 197 * @return return true only if the sequence if ambiguous protein sequence 198 * Return false otherwise. e.g. if the sequence is non-ambiguous 199 * protein or DNA 200 */ 201 public static boolean isAmbiguosProtein(String sequence) { 202 sequence = SequenceUtil.cleanSequence(sequence); 203 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { 204 return false; 205 } 206 if (SequenceUtil.DIGIT.matcher(sequence).find()) { 207 return false; 208 } 209 if (SequenceUtil.NON_AA.matcher(sequence).find()) { 210 return false; 211 } 212 if (SequenceUtil.AA.matcher(sequence).find()) { 213 return false; 214 } 215 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence); 216 return amb_prot.find(); 217 } 218 219 /** 220 * Writes list of FastaSequeces into the outstream formatting the sequence 221 * so that it contains width chars on each line 222 * 223 * @param outstream 224 * @param sequences 225 * @param width 226 * - the maximum number of characters to write in one line 227 * @throws IOException 228 */ 229 public static void writeFasta(final OutputStream outstream, 230 final List<FastaSequence> sequences, final int width) 231 throws IOException { 232 final OutputStreamWriter writer = new OutputStreamWriter(outstream); 233 final BufferedWriter fastawriter = new BufferedWriter(writer); 234 for (final FastaSequence fs : sequences) { 235 fastawriter.write(fs.getFormatedSequence(width)); 236 } 237 outstream.flush(); 238 fastawriter.close(); 239 writer.close(); 240 } 241 242 /** 243 * Reads fasta sequences from inStream into the list of FastaSequence 244 * objects 245 * 246 * @param inStream 247 * from 248 * @return list of FastaSequence objects 249 * @throws IOException 250 */ 251 public static List<FastaSequence> readFasta(final InputStream inStream) 252 throws IOException { 253 final List<FastaSequence> seqs = new ArrayList<FastaSequence>(); 254 255 final BufferedReader infasta = new BufferedReader( 256 new InputStreamReader(inStream, "UTF8"), 16000); 257 final Pattern pattern = Pattern.compile("//s+"); 258 259 String line; 260 String sname = "", seqstr = null; 261 do { 262 line = infasta.readLine(); 263 if ((line == null) || line.startsWith(">")) { 264 if (seqstr != null) { 265 seqs.add(new FastaSequence(sname.substring(1), seqstr)); 266 } 267 sname = line; // remove > 268 seqstr = ""; 269 } else { 270 final String subseq = pattern.matcher(line).replaceAll(""); 271 seqstr += subseq; 272 } 273 } while (line != null); 274 275 infasta.close(); 276 return seqs; 277 } 278 279 /** 280 * Writes FastaSequence in the file, each sequence will take one line only 281 * 282 * @param os 283 * @param sequences 284 * @throws IOException 285 */ 286 public static void writeFasta(final OutputStream os, 287 final List<FastaSequence> sequences) throws IOException { 288 final OutputStreamWriter outWriter = new OutputStreamWriter(os); 289 final BufferedWriter fasta_out = new BufferedWriter(outWriter); 290 for (final FastaSequence fs : sequences) { 291 fasta_out.write(fs.getOnelineFasta()); 292 } 293 fasta_out.close(); 294 outWriter.close(); 295 } 296 297}