001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.io.PrintStream; 031import java.util.Map; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034 035import org.biojava.bio.seq.Sequence; 036import org.biojava.bio.seq.io.ParseException; 037import org.biojava.bio.seq.io.SeqIOListener; 038import org.biojava.bio.seq.io.SymbolTokenization; 039import org.biojava.bio.symbol.IllegalSymbolException; 040import org.biojava.bio.symbol.SimpleSymbolList; 041import org.biojava.bio.symbol.Symbol; 042import org.biojava.bio.symbol.SymbolList; 043import org.biojava.utils.ChangeVetoException; 044import org.biojavax.Namespace; 045import org.biojavax.RichObjectFactory; 046import org.biojavax.SimpleNamespace; 047import org.biojavax.bio.seq.RichSequence; 048 049 050/** 051 * Format object representing FASTA files. These files are almost pure 052 * sequence data. 053 * @author Thomas Down 054 * @author Matthew Pocock 055 * @author Greg Cox 056 * @author Lukas Kall 057 * @author Richard Holland 058 * @author Mark Schreiber 059 * @author Carl Masak 060 * @since 1.5 061 */ 062 063public class FastaFormat extends RichSequenceFormat.HeaderlessFormat { 064 065 // Register this format with the format auto-guesser. 066 static { 067 RichSequence.IOTools.registerFormat(FastaFormat.class); 068 } 069 070 /** 071 * The name of this format 072 */ 073 public static final String FASTA_FORMAT = "FASTA"; 074 075 // header line 076 protected static final Pattern hp = Pattern.compile(">\\s*(\\S+)(\\s+(.*))?"); 077 // description chunk 078 protected static final Pattern dp = Pattern.compile( "^(gi\\|(\\d+)\\|)?(\\w+)\\|(\\w+?)(\\.(\\d+))?\\|(\\w+)?$"); 079 080 protected static final Pattern readableFiles = Pattern.compile(".*(fa|fas)$"); 081 protected static final Pattern aminoAcids = Pattern.compile("[FLIPQE]", Pattern.CASE_INSENSITIVE); 082 083 private FastaHeader header = new FastaHeader(); 084 085 /** 086 * {@inheritDoc} 087 * A file is in FASTA format if the name ends with fa or fas, or the file starts with ">". 088 */ 089 @Override 090 public boolean canRead(File file) throws IOException { 091 if (readableFiles.matcher(file.getName()).matches()) return true; 092 BufferedReader br = new BufferedReader(new FileReader(file)); 093 String firstLine = br.readLine(); 094 boolean readable = firstLine!=null && firstLine.startsWith(">"); 095 br.close(); 096 return readable; 097 } 098 099 /** 100 * {@inheritDoc} 101 * Returns an protein parser if the first line of sequence contains any of F/L/I/P/Q/E, 102 * otherwise returns a DNA tokenizer. 103 */ 104 @Override 105 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 106 BufferedReader br = new BufferedReader(new FileReader(file)); 107 br.readLine(); // discard first line 108 boolean aa = aminoAcids.matcher(br.readLine()).find(); 109 br.close(); 110 if (aa) return RichSequence.IOTools.getProteinParser(); 111 else return RichSequence.IOTools.getDNAParser(); 112 } 113 114 /** 115 * {@inheritDoc} 116 * A stream is in FASTA format if the stream starts with ">". 117 */ 118 public boolean canRead(BufferedInputStream stream) throws IOException { 119 stream.mark(2000); // some streams may not support this 120 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 121 String firstLine = br.readLine(); 122 boolean readable = firstLine!=null && firstLine.startsWith(">"); 123 // don't close the reader as it'll close the stream too. 124 // br.close(); 125 stream.reset(); 126 return readable; 127 } 128 129 /** 130 * {@inheritDoc} 131 * Returns an protein parser if the first line of sequence contains any of F/L/I/P/Q/E, 132 * otherwise returns a DNA tokenizer. 133 */ 134 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException { 135 stream.mark(2000); // some streams may not support this 136 BufferedReader br = new BufferedReader(new InputStreamReader(stream)); 137 br.readLine(); // discard first line 138 boolean aa = aminoAcids.matcher(br.readLine()).find(); 139 // don't close the reader as it'll close the stream too. 140 // br.close(); 141 stream.reset(); 142 if (aa) return RichSequence.IOTools.getProteinParser(); 143 else return RichSequence.IOTools.getDNAParser(); 144 } 145 146 /** 147 * {@inheritDoc} 148 */ 149 public boolean readSequence( 150 BufferedReader reader, 151 SymbolTokenization symParser, 152 SeqIOListener listener 153 ) throws 154 IllegalSymbolException, 155 IOException, 156 ParseException { 157 if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today"); 158 return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null); 159 } 160 161 /** 162 * {@inheritDoc} 163 * If namespace is null, then the namespace of the sequence in the fasta is used. 164 * If the namespace is null and so is the namespace of the sequence in the fasta, 165 * then the default namespace is used. 166 */ 167 public boolean readRichSequence( 168 BufferedReader reader, 169 SymbolTokenization symParser, 170 RichSeqIOListener rsiol, 171 Namespace ns 172 ) throws 173 IllegalSymbolException, 174 IOException, 175 ParseException { 176 177 String line = reader.readLine(); 178 if (line == null) { 179 throw new IOException("Premature stream end"); 180 } 181 while(line.length() == 0) { 182 line = reader.readLine(); 183 if (line == null) { 184 throw new IOException("Premature stream end"); 185 } 186 } 187 if (!line.startsWith(">")) { 188 throw new IOException("Stream does not appear to contain FASTA formatted data: " + line); 189 } 190 191 rsiol.startSequence(); 192 193 processHeader(line,rsiol,ns); 194 195 StringBuffer seq = new StringBuffer(); 196 boolean hasMoreSeq = true; 197 while (hasMoreSeq) { 198 reader.mark(500); 199 line = reader.readLine(); 200 if (line!=null) { 201 line = line.trim(); 202 if (line.length() > 0 && line.charAt(0)=='>') { 203 reader.reset(); 204 hasMoreSeq = false; 205 } else { 206 seq.append(line); 207 } 208 } else { 209 hasMoreSeq = false; 210 } 211 } 212 if (!this.getElideSymbols()) { 213 try { 214 SymbolList sl = new SimpleSymbolList(symParser, 215 seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-")); 216 rsiol.addSymbols(symParser.getAlphabet(), 217 (Symbol[])(sl.toList().toArray(new Symbol[0])), 218 0, sl.length()); 219 } catch (Exception e) { 220 // do not know name and gi any longer, replace them with empty string. 221 // why does the rsiol only have setter methods, but not getter??? 222 String message = ParseException.newMessage(this.getClass(), "", "", "problem parsing symbols", seq.toString()); 223 throw new ParseException(e, message); 224 } 225 } 226 227 rsiol.endSequence(); 228 229 return line!=null; 230 } 231 232 /** Parse the Header information from the Fasta Description line 233 * 234 * @param line 235 * @param rsiol 236 * @param ns 237 * @throws IOException 238 * @throws ParseException 239 */ 240 public void processHeader(String line,RichSeqIOListener rsiol,Namespace ns) 241 throws IOException, ParseException { 242 Matcher m = hp.matcher(line); 243 if (!m.matches()) { 244 throw new IOException("Stream does not appear to contain FASTA formatted data: " + line); 245 } 246 247 String name = m.group(1); 248 String desc = m.group(3); 249 String gi = null; 250 251 m = dp.matcher(name); 252 if (m.matches()) { 253 gi = m.group(2); 254 String namespace = m.group(3); 255 String accession = m.group(4); 256 String verString = m.group(6); 257 int version = verString==null?0:Integer.parseInt(verString); 258 name = m.group(7); 259 if (name==null) name=accession; 260 261 rsiol.setAccession(accession); 262 rsiol.setVersion(version); 263 if (gi!=null) rsiol.setIdentifier(gi); 264 if (ns==null) rsiol.setNamespace((Namespace)RichObjectFactory.getObject(SimpleNamespace.class,new Object[]{namespace})); 265 else rsiol.setNamespace(ns); 266 } else { 267 rsiol.setAccession(name); 268 rsiol.setNamespace((ns==null?RichObjectFactory.getDefaultNamespace():ns)); 269 } 270 rsiol.setName(name); 271 if (!this.getElideComments()) rsiol.setDescription(desc); 272 273 } 274 275 /** 276 * {@inheritDoc} 277 */ 278 public void writeSequence(Sequence seq, PrintStream os) throws IOException { 279 if (this.getPrintStream()==null) this.setPrintStream(os); 280 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 281 } 282 283 /** 284 * {@inheritDoc} 285 */ 286 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 287 if (this.getPrintStream()==null) this.setPrintStream(os); 288 if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format); 289 this.writeSequence(seq, RichObjectFactory.getDefaultNamespace()); 290 } 291 292 293 /** 294 * {@inheritDoc} 295 * If namespace is null, then the sequence's own namespace is used. 296 */ 297 public void writeSequence(Sequence seq, Namespace ns) throws IOException { 298 RichSequence rs; 299 try { 300 if (seq instanceof RichSequence) rs = (RichSequence)seq; 301 else rs = RichSequence.Tools.enrich(seq); 302 } catch (ChangeVetoException e) { 303 IOException e2 = new IOException("Unable to enrich sequence"); 304 e2.initCause(e); 305 throw e2; 306 } 307 308 StringBuilder sb = new StringBuilder(); 309 sb.append(">"); 310 311 String identifier = rs.getIdentifier(); 312 if (header.isShowIdentifier() && identifier!=null && !"".equals(identifier)) { 313 sb.append("gi|"); 314 sb.append(identifier); 315 sb.append("|"); 316 } 317 if(header.isShowNamespace()){ 318 sb.append((ns==null?rs.getNamespace().getName():ns.getName())); 319 sb.append("|"); 320 } 321 if(header.isShowAccession()){ 322 sb.append(rs.getAccession()); 323 if(header.isShowVersion()){ 324 sb.append("."); 325 } 326 } 327 if(header.isShowVersion()){ 328 sb.append(rs.getVersion()); 329 sb.append("|"); 330 } 331 if(header.isShowName()){ 332 sb.append(rs.getName()); 333 sb.append(" "); 334 }else{ 335 sb.append(" "); //in case the show the description there needs to be space 336 } 337 if(header.isShowDescription()){ 338 String desc = rs.getDescription(); 339 if (desc!=null && !"".equals(desc)) sb.append(desc.replaceAll("\\n"," ")); 340 } 341 if(sb.charAt(sb.length() -1) == '|'){ 342 sb.deleteCharAt(sb.length() -1); 343 } 344 this.getPrintStream().print(sb.toString()); 345 this.getPrintStream().println(); 346 347 int length = rs.length(); 348 349 for (int pos = 1; pos <= length; pos += this.getLineWidth()) { 350 int end = Math.min(pos + this.getLineWidth() - 1, length); 351 this.getPrintStream().println(rs.subStr(pos, end)); 352 } 353 } 354 355 /** 356 * {@inheritDoc} 357 */ 358 public String getDefaultFormat() { 359 return FASTA_FORMAT; 360 } 361 362 public FastaHeader getHeader() { 363 return header; 364 } 365 366 public void setHeader(FastaHeader header) { 367 this.header = header; 368 } 369}