001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.io.PrintStream;
027import java.io.Serializable;
028import java.util.Vector;
029import java.util.regex.Matcher;
030import java.util.regex.Pattern;
031
032import org.biojava.bio.Annotation;
033import org.biojava.bio.seq.Sequence;
034import org.biojava.bio.symbol.IllegalSymbolException;
035import org.biojava.utils.ParseErrorEvent;
036import org.biojava.utils.ParseErrorListener;
037
038
039/**
040 * Format object representing FASTA files. These files are almost pure
041 * sequence data. The only `sequence property' reported by this parser
042 * is PROPERTY_DESCRIPTIONLINE, which is the contents of the
043 * sequence's description line (the line starting with a '>'
044 * character). Normally, the first word of this is a sequence ID. If
045 * you wish it to be interpreted as such, you should use
046 * FastaDescriptionLineParser as a SeqIO filter.
047 *
048 * If you pass it a RichSeqIOListener, you'll get RichSequence objects
049 * in return. Likewise, if you write RichSequence objects, you'll get
050 * absolutely correct FASTA formatted output.
051 *
052 * @author Thomas Down
053 * @author Matthew Pocock
054 * @author Greg Cox
055 * @author Lukas Kall
056 * @author Richard Holland
057 * @author Mark Schreiber
058 * @deprecated Use org.biojavax.bio.seq.io.FastaFormat
059 */
060
061public class FastaFormat implements SequenceFormat,
062        Serializable,
063        org.biojava.utils.ParseErrorListener,
064        org.biojava.utils.ParseErrorSource {
065    public static final String DEFAULT = "FASTA";
066    
067    /**
068     * Constant string which is the property key used to notify
069     * listeners of the description lines of FASTA sequences.
070     */
071    public final static String PROPERTY_DESCRIPTIONLINE = "description_line";
072    
073    protected Vector mListeners = new Vector();
074    
075    /**
076     * The line width for output.
077     */
078    protected int lineWidth = 60;
079    
080    /**
081     * Retrive the current line width.
082     *
083     * @return the line width
084     */
085    public int getLineWidth() {
086        return lineWidth;
087    }
088    
089    /**
090     * Set the line width.
091     * <p>
092     * When writing, the lines of sequence will never be longer than the line
093     * width.
094     *
095     * @param width the new line width
096     */
097    public void setLineWidth(int width) {
098        this.lineWidth = width;
099    }
100    
101    /**
102     * Reads information from a flatfile to a <code>SeqIOListener</code>
103     * using a <code>SymbolTokenizer</code> to convert sequence strings
104     * to <code>Symbol</code> objects.
105     * @param reader The reader that is the source of the information
106     * @param symParser converts text seqeunce to biojava objects
107     * @param siol The listener that listens for event callbacks from this class.
108     * The listener can be a <code>RichSeqIOListener</code>.
109     * @throws org.biojava.bio.symbol.IllegalSymbolException if <code>symParser</code>
110     * doesn't know how to convert the text sequence into biojava <code>Symbol</code>s 
111     * @throws java.io.IOException if there is a problem reading.
112     * @throws org.biojava.bio.seq.io.ParseException if the source cannot be parsed.
113     * @return true if there is another unread sequence in the source.
114     */
115    public boolean readSequence(
116            BufferedReader reader,
117            SymbolTokenization symParser,
118            SeqIOListener siol
119            )   throws
120            IllegalSymbolException,
121            IOException,
122            ParseException {
123        String line = reader.readLine();
124        if (line == null) {
125            throw new IOException("Premature stream end");
126        }
127        while(line.length() == 0) {
128            line = reader.readLine();
129            if (line == null) {
130                throw new IOException("Premature stream end");
131            }
132        }
133        if (!line.startsWith(">")) {
134            throw new IOException("Stream does not appear to contain FASTA formatted data: " + line);
135        }
136        
137        siol.startSequence();
138        
139        String description = line.substring(1).trim();
140        
141        String regex = "(\\S+)(\\s+(.*))*";
142        Pattern p = Pattern.compile(regex);
143        Matcher m = p.matcher(description);
144        if (!m.matches()) {
145            throw new IOException("Stream does not appear to contain FASTA formatted data: " + line);
146        }
147        
148        String name = m.group(1);
149        
150        siol.setName(name);
151        siol.addSequenceProperty(PROPERTY_DESCRIPTIONLINE, description);
152        
153        boolean seenEOF = readSequenceData(reader, symParser, siol);
154        siol.endSequence();
155        
156        return !seenEOF;
157    }
158    
159    private boolean readSequenceData(
160            BufferedReader r,
161            SymbolTokenization parser,
162            SeqIOListener listener
163            ) throws
164            IOException,
165            IllegalSymbolException {
166        char[] cache = new char[512];
167        boolean reachedEnd = false, seenEOF = false;
168        StreamParser sparser = parser.parseStream(listener);
169        
170        while (!reachedEnd) {
171            r.mark(cache.length + 1);
172            int bytesRead = r.read(cache, 0, cache.length);
173            if (bytesRead < 0) {
174                reachedEnd = seenEOF = true;
175            } else {
176                int parseStart = 0;
177                int parseEnd = 0;
178                while (!reachedEnd && parseStart < bytesRead && cache[parseStart] != '>') {
179                    parseEnd = parseStart;
180                    
181                    while (parseEnd < bytesRead &&
182                            cache[parseEnd] != '\n' &&
183                            cache[parseEnd] != '\r'
184                            ) {
185                        ++parseEnd;
186                    }
187                    
188                    sparser.characters(cache, parseStart, parseEnd - parseStart);
189                    
190                    parseStart = parseEnd + 1;
191                    while (parseStart < bytesRead &&
192                            (cache[parseStart] == '\n' ||
193                            cache[parseStart] == '\r') ) {
194                        ++parseStart;
195                    }
196                }
197                if (parseStart < bytesRead && cache[parseStart] == '>') {
198                    try {
199                        r.reset();
200                    } catch (IOException ioe) {
201                        throw new IOException(
202                                "Can't reset: " +
203                                ioe.getMessage() +
204                                " parseStart=" + parseStart +
205                                " bytesRead=" + bytesRead
206                                );
207                    }
208                    if (r.skip(parseStart) != parseStart) {
209                        throw new IOException("Couldn't reset to start of next sequence");
210                    }
211                    reachedEnd = true;
212                }
213            }
214        }
215        
216        sparser.close();
217        return seenEOF;
218    }
219    
220    /**
221     * Return a suitable description line for a Sequence. If the
222     * sequence's annotation bundle contains PROPERTY_DESCRIPTIONLINE,
223     * this is used verbatim.  Otherwise, the sequence's name is used.
224     */
225    protected String describeSequence(Sequence seq) {
226        String description = null;
227        
228        Annotation seqAnn = seq.getAnnotation();
229        
230        if(seqAnn.containsProperty(PROPERTY_DESCRIPTIONLINE)) {
231            description = (String) seqAnn.getProperty(PROPERTY_DESCRIPTIONLINE);
232        } else {
233            description = seq.getName();
234        }
235        
236        return description;
237    }
238    
239    /**
240     * Writes a <code>Sequence</code> or <code>RichSequence</code> to a 
241     * <code>PrintStream</code> in FASTA format. If the sequence is a 
242     * <code>RichSequence</code> the format of the header will be in line with
243     * the NCBI standard.
244     * @param seq the sequence to format
245     * @param os the stream to write the sequence to. To print to screen use
246     * <code>System.out</code>
247     * @throws java.io.IOException if data cannot be written to <code>os</code>
248     */
249    public void writeSequence(Sequence seq, PrintStream os)
250    throws IOException {
251        os.print(">");
252        os.println(describeSequence(seq));
253        
254        int length = seq.length();
255        
256        for (int pos = 1; pos <= length; pos += lineWidth) {
257            int end = Math.min(pos + lineWidth - 1, length);
258            os.println(seq.subStr(pos, end));
259        }
260    }
261    
262    /**
263     * <code>writeSequence</code> writes a sequence to the specified
264     * <code>PrintStream</code>, using the specified format.
265     *
266     * @param seq a <code>Sequence</code> to write out.
267     * @param format a <code>String</code> indicating which sub-format
268     * of those available from a particular
269     * <code>SequenceFormat</code> implemention to use when
270     * writing.
271     * @param os a <code>PrintStream</code> object.
272     *
273     * @exception IOException if an error occurs.
274     * @deprecated use writeSequence(Sequence seq, PrintStream os)
275     */
276    public void writeSequence(Sequence seq, String format, PrintStream os)
277    throws IOException {
278        if (! format.equalsIgnoreCase(getDefaultFormat()))
279            throw new IllegalArgumentException("Unknown format '"
280                    + format
281                    + "'");
282        writeSequence(seq, os);
283    }
284    
285    /**
286     * <code>getDefaultFormat</code> returns the String identifier for
287     * the default format.
288     *
289     * @return a <code>String</code>.
290     * @deprecated
291     */
292    public String getDefaultFormat() {
293        return DEFAULT;
294    }
295    
296    /**
297     * Adds a parse error listener to the list of listeners if it isn't already
298     * included.
299     *
300     * @param theListener Listener to be added.
301     */
302    public synchronized void addParseErrorListener(ParseErrorListener theListener) {
303        if (mListeners.contains(theListener) == false) {
304            mListeners.addElement(theListener);
305        }
306    }
307    
308    /**
309     * Removes a parse error listener from the list of listeners if it is
310     * included.
311     *
312     * @param theListener Listener to be removed.
313     */
314    public synchronized void removeParseErrorListener(ParseErrorListener theListener) {
315        if (mListeners.contains(theListener) == true) {
316            mListeners.removeElement(theListener);
317        }
318    }
319    
320    /**
321     * This method determines the behaviour when a bad line is processed.
322     * Some options are to log the error, throw an exception, ignore it
323     * completely, or pass the event through.
324     * <p>
325     * This method should be overwritten when different behavior is desired.
326     *
327     * @param theEvent The event that contains the bad line and token.
328     */
329    public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) {
330        notifyParseErrorEvent(theEvent);
331    }
332    
333    // Protected methods
334    /**
335     * Passes the event on to all the listeners registered for ParseErrorEvents.
336     *
337     * @param theEvent The event to be handed to the listeners.
338     */
339    protected void notifyParseErrorEvent(ParseErrorEvent theEvent) {
340        Vector listeners;
341        synchronized(this) {
342            listeners = (Vector)mListeners.clone();
343        }
344        
345        for (int index = 0; index < listeners.size(); index++) {
346            ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index);
347            client.BadLineParsed(theEvent);
348        }
349    }
350}