Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.IOException;
028import java.io.PrintStream;
029
030import org.biojava.bio.BioException;
031import org.biojava.bio.seq.Sequence;
032import org.biojava.bio.seq.io.SequenceFormat;
033import org.biojava.bio.seq.io.SymbolTokenization;
034import org.biojava.bio.symbol.IllegalSymbolException;
035import org.biojavax.Namespace;
036import org.biojavax.bio.seq.RichSequence;
037
038
039/**
040 * Allows a file format to be read/written as RichSequences.
041 * @author Richard Holland
042 * @since 1.5
043 */
044public interface RichSequenceFormat extends SequenceFormat {
045    /**
046     * Check to see if a given file is in our format. Some formats may be
047     * able to determine this by filename, whilst others may have to open the
048     * file and read it to see what format it is in.
049     * @param file  the <code>File</code> to check.
050     * @return true if the file is readable by this format, false if not.
051     * @throws IOException in case the file is inaccessible.
052     */
053    public boolean canRead(File file) throws IOException;
054    
055    /**
056     * On the assumption that the file is readable by this format (not checked),
057     * attempt to guess which symbol tokenization we should use to read it.
058     * For formats that only accept one tokenization, just return it without
059     * checking the file. For formats that accept multiple tokenizations, its
060     * up to you how you do it.
061     * @param file  the <code>File</code> object to guess the format of.
062     * @return a <code>SymbolTokenization</code> to read the file with.
063     * @throws IOException if the file is unrecognisable or inaccessible.
064     */
065    public SymbolTokenization guessSymbolTokenization(File file) throws IOException;
066    
067    /**
068     * Check to see if a given stream is in our format. 
069     * @param stream the <code>BufferedInputStream</code> to check.
070     * @return true if the stream is readable by this format, false if not.
071     * @throws IOException in case the stream is inaccessible.
072     */
073    public boolean canRead(BufferedInputStream stream) throws IOException;
074    
075    /**
076     * On the assumption that the stream is readable by this format (not checked),
077     * attempt to guess which symbol tokenization we should use to read it.
078     * For formats that only accept one tokenization, just return it without
079     * checking the stream. For formats that accept multiple tokenizations, its
080     * up to you how you do it.
081     * @param stream the <code>BufferedInputStream</code> object to guess the format of.
082     * @return a <code>SymbolTokenization</code> to read the stream with.
083     * @throws IOException if the stream is unrecognisable or inaccessible.
084     */
085    public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException;
086    
087    /**
088     * Sets the stream to write to.
089     * @param os the PrintStream to write to.
090     * @throws IOException if writing fails.
091     */
092    public void setPrintStream(PrintStream os);
093    
094    /**
095     * Gets the print stream currently being written to.
096     * @return the current print stream.
097     */
098    public PrintStream getPrintStream();
099    
100    /**
101     * Informs the writer that we want to start writing. This will do any initialisation
102     * required, such as writing the opening tags of an XML file that groups sequences together.
103     * @throws IOException if writing fails.
104     */
105    public void beginWriting() throws IOException;
106    
107    /**
108     * Informs the writer that are done writing. This will do any finalisation
109     * required, such as writing the closing tags of an XML file that groups sequences together.
110     * @throws IOException if writing fails.
111     */
112    public void finishWriting() throws IOException;
113    
114    /**
115     * Reads a sequence from the given buffered reader using the given tokenizer to parse
116     * sequence symbols. Events are passed to the listener, and the namespace used
117     * for sequences read is the one given. If the namespace is null, then the default
118     * namespace for the parser is used, which may depend on individual implementations
119     * of this interface.
120     * @param reader the input source
121     * @param symParser the tokenizer which understands the sequence being read
122     * @param listener the listener to send sequence events to
123     * @param ns the namespace to read sequences into.
124     * @return true if there is more to read after this, false otherwise.
125     * @throws BioException in case of parsing errors.
126     * @throws IllegalSymbolException if the tokenizer couldn't understand one of the
127     * sequence symbols in the file.
128     * @throws IOException if there was a read error.
129     */
130    public boolean readRichSequence(BufferedReader reader, SymbolTokenization symParser,
131            RichSeqIOListener listener, Namespace ns) throws BioException, IllegalSymbolException, IOException;
132    
133    /**
134     * Writes a sequence out to the outputstream given by beginWriting() using the default format of the
135     * implementing class. If namespace is given, sequences will be written with that
136     * namespace, otherwise they will be written with the default namespace of the
137     * implementing class (which is usually the namespace of the sequence itself).
138     * If you pass this method a sequence which is not a RichSequence, it will attempt to
139     * convert it using RichSequence.Tools.enrich(). Obviously this is not going to guarantee
140     * a perfect conversion, so it's better if you just use RichSequences to start with!
141     * @param seq the sequence to write
142     * @param ns the namespace to write it with
143     * @throws IOException in case it couldn't write something
144     */
145    public void writeSequence(Sequence seq, Namespace ns) throws IOException;
146    
147    /**
148     * Retrive the current line width. Defaults to 80.
149     * @return the line width
150     */
151    public int getLineWidth();
152    
153    /**
154     * Set the line width. When writing, the lines of sequence will never be longer than the line
155     * width. Defaults to 80.
156     * @param width the new line width
157     */
158    public void setLineWidth(int width);
159    
160    /**
161     * Use this method to toggle reading of sequence data.
162     * @param elideSymbols set to true if you <em>don't</em> want the sequence data.
163     */
164    public void setElideSymbols(boolean elideSymbols);
165    
166    /**
167     * Is the format going to emit events when sequence data is read?
168     * @return true if it is <em>not</em> otherwise false (false is default) .
169     */
170    public boolean getElideSymbols();
171    
172    /**
173     * Use this method to toggle reading of feature data.
174     * @param elideFeatures set to true if you <em>don't</em> want the feature data.
175     */
176    public void setElideFeatures(boolean elideFeatures);
177    
178    /**
179     * Is the format going to emit events when feature data is read?
180     * @return true if it is <em>not</em> otherwise false (false is default).
181     */
182    public boolean getElideFeatures();
183    
184    /**
185     * Use this method to toggle reading of bibliographic reference data.
186     * @param elideReferences set to true if you <em>don't</em> want the bibliographic reference data.
187     */
188    public void setElideReferences(boolean elideReferences);
189    
190    /**
191     * Is the format going to emit events when bibliographic reference data is read?
192     * @return true if it is <em>not</em> otherwise false (false is default) .
193     */
194    public boolean getElideReferences();
195    
196    /**
197     * Use this method to toggle reading of comments data. Will also ignore remarks
198     * lines in bibliographic references.
199     * @param elideComments set to true if you <em>don't</em> want the comments data.
200     */
201    public void setElideComments(boolean elideComments);
202    
203    /**
204     * Is the format going to emit events when comments data or remarks from
205     * bibliographic references are read?
206     * @return true if it is <em>not</em> otherwise false (false is default).
207     */
208    public boolean getElideComments();
209    
210    /**
211     * Provides a basic format with simple things like line-widths precoded.
212     */
213    public abstract class BasicFormat implements RichSequenceFormat  {
214        
215        private int lineWidth = 80;
216        private boolean elideSymbols = false;
217        private boolean elideFeatures = false;
218        private boolean elideComments = false;
219        private boolean elideReferences = false;
220        private PrintStream os;
221        
222        /**
223         * {@inheritDoc}
224         */
225        public boolean canRead(File file) throws IOException {
226            return false;
227        }
228        
229        /**
230         * {@inheritDoc}
231         */
232        public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
233            return RichSequence.IOTools.getDNAParser();
234        }
235        
236        /**
237         * {@inheritDoc}
238         */
239        public int getLineWidth() { return this.lineWidth; }
240        
241        /**
242         * {@inheritDoc}
243         */
244        public void setLineWidth(int width) {
245            if (width<1) throw new IllegalArgumentException("Width cannot be less than 1");
246            this.lineWidth = width;
247        }
248        
249        /**
250         * {@inheritDoc}
251         */
252        public boolean getElideSymbols() { return this.elideSymbols; }
253        
254        /**
255         * {@inheritDoc}
256         */
257        public void setElideSymbols(boolean elideSymbols) { this.elideSymbols = elideSymbols; }
258        
259        /**
260         * {@inheritDoc}
261         */
262        public boolean getElideFeatures() { return this.elideFeatures; }
263        
264        /**
265         * {@inheritDoc}
266         */
267        public void setElideFeatures(boolean elideFeatures) { this.elideFeatures = elideFeatures; }
268        
269        /**
270         * {@inheritDoc}
271         */
272        public boolean getElideReferences() { return this.elideReferences; }
273        
274        /**
275         * {@inheritDoc}
276         */
277        public void setElideReferences(boolean elideReferences) { this.elideReferences = elideReferences; }
278        
279        /**
280         * {@inheritDoc}
281         */
282        public boolean getElideComments() { return this.elideComments; }
283        
284        /**
285         * {@inheritDoc}
286         */
287        public void setElideComments(boolean elideComments) { this.elideComments = elideComments; }
288        
289        /**
290         * {@inheritDoc}
291         */
292        public void setPrintStream(PrintStream os) {
293            if (os==null) throw new IllegalArgumentException("Print stream cannot be null");
294            this.os = os;
295        }
296        
297        /**
298         * {@inheritDoc}
299         */
300        public PrintStream getPrintStream() { return this.os; }
301    }
302    
303    /**
304     * Provides the basic implementation required for simple header/footer-less files such as Genbank.
305     */
306    public abstract class HeaderlessFormat extends BasicFormat {
307        /**
308         * {@inheritDoc}
309         */
310        public void beginWriting() throws IOException {}
311        
312        /**
313         * {@inheritDoc}
314         */
315        public void finishWriting() throws IOException {}
316    }
317}