001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.io.BufferedInputStream; 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.IOException; 028import java.io.PrintStream; 029 030import org.biojava.bio.BioException; 031import org.biojava.bio.seq.Sequence; 032import org.biojava.bio.seq.io.SequenceFormat; 033import org.biojava.bio.seq.io.SymbolTokenization; 034import org.biojava.bio.symbol.IllegalSymbolException; 035import org.biojavax.Namespace; 036import org.biojavax.bio.seq.RichSequence; 037 038 039/** 040 * Allows a file format to be read/written as RichSequences. 041 * @author Richard Holland 042 * @since 1.5 043 */ 044public interface RichSequenceFormat extends SequenceFormat { 045 /** 046 * Check to see if a given file is in our format. Some formats may be 047 * able to determine this by filename, whilst others may have to open the 048 * file and read it to see what format it is in. 049 * @param file the <code>File</code> to check. 050 * @return true if the file is readable by this format, false if not. 051 * @throws IOException in case the file is inaccessible. 052 */ 053 public boolean canRead(File file) throws IOException; 054 055 /** 056 * On the assumption that the file is readable by this format (not checked), 057 * attempt to guess which symbol tokenization we should use to read it. 058 * For formats that only accept one tokenization, just return it without 059 * checking the file. For formats that accept multiple tokenizations, its 060 * up to you how you do it. 061 * @param file the <code>File</code> object to guess the format of. 062 * @return a <code>SymbolTokenization</code> to read the file with. 063 * @throws IOException if the file is unrecognisable or inaccessible. 064 */ 065 public SymbolTokenization guessSymbolTokenization(File file) throws IOException; 066 067 /** 068 * Check to see if a given stream is in our format. 069 * @param stream the <code>BufferedInputStream</code> to check. 070 * @return true if the stream is readable by this format, false if not. 071 * @throws IOException in case the stream is inaccessible. 072 */ 073 public boolean canRead(BufferedInputStream stream) throws IOException; 074 075 /** 076 * On the assumption that the stream is readable by this format (not checked), 077 * attempt to guess which symbol tokenization we should use to read it. 078 * For formats that only accept one tokenization, just return it without 079 * checking the stream. For formats that accept multiple tokenizations, its 080 * up to you how you do it. 081 * @param stream the <code>BufferedInputStream</code> object to guess the format of. 082 * @return a <code>SymbolTokenization</code> to read the stream with. 083 * @throws IOException if the stream is unrecognisable or inaccessible. 084 */ 085 public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException; 086 087 /** 088 * Sets the stream to write to. 089 * @param os the PrintStream to write to. 090 * @throws IOException if writing fails. 091 */ 092 public void setPrintStream(PrintStream os); 093 094 /** 095 * Gets the print stream currently being written to. 096 * @return the current print stream. 097 */ 098 public PrintStream getPrintStream(); 099 100 /** 101 * Informs the writer that we want to start writing. This will do any initialisation 102 * required, such as writing the opening tags of an XML file that groups sequences together. 103 * @throws IOException if writing fails. 104 */ 105 public void beginWriting() throws IOException; 106 107 /** 108 * Informs the writer that are done writing. This will do any finalisation 109 * required, such as writing the closing tags of an XML file that groups sequences together. 110 * @throws IOException if writing fails. 111 */ 112 public void finishWriting() throws IOException; 113 114 /** 115 * Reads a sequence from the given buffered reader using the given tokenizer to parse 116 * sequence symbols. Events are passed to the listener, and the namespace used 117 * for sequences read is the one given. If the namespace is null, then the default 118 * namespace for the parser is used, which may depend on individual implementations 119 * of this interface. 120 * @param reader the input source 121 * @param symParser the tokenizer which understands the sequence being read 122 * @param listener the listener to send sequence events to 123 * @param ns the namespace to read sequences into. 124 * @return true if there is more to read after this, false otherwise. 125 * @throws BioException in case of parsing errors. 126 * @throws IllegalSymbolException if the tokenizer couldn't understand one of the 127 * sequence symbols in the file. 128 * @throws IOException if there was a read error. 129 */ 130 public boolean readRichSequence(BufferedReader reader, SymbolTokenization symParser, 131 RichSeqIOListener listener, Namespace ns) throws BioException, IllegalSymbolException, IOException; 132 133 /** 134 * Writes a sequence out to the outputstream given by beginWriting() using the default format of the 135 * implementing class. If namespace is given, sequences will be written with that 136 * namespace, otherwise they will be written with the default namespace of the 137 * implementing class (which is usually the namespace of the sequence itself). 138 * If you pass this method a sequence which is not a RichSequence, it will attempt to 139 * convert it using RichSequence.Tools.enrich(). Obviously this is not going to guarantee 140 * a perfect conversion, so it's better if you just use RichSequences to start with! 141 * @param seq the sequence to write 142 * @param ns the namespace to write it with 143 * @throws IOException in case it couldn't write something 144 */ 145 public void writeSequence(Sequence seq, Namespace ns) throws IOException; 146 147 /** 148 * Retrive the current line width. Defaults to 80. 149 * @return the line width 150 */ 151 public int getLineWidth(); 152 153 /** 154 * Set the line width. When writing, the lines of sequence will never be longer than the line 155 * width. Defaults to 80. 156 * @param width the new line width 157 */ 158 public void setLineWidth(int width); 159 160 /** 161 * Use this method to toggle reading of sequence data. 162 * @param elideSymbols set to true if you <em>don't</em> want the sequence data. 163 */ 164 public void setElideSymbols(boolean elideSymbols); 165 166 /** 167 * Is the format going to emit events when sequence data is read? 168 * @return true if it is <em>not</em> otherwise false (false is default) . 169 */ 170 public boolean getElideSymbols(); 171 172 /** 173 * Use this method to toggle reading of feature data. 174 * @param elideFeatures set to true if you <em>don't</em> want the feature data. 175 */ 176 public void setElideFeatures(boolean elideFeatures); 177 178 /** 179 * Is the format going to emit events when feature data is read? 180 * @return true if it is <em>not</em> otherwise false (false is default). 181 */ 182 public boolean getElideFeatures(); 183 184 /** 185 * Use this method to toggle reading of bibliographic reference data. 186 * @param elideReferences set to true if you <em>don't</em> want the bibliographic reference data. 187 */ 188 public void setElideReferences(boolean elideReferences); 189 190 /** 191 * Is the format going to emit events when bibliographic reference data is read? 192 * @return true if it is <em>not</em> otherwise false (false is default) . 193 */ 194 public boolean getElideReferences(); 195 196 /** 197 * Use this method to toggle reading of comments data. Will also ignore remarks 198 * lines in bibliographic references. 199 * @param elideComments set to true if you <em>don't</em> want the comments data. 200 */ 201 public void setElideComments(boolean elideComments); 202 203 /** 204 * Is the format going to emit events when comments data or remarks from 205 * bibliographic references are read? 206 * @return true if it is <em>not</em> otherwise false (false is default). 207 */ 208 public boolean getElideComments(); 209 210 /** 211 * Provides a basic format with simple things like line-widths precoded. 212 */ 213 public abstract class BasicFormat implements RichSequenceFormat { 214 215 private int lineWidth = 80; 216 private boolean elideSymbols = false; 217 private boolean elideFeatures = false; 218 private boolean elideComments = false; 219 private boolean elideReferences = false; 220 private PrintStream os; 221 222 /** 223 * {@inheritDoc} 224 */ 225 public boolean canRead(File file) throws IOException { 226 return false; 227 } 228 229 /** 230 * {@inheritDoc} 231 */ 232 public SymbolTokenization guessSymbolTokenization(File file) throws IOException { 233 return RichSequence.IOTools.getDNAParser(); 234 } 235 236 /** 237 * {@inheritDoc} 238 */ 239 public int getLineWidth() { return this.lineWidth; } 240 241 /** 242 * {@inheritDoc} 243 */ 244 public void setLineWidth(int width) { 245 if (width<1) throw new IllegalArgumentException("Width cannot be less than 1"); 246 this.lineWidth = width; 247 } 248 249 /** 250 * {@inheritDoc} 251 */ 252 public boolean getElideSymbols() { return this.elideSymbols; } 253 254 /** 255 * {@inheritDoc} 256 */ 257 public void setElideSymbols(boolean elideSymbols) { this.elideSymbols = elideSymbols; } 258 259 /** 260 * {@inheritDoc} 261 */ 262 public boolean getElideFeatures() { return this.elideFeatures; } 263 264 /** 265 * {@inheritDoc} 266 */ 267 public void setElideFeatures(boolean elideFeatures) { this.elideFeatures = elideFeatures; } 268 269 /** 270 * {@inheritDoc} 271 */ 272 public boolean getElideReferences() { return this.elideReferences; } 273 274 /** 275 * {@inheritDoc} 276 */ 277 public void setElideReferences(boolean elideReferences) { this.elideReferences = elideReferences; } 278 279 /** 280 * {@inheritDoc} 281 */ 282 public boolean getElideComments() { return this.elideComments; } 283 284 /** 285 * {@inheritDoc} 286 */ 287 public void setElideComments(boolean elideComments) { this.elideComments = elideComments; } 288 289 /** 290 * {@inheritDoc} 291 */ 292 public void setPrintStream(PrintStream os) { 293 if (os==null) throw new IllegalArgumentException("Print stream cannot be null"); 294 this.os = os; 295 } 296 297 /** 298 * {@inheritDoc} 299 */ 300 public PrintStream getPrintStream() { return this.os; } 301 } 302 303 /** 304 * Provides the basic implementation required for simple header/footer-less files such as Genbank. 305 */ 306 public abstract class HeaderlessFormat extends BasicFormat { 307 /** 308 * {@inheritDoc} 309 */ 310 public void beginWriting() throws IOException {} 311 312 /** 313 * {@inheritDoc} 314 */ 315 public void finishWriting() throws IOException {} 316 } 317}