001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.io.PrintStream;
027import java.io.Serializable;
028import java.util.Vector;
029
030import org.biojava.bio.seq.Sequence;
031import org.biojava.bio.symbol.IllegalSymbolException;
032import org.biojava.utils.ParseErrorEvent;
033import org.biojava.utils.ParseErrorListener;
034
035
036/**
037 * Format reader for GenBank files. Converted from the old style io to
038 * the new by working from <code>EmblLikeFormat</code>.
039 *
040 * @author Thomas Down
041 * @author Thad Welch
042 * Added GenBank header info to the sequence annotation. The ACCESSION header
043 * tag is not included. Stored in sequence.getName().
044 * @author Greg Cox
045 * @author Keith James
046 * @author Matthew Pocock
047 * @author Ron Kuhn
048 * @deprecated Use org.biojavax.bio.seq.io.GenbankFormat
049 */
050public class GenbankFormat
051        implements SequenceFormat,
052        Serializable,
053        org.biojava.utils.ParseErrorListener,
054        org.biojava.utils.ParseErrorSource {
055    public static final String DEFAULT = "GENBANK";
056    
057    protected static final String LOCUS_TAG = "LOCUS";
058    protected static final String SIZE_TAG = "SIZE";
059    protected static final String STRAND_NUMBER_TAG = "STRANDS";
060    protected static final String TYPE_TAG = "TYPE";
061    protected static final String CIRCULAR_TAG = "CIRCULAR";
062    protected static final String DIVISION_TAG = "DIVISION";
063    protected static final String DATE_TAG = "MDAT";
064    
065    protected static final String ACCESSION_TAG = "ACCESSION";
066    protected static final String VERSION_TAG = "VERSION";
067    protected static final String GI_TAG = "GI";
068    protected static final String KEYWORDS_TAG = "KW";
069    protected static final String DEFINITION_TAG = "DEFINITION";
070    protected static final String SOURCE_TAG = "SOURCE";
071    protected static final String ORGANISM_TAG = "ORGANISM";
072    protected static final String REFERENCE_TAG = "REFERENCE";
073    protected static final String COORDINATE_TAG = "COORDINATE";
074    protected static final String REF_ACCESSION_TAG = "";
075    protected static final String AUTHORS_TAG = "AUTHORS";
076    protected static final String TITLE_TAG = "TITLE";
077    protected static final String JOURNAL_TAG = "JOURNAL";
078    protected static final String PUBMED_TAG = "PUBMED";
079    protected static final String MEDLINE_TAG = "MEDLINE";
080    protected static final String COMMENT_TAG = "COMMENT";
081    protected static final String FEATURE_TAG = "FEATURES";
082    protected static final String BASE_COUNT_TAG = "BASE";
083    protected static final String FEATURE_FLAG = "FT";
084    protected static final String START_SEQUENCE_TAG = "ORIGIN";
085    protected static final String END_SEQUENCE_TAG = "//";
086    
087    protected static final String FEATURE_LINE_PREFIX = "     ";
088    
089    private Vector mListeners = new Vector();
090    private boolean elideSymbols = false;
091    
092    /**
093     * Reads a sequence from the specified reader using the Symbol
094     * parser and Sequence Factory provided. The sequence read in must
095     * be in Genbank format.
096     *
097     * @return boolean True if there is another sequence in the file; false
098     * otherwise
099     */
100    public boolean readSequence(BufferedReader reader,
101            SymbolTokenization symParser,
102            SeqIOListener listener)
103            throws IllegalSymbolException, IOException, ParseException {
104        String line;
105        boolean hasAnotherSequence    = true;
106        boolean hasInternalWhitespace = false;
107        
108        GenbankContext ctx = new GenbankContext(symParser, listener);
109        ctx.addParseErrorListener(this);
110        ctx.setElideSymbols(this.getElideSymbols());
111        
112        listener.startSequence();
113        
114        while ((line = reader.readLine()) != null) {
115            if (line.startsWith(END_SEQUENCE_TAG)) {
116                // To close the StreamParser encapsulated in the
117                // GenbankContext object
118                ctx.processLine(line);
119                
120                // Allows us to tolerate trailing whitespace without
121                // thinking that there is another Sequence to follow
122                while (true) {
123                    reader.mark(1);
124                    int c = reader.read();
125                    
126                    if (c == -1) {
127                        hasAnotherSequence = false;
128                        break;
129                    }
130                    
131                    if (Character.isWhitespace((char) c)) {
132                        hasInternalWhitespace = true;
133                        continue;
134                    }
135                    
136                    if (hasInternalWhitespace)
137                        System.err.println("Warning: whitespace found between sequence entries");
138                    
139                    reader.reset();
140                    break;
141                }
142                
143                listener.endSequence();
144                return hasAnotherSequence;
145            }
146            ctx.processLine(line);
147        }
148        
149        throw new IOException("Premature end of stream for GENBANK");
150    }
151    
152    public void writeSequence(Sequence seq, PrintStream os)
153    throws IOException {
154        writeSequence(seq, getDefaultFormat(), os);
155    }
156    
157    /**
158     * <code>writeSequence</code> writes a sequence to the specified
159     * <code>PrintStream</code>, using the specified format.
160     *
161     * @param seq a <code>Sequence</code> to write out.
162     * @param format a <code>String</code> indicating which sub-format
163     * of those available from a particular
164     * <code>SequenceFormat</code> implemention to use when
165     * writing.
166     * @param os a <code>PrintStream</code> object.
167     *
168     * @exception IOException if an error occurs.
169     * @deprecated use writeSequence(Sequence seq, PrintStream os)
170     */
171    public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
172        SeqFileFormer former;
173        
174        if (format.equalsIgnoreCase("GENBANK"))
175            former = new GenbankFileFormer();
176        else if (format.equalsIgnoreCase("GENPEPT"))
177            former = new GenpeptFileFormer();
178        else if (format.equalsIgnoreCase("REFSEQ:PROTEIN"))
179            former = new ProteinRefSeqFileFormer();
180        else
181            throw new IllegalArgumentException("Unknown format '"
182                    + format
183                    + "'");
184        former.setPrintStream(os);
185        
186        SeqIOEventEmitter emitter =
187                new SeqIOEventEmitter(GenEmblPropertyComparator.INSTANCE,
188                GenEmblFeatureComparator.INSTANCE);
189        
190        emitter.getSeqIOEvents(seq, former);
191    }
192    
193    /**
194     * <code>getDefaultFormat</code> returns the String identifier for
195     * the default format.
196     *
197     * @return a <code>String</code>.
198     * @deprecated
199     */
200    public String getDefaultFormat() {
201        return DEFAULT;
202    }
203    
204    /**
205     * Adds a parse error listener to the list of listeners if it isn't already
206     * included.
207     *
208     * @param theListener Listener to be added.
209     */
210    public synchronized void addParseErrorListener(ParseErrorListener theListener) {
211        if (mListeners.contains(theListener) == false) {
212            mListeners.addElement(theListener);
213        }
214    }
215    
216    /**
217     * Removes a parse error listener from the list of listeners if it is
218     * included.
219     *
220     * @param theListener Listener to be removed.
221     */
222    public synchronized void removeParseErrorListener(
223            ParseErrorListener theListener) {
224        if (mListeners.contains(theListener) == true) {
225            mListeners.removeElement(theListener);
226        }
227    }
228    
229    /**
230     * This method determines the behaviour when a bad line is processed.
231     * Some options are to log the error, throw an exception, ignore it
232     * completely, or pass the event through.
233     * <P>
234     * This method should be overwritten when different behavior is desired.
235     *
236     * @param theEvent The event that contains the bad line and token.
237     */
238    public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) {
239        notifyParseErrorEvent(theEvent);
240    }
241    
242// Protected methods
243    /**
244     * Passes the event on to all the listeners registered for ParseErrorEvents.
245     *
246     * @param theEvent The event to be handed to the listeners.
247     */
248    protected void notifyParseErrorEvent(ParseErrorEvent theEvent) {
249        Vector listeners;
250        synchronized(this) {
251            listeners = (Vector)mListeners.clone();
252        }
253        
254        int lnrCount = listeners.size();
255        for (int index = 0; index < lnrCount; index++) {
256            ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index);
257            client.BadLineParsed(theEvent);
258        }
259    }
260    
261    public boolean getElideSymbols() {
262        return elideSymbols;
263    }
264    
265    /**
266     * Use this method to toggle reading of sequence data. If you're only
267     * interested in header data set to true.
268     * @param elideSymbols set to true if you don't want the sequence data.
269     */
270    public void setElideSymbols(boolean elideSymbols) {
271        this.elideSymbols = elideSymbols;
272    }
273}
274