001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026import java.io.PrintStream; 027import java.io.Serializable; 028import java.util.Vector; 029 030import org.biojava.bio.seq.Sequence; 031import org.biojava.bio.symbol.IllegalSymbolException; 032import org.biojava.utils.ParseErrorEvent; 033import org.biojava.utils.ParseErrorListener; 034 035 036/** 037 * Format reader for GenBank files. Converted from the old style io to 038 * the new by working from <code>EmblLikeFormat</code>. 039 * 040 * @author Thomas Down 041 * @author Thad Welch 042 * Added GenBank header info to the sequence annotation. The ACCESSION header 043 * tag is not included. Stored in sequence.getName(). 044 * @author Greg Cox 045 * @author Keith James 046 * @author Matthew Pocock 047 * @author Ron Kuhn 048 * @deprecated Use org.biojavax.bio.seq.io.GenbankFormat 049 */ 050public class GenbankFormat 051 implements SequenceFormat, 052 Serializable, 053 org.biojava.utils.ParseErrorListener, 054 org.biojava.utils.ParseErrorSource { 055 public static final String DEFAULT = "GENBANK"; 056 057 protected static final String LOCUS_TAG = "LOCUS"; 058 protected static final String SIZE_TAG = "SIZE"; 059 protected static final String STRAND_NUMBER_TAG = "STRANDS"; 060 protected static final String TYPE_TAG = "TYPE"; 061 protected static final String CIRCULAR_TAG = "CIRCULAR"; 062 protected static final String DIVISION_TAG = "DIVISION"; 063 protected static final String DATE_TAG = "MDAT"; 064 065 protected static final String ACCESSION_TAG = "ACCESSION"; 066 protected static final String VERSION_TAG = "VERSION"; 067 protected static final String GI_TAG = "GI"; 068 protected static final String KEYWORDS_TAG = "KW"; 069 protected static final String DEFINITION_TAG = "DEFINITION"; 070 protected static final String SOURCE_TAG = "SOURCE"; 071 protected static final String ORGANISM_TAG = "ORGANISM"; 072 protected static final String REFERENCE_TAG = "REFERENCE"; 073 protected static final String COORDINATE_TAG = "COORDINATE"; 074 protected static final String REF_ACCESSION_TAG = ""; 075 protected static final String AUTHORS_TAG = "AUTHORS"; 076 protected static final String TITLE_TAG = "TITLE"; 077 protected static final String JOURNAL_TAG = "JOURNAL"; 078 protected static final String PUBMED_TAG = "PUBMED"; 079 protected static final String MEDLINE_TAG = "MEDLINE"; 080 protected static final String COMMENT_TAG = "COMMENT"; 081 protected static final String FEATURE_TAG = "FEATURES"; 082 protected static final String BASE_COUNT_TAG = "BASE"; 083 protected static final String FEATURE_FLAG = "FT"; 084 protected static final String START_SEQUENCE_TAG = "ORIGIN"; 085 protected static final String END_SEQUENCE_TAG = "//"; 086 087 protected static final String FEATURE_LINE_PREFIX = " "; 088 089 private Vector mListeners = new Vector(); 090 private boolean elideSymbols = false; 091 092 /** 093 * Reads a sequence from the specified reader using the Symbol 094 * parser and Sequence Factory provided. The sequence read in must 095 * be in Genbank format. 096 * 097 * @return boolean True if there is another sequence in the file; false 098 * otherwise 099 */ 100 public boolean readSequence(BufferedReader reader, 101 SymbolTokenization symParser, 102 SeqIOListener listener) 103 throws IllegalSymbolException, IOException, ParseException { 104 String line; 105 boolean hasAnotherSequence = true; 106 boolean hasInternalWhitespace = false; 107 108 GenbankContext ctx = new GenbankContext(symParser, listener); 109 ctx.addParseErrorListener(this); 110 ctx.setElideSymbols(this.getElideSymbols()); 111 112 listener.startSequence(); 113 114 while ((line = reader.readLine()) != null) { 115 if (line.startsWith(END_SEQUENCE_TAG)) { 116 // To close the StreamParser encapsulated in the 117 // GenbankContext object 118 ctx.processLine(line); 119 120 // Allows us to tolerate trailing whitespace without 121 // thinking that there is another Sequence to follow 122 while (true) { 123 reader.mark(1); 124 int c = reader.read(); 125 126 if (c == -1) { 127 hasAnotherSequence = false; 128 break; 129 } 130 131 if (Character.isWhitespace((char) c)) { 132 hasInternalWhitespace = true; 133 continue; 134 } 135 136 if (hasInternalWhitespace) 137 System.err.println("Warning: whitespace found between sequence entries"); 138 139 reader.reset(); 140 break; 141 } 142 143 listener.endSequence(); 144 return hasAnotherSequence; 145 } 146 ctx.processLine(line); 147 } 148 149 throw new IOException("Premature end of stream for GENBANK"); 150 } 151 152 public void writeSequence(Sequence seq, PrintStream os) 153 throws IOException { 154 writeSequence(seq, getDefaultFormat(), os); 155 } 156 157 /** 158 * <code>writeSequence</code> writes a sequence to the specified 159 * <code>PrintStream</code>, using the specified format. 160 * 161 * @param seq a <code>Sequence</code> to write out. 162 * @param format a <code>String</code> indicating which sub-format 163 * of those available from a particular 164 * <code>SequenceFormat</code> implemention to use when 165 * writing. 166 * @param os a <code>PrintStream</code> object. 167 * 168 * @exception IOException if an error occurs. 169 * @deprecated use writeSequence(Sequence seq, PrintStream os) 170 */ 171 public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException { 172 SeqFileFormer former; 173 174 if (format.equalsIgnoreCase("GENBANK")) 175 former = new GenbankFileFormer(); 176 else if (format.equalsIgnoreCase("GENPEPT")) 177 former = new GenpeptFileFormer(); 178 else if (format.equalsIgnoreCase("REFSEQ:PROTEIN")) 179 former = new ProteinRefSeqFileFormer(); 180 else 181 throw new IllegalArgumentException("Unknown format '" 182 + format 183 + "'"); 184 former.setPrintStream(os); 185 186 SeqIOEventEmitter emitter = 187 new SeqIOEventEmitter(GenEmblPropertyComparator.INSTANCE, 188 GenEmblFeatureComparator.INSTANCE); 189 190 emitter.getSeqIOEvents(seq, former); 191 } 192 193 /** 194 * <code>getDefaultFormat</code> returns the String identifier for 195 * the default format. 196 * 197 * @return a <code>String</code>. 198 * @deprecated 199 */ 200 public String getDefaultFormat() { 201 return DEFAULT; 202 } 203 204 /** 205 * Adds a parse error listener to the list of listeners if it isn't already 206 * included. 207 * 208 * @param theListener Listener to be added. 209 */ 210 public synchronized void addParseErrorListener(ParseErrorListener theListener) { 211 if (mListeners.contains(theListener) == false) { 212 mListeners.addElement(theListener); 213 } 214 } 215 216 /** 217 * Removes a parse error listener from the list of listeners if it is 218 * included. 219 * 220 * @param theListener Listener to be removed. 221 */ 222 public synchronized void removeParseErrorListener( 223 ParseErrorListener theListener) { 224 if (mListeners.contains(theListener) == true) { 225 mListeners.removeElement(theListener); 226 } 227 } 228 229 /** 230 * This method determines the behaviour when a bad line is processed. 231 * Some options are to log the error, throw an exception, ignore it 232 * completely, or pass the event through. 233 * <P> 234 * This method should be overwritten when different behavior is desired. 235 * 236 * @param theEvent The event that contains the bad line and token. 237 */ 238 public void BadLineParsed(org.biojava.utils.ParseErrorEvent theEvent) { 239 notifyParseErrorEvent(theEvent); 240 } 241 242// Protected methods 243 /** 244 * Passes the event on to all the listeners registered for ParseErrorEvents. 245 * 246 * @param theEvent The event to be handed to the listeners. 247 */ 248 protected void notifyParseErrorEvent(ParseErrorEvent theEvent) { 249 Vector listeners; 250 synchronized(this) { 251 listeners = (Vector)mListeners.clone(); 252 } 253 254 int lnrCount = listeners.size(); 255 for (int index = 0; index < lnrCount; index++) { 256 ParseErrorListener client = (ParseErrorListener)listeners.elementAt(index); 257 client.BadLineParsed(theEvent); 258 } 259 } 260 261 public boolean getElideSymbols() { 262 return elideSymbols; 263 } 264 265 /** 266 * Use this method to toggle reading of sequence data. If you're only 267 * interested in header data set to true. 268 * @param elideSymbols set to true if you don't want the sequence data. 269 */ 270 public void setElideSymbols(boolean elideSymbols) { 271 this.elideSymbols = elideSymbols; 272 } 273} 274