001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026import java.io.PrintStream;
027import java.io.Serializable;
028import java.util.ArrayList;
029import java.util.Vector;
030
031import org.biojava.bio.seq.Sequence;
032import org.biojava.bio.symbol.IllegalSymbolException;
033import org.biojava.utils.ChangeVetoException;
034import org.biojava.utils.ParseErrorEvent;
035import org.biojava.utils.ParseErrorListener;
036import org.biojava.utils.ParseErrorSource;
037
038/**
039 * <p>
040 * Format processor for handling EMBL records and similar files.  This
041 * takes a very simple approach: all `normal' attribute lines are
042 * passed to the listener as a tag (first two characters) and a value
043 * (the rest of the line from the 6th character onwards).  Any data
044 * between the special `SQ' line and the "//" entry terminator is
045 * passed as a SymbolReader.
046 * </p>
047 *
048 * <p>
049 * This low-level format processor should normally be used in
050 * conjunction with one or more `filter' objects, such as
051 * EmblProcessor.
052 * </p>
053 *
054 * <p>
055 * Many ideas borrowed from the old EmblFormat processor by Thomas
056 * Down and Thad Welch.
057 * </p>
058 *
059 * @author Thomas Down
060 * @author Greg Cox
061 * @author Keith James
062 * @author Len Trigg
063 * @author Lorna Morris
064 * @since 1.1
065 * @deprecated Use org.biojavax.bio.seq.io.EMBLFormat instead
066 */
067
068public class EmblLikeFormat implements
069    SequenceFormat,
070    Serializable,
071    ParseErrorSource,
072    ParseErrorListener {
073  public static final String DEFAULT = "EMBL";
074
075  protected static final String ID_TAG = "ID";
076  protected static final String SIZE_TAG = "SIZE";
077  protected static final String STRAND_NUMBER_TAG = "STRANDS";
078  protected static final String TYPE_TAG = "TYPE";
079  protected static final String CIRCULAR_TAG = "CIRCULAR";
080  protected static final String DIVISION_TAG = "DIVISION";
081  protected static final String DR_TAG = "DR"; //Lorna: new tag
082
083  protected static final String ACCESSION_TAG = "AC";
084  protected static final String VERSION_TAG = "SV";
085  protected static final String DATE_TAG = "DT";
086  protected static final String DEFINITION_TAG = "DE";
087  protected static final String KEYWORDS_TAG = "KW";
088  protected static final String SOURCE_TAG = "OS";
089  protected static final String ORGANISM_TAG = "OC";
090  protected static final String ORGANISM_XREF_TAG = "OX";
091  protected static final String REFERENCE_TAG = "RN";
092  protected static final String COORDINATE_TAG = "RP";
093  protected static final String REF_ACCESSION_TAG = "RX";
094  protected static final String AUTHORS_TAG = "RA";
095  protected static final String REF_XREF_TAG = "RX";
096  protected static final String TITLE_TAG = "RT";
097  protected static final String JOURNAL_TAG = "RL";
098  protected static final String COMMENT_TAG = "CC";
099  protected static final String FEATURE_TAG = "FH";
100  protected static final String SEPARATOR_TAG = "XX";
101  protected static final String FEATURE_TABLE_TAG = "FT";
102  protected static final String START_SEQUENCE_TAG = "SQ";
103  protected static final String END_SEQUENCE_TAG = "//";
104
105  private boolean elideSymbols = false;
106  private Vector mListeners = new Vector();
107
108  /**
109   * <p>Specifies whether the symbols (SQ) part of the entry should
110   * be ignored. If this property is set to <code>true</code>, the
111   * parser will never call addSymbols on the
112   * <code>SeqIOListener</code>, but parsing will be faster if
113   * you're only interested in header information.</p>
114   *
115   * <p> This property also allows the header to be parsed for files
116   * which have invalid sequence data.</p>
117   */
118  public void setElideSymbols(boolean b) {
119    elideSymbols = b;
120  }
121
122  /**
123   * Return a flag indicating if symbol data will be skipped
124   * when parsing streams.
125   */
126  public boolean getElideSymbols() {
127    return elideSymbols;
128  }
129
130  public boolean readSequence(BufferedReader reader,
131                              SymbolTokenization symParser,
132                              SeqIOListener listener) throws
133      IllegalSymbolException, IOException, ParseException {
134
135    //EmblReferenceProperty reference = null; //lorna
136
137    if (listener instanceof ParseErrorSource) {
138      ( (ParseErrorSource) (listener)).addParseErrorListener(this);
139    }
140
141    String line;
142    StreamParser sparser = null;
143    boolean hasMoreSequence = true;
144    boolean hasInternalWhitespace = false;
145
146    listener.startSequence();
147
148    while ( (line = reader.readLine()) != null) {
149      if (line.startsWith(END_SEQUENCE_TAG)) {
150        if (sparser != null) {
151          // End of symbol data
152          sparser.close();
153          sparser = null;
154        }
155
156        // Allows us to tolerate trailing whitespace without
157        // thinking that there is another Sequence to follow
158        while (true) {
159          reader.mark(1);
160          int c = reader.read();
161
162          if (c == -1) {
163            hasMoreSequence = false;
164            break;
165          }
166
167          if (Character.isWhitespace( (char) c)) {
168            hasInternalWhitespace = true;
169            continue;
170          }
171
172          if (hasInternalWhitespace)
173            System.err.println(
174                "Warning: whitespace found between sequence entries");
175
176          reader.reset();
177          break;
178        }
179
180        listener.endSequence();
181        return hasMoreSequence;
182      }
183      else if (line.startsWith(START_SEQUENCE_TAG)) {
184        // Adding a null property to flush the last feature;
185        // Needed for Swissprot files because there is no gap
186        // between the feature table and the sequence data
187        listener.addSequenceProperty(SEPARATOR_TAG, "");
188
189        sparser = symParser.parseStream(listener);
190      }
191      else {
192        if (sparser == null) {
193          // Normal attribute line
194          String tag = line.substring(0, 2);
195          String rest = null;
196          if (line.length() > 5) {
197            rest = line.substring(5);
198          }
199
200          if (tag.equals(REFERENCE_TAG)) { //only 1 reference_tag!
201
202            try {
203              //lorna added, tags read in order, when a complete set goes through,
204              //spit out a single annotation event
205              ReferenceAnnotation refAnnot = new ReferenceAnnotation();
206
207              refAnnot.setProperty(tag, rest);
208              while (! (tag.equals(SEPARATOR_TAG))) {
209                // Normal attribute line
210
211                line = reader.readLine();
212
213                tag = line.substring(0, 2);
214
215                if (line.length() > 5) {
216                  rest = line.substring(5);
217                }
218                else {
219                  rest = null; //for XX lines
220                }
221
222                if (refAnnot.containsProperty(tag)) {
223
224                  Object property = refAnnot.getProperty(tag);
225                  ArrayList properties;
226
227                  if (property instanceof String) {
228                    properties = new ArrayList();
229                    properties.add(property);
230                    properties.add(rest);
231                    refAnnot.setProperty(tag, properties);
232                  }
233                  if (property instanceof ArrayList) {
234                    ( (ArrayList) property).add(rest);
235                  }
236                }
237                else {
238                  refAnnot.setProperty(tag, rest);
239                }
240                //mark_s: required for parsing swissprot
241                //fixme: it is actually possible to have more than one JOURNAL_TAG
242                //so should really only break after the last one.
243                if(tag.equals(JOURNAL_TAG))
244                  break;
245              }
246              listener.addSequenceProperty(ReferenceAnnotation.class, refAnnot);
247
248            } catch (ChangeVetoException cve) {
249              cve.printStackTrace();
250            }
251
252          }
253          // lorna, end
254          else { //lorna
255            listener.addSequenceProperty(tag, rest);
256          } //lorna
257        }
258        else {
259          // Sequence line
260          if (!elideSymbols)
261            processSequenceLine(line, sparser);
262        }
263      }
264    }
265
266    if (sparser != null)
267      sparser.close();
268
269    throw new IOException(
270        "Premature end of stream or missing end tag '//' for EMBL");
271  }
272
273  /**
274   * Dispatch symbol data from SQ-block line of an EMBL-like file.
275   */
276  protected void processSequenceLine(String line, StreamParser parser) throws
277      IllegalSymbolException, ParseException {
278    char[] cline = line.toCharArray();
279    int parseStart = 0;
280    int parseEnd = 0;
281
282    while (parseStart < cline.length) {
283      while (parseStart < cline.length && cline[parseStart] == ' ')
284        ++
285          parseStart;
286      if (parseStart >= cline.length)
287        break;
288
289      if (Character.isDigit(cline[parseStart]))
290        return;
291
292      parseEnd = parseStart + 1;
293      while (parseEnd < cline.length && cline[parseEnd] != ' ') {
294        if (cline[parseEnd] == '.' || cline[parseEnd] == '~') {
295          cline[parseEnd] = '-';
296        }
297        ++parseEnd;
298      }
299
300      // Got a segment of read sequence data
301      parser.characters(cline, parseStart, parseEnd - parseStart);
302
303      parseStart = parseEnd;
304    }
305  }
306
307  public void writeSequence(Sequence seq, PrintStream os) throws IOException {
308    writeSequence(seq, getDefaultFormat(), os);
309  }
310
311  /**
312   * <code>writeSequence</code> writes a sequence to the specified
313   * <code>PrintStream</code>, using the specified format.
314   *
315   * @param seq a <code>Sequence</code> to write out.
316   * @param format a <code>String</code> indicating which sub-format
317   * of those available from a particular
318   * <code>SequenceFormat</code> implemention to use when
319   * writing.
320   * @param os a <code>PrintStream</code> object.
321   *
322   * @exception IOException if an error occurs.
323   * @deprecated use writeSequence(Sequence seq, PrintStream os)
324   */
325  public void writeSequence(Sequence seq, String format, PrintStream os) throws
326      IOException {
327    SeqFileFormer former;
328
329    if (format.equalsIgnoreCase("EMBL"))
330      former = new EmblFileFormer();
331    else if (format.equalsIgnoreCase("SWISSPROT"))
332      former = new SwissprotFileFormer();
333    else
334      throw new IllegalArgumentException("Unknown format '"
335                                         + format
336                                         + "'");
337    former.setPrintStream(os);
338
339    SeqIOEventEmitter emitter =
340        new SeqIOEventEmitter(GenEmblPropertyComparator.INSTANCE,
341                              GenEmblFeatureComparator.INSTANCE);
342
343    emitter.getSeqIOEvents(seq, former);
344  }
345
346  /**
347   * <code>getDefaultFormat</code> returns the String identifier for
348   * the default format written by a <code>SequenceFormat</code>
349   * implementation.
350   *
351   * @return a <code>String</code>.
352   * @deprecated
353   */
354  public String getDefaultFormat() {
355    return DEFAULT;
356  }
357
358  /**
359   * <p>
360   * This method determines the behaviour when a bad line is processed.
361   * Some options are to log the error, throw an exception, ignore it
362   * completely, or pass the event through.
363   * </p>
364   *
365   * <p>
366   * This method should be overwritten when different behavior is desired.
367   * </p>
368   *
369   * @param theEvent The event that contains the bad line and token.
370   */
371  public void BadLineParsed(ParseErrorEvent theEvent) {
372    notifyParseErrorEvent(theEvent);
373  }
374
375  /**
376   * Adds a parse error listener to the list of listeners if it isn't already
377   * included.
378   *
379   * @param theListener Listener to be added.
380   */
381  public synchronized void addParseErrorListener(ParseErrorListener theListener) {
382    if (mListeners.contains(theListener) == false) {
383      mListeners.addElement(theListener);
384    }
385  }
386
387  /**
388   * Removes a parse error listener from the list of listeners if it is
389   * included.
390   *
391   * @param theListener Listener to be removed.
392   */
393  public synchronized void removeParseErrorListener(ParseErrorListener
394      theListener) {
395    if (mListeners.contains(theListener) == true) {
396      mListeners.removeElement(theListener);
397    }
398  }
399
400  // Protected methods
401  /**
402   * Passes the event on to all the listeners registered for ParseErrorEvents.
403   *
404   * @param theEvent The event to be handed to the listeners.
405   */
406  protected void notifyParseErrorEvent(ParseErrorEvent theEvent) {
407    Vector listeners;
408    synchronized (this) {
409      listeners = (Vector) mListeners.clone();
410    }
411
412    for (int index = 0; index < listeners.size(); index++) {
413      ParseErrorListener client = (ParseErrorListener) listeners.elementAt(
414          index);
415      client.BadLineParsed(theEvent);
416    }
417  }
418}