001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.io.BufferedInputStream;
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.io.PrintStream;
031import java.util.Map;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034
035import org.biojava.bio.seq.Sequence;
036import org.biojava.bio.seq.io.ParseException;
037import org.biojava.bio.seq.io.SeqIOListener;
038import org.biojava.bio.seq.io.SymbolTokenization;
039import org.biojava.bio.symbol.IllegalSymbolException;
040import org.biojava.bio.symbol.SimpleSymbolList;
041import org.biojava.bio.symbol.Symbol;
042import org.biojava.bio.symbol.SymbolList;
043import org.biojava.utils.ChangeVetoException;
044import org.biojavax.Namespace;
045import org.biojavax.RichObjectFactory;
046import org.biojavax.SimpleNamespace;
047import org.biojavax.bio.seq.RichSequence;
048
049
050/**
051 * Format object representing FASTA files. These files are almost pure
052 * sequence data.
053 * @author Thomas Down
054 * @author Matthew Pocock
055 * @author Greg Cox
056 * @author Lukas Kall
057 * @author Richard Holland
058 * @author Mark Schreiber
059 * @author Carl Masak
060 * @since 1.5
061 */
062
063public class FastaFormat extends RichSequenceFormat.HeaderlessFormat {
064
065        // Register this format with the format auto-guesser.
066        static {
067                RichSequence.IOTools.registerFormat(FastaFormat.class);
068        }
069
070        /**
071         * The name of this format
072         */
073        public static final String FASTA_FORMAT = "FASTA";
074
075        // header line
076        protected static final Pattern hp = Pattern.compile(">\\s*(\\S+)(\\s+(.*))?");
077        // description chunk
078        protected static final Pattern dp = Pattern.compile( "^(gi\\|(\\d+)\\|)?(\\w+)\\|(\\w+?)(\\.(\\d+))?\\|(\\w+)?$");
079
080        protected static final Pattern readableFiles = Pattern.compile(".*(fa|fas)$");
081        protected static final Pattern aminoAcids = Pattern.compile("[FLIPQE]", Pattern.CASE_INSENSITIVE);
082
083        private FastaHeader header = new FastaHeader();
084
085        /**
086         * {@inheritDoc}
087         * A file is in FASTA format if the name ends with fa or fas, or the file starts with ">".
088         */
089        @Override
090        public boolean canRead(File file) throws IOException {
091                if (readableFiles.matcher(file.getName()).matches()) return true;
092                BufferedReader br = new BufferedReader(new FileReader(file));
093                String firstLine = br.readLine();
094                boolean readable = firstLine!=null && firstLine.startsWith(">");
095                br.close();
096                return readable;
097        }
098
099        /**
100         * {@inheritDoc}
101         * Returns an protein parser if the first line of sequence contains any of F/L/I/P/Q/E, 
102         * otherwise returns a DNA tokenizer.
103         */
104        @Override
105        public SymbolTokenization guessSymbolTokenization(File file) throws IOException {
106                BufferedReader br = new BufferedReader(new FileReader(file));
107                br.readLine(); // discard first line
108                boolean aa = aminoAcids.matcher(br.readLine()).find();
109                br.close();
110                if (aa) return RichSequence.IOTools.getProteinParser();
111                else return RichSequence.IOTools.getDNAParser();
112        }
113
114        /**
115         * {@inheritDoc}
116         * A stream is in FASTA format if the stream starts with ">".
117         */
118        public boolean canRead(BufferedInputStream stream) throws IOException {
119                stream.mark(2000); // some streams may not support this
120                BufferedReader br = new BufferedReader(new InputStreamReader(stream));
121                String firstLine = br.readLine();
122                boolean readable = firstLine!=null && firstLine.startsWith(">");
123                // don't close the reader as it'll close the stream too.
124                // br.close();
125                stream.reset();
126                return readable;
127        }
128
129        /**
130         * {@inheritDoc}
131         * Returns an protein parser if the first line of sequence contains any of F/L/I/P/Q/E, 
132         * otherwise returns a DNA tokenizer.
133         */
134        public SymbolTokenization guessSymbolTokenization(BufferedInputStream stream) throws IOException {
135                stream.mark(2000); // some streams may not support this
136                BufferedReader br = new BufferedReader(new InputStreamReader(stream));
137                br.readLine(); // discard first line
138                boolean aa = aminoAcids.matcher(br.readLine()).find();
139                // don't close the reader as it'll close the stream too.
140                // br.close();
141                stream.reset();
142                if (aa) return RichSequence.IOTools.getProteinParser();
143                else return RichSequence.IOTools.getDNAParser();
144        }
145
146        /**
147         * {@inheritDoc}
148         */
149        public boolean readSequence(
150                        BufferedReader reader,
151                        SymbolTokenization symParser,
152                        SeqIOListener listener
153        )       throws
154        IllegalSymbolException,
155        IOException,
156        ParseException {
157                if (!(listener instanceof RichSeqIOListener)) throw new IllegalArgumentException("Only accepting RichSeqIOListeners today");
158                return this.readRichSequence(reader,symParser,(RichSeqIOListener)listener,null);
159        }
160
161        /**
162         * {@inheritDoc}
163         * If namespace is null, then the namespace of the sequence in the fasta is used.
164         * If the namespace is null and so is the namespace of the sequence in the fasta,
165         * then the default namespace is used.
166         */
167        public boolean readRichSequence(
168                        BufferedReader reader,
169                        SymbolTokenization symParser,
170                        RichSeqIOListener rsiol,
171                        Namespace ns
172        )       throws
173        IllegalSymbolException,
174        IOException,
175        ParseException {
176
177                String line = reader.readLine();
178                if (line == null) {
179                        throw new IOException("Premature stream end");
180                }
181                while(line.length() == 0) {
182                        line = reader.readLine();
183                        if (line == null) {
184                                throw new IOException("Premature stream end");
185                        }
186                }
187                if (!line.startsWith(">")) {
188                        throw new IOException("Stream does not appear to contain FASTA formatted data: " + line);
189                }
190
191                rsiol.startSequence();
192
193                processHeader(line,rsiol,ns);
194
195                StringBuffer seq = new StringBuffer();
196                boolean hasMoreSeq = true;
197                while (hasMoreSeq) {
198                        reader.mark(500);
199                        line = reader.readLine();
200                        if (line!=null) {
201                                line = line.trim();
202                                if (line.length() > 0 && line.charAt(0)=='>') {
203                                        reader.reset();
204                                        hasMoreSeq = false;
205                                } else {
206                                        seq.append(line);
207                                }
208                        } else {
209                                hasMoreSeq = false;
210                        }
211                }
212                if (!this.getElideSymbols()) {
213                        try {
214                                SymbolList sl = new SimpleSymbolList(symParser,
215                                                seq.toString().replaceAll("\\s+","").replaceAll("[\\.|~]","-"));
216                                rsiol.addSymbols(symParser.getAlphabet(),
217                                                (Symbol[])(sl.toList().toArray(new Symbol[0])),
218                                                0, sl.length());
219                        } catch (Exception e) {
220                                // do not know name and gi any longer, replace them with empty string.
221                                // why does the rsiol only have setter methods, but not getter???
222                                String message = ParseException.newMessage(this.getClass(), "", "", "problem parsing symbols", seq.toString());
223                                throw new ParseException(e, message);
224                        }
225                }
226
227                rsiol.endSequence();
228
229                return line!=null;
230        }
231
232        /** Parse the Header information from the Fasta Description line
233         * 
234         * @param line
235         * @param rsiol
236         * @param ns
237         * @throws IOException
238         * @throws ParseException
239         */
240        public void processHeader(String line,RichSeqIOListener rsiol,Namespace ns) 
241        throws IOException, ParseException {
242                Matcher m = hp.matcher(line);
243                if (!m.matches()) {
244                        throw new IOException("Stream does not appear to contain FASTA formatted data: " + line);
245                }
246
247                String name = m.group(1);
248                String desc = m.group(3);
249                String gi = null;
250
251                m = dp.matcher(name);
252                if (m.matches()) {
253                        gi = m.group(2);
254                        String namespace = m.group(3);
255                        String accession = m.group(4);
256                        String verString = m.group(6);
257                        int version = verString==null?0:Integer.parseInt(verString);
258                        name = m.group(7);
259                        if (name==null) name=accession;
260
261                        rsiol.setAccession(accession);
262                        rsiol.setVersion(version);
263                        if (gi!=null) rsiol.setIdentifier(gi);
264                        if (ns==null) rsiol.setNamespace((Namespace)RichObjectFactory.getObject(SimpleNamespace.class,new Object[]{namespace}));
265                        else rsiol.setNamespace(ns);
266                } else {
267                        rsiol.setAccession(name);
268                        rsiol.setNamespace((ns==null?RichObjectFactory.getDefaultNamespace():ns));
269                }
270                rsiol.setName(name);
271                if (!this.getElideComments()) rsiol.setDescription(desc);
272
273        }
274
275        /**
276         * {@inheritDoc}
277         */
278        public void     writeSequence(Sequence seq, PrintStream os) throws IOException {
279                if (this.getPrintStream()==null) this.setPrintStream(os);
280                this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
281        }
282
283        /**
284         * {@inheritDoc}
285         */
286        public void writeSequence(Sequence seq, String format, PrintStream os) throws IOException {
287                if (this.getPrintStream()==null) this.setPrintStream(os);
288                if (!format.equals(this.getDefaultFormat())) throw new IllegalArgumentException("Unknown format: "+format);
289                this.writeSequence(seq, RichObjectFactory.getDefaultNamespace());
290        }
291
292
293        /**
294         * {@inheritDoc}
295         * If namespace is null, then the sequence's own namespace is used.
296         */
297        public void writeSequence(Sequence seq, Namespace ns) throws IOException {
298                RichSequence rs;
299                try {
300                        if (seq instanceof RichSequence) rs = (RichSequence)seq;
301                        else rs = RichSequence.Tools.enrich(seq);
302                } catch (ChangeVetoException e) {
303                        IOException e2 = new IOException("Unable to enrich sequence");
304                        e2.initCause(e);
305                        throw e2;
306                }
307
308                StringBuilder sb = new StringBuilder();
309                sb.append(">");
310
311                String identifier = rs.getIdentifier();
312                if (header.isShowIdentifier() && identifier!=null && !"".equals(identifier)) {
313                        sb.append("gi|");
314                        sb.append(identifier);
315                        sb.append("|");
316                }
317                if(header.isShowNamespace()){
318                        sb.append((ns==null?rs.getNamespace().getName():ns.getName()));
319                        sb.append("|");
320                }
321                if(header.isShowAccession()){
322                        sb.append(rs.getAccession());
323                        if(header.isShowVersion()){
324                                sb.append(".");
325                        }
326                }
327                if(header.isShowVersion()){
328                        sb.append(rs.getVersion());
329                        sb.append("|");
330                }
331                if(header.isShowName()){
332                        sb.append(rs.getName());
333                        sb.append(" ");
334                }else{
335                        sb.append(" "); //in case the show the description there needs to be space
336                }
337                if(header.isShowDescription()){
338                        String desc = rs.getDescription();
339                        if (desc!=null && !"".equals(desc)) sb.append(desc.replaceAll("\\n"," "));
340                }
341                if(sb.charAt(sb.length() -1) == '|'){
342                        sb.deleteCharAt(sb.length() -1);
343                }
344                this.getPrintStream().print(sb.toString());
345                this.getPrintStream().println();
346
347                int length = rs.length();
348
349                for (int pos = 1; pos <= length; pos += this.getLineWidth()) {
350                        int end = Math.min(pos + this.getLineWidth() - 1, length);
351                        this.getPrintStream().println(rs.subStr(pos, end));
352                }
353        }
354
355        /**
356         * {@inheritDoc}
357         */
358        public String getDefaultFormat() {
359                return FASTA_FORMAT;
360        }
361
362        public FastaHeader getHeader() {
363                return header;
364        }
365
366        public void setHeader(FastaHeader header) {
367                this.header = header;
368        }
369}