001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.RNASequence;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
029import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
030import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
031import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
032
033import java.io.File;
034import java.io.FileInputStream;
035import java.io.IOException;
036import java.io.InputStream;
037import java.util.LinkedHashMap;
038
039/**
040 *
041 * @author Scooter Willis <willishf at gmail dot com>
042 */
043public class FastaReaderHelper {
044
045        /**
046         * Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects
047         * that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested
048         * in one sequence based on accession id.
049         * @param file
050         * @param lazySequenceLoad
051         * @return
052         * @throws IOException
053         */
054        public static LinkedHashMap<String, DNASequence> readFastaDNASequence(File file, boolean lazySequenceLoad) throws IOException {
055                if (!lazySequenceLoad) {
056                        return readFastaDNASequence(file);
057                }
058
059                FastaReader<DNASequence, NucleotideCompound> fastaProxyReader =
060                                new FastaReader<DNASequence, NucleotideCompound>(
061                                                file,
062                                                new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(),
063                                                new FileProxyDNASequenceCreator(
064                                                                file,
065                                                                DNACompoundSet.getDNACompoundSet(),
066                                                                new FastaSequenceParser()
067                                                        )
068                                        );
069                return fastaProxyReader.process();
070
071        }
072
073        /**
074         * Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects
075         * that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested
076         * in one sequence based on accession id.
077         * @param file
078         * @param lazySequenceLoad
079         * @return
080         * @throws IOException
081         */
082        public static LinkedHashMap<String, RNASequence> readFastaRNASequence(File file, boolean lazySequenceLoad) throws IOException {
083                if (!lazySequenceLoad) {
084                        return readFastaRNASequence(file);
085                }
086
087                FastaReader<RNASequence, NucleotideCompound> fastaProxyReader =
088                                new FastaReader<RNASequence, NucleotideCompound>(
089                                                file,
090                                                new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
091                                                new FileProxyRNASequenceCreator(
092                                                                file,
093                                                                RNACompoundSet.getRNACompoundSet(),
094                                                                new FastaSequenceParser()
095                                                        )
096                                        );
097                return fastaProxyReader.process();
098
099        }
100
101        /**
102         * Read a fasta file containing amino acids with setup that would handle most
103         * cases.
104         *
105         * @param file
106         * @return
107         * @throws IOException
108         */
109        public static LinkedHashMap<String, ProteinSequence> readFastaProteinSequence(
110                        File file) throws IOException {
111                FileInputStream inStream = new FileInputStream(file);
112                LinkedHashMap<String, ProteinSequence> proteinSequences = readFastaProteinSequence(inStream);
113                inStream.close();
114                return proteinSequences;
115        }
116
117        /**
118         * Read a fasta file containing amino acids with setup that would handle most
119         * cases. User is responsible for closing InputStream because you opened it
120         *
121         * @param inStream
122         * @return
123         * @throws IOException
124         */
125        public static LinkedHashMap<String, ProteinSequence> readFastaProteinSequence(
126                        InputStream inStream) throws IOException {
127                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(
128                                inStream,
129                                new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(),
130                                new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
131                return fastaReader.process();
132        }
133
134        /**
135         * Read a fasta DNA sequence
136         * @param inStream
137         * @return
138         * @throws IOException
139         */
140        public static LinkedHashMap<String, DNASequence> readFastaDNASequence(
141                        InputStream inStream) throws IOException {
142                FastaReader<DNASequence, NucleotideCompound> fastaReader = new FastaReader<DNASequence, NucleotideCompound>(
143                                inStream,
144                                new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(),
145                                new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
146                return fastaReader.process();
147        }
148
149        /**
150         *
151         * @param file
152         * @return
153         * @throws IOException
154         */
155        public static LinkedHashMap<String, DNASequence> readFastaDNASequence(
156                        File file) throws IOException {
157                FileInputStream inStream = new FileInputStream(file);
158                LinkedHashMap<String, DNASequence> dnaSequences = readFastaDNASequence(inStream);
159                inStream.close();
160                return dnaSequences;
161        }
162
163        /**
164         * Read a fasta RNA sequence
165         * @param inStream
166         * @return
167         * @throws IOException
168         */
169        public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
170                        InputStream inStream) throws IOException {
171                FastaReader<RNASequence, NucleotideCompound> fastaReader = new FastaReader<RNASequence, NucleotideCompound>(
172                                inStream,
173                                new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(),
174                                new RNASequenceCreator(RNACompoundSet.getRNACompoundSet()));
175                return fastaReader.process();
176        }
177
178        /**
179         *
180         * @param file
181         * @return
182         * @throws IOException
183         */
184        public static LinkedHashMap<String, RNASequence> readFastaRNASequence(
185                        File file) throws IOException {
186                FileInputStream inStream = new FileInputStream(file);
187                LinkedHashMap<String, RNASequence> rnaSequences = readFastaRNASequence(inStream);
188                inStream.close();
189                return rnaSequences;
190        }
191
192        public static void main(String[] args) throws Exception {
193
194                LinkedHashMap<String, DNASequence> dnaSequences = FastaReaderHelper.readFastaDNASequence(new File("fasta.fna"));
195                for (DNASequence sequence : dnaSequences.values()) {
196                        sequence.getRNASequence().getProteinSequence().getSequenceAsString();
197                }
198        }
199}