001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.sequence.DNASequence;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
028import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
029import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
030
031import java.io.File;
032import java.io.FileInputStream;
033import java.io.IOException;
034import java.io.InputStream;
035import java.util.LinkedHashMap;
036
037/**
038 *
039 * @author Scooter Willis <willishf at gmail dot com>
040 */
041public class FastaReaderHelper {
042
043        /**
044         * Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects
045         * that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested
046         * in one sequence based on accession id.
047         * @param file
048         * @param lazySequenceLoad
049         * @return
050         * @throws IOException
051         */
052        public static LinkedHashMap<String, DNASequence> readFastaDNASequence(File file, boolean lazySequenceLoad) throws IOException {
053                if (!lazySequenceLoad) {
054                        return readFastaDNASequence(file);
055                }
056
057                FastaReader<DNASequence, NucleotideCompound> fastaProxyReader =
058                                new FastaReader<DNASequence, NucleotideCompound>(
059                                                file,
060                                                new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(),
061                                                new FileProxyDNASequenceCreator(
062                                                                file,
063                                                                DNACompoundSet.getDNACompoundSet(),
064                                                                new FastaSequenceParser()
065                                                        )
066                                        );
067                return fastaProxyReader.process();
068
069        }
070
071        /**
072         * Read a fasta file containing amino acids with setup that would handle most
073         * cases.
074         *
075         * @param file
076         * @return
077         * @throws IOException
078         */
079        public static LinkedHashMap<String, ProteinSequence> readFastaProteinSequence(
080                        File file) throws IOException {
081                FileInputStream inStream = new FileInputStream(file);
082                LinkedHashMap<String, ProteinSequence> proteinSequences = readFastaProteinSequence(inStream);
083                inStream.close();
084                return proteinSequences;
085        }
086
087        /**
088         * Read a fasta file containing amino acids with setup that would handle most
089         * cases. User is responsible for closing InputStream because you opened it
090         *
091         * @param inStream
092         * @return
093         * @throws IOException
094         */
095        public static LinkedHashMap<String, ProteinSequence> readFastaProteinSequence(
096                        InputStream inStream) throws IOException {
097                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(
098                                inStream,
099                                new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(),
100                                new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
101                return fastaReader.process();
102        }
103
104        /**
105         * Read a fasta DNA sequence
106         * @param inStream
107         * @return
108         * @throws IOException
109         */
110        public static LinkedHashMap<String, DNASequence> readFastaDNASequence(
111                        InputStream inStream) throws IOException {
112                FastaReader<DNASequence, NucleotideCompound> fastaReader = new FastaReader<DNASequence, NucleotideCompound>(
113                                inStream,
114                                new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(),
115                                new DNASequenceCreator(DNACompoundSet.getDNACompoundSet()));
116                return fastaReader.process();
117        }
118
119        /**
120         *
121         * @param file
122         * @return
123         * @throws IOException
124         */
125        public static LinkedHashMap<String, DNASequence> readFastaDNASequence(
126                        File file) throws IOException {
127                FileInputStream inStream = new FileInputStream(file);
128                LinkedHashMap<String, DNASequence> dnaSequences = readFastaDNASequence(inStream);
129                inStream.close();
130                return dnaSequences;
131        }
132
133        public static void main(String args[]) throws Exception {
134
135                LinkedHashMap<String, DNASequence> dnaSequences = FastaReaderHelper.readFastaDNASequence(new File("fasta.fna"));
136                for (DNASequence sequence : dnaSequences.values()) {
137                        sequence.getRNASequence().getProteinSequence().getSequenceAsString();
138                }
139        }
140}