001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.sequence.DNASequence; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.RNASequence; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 029import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 030import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 031import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 032 033import java.io.File; 034import java.io.FileInputStream; 035import java.io.IOException; 036import java.io.InputStream; 037import java.util.LinkedHashMap; 038 039/** 040 * 041 * @author Scooter Willis <willishf at gmail dot com> 042 */ 043public class FastaReaderHelper { 044 045 /** 046 * Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects 047 * that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested 048 * in one sequence based on accession id. 049 * @param file 050 * @param lazySequenceLoad 051 * @return 052 * @throws IOException 053 */ 054 public static LinkedHashMap<String, DNASequence> readFastaDNASequence(File file, boolean lazySequenceLoad) throws IOException { 055 if (!lazySequenceLoad) { 056 return readFastaDNASequence(file); 057 } 058 059 FastaReader<DNASequence, NucleotideCompound> fastaProxyReader = 060 new FastaReader<DNASequence, NucleotideCompound>( 061 file, 062 new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(), 063 new FileProxyDNASequenceCreator( 064 file, 065 DNACompoundSet.getDNACompoundSet(), 066 new FastaSequenceParser() 067 ) 068 ); 069 return fastaProxyReader.process(); 070 071 } 072 073 /** 074 * Selecting lazySequenceLoad=true will parse the FASTA file and figure out the accessionid and offsets and return sequence objects 075 * that can in the future read the sequence from the disk. This allows the loading of large fasta files where you are only interested 076 * in one sequence based on accession id. 077 * @param file 078 * @param lazySequenceLoad 079 * @return 080 * @throws IOException 081 */ 082 public static LinkedHashMap<String, RNASequence> readFastaRNASequence(File file, boolean lazySequenceLoad) throws IOException { 083 if (!lazySequenceLoad) { 084 return readFastaRNASequence(file); 085 } 086 087 FastaReader<RNASequence, NucleotideCompound> fastaProxyReader = 088 new FastaReader<RNASequence, NucleotideCompound>( 089 file, 090 new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(), 091 new FileProxyRNASequenceCreator( 092 file, 093 RNACompoundSet.getRNACompoundSet(), 094 new FastaSequenceParser() 095 ) 096 ); 097 return fastaProxyReader.process(); 098 099 } 100 101 /** 102 * Read a fasta file containing amino acids with setup that would handle most 103 * cases. 104 * 105 * @param file 106 * @return 107 * @throws IOException 108 */ 109 public static LinkedHashMap<String, ProteinSequence> readFastaProteinSequence( 110 File file) throws IOException { 111 FileInputStream inStream = new FileInputStream(file); 112 LinkedHashMap<String, ProteinSequence> proteinSequences = readFastaProteinSequence(inStream); 113 inStream.close(); 114 return proteinSequences; 115 } 116 117 /** 118 * Read a fasta file containing amino acids with setup that would handle most 119 * cases. User is responsible for closing InputStream because you opened it 120 * 121 * @param inStream 122 * @return 123 * @throws IOException 124 */ 125 public static LinkedHashMap<String, ProteinSequence> readFastaProteinSequence( 126 InputStream inStream) throws IOException { 127 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>( 128 inStream, 129 new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(), 130 new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 131 return fastaReader.process(); 132 } 133 134 /** 135 * Read a fasta DNA sequence 136 * @param inStream 137 * @return 138 * @throws IOException 139 */ 140 public static LinkedHashMap<String, DNASequence> readFastaDNASequence( 141 InputStream inStream) throws IOException { 142 FastaReader<DNASequence, NucleotideCompound> fastaReader = new FastaReader<DNASequence, NucleotideCompound>( 143 inStream, 144 new GenericFastaHeaderParser<DNASequence, NucleotideCompound>(), 145 new DNASequenceCreator(DNACompoundSet.getDNACompoundSet())); 146 return fastaReader.process(); 147 } 148 149 /** 150 * 151 * @param file 152 * @return 153 * @throws IOException 154 */ 155 public static LinkedHashMap<String, DNASequence> readFastaDNASequence( 156 File file) throws IOException { 157 FileInputStream inStream = new FileInputStream(file); 158 LinkedHashMap<String, DNASequence> dnaSequences = readFastaDNASequence(inStream); 159 inStream.close(); 160 return dnaSequences; 161 } 162 163 /** 164 * Read a fasta RNA sequence 165 * @param inStream 166 * @return 167 * @throws IOException 168 */ 169 public static LinkedHashMap<String, RNASequence> readFastaRNASequence( 170 InputStream inStream) throws IOException { 171 FastaReader<RNASequence, NucleotideCompound> fastaReader = new FastaReader<RNASequence, NucleotideCompound>( 172 inStream, 173 new GenericFastaHeaderParser<RNASequence, NucleotideCompound>(), 174 new RNASequenceCreator(RNACompoundSet.getRNACompoundSet())); 175 return fastaReader.process(); 176 } 177 178 /** 179 * 180 * @param file 181 * @return 182 * @throws IOException 183 */ 184 public static LinkedHashMap<String, RNASequence> readFastaRNASequence( 185 File file) throws IOException { 186 FileInputStream inStream = new FileInputStream(file); 187 LinkedHashMap<String, RNASequence> rnaSequences = readFastaRNASequence(inStream); 188 inStream.close(); 189 return rnaSequences; 190 } 191 192 public static void main(String[] args) throws Exception { 193 194 LinkedHashMap<String, DNASequence> dnaSequences = FastaReaderHelper.readFastaDNASequence(new File("fasta.fna")); 195 for (DNASequence sequence : dnaSequences.values()) { 196 sequence.getRNASequence().getProteinSequence().getSequenceAsString(); 197 } 198 } 199}