001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 027import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 028import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 030import org.biojava.nbio.core.sequence.template.Compound; 031import org.biojava.nbio.core.sequence.template.Sequence; 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035import java.io.*; 036import java.util.HashMap; 037import java.util.LinkedHashMap; 038 039/** 040 * Use FastaReaderHelper as an example of how to use this class where FastaReaderHelper should be the 041 * primary class used to read Fasta files 042 * @author Scooter Willis ;lt;willishf at gmail dot com> 043 */ 044public class FastaReader<S extends Sequence<?>, C extends Compound> { 045 046 private final static Logger logger = LoggerFactory.getLogger(FastaReader.class); 047 048 SequenceCreatorInterface<C> sequenceCreator; 049 SequenceHeaderParserInterface<S,C> headerParser; 050 BufferedReaderBytesRead br; 051 InputStreamReader isr; 052 FileInputStream fi = null; 053 long fileIndex = 0; 054 long sequenceIndex = 0; 055 String line = ""; 056 String header= ""; 057 058 /** 059 * If you are going to use FileProxyProteinSequenceCreator then do not use this constructor because we need details about 060 * local file offsets for quick reads. InputStreams does not give you the name of the stream to access quickly via file seek. A seek in 061 * an inputstream is forced to read all the data so you don't gain anything. 062 * @param is inputStream 063 * @param headerParser 064 * @param sequenceCreator 065 */ 066 public FastaReader(InputStream is, SequenceHeaderParserInterface<S,C> headerParser, 067 SequenceCreatorInterface<C> sequenceCreator) { 068 this.headerParser = headerParser; 069 isr = new InputStreamReader(is); 070 this.br = new BufferedReaderBytesRead(isr); 071 this.sequenceCreator = sequenceCreator; 072 } 073 074 /** 075 * If you are going to use the FileProxyProteinSequenceCreator then you 076 * need to use this constructor because we need details about 077 * the location of the file. 078 * @param file 079 * @param headerParser 080 * @param sequenceCreator 081 * @throws FileNotFoundException if the file does not exist, is a directory 082 * rather than a regular file, or for some other reason cannot be opened 083 * for reading. 084 * @throws SecurityException if a security manager exists and its checkRead 085 * method denies read access to the file. 086 */ 087 public FastaReader(File file, SequenceHeaderParserInterface<S,C> headerParser, 088 SequenceCreatorInterface<C> sequenceCreator) throws FileNotFoundException { 089 this.headerParser = headerParser; 090 fi = new FileInputStream(file); 091 isr = new InputStreamReader(fi); 092 this.br = new BufferedReaderBytesRead(isr); 093 this.sequenceCreator = sequenceCreator; 094 } 095 096 /** 097 * The parsing is done in this method.<br> 098 * This method tries to process all the available fasta records 099 * in the File or InputStream, closes the underlying resource, 100 * and return the results in {@link LinkedHashMap}.<br> 101 * You don't need to call {@link #close()} after calling this method. 102 * @see #process(int) 103 * @return {@link HashMap} containing all the parsed fasta records 104 * present, starting current fileIndex onwards. 105 * @throws IOException if an error occurs reading the input file 106 */ 107 public LinkedHashMap<String,S> process() throws IOException { 108 LinkedHashMap<String,S> sequences = process(-1); 109 close(); 110 111 return sequences; 112 } 113 114 /** 115 * This method tries to parse maximum <code>max</code> records from 116 * the open File or InputStream, and leaves the underlying resource open.<br> 117 * Subsequent calls to the same method continue parsing the rest of the file.<br> 118 * This is particularly useful when dealing with very big data files, 119 * (e.g. NCBI nr database), which can't fit into memory and will take long 120 * time before the first result is available.<br> 121 * <b>N.B.</b> 122 * <ul> 123 * <li>This method can't be called after calling its NO-ARGUMENT twin.</li> 124 * <li>remember to close the underlying resource when you are done.</li> 125 * </ul> 126 * @see #process() 127 * @author Amr AL-Hossary 128 * @since 3.0.6 129 * @param max maximum number of records to return, <code>-1</code> for infinity. 130 * @return {@link HashMap} containing maximum <code>max</code> parsed fasta records 131 * present, starting current fileIndex onwards. 132 * @throws IOException if an error occurs reading the input file 133 */ 134 public LinkedHashMap<String,S> process(int max) throws IOException { 135 136 137 String line = ""; 138 if(this.line != null && this.line.length() > 0){ 139 line=this.line; 140 } 141 String header = ""; 142 if(this.header != null && this.header.length() > 0){ 143 header=this.header; 144 } 145 146 StringBuilder sb = new StringBuilder(); 147 int processedSequences=0; 148 boolean keepGoing = true; 149 150 151 LinkedHashMap<String,S> sequences = new LinkedHashMap<String,S>(); 152 153 do { 154 line = line.trim(); // nice to have but probably not needed 155 if (line.length() != 0) { 156 if (line.startsWith(">")) {//start of new fasta record 157 158 if (sb.length() > 0) { 159 //i.e. if there is already a sequence before 160 //logger.info("Sequence index=" + sequenceIndex); 161 162 try { 163 @SuppressWarnings("unchecked") 164 S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex); 165 headerParser.parseHeader(header, sequence); 166 sequences.put(sequence.getAccession().getID(),sequence); 167 processedSequences++; 168 169 } catch (CompoundNotFoundException e) { 170 logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored", 171 header, e.getMessage()); 172 } 173 174 sb.setLength(0); //this is faster than allocating new buffers, better memory utilization (same buffer) 175 } 176 header = line.substring(1); 177 } else if (line.startsWith(";")) { 178 } else { 179 //mark the start of the sequence with the fileIndex before the line was read 180 if(sb.length() == 0){ 181 sequenceIndex = fileIndex; 182 } 183 sb.append(line); 184 } 185 } 186 fileIndex = br.getBytesRead(); 187 188 line = br.readLine(); 189 190 if (line == null) { 191 //i.e. EOF 192 if ( sb.length() == 0 && header.length() != 0 ) { 193 logger.warn("Can't parse sequence {}. Got sequence of length 0!", sequenceIndex); 194 logger.warn("header: {}", header); 195 header = null; 196 } else if ( sb.length() > 0 ) { 197 //logger.info("Sequence index=" + sequenceIndex + " " + fileIndex ); 198 try { 199 @SuppressWarnings("unchecked") 200 S sequence = (S)sequenceCreator.getSequence(sb.toString(), sequenceIndex); 201 headerParser.parseHeader(header, sequence); 202 sequences.put(sequence.getAccession().getID(),sequence); 203 processedSequences++; 204 header = null; 205 } catch (CompoundNotFoundException e) { 206 logger.warn("Sequence with header '{}' has unrecognised compounds ({}), it will be ignored", 207 header, e.getMessage()); 208 } 209 } 210 keepGoing = false; 211 } 212 if (max > -1 && processedSequences>=max) { 213 keepGoing=false; 214 } 215 } while (keepGoing); 216 217 this.line = line; 218 this.header= header; 219 220 return max > -1 && sequences.isEmpty() ? null : sequences; 221 } 222 223 public void close() throws IOException { 224 br.close(); 225 isr.close(); 226 //If stream was created from File object then we need to close it 227 if (fi != null) { 228 fi.close(); 229 } 230 this.line=this.header = null; 231 } 232 233 public static void main(String[] args) { 234 try { 235 String inputFile = "/PF00104_small.fasta"; 236 InputStream is = FastaReader.class.getResourceAsStream(inputFile); 237 238 239 if ( is == null) 240 System.err.println("Could not get input file " + inputFile); 241 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(is, new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 242 LinkedHashMap<String,ProteinSequence> proteinSequences = fastaReader.process(); 243 is.close(); 244 245 246 //logger.info("Protein Sequences: {}", proteinSequences); 247 248 File file = new File(inputFile); 249 FastaReader<ProteinSequence,AminoAcidCompound> fastaProxyReader = 250 new FastaReader<ProteinSequence,AminoAcidCompound>( 251 file, 252 new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(), 253 new FileProxyProteinSequenceCreator( 254 file, 255 AminoAcidCompoundSet.getAminoAcidCompoundSet(), 256 new FastaSequenceParser() 257 ) 258 ); 259 LinkedHashMap<String,ProteinSequence> proteinProxySequences = fastaProxyReader.process(); 260 261 for(String key : proteinProxySequences.keySet()){ 262 ProteinSequence proteinSequence = proteinProxySequences.get(key); 263 logger.info("Protein Proxy Sequence Key: {}", key); 264// if(key.equals("Q98SJ1_CHICK/15-61")){ 265// int dummy = 1; 266// } 267 logger.info("Protein Sequence: {}", proteinSequence.toString()); 268 269 } 270 271 } catch (Exception e) { 272 logger.warn("Exception: ", e); 273 } 274 } 275}