001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package demo; 022 023 024import java.io.File; 025import java.io.InputStream; 026import java.util.LinkedHashMap; 027import java.util.logging.Level; 028import java.util.logging.Logger; 029import org.biojava.nbio.core.sequence.ProteinSequence; 030import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 031import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 032import org.biojava.nbio.core.sequence.io.FastaReader; 033import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser; 034import org.biojava.nbio.core.sequence.io.ProteinSequenceCreator; 035import org.biojava.nbio.core.util.InputStreamProvider; 036 037 038/** 039 * Created by andreas on 6/17/15. 040 */ 041public class ParseFastaFileDemo { 042 043 044 public ParseFastaFileDemo(){ 045 046 047 } 048 049 /** e.g. download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz 050 * and pass in path to local location of file 051 * 052 * @param args 053 */ 054 public static void main(String[] args) { 055 056 int mb = 1024*1024; 057 058 //Getting the runtime reference from system 059 Runtime runtime = Runtime.getRuntime(); 060 061 System.out.println("##### Heap utilization statistics [MB] #####"); 062 063 //Print used memory 064 System.out.println("Used Memory:" 065 + (runtime.totalMemory() - runtime.freeMemory()) / mb); 066 067 //Print free memory 068 System.out.println("Free Memory:" 069 + runtime.freeMemory() / mb); 070 071 //Print total available memory 072 System.out.println("Total Memory:" + runtime.totalMemory() / mb); 073 074 //Print Maximum available memory 075 System.out.println("Max Memory:" + runtime.maxMemory() / mb); 076 077 078 if ( args.length < 1) { 079 System.err.println("First argument needs to be path to fasta file"); 080 return; 081 } 082 083 File f = new File(args[0]); 084 085 if ( ! f.exists()) { 086 System.err.println("File does not exist " + args[0]); 087 return; 088 } 089 090 long timeS = System.currentTimeMillis(); 091 092 try { 093 094 // automatically uncompress files using InputStreamProvider 095 InputStreamProvider isp = new InputStreamProvider(); 096 097 InputStream inStream = isp.getInputStream(f); 098 099 100 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>( 101 inStream, 102 new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(), 103 new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 104 105 LinkedHashMap<String, ProteinSequence> b; 106 107 int nrSeq = 0; 108 109 while ((b = fastaReader.process(100)) != null) { 110 for (String key : b.keySet()) { 111 nrSeq++; 112 //System.out.println(nrSeq + " : " + key + " " + b.get(key)); 113 if ( nrSeq % 100000 == 0) 114 System.out.println(nrSeq ); 115 } 116 117 } 118 long timeE = System.currentTimeMillis(); 119 System.out.println("parsed a total of " + nrSeq + " TREMBL sequences! in " + (timeE - timeS)); 120 } catch (Exception ex) { 121 Logger.getLogger(ParseFastaFileDemo.class.getName()).log(Level.SEVERE, null, ex); 122 } 123 } 124}