001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package demo; 022 023 024import java.io.File; 025import java.io.InputStream; 026import java.util.LinkedHashMap; 027import org.biojava.nbio.core.sequence.ProteinSequence; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 029import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 030import org.biojava.nbio.core.sequence.io.FastaReader; 031import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser; 032import org.biojava.nbio.core.sequence.io.ProteinSequenceCreator; 033import org.biojava.nbio.core.util.InputStreamProvider; 034 035 036/** 037 * Created by andreas on 6/17/15. 038 */ 039public class ParseFastaFileDemo { 040 041 042 public ParseFastaFileDemo(){ 043 044 045 } 046 047 /** 048 * e.g. download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz 049 * and pass in path to local location of file 050 * 051 * @param args 052 */ 053 public static void main(String[] args) throws Exception { 054 055 int mb = 1024*1024; 056 057 //Getting the runtime reference from system 058 Runtime runtime = Runtime.getRuntime(); 059 060 System.out.println("##### Heap utilization statistics [MB] #####"); 061 062 //Print used memory 063 System.out.println("Used Memory:" 064 + (runtime.totalMemory() - runtime.freeMemory()) / mb); 065 066 //Print free memory 067 System.out.println("Free Memory:" 068 + runtime.freeMemory() / mb); 069 070 //Print total available memory 071 System.out.println("Total Memory:" + runtime.totalMemory() / mb); 072 073 //Print Maximum available memory 074 System.out.println("Max Memory:" + runtime.maxMemory() / mb); 075 076 077 if ( args.length < 1) { 078 System.err.println("First argument needs to be path to fasta file"); 079 return; 080 } 081 082 File f = new File(args[0]); 083 084 if ( ! f.exists()) { 085 System.err.println("File does not exist " + args[0]); 086 return; 087 } 088 089 long timeS = System.currentTimeMillis(); 090 091 // automatically uncompress files using InputStreamProvider 092 InputStreamProvider isp = new InputStreamProvider(); 093 094 InputStream inStream = isp.getInputStream(f); 095 096 097 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>( 098 inStream, 099 new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(), 100 new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 101 102 LinkedHashMap<String, ProteinSequence> b; 103 104 int nrSeq = 0; 105 106 while ((b = fastaReader.process(100)) != null) { 107 for (String key : b.keySet()) { 108 nrSeq++; 109 System.out.println(nrSeq + " : " + key + " " + b.get(key)); 110 if ( nrSeq % 100000 == 0) 111 System.out.println(nrSeq ); 112 } 113 114 } 115 long timeE = System.currentTimeMillis(); 116 System.out.println("parsed a total of " + nrSeq + " TREMBL sequences! in " + (timeE - timeS)); 117 } 118}