001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package demo; 022 023 024import java.io.File; 025import java.io.InputStream; 026import java.util.LinkedHashMap; 027import java.util.Map; 028 029import org.biojava.nbio.core.sequence.ProteinSequence; 030import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 031import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 032import org.biojava.nbio.core.sequence.io.FastaReader; 033import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser; 034import org.biojava.nbio.core.sequence.io.ProteinSequenceCreator; 035import org.biojava.nbio.core.util.InputStreamProvider; 036 037 038/** 039 * Created by andreas on 6/17/15. 040 */ 041public class ParseFastaFileDemo { 042 043 044 public ParseFastaFileDemo(){ 045 046 047 } 048 049 /** 050 * e.g. download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz 051 * and pass in path to local location of file 052 * 053 * @param args 054 */ 055 public static void main(String[] args) throws Exception { 056 057 int mb = 1024*1024; 058 059 //Getting the runtime reference from system 060 Runtime runtime = Runtime.getRuntime(); 061 062 System.out.println("##### Heap utilization statistics [MB] #####"); 063 064 //Print used memory 065 System.out.println("Used Memory:" 066 + (runtime.totalMemory() - runtime.freeMemory()) / mb); 067 068 //Print free memory 069 System.out.println("Free Memory:" 070 + runtime.freeMemory() / mb); 071 072 //Print total available memory 073 System.out.println("Total Memory:" + runtime.totalMemory() / mb); 074 075 //Print Maximum available memory 076 System.out.println("Max Memory:" + runtime.maxMemory() / mb); 077 078 079 if ( args.length < 1) { 080 System.err.println("First argument needs to be path to fasta file"); 081 return; 082 } 083 084 File f = new File(args[0]); 085 086 if ( ! f.exists()) { 087 System.err.println("File does not exist " + args[0]); 088 return; 089 } 090 091 long timeS = System.currentTimeMillis(); 092 093 // automatically uncompress files using InputStreamProvider 094 InputStreamProvider isp = new InputStreamProvider(); 095 096 InputStream inStream = isp.getInputStream(f); 097 098 099 FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<>( 100 inStream, 101 new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(), 102 new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet())); 103 104 Map<String, ProteinSequence> b; 105 106 int nrSeq = 0; 107 108 while ((b = fastaReader.process(100)) != null) { 109 for (String key : b.keySet()) { 110 nrSeq++; 111 System.out.println(nrSeq + " : " + key + " " + b.get(key)); 112 if ( nrSeq % 100000 == 0) 113 System.out.println(nrSeq ); 114 } 115 116 } 117 long timeE = System.currentTimeMillis(); 118 System.out.println("parsed a total of " + nrSeq + " TREMBL sequences! in " + (timeE - timeS)); 119 } 120}