001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package demo;
022
023
024import java.io.File;
025import java.io.InputStream;
026import java.util.LinkedHashMap;
027import org.biojava.nbio.core.sequence.ProteinSequence;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
029import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
030import org.biojava.nbio.core.sequence.io.FastaReader;
031import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser;
032import org.biojava.nbio.core.sequence.io.ProteinSequenceCreator;
033import org.biojava.nbio.core.util.InputStreamProvider;
034
035
036/**
037 * Created by andreas on 6/17/15.
038 */
039public class ParseFastaFileDemo {
040
041
042        public ParseFastaFileDemo(){
043
044
045        }
046
047        /**
048         * e.g. download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz
049         * and pass in path to local location of file
050         *
051         * @param args
052         */
053        public static void main(String[] args) throws Exception {
054
055                int mb = 1024*1024;
056
057                //Getting the runtime reference from system
058                Runtime runtime = Runtime.getRuntime();
059
060                System.out.println("##### Heap utilization statistics [MB] #####");
061
062                //Print used memory
063                System.out.println("Used Memory:"
064                                + (runtime.totalMemory() - runtime.freeMemory()) / mb);
065
066                //Print free memory
067                System.out.println("Free Memory:"
068                                + runtime.freeMemory() / mb);
069
070                //Print total available memory
071                System.out.println("Total Memory:" + runtime.totalMemory() / mb);
072
073                //Print Maximum available memory
074                System.out.println("Max Memory:" + runtime.maxMemory() / mb);
075
076
077                if ( args.length < 1) {
078                        System.err.println("First argument needs to be path to fasta file");
079                        return;
080                }
081
082                File f = new File(args[0]);
083
084                if ( ! f.exists()) {
085                        System.err.println("File does not exist " + args[0]);
086                        return;
087                }
088
089                long timeS = System.currentTimeMillis();
090
091                // automatically uncompress files using InputStreamProvider
092                InputStreamProvider isp = new InputStreamProvider();
093
094                InputStream inStream = isp.getInputStream(f);
095
096
097                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(
098                                inStream,
099                                new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(),
100                                new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
101
102                LinkedHashMap<String, ProteinSequence> b;
103
104                int nrSeq = 0;
105
106                while ((b = fastaReader.process(100)) != null) {
107                        for (String key : b.keySet()) {
108                                nrSeq++;
109                                System.out.println(nrSeq + " : " + key + " " + b.get(key));
110                                if ( nrSeq % 100000 == 0)
111                                        System.out.println(nrSeq );
112                        }
113
114                }
115                long timeE = System.currentTimeMillis();
116                System.out.println("parsed a total of " + nrSeq + " TREMBL sequences! in " + (timeE - timeS));
117        }
118}