001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package demo;
022
023
024import java.io.File;
025import java.io.InputStream;
026import java.util.LinkedHashMap;
027import java.util.logging.Level;
028import java.util.logging.Logger;
029import org.biojava.nbio.core.sequence.ProteinSequence;
030import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
031import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
032import org.biojava.nbio.core.sequence.io.FastaReader;
033import org.biojava.nbio.core.sequence.io.GenericFastaHeaderParser;
034import org.biojava.nbio.core.sequence.io.ProteinSequenceCreator;
035import org.biojava.nbio.core.util.InputStreamProvider;
036
037
038/**
039 * Created by andreas on 6/17/15.
040 */
041public class ParseFastaFileDemo {
042
043
044        public ParseFastaFileDemo(){
045
046
047                }
048
049        /** e.g. download ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz
050         * and pass in path to local location of file
051         *
052         * @param args
053         */
054                public static void main(String[] args) {
055
056                        int mb = 1024*1024;
057
058                        //Getting the runtime reference from system
059                        Runtime runtime = Runtime.getRuntime();
060
061                        System.out.println("##### Heap utilization statistics [MB] #####");
062
063                        //Print used memory
064                        System.out.println("Used Memory:"
065                                        + (runtime.totalMemory() - runtime.freeMemory()) / mb);
066
067                        //Print free memory
068                        System.out.println("Free Memory:"
069                                        + runtime.freeMemory() / mb);
070
071                        //Print total available memory
072                        System.out.println("Total Memory:" + runtime.totalMemory() / mb);
073
074                        //Print Maximum available memory
075                        System.out.println("Max Memory:" + runtime.maxMemory() / mb);
076
077
078                        if ( args.length < 1) {
079                                System.err.println("First argument needs to be path to fasta file");
080                                return;
081                        }
082
083                        File f = new File(args[0]);
084
085                        if ( ! f.exists()) {
086                                System.err.println("File does not exist " + args[0]);
087                                return;
088                        }
089
090                        long timeS = System.currentTimeMillis();
091
092                        try {
093
094                                // automatically uncompress files using InputStreamProvider
095                                InputStreamProvider isp = new InputStreamProvider();
096
097                                InputStream inStream = isp.getInputStream(f);
098
099
100                                FastaReader<ProteinSequence, AminoAcidCompound> fastaReader = new FastaReader<ProteinSequence, AminoAcidCompound>(
101                                                inStream,
102                                                new GenericFastaHeaderParser<ProteinSequence, AminoAcidCompound>(),
103                                                new ProteinSequenceCreator(AminoAcidCompoundSet.getAminoAcidCompoundSet()));
104
105                                LinkedHashMap<String, ProteinSequence> b;
106
107                                int nrSeq = 0;
108
109                                while ((b = fastaReader.process(100)) != null) {
110                                        for (String key : b.keySet()) {
111                                                nrSeq++;
112                                                //System.out.println(nrSeq + " : " + key + " " + b.get(key));
113                                                if ( nrSeq % 100000 == 0)
114                                                        System.out.println(nrSeq );
115                                        }
116
117                                }
118                                long timeE = System.currentTimeMillis();
119                                System.out.println("parsed a total of " + nrSeq + " TREMBL sequences! in " + (timeE - timeS));
120                        } catch (Exception ex) {
121                                Logger.getLogger(ParseFastaFileDemo.class.getName()).log(Level.SEVERE, null, ex);
122                        }
123                }
124}