001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.uniprot;
022
023
024import org.biojava.nbio.core.sequence.AccessionID;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.io.FastaWriterHelper;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import java.io.BufferedReader;
031import java.io.File;
032import java.io.FileReader;
033import java.util.ArrayList;
034import java.util.HashMap;
035
036/**
037 *
038 * @author Scooter
039 */
040public class UniprotToFasta {
041
042        private static final Logger logger = LoggerFactory.getLogger(UniprotToFasta.class);
043
044        public static void main( String[] args ){
045                try{
046                        String uniprotDatFileName = "uniprot_trembl_fungi.dat";
047                        String fastaFileName = "uniprot__trembel_fungi.faa";
048                        UniprotToFasta uniprotToFasta = new UniprotToFasta();
049                        uniprotToFasta.process(uniprotDatFileName, fastaFileName);
050                }catch(Exception e){
051                        logger.error("Exception: ", e);
052                }
053        }
054
055        /**
056         * Convert a Uniprot sequence file to a fasta file. Allows you to download all sequence data for a species
057         * and convert to fasta to be used in a blast database
058         * @param uniprotDatFileName
059         * @param fastaFileName
060         * @throws Exception
061         */
062
063        public void process( String uniprotDatFileName,String fastaFileName ) throws Exception{
064
065                        FileReader fr = new FileReader(uniprotDatFileName);
066                        BufferedReader br = new BufferedReader(fr);
067                        String line = br.readLine();
068                        String id = "";
069                        StringBuffer sequence = new StringBuffer();
070                        ArrayList<ProteinSequence> seqCodingRegionsList = new ArrayList<>();
071                        int count = 0;
072                        HashMap<String,String> uniqueGenes = new HashMap<>();
073                        HashMap<String,String> uniqueSpecies = new HashMap<>();
074                        while(line != null){
075                                if(line.startsWith("ID")){
076                                        String[] data = line.split(" ");
077                                        id = data[3];
078                                }else if(line.startsWith("SQ")){
079                                        line = br.readLine();
080                                        while(!line.startsWith("//")){
081
082                                                for(int i = 0; i < line.length(); i++){
083                                                        char aa = line.charAt(i);
084                                                        if((aa >= 'A' && aa <= 'Z') || (aa >= 'a' && aa <= 'z' )){
085                                                                sequence.append(aa);
086                                                        }
087                                                }
088                                                line = br.readLine();
089                                        }
090
091                                 //   System.out.println(">" + id);
092                                 //   System.out.println(sequence.toString());
093
094                                        ProteinSequence seq = new ProteinSequence(sequence.toString() );
095                                        seq.setAccession(new AccessionID(id));
096
097                                        seqCodingRegionsList.add(seq);
098                                        sequence = new StringBuffer();
099                                        count++;
100                                        if(count % 100 == 0)
101                                                logger.info("Count: ", count);
102                                        String[] parts = id.split("_");
103                                        uniqueGenes.put(parts[0], "");
104                                        uniqueSpecies.put(parts[1],"");
105                                }
106                                line = br.readLine();
107                        }
108           //     System.out.println("Unique Genes=" + uniqueGenes.size());
109           //     System.out.println("Unique Species=" + uniqueSpecies.size());
110           //     System.out.println("Total sequences=" + seqCodingRegionsList.size());
111                        FastaWriterHelper.writeProteinSequence(new File(fastaFileName), seqCodingRegionsList);
112
113                        br.close();
114                        fr.close();
115
116          //      System.out.println(uniqueGenes.keySet());
117          //      System.out.println("====================");
118          //      System.out.println(uniqueSpecies.keySet());
119
120
121        }
122
123}