001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.uniprot; 022 023 024import org.biojava.nbio.core.sequence.AccessionID; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.io.FastaWriterHelper; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import java.io.BufferedReader; 031import java.io.File; 032import java.io.FileReader; 033import java.util.ArrayList; 034import java.util.HashMap; 035 036/** 037 * 038 * @author Scooter 039 */ 040public class UniprotToFasta { 041 042 private static final Logger logger = LoggerFactory.getLogger(UniprotToFasta.class); 043 044 public static void main( String[] args ){ 045 try{ 046 String uniprotDatFileName = "uniprot_trembl_fungi.dat"; 047 String fastaFileName = "uniprot__trembel_fungi.faa"; 048 UniprotToFasta uniprotToFasta = new UniprotToFasta(); 049 uniprotToFasta.process(uniprotDatFileName, fastaFileName); 050 }catch(Exception e){ 051 logger.error("Exception: ", e); 052 } 053 } 054 055 /** 056 * Convert a Uniprot sequence file to a fasta file. Allows you to download all sequence data for a species 057 * and convert to fasta to be used in a blast database 058 * @param uniprotDatFileName 059 * @param fastaFileName 060 * @throws Exception 061 */ 062 063 public void process( String uniprotDatFileName,String fastaFileName ) throws Exception{ 064 065 FileReader fr = new FileReader(uniprotDatFileName); 066 BufferedReader br = new BufferedReader(fr); 067 String line = br.readLine(); 068 String id = ""; 069 StringBuffer sequence = new StringBuffer(); 070 ArrayList<ProteinSequence> seqCodingRegionsList = new ArrayList<>(); 071 int count = 0; 072 HashMap<String,String> uniqueGenes = new HashMap<>(); 073 HashMap<String,String> uniqueSpecies = new HashMap<>(); 074 while(line != null){ 075 if(line.startsWith("ID")){ 076 String[] data = line.split(" "); 077 id = data[3]; 078 }else if(line.startsWith("SQ")){ 079 line = br.readLine(); 080 while(!line.startsWith("//")){ 081 082 for(int i = 0; i < line.length(); i++){ 083 char aa = line.charAt(i); 084 if((aa >= 'A' && aa <= 'Z') || (aa >= 'a' && aa <= 'z' )){ 085 sequence.append(aa); 086 } 087 } 088 line = br.readLine(); 089 } 090 091 // System.out.println(">" + id); 092 // System.out.println(sequence.toString()); 093 094 ProteinSequence seq = new ProteinSequence(sequence.toString() ); 095 seq.setAccession(new AccessionID(id)); 096 097 seqCodingRegionsList.add(seq); 098 sequence = new StringBuffer(); 099 count++; 100 if(count % 100 == 0) 101 logger.info("Count: ", count); 102 String[] parts = id.split("_"); 103 uniqueGenes.put(parts[0], ""); 104 uniqueSpecies.put(parts[1],""); 105 } 106 line = br.readLine(); 107 } 108 // System.out.println("Unique Genes=" + uniqueGenes.size()); 109 // System.out.println("Unique Species=" + uniqueSpecies.size()); 110 // System.out.println("Total sequences=" + seqCodingRegionsList.size()); 111 FastaWriterHelper.writeProteinSequence(new File(fastaFileName), seqCodingRegionsList); 112 113 br.close(); 114 fr.close(); 115 116 // System.out.println(uniqueGenes.keySet()); 117 // System.out.println("===================="); 118 // System.out.println(uniqueSpecies.keySet()); 119 120 121 } 122 123}