001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.homology;
022
023import org.biojava.nbio.genome.GeneFeatureHelper;
024import org.biojava.nbio.alignment.Alignments;
025import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
026import org.biojava.nbio.alignment.SimpleGapPenalty;
027import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix;
028import org.biojava.nbio.core.alignment.template.SequencePair;
029import org.biojava.nbio.core.sequence.*;
030import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
031import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
032import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
033import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
034import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import java.io.File;
040import java.io.FileOutputStream;
041import java.io.OutputStream;
042import java.util.ArrayList;
043import java.util.LinkedHashMap;
044
045/**
046 *
047 * @author Scooter Willis <willishf at gmail dot com>
048 * @author Mark Chapman
049 */
050public class GFF3FromUniprotBlastHits {
051
052        private static final Logger logger = LoggerFactory.getLogger(GFF3FromUniprotBlastHits.class);
053
054        public void process(File xmlBlastHits, double ecutoff, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception {
055                LinkedHashMap<String, ArrayList<String>> hits = BlastHomologyHits.getMatches(xmlBlastHits, ecutoff);
056                process(hits, geneSequenceHashMap, gff3Output);
057        }
058
059        public void process(LinkedHashMap<String, ArrayList<String>> hits, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception {
060                int size = hits.size();
061                int index = 0;
062//              HashMap<String, String> scaffoldsReferencedHashMap = new HashMap<String, String>();
063                for (String accessionid : hits.keySet()) {
064                        index++;
065                        if (index == 12) {
066                                index = 12;
067                        }
068                        logger.error(accessionid + " " + index + "/" + size);
069                        try {
070
071                                String[] data = accessionid.split(" ");
072                                String id = data[0];
073                                GeneSequence geneSequence = geneSequenceHashMap.get(id);
074                                if (geneSequence == null) {
075                                        logger.error("Not found " + id);
076                                        continue;
077                                }
078                                ArrayList<String> uniprotProteinHits = hits.get(accessionid);
079                                String uniprotBestHit = uniprotProteinHits.get(0);
080                                UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>(uniprotBestHit, AminoAcidCompoundSet.getAminoAcidCompoundSet());
081
082                                ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
083                                String hitSequence = proteinSequence.getSequenceAsString();
084                                for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) {
085
086
087                                        String predictedProteinSequence = transcriptSequence.getProteinSequence().getSequenceAsString();
088                                        ArrayList<ProteinSequence> cdsProteinList = transcriptSequence.getProteinCDSSequences();
089
090                                        ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>(transcriptSequence.getCDSSequences().values());
091                                        String testSequence = "";
092                                        for (ProteinSequence cdsProteinSequence : cdsProteinList) {
093                                                testSequence = testSequence + cdsProteinSequence.getSequenceAsString();
094                                        }
095                                        if (!testSequence.equals(predictedProteinSequence) && (!predictedProteinSequence.equals(testSequence.substring(0, testSequence.length() - 1)))) {
096                                                DNASequence codingSequence = transcriptSequence.getDNACodingSequence();
097                                                logger.info("Coding Sequence: {}", codingSequence.getSequenceAsString());
098                                                logger.info("Sequence agreement error");
099                                                logger.info("CDS seq={}", testSequence);
100                                                logger.info("PRE seq={}", predictedProteinSequence);
101                                                logger.info("UNI seq={}", hitSequence);
102                                                //  throw new Exception("Protein Sequence compare error " + id);
103                                        }
104
105                                        SequencePair<ProteinSequence, AminoAcidCompound> alignment = Alignments.getPairwiseAlignment(
106                                                        transcriptSequence.getProteinSequence(), proteinSequence,
107                                                        PairwiseSequenceAlignerType.LOCAL, new SimpleGapPenalty(),
108                                                        SimpleSubstitutionMatrix.getBlosum62()
109                                                        );
110                                        // System.out.println();
111                                        //    System.out.println(alignment.getSummary());
112                                        //   System.out.println(new Pair().format(alignment));
113                                        int proteinIndex = 0;
114                                        int gff3Index = 0;
115                                        for (int i = 0; i < cdsProteinList.size(); i++) {
116                                                ProteinSequence peptideSequence = cdsProteinList.get(i);
117                                                String seq = peptideSequence.getSequenceAsString();
118                                                Integer startIndex = null;
119                                                int offsetStartIndex = 0;
120                                                for (int s = 0; s < seq.length(); s++) {
121                                                        startIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + s);
122                                                        if (startIndex != null) {
123                                                                startIndex = startIndex + 1;
124                                                                offsetStartIndex = s;
125                                                                break;
126                                                        }
127                                                }
128                                                Integer endIndex = null;
129
130                                                int offsetEndIndex = 0;
131                                                for (int e = 0; e < seq.length(); e++) {
132                                                        endIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + seq.length() - 1 - e);
133                                                        if (endIndex != null) {
134                                                                endIndex = endIndex + 1;
135                                                                offsetEndIndex = e;
136                                                                break;
137                                                        }
138                                                }
139
140                                                proteinIndex = proteinIndex + seq.length();
141                                                if (startIndex != null && endIndex != null && startIndex != endIndex) {
142                                                        CDSSequence cdsSequence = cdsSequenceList.get(i);
143                                                        String hitLabel = "";
144                                                        if (transcriptSequence.getStrand() == Strand.POSITIVE) {
145                                                                hitLabel = uniprotBestHit + "_" + startIndex + "_" + endIndex;
146                                                        } else {
147                                                                hitLabel = uniprotBestHit + "_" + endIndex + "_" + startIndex;
148                                                        }
149                                                        int dnaBeginIndex = cdsSequence.getBioBegin() + (3 * offsetStartIndex);
150                                                        int dnaEndIndex = cdsSequence.getBioEnd() - (3 * offsetEndIndex);
151                                                        String scaffold = geneSequence.getParentChromosomeSequence().getAccession().getID();
152                                        //        if (scaffoldsReferencedHashMap.containsKey(scaffold) == false) {
153                                        //            String gff3line = scaffold + "\t" + geneSequence.getSource() + "\t" + "size" + "\t" + "1" + "\t" + geneSequence.getParentChromosomeSequence().getBioEnd() + "\t.\t.\t.\tName=" + scaffold + "\r\n";
154                                        //            gff3Output.write(gff3line.getBytes());
155                                        //            scaffoldsReferencedHashMap.put(scaffold, scaffold);
156                                        //        }
157
158                                                        String line = scaffold + "\t" + geneSequence.getSource() + "_" + "UNIPROT\tmatch\t" + dnaBeginIndex + "\t" + dnaEndIndex + "\t.\t" + transcriptSequence.getStrand().getStringRepresentation() + "\t.\t";
159                                                        if (gff3Index == 0) {
160                                                                FeaturesKeyWordInterface featureKeyWords = proteinSequence.getFeaturesKeyWord();
161                                                                String notes = "";
162                                                                if (featureKeyWords != null) {
163                                                                        ArrayList<String> keyWords = featureKeyWords.getKeyWords();
164                                                                        if (keyWords.size() > 0) {
165                                                                                notes = ";Note=";
166                                                                                for (String note : keyWords) {
167                                                                                        if (note.equals("Complete proteome")) {
168                                                                                                continue;
169                                                                                        }
170                                                                                        if (note.equals("Direct protein sequencing")) {
171                                                                                                continue;
172                                                                                        }
173
174                                                                                        notes = notes + " " + note;
175                                                                                        geneSequence.addNote(note); // add note/keyword which can be output in fasta header if needed
176                                                                                }
177                                                                        }
178
179                                                                }
180
181                                                                DatabaseReferenceInterface databaseReferences = proteinSequence.getDatabaseReferences();
182                                                                if (databaseReferences != null) {
183                                                                        LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferenceHashMap = databaseReferences.getDatabaseReferences();
184                                                                        ArrayList<DBReferenceInfo> pfamList = databaseReferenceHashMap.get("Pfam");
185                                                                        ArrayList<DBReferenceInfo> cazyList = databaseReferenceHashMap.get("CAZy");
186                                                                        ArrayList<DBReferenceInfo> goList = databaseReferenceHashMap.get("GO");
187                                                                        ArrayList<DBReferenceInfo> eccList = databaseReferenceHashMap.get("BRENDA");
188                                                                        if (pfamList != null && pfamList.size() > 0) {
189                                                                                if (notes.length() == 0) {
190                                                                                        notes = ";Note=";
191                                                                                }
192                                                                                for (DBReferenceInfo note : pfamList) {
193                                                                                        notes = notes + " " + note.getId();
194                                                                                        geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed
195                                                                                }
196                                                                        }
197
198                                                                        if (cazyList != null && cazyList.size() > 0) {
199                                                                                if (notes.length() == 0) {
200                                                                                        notes = ";Note=";
201                                                                                }
202                                                                                for (DBReferenceInfo note : cazyList) {
203                                                                                        notes = notes + " " + note.getId();
204                                                                                        geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed
205                                                                                        // System.out.println("CAZy=" + note);
206                                                                                }
207                                                                        }
208
209                                                                        if (eccList != null && eccList.size() > 0) {
210                                                                                if (notes.length() == 0) {
211                                                                                        notes = ";Note=";
212                                                                                }
213                                                                                for (DBReferenceInfo note : eccList) {
214                                                                                        String dbid = note.getId();
215                                                                                        dbid = dbid.replace(".", "_"); //replace . with _ to facilitate searching in gbrowse
216                                                                                        notes = notes + " " + "EC:" + dbid;
217                                                                                        geneSequence.addNote("EC:" + dbid); // add note/keyword which can be output in fasta header if needed
218
219                                                                                }
220                                                                        }
221
222                                                                        if (goList != null && goList.size() > 0) {
223                                                                                if (notes.length() == 0) {
224                                                                                        notes = ";Note=";
225                                                                                }
226                                                                                for (DBReferenceInfo note : goList) {
227                                                                                        notes = notes + " " + note.getId();
228                                                                                        geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed
229                                                                                        LinkedHashMap<String, String> properties = note.getProperties();
230                                                                                        for (String propertytype : properties.keySet()) {
231                                                                                                if (propertytype.equals("evidence")) {
232                                                                                                        continue;
233                                                                                                }
234                                                                                                String property = properties.get(propertytype);
235
236                                                                                                if (property.startsWith("C:")) {
237                                                                                                        continue; // skip over the location
238                                                                                                }
239                                                                                                if (property.endsWith("...")) {
240                                                                                                        property = property.substring(0, property.length() - 3);
241                                                                                                }
242                                                                                                notes = notes + " " + property;
243                                                                                                geneSequence.addNote(property);
244                                                                                        }
245                                                                                }
246                                                                        }
247
248                                                                }
249
250
251                                                                line = line + "Name=" + hitLabel + ";Alias=" + uniprotBestHit + notes + "\n";
252                                                        } else {
253                                                                line = line + "Name=" + hitLabel + "\n";
254                                                        }
255                                                        gff3Index++;
256
257                                                        gff3Output.write(line.getBytes());
258                                                }
259                                        }
260                                }
261                        } catch (Exception e) {
262                                logger.info("Accession Id: {}", accessionid, e);
263                        }
264                }
265
266
267
268        }
269
270
271
272
273        public static void main(String[] args) {
274                /*
275                        try {
276                                LinkedHashMap<String, ChromosomeSequence> dnaSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds.fna"), new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf"));
277                                LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceList.values());
278                                FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_uniprot_match.gff3");
279
280                                GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits();
281                                gff3FromUniprotBlastHits.process(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/c1-454Scaffolds-hits-uniprot_fungi.xml"), 1E-10, geneSequenceList, fo);
282                                fo.close();
283
284
285                        } catch (Exception e) {
286                                logger.error("Exception: ", e);
287
288
289                        }
290                */
291
292                        try {
293                                LinkedHashMap<String, ChromosomeSequence> dnaSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds-16.fna"), new File("/Users/Scooter/scripps/dyadic/GlimmerHMM/c1_glimmerhmm-16.gff"));
294                                LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceHashMap.values());
295                                FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/outputGlimmer/genemark_uniprot_match-16.gff3");
296                                LinkedHashMap<String, ArrayList<String>> blasthits = BlastHomologyHits.getMatches(new File("/Users/Scooter/scripps/dyadic/blastresults/c1_glimmer_in_uniprot.xml"), 1E-10);
297                                logger.error("Number of uniprot hits " + blasthits.size());
298
299                                GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits();
300                                gff3FromUniprotBlastHits.process(blasthits, geneSequenceList, fo);
301                                fo.close();
302                        } catch (Exception e) {
303                                logger.error("Exception: ", e);
304                        }
305        }
306}