001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.homology;
022
023import org.biojava.nbio.genome.GeneFeatureHelper;
024import org.biojava.nbio.alignment.Alignments;
025import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType;
026import org.biojava.nbio.alignment.SimpleGapPenalty;
027import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix;
028import org.biojava.nbio.core.alignment.template.SequencePair;
029import org.biojava.nbio.core.sequence.*;
030import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
031import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
032import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
033import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface;
034import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface;
035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import java.io.File;
040import java.io.FileOutputStream;
041import java.io.OutputStream;
042import java.util.ArrayList;
043import java.util.LinkedHashMap;
044import java.util.List;
045import java.util.Map;
046
047/**
048 *
049 * @author Scooter Willis <willishf at gmail dot com>
050 * @author Mark Chapman
051 */
052public class GFF3FromUniprotBlastHits {
053
054        private static final Logger logger = LoggerFactory.getLogger(GFF3FromUniprotBlastHits.class);
055
056        public void process(File xmlBlastHits, double ecutoff, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception {
057                LinkedHashMap<String, ArrayList<String>> hits = BlastHomologyHits.getMatches(xmlBlastHits, ecutoff);
058                process(hits, geneSequenceHashMap, gff3Output);
059        }
060
061        public void process(LinkedHashMap<String, ArrayList<String>> hits, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception {
062                int size = hits.size();
063                int index = 0;
064//              HashMap<String, String> scaffoldsReferencedHashMap = new HashMap<String, String>();
065                for (String accessionid : hits.keySet()) {
066                        index++;
067                        if (index == 12) {
068                                index = 12;
069                        }
070                        logger.error(accessionid + " " + index + "/" + size);
071                        try {
072
073                                String[] data = accessionid.split(" ");
074                                String id = data[0];
075                                GeneSequence geneSequence = geneSequenceHashMap.get(id);
076                                if (geneSequence == null) {
077                                        logger.error("Not found " + id);
078                                        continue;
079                                }
080                                ArrayList<String> uniprotProteinHits = hits.get(accessionid);
081                                String uniprotBestHit = uniprotProteinHits.get(0);
082                                UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>(uniprotBestHit, AminoAcidCompoundSet.getAminoAcidCompoundSet());
083
084                                ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence);
085                                String hitSequence = proteinSequence.getSequenceAsString();
086                                for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) {
087
088
089                                        String predictedProteinSequence = transcriptSequence.getProteinSequence().getSequenceAsString();
090                                        ArrayList<ProteinSequence> cdsProteinList = transcriptSequence.getProteinCDSSequences();
091
092                                        ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>(transcriptSequence.getCDSSequences().values());
093                                        String testSequence = "";
094                                        for (ProteinSequence cdsProteinSequence : cdsProteinList) {
095                                                testSequence = testSequence + cdsProteinSequence.getSequenceAsString();
096                                        }
097                                        if (!testSequence.equals(predictedProteinSequence) && (!predictedProteinSequence.equals(testSequence.substring(0, testSequence.length() - 1)))) {
098                                                DNASequence codingSequence = transcriptSequence.getDNACodingSequence();
099                                                logger.info("Coding Sequence: {}", codingSequence.getSequenceAsString());
100                                                logger.info("Sequence agreement error");
101                                                logger.info("CDS seq={}", testSequence);
102                                                logger.info("PRE seq={}", predictedProteinSequence);
103                                                logger.info("UNI seq={}", hitSequence);
104                                                //  throw new Exception("Protein Sequence compare error " + id);
105                                        }
106
107                                        SequencePair<ProteinSequence, AminoAcidCompound> alignment = Alignments.getPairwiseAlignment(
108                                                        transcriptSequence.getProteinSequence(), proteinSequence,
109                                                        PairwiseSequenceAlignerType.LOCAL, new SimpleGapPenalty(),
110                                                        SimpleSubstitutionMatrix.getBlosum62()
111                                                        );
112                                        // System.out.println();
113                                        //    System.out.println(alignment.getSummary());
114                                        //   System.out.println(new Pair().format(alignment));
115                                        int proteinIndex = 0;
116                                        int gff3Index = 0;
117                                        for (int i = 0; i < cdsProteinList.size(); i++) {
118                                                ProteinSequence peptideSequence = cdsProteinList.get(i);
119                                                String seq = peptideSequence.getSequenceAsString();
120                                                Integer startIndex = null;
121                                                int offsetStartIndex = 0;
122                                                for (int s = 0; s < seq.length(); s++) {
123                                                        startIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + s);
124                                                        if (startIndex != null) {
125                                                                startIndex = startIndex + 1;
126                                                                offsetStartIndex = s;
127                                                                break;
128                                                        }
129                                                }
130                                                Integer endIndex = null;
131
132                                                int offsetEndIndex = 0;
133                                                for (int e = 0; e < seq.length(); e++) {
134                                                        endIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + seq.length() - 1 - e);
135                                                        if (endIndex != null) {
136                                                                endIndex = endIndex + 1;
137                                                                offsetEndIndex = e;
138                                                                break;
139                                                        }
140                                                }
141
142                                                proteinIndex = proteinIndex + seq.length();
143                                                if (startIndex != null && endIndex != null && startIndex != endIndex) {
144                                                        CDSSequence cdsSequence = cdsSequenceList.get(i);
145                                                        String hitLabel = "";
146                                                        if (transcriptSequence.getStrand() == Strand.POSITIVE) {
147                                                                hitLabel = uniprotBestHit + "_" + startIndex + "_" + endIndex;
148                                                        } else {
149                                                                hitLabel = uniprotBestHit + "_" + endIndex + "_" + startIndex;
150                                                        }
151                                                        int dnaBeginIndex = cdsSequence.getBioBegin() + (3 * offsetStartIndex);
152                                                        int dnaEndIndex = cdsSequence.getBioEnd() - (3 * offsetEndIndex);
153                                                        String scaffold = geneSequence.getParentChromosomeSequence().getAccession().getID();
154                                        //        if (scaffoldsReferencedHashMap.containsKey(scaffold) == false) {
155                                        //            String gff3line = scaffold + "\t" + geneSequence.getSource() + "\t" + "size" + "\t" + "1" + "\t" + geneSequence.getParentChromosomeSequence().getBioEnd() + "\t.\t.\t.\tName=" + scaffold + "\r\n";
156                                        //            gff3Output.write(gff3line.getBytes());
157                                        //            scaffoldsReferencedHashMap.put(scaffold, scaffold);
158                                        //        }
159
160                                                        String line = scaffold + "\t" + geneSequence.getSource() + "_" + "UNIPROT\tmatch\t" + dnaBeginIndex + "\t" + dnaEndIndex + "\t.\t" + transcriptSequence.getStrand().getStringRepresentation() + "\t.\t";
161                                                        if (gff3Index == 0) {
162                                                                FeaturesKeyWordInterface featureKeyWords = proteinSequence.getFeaturesKeyWord();
163                                                                String notes = "";
164                                                                if (featureKeyWords != null) {
165                                                                        List<String> keyWords = featureKeyWords.getKeyWords();
166                                                                        if (keyWords.size() > 0) {
167                                                                                notes = ";Note=";
168                                                                                for (String note : keyWords) {
169                                                                                        if (note.equals("Complete proteome")) {
170                                                                                                continue;
171                                                                                        }
172                                                                                        if (note.equals("Direct protein sequencing")) {
173                                                                                                continue;
174                                                                                        }
175
176                                                                                        notes = notes + " " + note;
177                                                                                        geneSequence.addNote(note); // add note/keyword which can be output in fasta header if needed
178                                                                                }
179                                                                        }
180
181                                                                }
182
183                                                                DatabaseReferenceInterface databaseReferences = proteinSequence.getDatabaseReferences();
184                                                                if (databaseReferences != null) {
185                                                                        Map<String, List<DBReferenceInfo>> databaseReferenceHashMap = databaseReferences.getDatabaseReferences();
186                                                                        List<DBReferenceInfo> pfamList = databaseReferenceHashMap.get("Pfam");
187                                                                        List<DBReferenceInfo> cazyList = databaseReferenceHashMap.get("CAZy");
188                                                                        List<DBReferenceInfo> goList = databaseReferenceHashMap.get("GO");
189                                                                        List<DBReferenceInfo> eccList = databaseReferenceHashMap.get("BRENDA");
190                                                                        if (pfamList != null && pfamList.size() > 0) {
191                                                                                if (notes.length() == 0) {
192                                                                                        notes = ";Note=";
193                                                                                }
194                                                                                for (DBReferenceInfo note : pfamList) {
195                                                                                        notes = notes + " " + note.getId();
196                                                                                        geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed
197                                                                                }
198                                                                        }
199
200                                                                        if (cazyList != null && cazyList.size() > 0) {
201                                                                                if (notes.length() == 0) {
202                                                                                        notes = ";Note=";
203                                                                                }
204                                                                                for (DBReferenceInfo note : cazyList) {
205                                                                                        notes = notes + " " + note.getId();
206                                                                                        geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed
207                                                                                        // System.out.println("CAZy=" + note);
208                                                                                }
209                                                                        }
210
211                                                                        if (eccList != null && eccList.size() > 0) {
212                                                                                if (notes.length() == 0) {
213                                                                                        notes = ";Note=";
214                                                                                }
215                                                                                for (DBReferenceInfo note : eccList) {
216                                                                                        String dbid = note.getId();
217                                                                                        dbid = dbid.replace(".", "_"); //replace . with _ to facilitate searching in gbrowse
218                                                                                        notes = notes + " " + "EC:" + dbid;
219                                                                                        geneSequence.addNote("EC:" + dbid); // add note/keyword which can be output in fasta header if needed
220
221                                                                                }
222                                                                        }
223
224                                                                        if (goList != null && goList.size() > 0) {
225                                                                                if (notes.length() == 0) {
226                                                                                        notes = ";Note=";
227                                                                                }
228                                                                                for (DBReferenceInfo note : goList) {
229                                                                                        notes = notes + " " + note.getId();
230                                                                                        geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed
231                                                                                        LinkedHashMap<String, String> properties = note.getProperties();
232                                                                                        for (String propertytype : properties.keySet()) {
233                                                                                                if (propertytype.equals("evidence")) {
234                                                                                                        continue;
235                                                                                                }
236                                                                                                String property = properties.get(propertytype);
237
238                                                                                                if (property.startsWith("C:")) {
239                                                                                                        continue; // skip over the location
240                                                                                                }
241                                                                                                if (property.endsWith("...")) {
242                                                                                                        property = property.substring(0, property.length() - 3);
243                                                                                                }
244                                                                                                notes = notes + " " + property;
245                                                                                                geneSequence.addNote(property);
246                                                                                        }
247                                                                                }
248                                                                        }
249
250                                                                }
251
252
253                                                                line = line + "Name=" + hitLabel + ";Alias=" + uniprotBestHit + notes + "\n";
254                                                        } else {
255                                                                line = line + "Name=" + hitLabel + "\n";
256                                                        }
257                                                        gff3Index++;
258
259                                                        gff3Output.write(line.getBytes());
260                                                }
261                                        }
262                                }
263                        } catch (Exception e) {
264                                logger.info("Accession Id: {}", accessionid, e);
265                        }
266                }
267
268
269
270        }
271
272
273
274
275        public static void main(String[] args) {
276                /*
277                        try {
278                                LinkedHashMap<String, ChromosomeSequence> dnaSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds.fna"), new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf"));
279                                LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceList.values());
280                                FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_uniprot_match.gff3");
281
282                                GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits();
283                                gff3FromUniprotBlastHits.process(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/c1-454Scaffolds-hits-uniprot_fungi.xml"), 1E-10, geneSequenceList, fo);
284                                fo.close();
285
286
287                        } catch (Exception e) {
288                                logger.error("Exception: ", e);
289
290
291                        }
292                */
293
294                        try {
295                                LinkedHashMap<String, ChromosomeSequence> dnaSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds-16.fna"), new File("/Users/Scooter/scripps/dyadic/GlimmerHMM/c1_glimmerhmm-16.gff"));
296                                LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceHashMap.values());
297                                FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/outputGlimmer/genemark_uniprot_match-16.gff3");
298                                LinkedHashMap<String, ArrayList<String>> blasthits = BlastHomologyHits.getMatches(new File("/Users/Scooter/scripps/dyadic/blastresults/c1_glimmer_in_uniprot.xml"), 1E-10);
299                                logger.error("Number of uniprot hits " + blasthits.size());
300
301                                GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits();
302                                gff3FromUniprotBlastHits.process(blasthits, geneSequenceList, fo);
303                                fo.close();
304                        } catch (Exception e) {
305                                logger.error("Exception: ", e);
306                        }
307        }
308}