001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.homology; 022 023import org.biojava.nbio.genome.GeneFeatureHelper; 024import org.biojava.nbio.alignment.Alignments; 025import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 026import org.biojava.nbio.alignment.SimpleGapPenalty; 027import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix; 028import org.biojava.nbio.core.alignment.template.SequencePair; 029import org.biojava.nbio.core.sequence.*; 030import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 031import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 032import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 033import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 034import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import java.io.File; 040import java.io.FileOutputStream; 041import java.io.OutputStream; 042import java.util.ArrayList; 043import java.util.LinkedHashMap; 044 045/** 046 * 047 * @author Scooter Willis <willishf at gmail dot com> 048 * @author Mark Chapman 049 */ 050public class GFF3FromUniprotBlastHits { 051 052 private static final Logger logger = LoggerFactory.getLogger(GFF3FromUniprotBlastHits.class); 053 054 public void process(File xmlBlastHits, double ecutoff, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception { 055 LinkedHashMap<String, ArrayList<String>> hits = BlastHomologyHits.getMatches(xmlBlastHits, ecutoff); 056 process(hits, geneSequenceHashMap, gff3Output); 057 } 058 059 public void process(LinkedHashMap<String, ArrayList<String>> hits, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception { 060 int size = hits.size(); 061 int index = 0; 062// HashMap<String, String> scaffoldsReferencedHashMap = new HashMap<String, String>(); 063 for (String accessionid : hits.keySet()) { 064 index++; 065 if (index == 12) { 066 index = 12; 067 } 068 logger.error(accessionid + " " + index + "/" + size); 069 try { 070 071 String[] data = accessionid.split(" "); 072 String id = data[0]; 073 GeneSequence geneSequence = geneSequenceHashMap.get(id); 074 if (geneSequence == null) { 075 logger.error("Not found " + id); 076 continue; 077 } 078 ArrayList<String> uniprotProteinHits = hits.get(accessionid); 079 String uniprotBestHit = uniprotProteinHits.get(0); 080 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>(uniprotBestHit, AminoAcidCompoundSet.getAminoAcidCompoundSet()); 081 082 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence); 083 String hitSequence = proteinSequence.getSequenceAsString(); 084 for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) { 085 086 087 String predictedProteinSequence = transcriptSequence.getProteinSequence().getSequenceAsString(); 088 ArrayList<ProteinSequence> cdsProteinList = transcriptSequence.getProteinCDSSequences(); 089 090 ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>(transcriptSequence.getCDSSequences().values()); 091 String testSequence = ""; 092 for (ProteinSequence cdsProteinSequence : cdsProteinList) { 093 testSequence = testSequence + cdsProteinSequence.getSequenceAsString(); 094 } 095 if (!testSequence.equals(predictedProteinSequence) && (!predictedProteinSequence.equals(testSequence.substring(0, testSequence.length() - 1)))) { 096 DNASequence codingSequence = transcriptSequence.getDNACodingSequence(); 097 logger.info("Coding Sequence: {}", codingSequence.getSequenceAsString()); 098 logger.info("Sequence agreement error"); 099 logger.info("CDS seq={}", testSequence); 100 logger.info("PRE seq={}", predictedProteinSequence); 101 logger.info("UNI seq={}", hitSequence); 102 // throw new Exception("Protein Sequence compare error " + id); 103 } 104 105 SequencePair<ProteinSequence, AminoAcidCompound> alignment = Alignments.getPairwiseAlignment( 106 transcriptSequence.getProteinSequence(), proteinSequence, 107 PairwiseSequenceAlignerType.LOCAL, new SimpleGapPenalty(), 108 SimpleSubstitutionMatrix.getBlosum62() 109 ); 110 // System.out.println(); 111 // System.out.println(alignment.getSummary()); 112 // System.out.println(new Pair().format(alignment)); 113 int proteinIndex = 0; 114 int gff3Index = 0; 115 for (int i = 0; i < cdsProteinList.size(); i++) { 116 ProteinSequence peptideSequence = cdsProteinList.get(i); 117 String seq = peptideSequence.getSequenceAsString(); 118 Integer startIndex = null; 119 int offsetStartIndex = 0; 120 for (int s = 0; s < seq.length(); s++) { 121 startIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + s); 122 if (startIndex != null) { 123 startIndex = startIndex + 1; 124 offsetStartIndex = s; 125 break; 126 } 127 } 128 Integer endIndex = null; 129 130 int offsetEndIndex = 0; 131 for (int e = 0; e < seq.length(); e++) { 132 endIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + seq.length() - 1 - e); 133 if (endIndex != null) { 134 endIndex = endIndex + 1; 135 offsetEndIndex = e; 136 break; 137 } 138 } 139 140 proteinIndex = proteinIndex + seq.length(); 141 if (startIndex != null && endIndex != null && startIndex != endIndex) { 142 CDSSequence cdsSequence = cdsSequenceList.get(i); 143 String hitLabel = ""; 144 if (transcriptSequence.getStrand() == Strand.POSITIVE) { 145 hitLabel = uniprotBestHit + "_" + startIndex + "_" + endIndex; 146 } else { 147 hitLabel = uniprotBestHit + "_" + endIndex + "_" + startIndex; 148 } 149 int dnaBeginIndex = cdsSequence.getBioBegin() + (3 * offsetStartIndex); 150 int dnaEndIndex = cdsSequence.getBioEnd() - (3 * offsetEndIndex); 151 String scaffold = geneSequence.getParentChromosomeSequence().getAccession().getID(); 152 // if (scaffoldsReferencedHashMap.containsKey(scaffold) == false) { 153 // String gff3line = scaffold + "\t" + geneSequence.getSource() + "\t" + "size" + "\t" + "1" + "\t" + geneSequence.getParentChromosomeSequence().getBioEnd() + "\t.\t.\t.\tName=" + scaffold + "\r\n"; 154 // gff3Output.write(gff3line.getBytes()); 155 // scaffoldsReferencedHashMap.put(scaffold, scaffold); 156 // } 157 158 String line = scaffold + "\t" + geneSequence.getSource() + "_" + "UNIPROT\tmatch\t" + dnaBeginIndex + "\t" + dnaEndIndex + "\t.\t" + transcriptSequence.getStrand().getStringRepresentation() + "\t.\t"; 159 if (gff3Index == 0) { 160 FeaturesKeyWordInterface featureKeyWords = proteinSequence.getFeaturesKeyWord(); 161 String notes = ""; 162 if (featureKeyWords != null) { 163 ArrayList<String> keyWords = featureKeyWords.getKeyWords(); 164 if (keyWords.size() > 0) { 165 notes = ";Note="; 166 for (String note : keyWords) { 167 if (note.equals("Complete proteome")) { 168 continue; 169 } 170 if (note.equals("Direct protein sequencing")) { 171 continue; 172 } 173 174 notes = notes + " " + note; 175 geneSequence.addNote(note); // add note/keyword which can be output in fasta header if needed 176 } 177 } 178 179 } 180 181 DatabaseReferenceInterface databaseReferences = proteinSequence.getDatabaseReferences(); 182 if (databaseReferences != null) { 183 LinkedHashMap<String, ArrayList<DBReferenceInfo>> databaseReferenceHashMap = databaseReferences.getDatabaseReferences(); 184 ArrayList<DBReferenceInfo> pfamList = databaseReferenceHashMap.get("Pfam"); 185 ArrayList<DBReferenceInfo> cazyList = databaseReferenceHashMap.get("CAZy"); 186 ArrayList<DBReferenceInfo> goList = databaseReferenceHashMap.get("GO"); 187 ArrayList<DBReferenceInfo> eccList = databaseReferenceHashMap.get("BRENDA"); 188 if (pfamList != null && pfamList.size() > 0) { 189 if (notes.length() == 0) { 190 notes = ";Note="; 191 } 192 for (DBReferenceInfo note : pfamList) { 193 notes = notes + " " + note.getId(); 194 geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed 195 } 196 } 197 198 if (cazyList != null && cazyList.size() > 0) { 199 if (notes.length() == 0) { 200 notes = ";Note="; 201 } 202 for (DBReferenceInfo note : cazyList) { 203 notes = notes + " " + note.getId(); 204 geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed 205 // System.out.println("CAZy=" + note); 206 } 207 } 208 209 if (eccList != null && eccList.size() > 0) { 210 if (notes.length() == 0) { 211 notes = ";Note="; 212 } 213 for (DBReferenceInfo note : eccList) { 214 String dbid = note.getId(); 215 dbid = dbid.replace(".", "_"); //replace . with _ to facilitate searching in gbrowse 216 notes = notes + " " + "EC:" + dbid; 217 geneSequence.addNote("EC:" + dbid); // add note/keyword which can be output in fasta header if needed 218 219 } 220 } 221 222 if (goList != null && goList.size() > 0) { 223 if (notes.length() == 0) { 224 notes = ";Note="; 225 } 226 for (DBReferenceInfo note : goList) { 227 notes = notes + " " + note.getId(); 228 geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed 229 LinkedHashMap<String, String> properties = note.getProperties(); 230 for (String propertytype : properties.keySet()) { 231 if (propertytype.equals("evidence")) { 232 continue; 233 } 234 String property = properties.get(propertytype); 235 236 if (property.startsWith("C:")) { 237 continue; // skip over the location 238 } 239 if (property.endsWith("...")) { 240 property = property.substring(0, property.length() - 3); 241 } 242 notes = notes + " " + property; 243 geneSequence.addNote(property); 244 } 245 } 246 } 247 248 } 249 250 251 line = line + "Name=" + hitLabel + ";Alias=" + uniprotBestHit + notes + "\n"; 252 } else { 253 line = line + "Name=" + hitLabel + "\n"; 254 } 255 gff3Index++; 256 257 gff3Output.write(line.getBytes()); 258 } 259 } 260 } 261 } catch (Exception e) { 262 logger.info("Accession Id: {}", accessionid, e); 263 } 264 } 265 266 267 268 } 269 270 271 272 273 public static void main(String[] args) { 274 /* 275 try { 276 LinkedHashMap<String, ChromosomeSequence> dnaSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds.fna"), new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf")); 277 LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceList.values()); 278 FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_uniprot_match.gff3"); 279 280 GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits(); 281 gff3FromUniprotBlastHits.process(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/c1-454Scaffolds-hits-uniprot_fungi.xml"), 1E-10, geneSequenceList, fo); 282 fo.close(); 283 284 285 } catch (Exception e) { 286 logger.error("Exception: ", e); 287 288 289 } 290 */ 291 292 try { 293 LinkedHashMap<String, ChromosomeSequence> dnaSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds-16.fna"), new File("/Users/Scooter/scripps/dyadic/GlimmerHMM/c1_glimmerhmm-16.gff")); 294 LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceHashMap.values()); 295 FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/outputGlimmer/genemark_uniprot_match-16.gff3"); 296 LinkedHashMap<String, ArrayList<String>> blasthits = BlastHomologyHits.getMatches(new File("/Users/Scooter/scripps/dyadic/blastresults/c1_glimmer_in_uniprot.xml"), 1E-10); 297 logger.error("Number of uniprot hits " + blasthits.size()); 298 299 GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits(); 300 gff3FromUniprotBlastHits.process(blasthits, geneSequenceList, fo); 301 fo.close(); 302 } catch (Exception e) { 303 logger.error("Exception: ", e); 304 } 305 } 306}