001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.homology; 022 023import org.biojava.nbio.genome.GeneFeatureHelper; 024import org.biojava.nbio.alignment.Alignments; 025import org.biojava.nbio.alignment.Alignments.PairwiseSequenceAlignerType; 026import org.biojava.nbio.alignment.SimpleGapPenalty; 027import org.biojava.nbio.core.alignment.matrices.SimpleSubstitutionMatrix; 028import org.biojava.nbio.core.alignment.template.SequencePair; 029import org.biojava.nbio.core.sequence.*; 030import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 031import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 032import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 033import org.biojava.nbio.core.sequence.features.DatabaseReferenceInterface; 034import org.biojava.nbio.core.sequence.features.FeaturesKeyWordInterface; 035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import java.io.File; 040import java.io.FileOutputStream; 041import java.io.OutputStream; 042import java.util.ArrayList; 043import java.util.LinkedHashMap; 044import java.util.List; 045import java.util.Map; 046 047/** 048 * 049 * @author Scooter Willis <willishf at gmail dot com> 050 * @author Mark Chapman 051 */ 052public class GFF3FromUniprotBlastHits { 053 054 private static final Logger logger = LoggerFactory.getLogger(GFF3FromUniprotBlastHits.class); 055 056 public void process(File xmlBlastHits, double ecutoff, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception { 057 LinkedHashMap<String, ArrayList<String>> hits = BlastHomologyHits.getMatches(xmlBlastHits, ecutoff); 058 process(hits, geneSequenceHashMap, gff3Output); 059 } 060 061 public void process(LinkedHashMap<String, ArrayList<String>> hits, LinkedHashMap<String, GeneSequence> geneSequenceHashMap, OutputStream gff3Output) throws Exception { 062 int size = hits.size(); 063 int index = 0; 064// HashMap<String, String> scaffoldsReferencedHashMap = new HashMap<String, String>(); 065 for (String accessionid : hits.keySet()) { 066 index++; 067 if (index == 12) { 068 index = 12; 069 } 070 logger.error(accessionid + " " + index + "/" + size); 071 try { 072 073 String[] data = accessionid.split(" "); 074 String id = data[0]; 075 GeneSequence geneSequence = geneSequenceHashMap.get(id); 076 if (geneSequence == null) { 077 logger.error("Not found " + id); 078 continue; 079 } 080 ArrayList<String> uniprotProteinHits = hits.get(accessionid); 081 String uniprotBestHit = uniprotProteinHits.get(0); 082 UniprotProxySequenceReader<AminoAcidCompound> uniprotSequence = new UniprotProxySequenceReader<AminoAcidCompound>(uniprotBestHit, AminoAcidCompoundSet.getAminoAcidCompoundSet()); 083 084 ProteinSequence proteinSequence = new ProteinSequence(uniprotSequence); 085 String hitSequence = proteinSequence.getSequenceAsString(); 086 for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) { 087 088 089 String predictedProteinSequence = transcriptSequence.getProteinSequence().getSequenceAsString(); 090 ArrayList<ProteinSequence> cdsProteinList = transcriptSequence.getProteinCDSSequences(); 091 092 ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>(transcriptSequence.getCDSSequences().values()); 093 String testSequence = ""; 094 for (ProteinSequence cdsProteinSequence : cdsProteinList) { 095 testSequence = testSequence + cdsProteinSequence.getSequenceAsString(); 096 } 097 if (!testSequence.equals(predictedProteinSequence) && (!predictedProteinSequence.equals(testSequence.substring(0, testSequence.length() - 1)))) { 098 DNASequence codingSequence = transcriptSequence.getDNACodingSequence(); 099 logger.info("Coding Sequence: {}", codingSequence.getSequenceAsString()); 100 logger.info("Sequence agreement error"); 101 logger.info("CDS seq={}", testSequence); 102 logger.info("PRE seq={}", predictedProteinSequence); 103 logger.info("UNI seq={}", hitSequence); 104 // throw new Exception("Protein Sequence compare error " + id); 105 } 106 107 SequencePair<ProteinSequence, AminoAcidCompound> alignment = Alignments.getPairwiseAlignment( 108 transcriptSequence.getProteinSequence(), proteinSequence, 109 PairwiseSequenceAlignerType.LOCAL, new SimpleGapPenalty(), 110 SimpleSubstitutionMatrix.getBlosum62() 111 ); 112 // System.out.println(); 113 // System.out.println(alignment.getSummary()); 114 // System.out.println(new Pair().format(alignment)); 115 int proteinIndex = 0; 116 int gff3Index = 0; 117 for (int i = 0; i < cdsProteinList.size(); i++) { 118 ProteinSequence peptideSequence = cdsProteinList.get(i); 119 String seq = peptideSequence.getSequenceAsString(); 120 Integer startIndex = null; 121 int offsetStartIndex = 0; 122 for (int s = 0; s < seq.length(); s++) { 123 startIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + s); 124 if (startIndex != null) { 125 startIndex = startIndex + 1; 126 offsetStartIndex = s; 127 break; 128 } 129 } 130 Integer endIndex = null; 131 132 int offsetEndIndex = 0; 133 for (int e = 0; e < seq.length(); e++) { 134 endIndex = alignment.getIndexInTargetForQueryAt(proteinIndex + seq.length() - 1 - e); 135 if (endIndex != null) { 136 endIndex = endIndex + 1; 137 offsetEndIndex = e; 138 break; 139 } 140 } 141 142 proteinIndex = proteinIndex + seq.length(); 143 if (startIndex != null && endIndex != null && startIndex != endIndex) { 144 CDSSequence cdsSequence = cdsSequenceList.get(i); 145 String hitLabel = ""; 146 if (transcriptSequence.getStrand() == Strand.POSITIVE) { 147 hitLabel = uniprotBestHit + "_" + startIndex + "_" + endIndex; 148 } else { 149 hitLabel = uniprotBestHit + "_" + endIndex + "_" + startIndex; 150 } 151 int dnaBeginIndex = cdsSequence.getBioBegin() + (3 * offsetStartIndex); 152 int dnaEndIndex = cdsSequence.getBioEnd() - (3 * offsetEndIndex); 153 String scaffold = geneSequence.getParentChromosomeSequence().getAccession().getID(); 154 // if (scaffoldsReferencedHashMap.containsKey(scaffold) == false) { 155 // String gff3line = scaffold + "\t" + geneSequence.getSource() + "\t" + "size" + "\t" + "1" + "\t" + geneSequence.getParentChromosomeSequence().getBioEnd() + "\t.\t.\t.\tName=" + scaffold + "\r\n"; 156 // gff3Output.write(gff3line.getBytes()); 157 // scaffoldsReferencedHashMap.put(scaffold, scaffold); 158 // } 159 160 String line = scaffold + "\t" + geneSequence.getSource() + "_" + "UNIPROT\tmatch\t" + dnaBeginIndex + "\t" + dnaEndIndex + "\t.\t" + transcriptSequence.getStrand().getStringRepresentation() + "\t.\t"; 161 if (gff3Index == 0) { 162 FeaturesKeyWordInterface featureKeyWords = proteinSequence.getFeaturesKeyWord(); 163 String notes = ""; 164 if (featureKeyWords != null) { 165 List<String> keyWords = featureKeyWords.getKeyWords(); 166 if (keyWords.size() > 0) { 167 notes = ";Note="; 168 for (String note : keyWords) { 169 if (note.equals("Complete proteome")) { 170 continue; 171 } 172 if (note.equals("Direct protein sequencing")) { 173 continue; 174 } 175 176 notes = notes + " " + note; 177 geneSequence.addNote(note); // add note/keyword which can be output in fasta header if needed 178 } 179 } 180 181 } 182 183 DatabaseReferenceInterface databaseReferences = proteinSequence.getDatabaseReferences(); 184 if (databaseReferences != null) { 185 Map<String, List<DBReferenceInfo>> databaseReferenceHashMap = databaseReferences.getDatabaseReferences(); 186 List<DBReferenceInfo> pfamList = databaseReferenceHashMap.get("Pfam"); 187 List<DBReferenceInfo> cazyList = databaseReferenceHashMap.get("CAZy"); 188 List<DBReferenceInfo> goList = databaseReferenceHashMap.get("GO"); 189 List<DBReferenceInfo> eccList = databaseReferenceHashMap.get("BRENDA"); 190 if (pfamList != null && pfamList.size() > 0) { 191 if (notes.length() == 0) { 192 notes = ";Note="; 193 } 194 for (DBReferenceInfo note : pfamList) { 195 notes = notes + " " + note.getId(); 196 geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed 197 } 198 } 199 200 if (cazyList != null && cazyList.size() > 0) { 201 if (notes.length() == 0) { 202 notes = ";Note="; 203 } 204 for (DBReferenceInfo note : cazyList) { 205 notes = notes + " " + note.getId(); 206 geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed 207 // System.out.println("CAZy=" + note); 208 } 209 } 210 211 if (eccList != null && eccList.size() > 0) { 212 if (notes.length() == 0) { 213 notes = ";Note="; 214 } 215 for (DBReferenceInfo note : eccList) { 216 String dbid = note.getId(); 217 dbid = dbid.replace(".", "_"); //replace . with _ to facilitate searching in gbrowse 218 notes = notes + " " + "EC:" + dbid; 219 geneSequence.addNote("EC:" + dbid); // add note/keyword which can be output in fasta header if needed 220 221 } 222 } 223 224 if (goList != null && goList.size() > 0) { 225 if (notes.length() == 0) { 226 notes = ";Note="; 227 } 228 for (DBReferenceInfo note : goList) { 229 notes = notes + " " + note.getId(); 230 geneSequence.addNote(note.getId()); // add note/keyword which can be output in fasta header if needed 231 LinkedHashMap<String, String> properties = note.getProperties(); 232 for (String propertytype : properties.keySet()) { 233 if (propertytype.equals("evidence")) { 234 continue; 235 } 236 String property = properties.get(propertytype); 237 238 if (property.startsWith("C:")) { 239 continue; // skip over the location 240 } 241 if (property.endsWith("...")) { 242 property = property.substring(0, property.length() - 3); 243 } 244 notes = notes + " " + property; 245 geneSequence.addNote(property); 246 } 247 } 248 } 249 250 } 251 252 253 line = line + "Name=" + hitLabel + ";Alias=" + uniprotBestHit + notes + "\n"; 254 } else { 255 line = line + "Name=" + hitLabel + "\n"; 256 } 257 gff3Index++; 258 259 gff3Output.write(line.getBytes()); 260 } 261 } 262 } 263 } catch (Exception e) { 264 logger.info("Accession Id: {}", accessionid, e); 265 } 266 } 267 268 269 270 } 271 272 273 274 275 public static void main(String[] args) { 276 /* 277 try { 278 LinkedHashMap<String, ChromosomeSequence> dnaSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds.fna"), new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_hmm.gtf")); 279 LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceList.values()); 280 FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/genemark_uniprot_match.gff3"); 281 282 GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits(); 283 gff3FromUniprotBlastHits.process(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/c1-454Scaffolds-hits-uniprot_fungi.xml"), 1E-10, geneSequenceList, fo); 284 fo.close(); 285 286 287 } catch (Exception e) { 288 logger.error("Exception: ", e); 289 290 291 } 292 */ 293 294 try { 295 LinkedHashMap<String, ChromosomeSequence> dnaSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("/Users/Scooter/scripps/dyadic/analysis/454Scaffolds/454Scaffolds-16.fna"), new File("/Users/Scooter/scripps/dyadic/GlimmerHMM/c1_glimmerhmm-16.gff")); 296 LinkedHashMap<String, GeneSequence> geneSequenceList = GeneFeatureHelper.getGeneSequences(dnaSequenceHashMap.values()); 297 FileOutputStream fo = new FileOutputStream("/Users/Scooter/scripps/dyadic/outputGlimmer/genemark_uniprot_match-16.gff3"); 298 LinkedHashMap<String, ArrayList<String>> blasthits = BlastHomologyHits.getMatches(new File("/Users/Scooter/scripps/dyadic/blastresults/c1_glimmer_in_uniprot.xml"), 1E-10); 299 logger.error("Number of uniprot hits " + blasthits.size()); 300 301 GFF3FromUniprotBlastHits gff3FromUniprotBlastHits = new GFF3FromUniprotBlastHits(); 302 gff3FromUniprotBlastHits.process(blasthits, geneSequenceList, fo); 303 fo.close(); 304 } catch (Exception e) { 305 logger.error("Exception: ", e); 306 } 307 } 308}