001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome; 022 023import org.biojava.nbio.genome.parsers.gff.*; 024import org.biojava.nbio.core.sequence.*; 025import org.biojava.nbio.core.sequence.io.FastaReaderHelper; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import java.io.File; 030import java.io.FileWriter; 031import java.util.ArrayList; 032import java.util.Collection; 033import java.util.Collections; 034import java.util.LinkedHashMap; 035 036/** 037 * 038 * @author Scooter Willis <willishf at gmail dot com> 039 */ 040public class GeneFeatureHelper { 041 042 private static final Logger logger = LoggerFactory.getLogger(GeneFeatureHelper.class); 043 044 static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromUpperCaseExonFastaFile(File fastaSequenceFile, File uppercaseFastaFile, boolean throwExceptionGeneNotFound) throws Exception { 045 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<String, ChromosomeSequence>(); 046 LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 047 for (String accession : dnaSequenceList.keySet()) { 048 DNASequence contigSequence = dnaSequenceList.get(accession); 049 ChromosomeSequence chromsomeSequence = new ChromosomeSequence(contigSequence.getSequenceAsString()); 050 chromsomeSequence.setAccession(contigSequence.getAccession()); 051 chromosomeSequenceList.put(accession, chromsomeSequence); 052 } 053 054 055 LinkedHashMap<String, DNASequence> geneSequenceList = FastaReaderHelper.readFastaDNASequence(uppercaseFastaFile); 056 for (DNASequence dnaSequence : geneSequenceList.values()) { 057 String geneSequence = dnaSequence.getSequenceAsString(); 058 String lcGeneSequence = geneSequence.toLowerCase(); 059 String reverseGeneSequence = dnaSequence.getReverse().getSequenceAsString(); 060 String lcReverseGeneSequence = reverseGeneSequence.toLowerCase(); 061 Integer bioStart = null; 062 Integer bioEnd = null; 063 Strand strand = Strand.POSITIVE; 064 boolean geneFound = false; 065 String accession = ""; 066 DNASequence contigDNASequence = null; 067 for (String id : dnaSequenceList.keySet()) { 068 accession = id; 069 contigDNASequence = dnaSequenceList.get(id); 070 String contigSequence = contigDNASequence.getSequenceAsString().toLowerCase(); 071 bioStart = contigSequence.indexOf(lcGeneSequence); 072 if (bioStart != -1) { 073 bioStart = bioStart + 1; 074 bioEnd = bioStart + geneSequence.length() - 1; 075 geneFound = true; 076 break; 077 } else { 078 bioStart = contigSequence.indexOf(lcReverseGeneSequence); 079 if (bioStart != -1) { 080 bioStart = bioStart + 1; 081 bioEnd = bioStart - geneSequence.length() - 1; 082 strand = Strand.NEGATIVE; 083 geneFound = true; 084 break; 085 } 086 } 087 } 088 089 if (geneFound) { 090 logger.info("Gene {} found at {} {} {} {}", 091 dnaSequence.getAccession().toString(), contigDNASequence.getAccession().toString(), bioStart, bioEnd, strand); 092 ChromosomeSequence chromosomeSequence = chromosomeSequenceList.get(accession); 093 094 ArrayList<Integer> exonBoundries = new ArrayList<Integer>(); 095 096 //look for transitions from lowercase to upper case 097 for (int i = 0; i < geneSequence.length(); i++) { 098 if (i == 0 && Character.isUpperCase(geneSequence.charAt(i))) { 099 exonBoundries.add(i); 100 } else if (i == geneSequence.length() - 1) { 101 exonBoundries.add(i); 102 } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i - 1))) { 103 exonBoundries.add(i); 104 } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i + 1))) { 105 exonBoundries.add(i); 106 } 107 } 108 if (strand == Strand.NEGATIVE) { 109 Collections.reverse(exonBoundries); 110 } 111 112 113 String geneaccession = dnaSequence.getAccession().getID(); 114 String note = geneaccession; 115 String[] values = geneaccession.split(" "); 116 geneaccession = values[0]; 117 118 119 120 GeneSequence geneSeq = chromosomeSequence.addGene(new AccessionID(geneaccession), bioStart, bioEnd, strand); 121 geneSeq.addNote(note); 122 geneSeq.setSource(uppercaseFastaFile.getName()); 123 //String transcriptName = geneaccession + "-transcript"; 124 //TranscriptSequence transcriptSequence = geneSeq.addTranscript(new AccessionID(transcriptName), bioStart, bioEnd); 125 126 int runningFrameLength = 0; 127 for (int i = 0; i < exonBoundries.size() - 1; i = i + 2) { 128 int cdsBioStart = exonBoundries.get(i) + bioStart; 129 int cdsBioEnd = exonBoundries.get(i + 1) + bioStart; 130 runningFrameLength = runningFrameLength + Math.abs(cdsBioEnd - cdsBioStart) + 1; 131 //String cdsName = transcriptName + "-cds-" + cdsBioStart + "-" + cdsBioEnd; 132 133 //AccessionID cdsAccessionID = new AccessionID(cdsName); 134 //ExonSequence exonSequence = geneSeq.addExon(cdsAccessionID, cdsBioStart, cdsBioEnd); 135 int remainder = runningFrameLength % 3; 136 //int frame = 0; 137 if (remainder == 1) { 138 //frame = 2; // borrow 2 from next CDS region 139 } else if (remainder == 2) { 140 //frame = 1; 141 } 142 //CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsBioStart, cdsBioEnd, frame); 143 } 144 145 146 } else { 147 if (throwExceptionGeneNotFound) { 148 throw new Exception(dnaSequence.getAccession().toString() + " not found"); 149 } 150 logger.info("Gene not found {}", dnaSequence.getAccession().toString()); 151 } 152 153 } 154 return chromosomeSequenceList; 155 } 156 157 /** 158 * Output a gff3 feature file that will give the length of each scaffold/chromosome in the fasta file. 159 * Used for gbrowse so it knows length. 160 * @param fastaSequenceFile 161 * @param gffFile 162 * @throws Exception 163 */ 164 static public void outputFastaSequenceLengthGFF3(File fastaSequenceFile, File gffFile) throws Exception { 165 LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 166 String fileName = fastaSequenceFile.getName(); 167 FileWriter fw = new FileWriter(gffFile); 168 String newLine = System.getProperty("line.separator"); 169 fw.write("##gff-version 3" + newLine); 170 for (DNASequence dnaSequence : dnaSequenceList.values()) { 171 String gff3line = dnaSequence.getAccession().getID() + "\t" + fileName + "\t" + "contig" + "\t" + "1" + "\t" + dnaSequence.getBioEnd() + "\t.\t.\t.\tName=" + dnaSequence.getAccession().getID() + newLine; 172 fw.write(gff3line); 173 } 174 fw.close(); 175 } 176 177 /** 178 * Loads Fasta file and GFF2 feature file generated from the geneid prediction algorithm 179 * 180 * @param fastaSequenceFile 181 * @param gffFile 182 * @return 183 * @throws Exception 184 */ 185 static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneIDGFF2(File fastaSequenceFile, File gffFile) throws Exception { 186 LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 187 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 188 FeatureList listGenes = GeneIDGFF2Reader.read(gffFile.getAbsolutePath()); 189 addGeneIDGFF2GeneFeatures(chromosomeSequenceList, listGenes); 190 return chromosomeSequenceList; 191 } 192 193 /** 194 * Load GFF2 feature file generated from the geneid prediction algorithm and map features onto the chromosome sequences 195 * 196 * @param chromosomeSequenceList 197 * @param listGenes 198 * @throws Exception 199 */ 200 static public void addGeneIDGFF2GeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 201 Collection<String> geneIds = listGenes.attributeValues("gene_id"); 202 for (String geneid : geneIds) { 203 FeatureList gene = listGenes.selectByAttribute("gene_id", geneid); 204 FeatureI geneFeature = gene.get(0); 205 ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname()); 206 geneid = geneid.replaceAll("_", ".G"); 207 AccessionID geneAccessionID = new AccessionID(geneid); 208 GeneSequence geneSequence = null; 209 Collection<String> transcriptids = gene.attributeValues("gene_id"); 210 for (String transcriptid : transcriptids) { 211 // get all the individual features (exons, CDS regions, etc.) of this gene 212 FeatureList transcriptFeature = listGenes.selectByAttribute("gene_id", transcriptid); 213 transcriptid = transcriptid.replaceAll("_", ".G"); 214 215 216 217 218 // String seqName = feature.seqname(); 219 //FeatureI startCodon = null; 220 //FeatureI stopCodon = null; 221 Integer startCodonBegin = null; 222 Integer stopCodonEnd = null; 223 //String startCodonName = ""; 224 //String stopCodonName = ""; 225 226 227 // now select only the coding regions of this gene 228 FeatureList firstFeatures = transcriptFeature.selectByType("First"); 229 FeatureList terminalFeatures = transcriptFeature.selectByType("Terminal"); 230 FeatureList internalFeatures = transcriptFeature.selectByType("Internal"); 231 FeatureList singleFeatures = transcriptFeature.selectByType("Single"); 232 FeatureList cdsFeatures = new FeatureList(); 233 cdsFeatures.add(firstFeatures); 234 cdsFeatures.add(terminalFeatures); 235 cdsFeatures.add(internalFeatures); 236 cdsFeatures.add(singleFeatures); 237 // sort them 238 cdsFeatures = cdsFeatures.sortByStart(); 239 Strand strand = Strand.POSITIVE; 240 FeatureI feature = cdsFeatures.get(0); 241 if (feature.location().isNegative()) { 242 strand = Strand.NEGATIVE; 243 } 244 if (startCodonBegin == null) { 245 FeatureI firstFeature = cdsFeatures.get(0); 246 if (strand == Strand.NEGATIVE) { 247 startCodonBegin = firstFeature.location().bioEnd(); 248 } else { 249 startCodonBegin = firstFeature.location().bioStart(); 250 } 251 } 252 253 if (stopCodonEnd == null) { 254 255 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 256 if (strand == Strand.NEGATIVE) { 257 stopCodonEnd = lastFeature.location().bioStart(); 258 } else { 259 stopCodonEnd = lastFeature.location().bioEnd(); 260 } 261 } 262 //for gtf ordering can be strand based so first is last and last is first 263 if (startCodonBegin > stopCodonEnd) { 264 int temp = startCodonBegin; 265 startCodonBegin = stopCodonEnd; 266 stopCodonEnd = temp; 267 } 268 269 AccessionID transcriptAccessionID = new AccessionID(transcriptid); 270 if (geneSequence == null) { 271 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 272 geneSequence.setSource(((Feature) feature).source()); 273 } else { 274 //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end 275 276 if (startCodonBegin < geneSequence.getBioBegin()) { 277 geneSequence.setBioBegin(startCodonBegin); 278 } 279 if (stopCodonEnd > geneSequence.getBioBegin()) { 280 geneSequence.setBioEnd(stopCodonEnd); 281 } 282 283 } 284 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 285 /* 286 if (startCodon != null) { 287 if (startCodonName == null || startCodonName.length() == 0) { 288 startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 289 } 290 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 291 } 292 if (stopCodon != null) { 293 if (stopCodonName == null || stopCodonName.length() == 0) { 294 stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 295 } 296 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 297 } 298 */ 299 300 for (FeatureI cdsFeature : cdsFeatures) { 301 Feature cds = (Feature) cdsFeature; 302 String cdsName = cds.getAttribute("transcript_name"); 303 if (cdsName == null || cdsName.length() == 0) { 304 cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 305 } 306 AccessionID cdsAccessionID = new AccessionID(cdsName); 307 //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 308 CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame()); 309 cdsSequence.setSequenceScore(cds.score()); 310 } 311 } 312 } 313 314 } 315 316 static public LinkedHashMap<String, ChromosomeSequence> getChromosomeSequenceFromDNASequence(LinkedHashMap<String, DNASequence> dnaSequenceList) { 317 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<String, ChromosomeSequence>(); 318 for (String key : dnaSequenceList.keySet()) { 319 DNASequence dnaSequence = dnaSequenceList.get(key); 320 ChromosomeSequence chromosomeSequence = new ChromosomeSequence(dnaSequence.getProxySequenceReader()); //we want the underlying sequence but don't need storage 321 chromosomeSequence.setAccession(dnaSequence.getAccession()); 322 chromosomeSequenceList.put(key, chromosomeSequence); 323 } 324 return chromosomeSequenceList; 325 } 326 327 /** 328 * Lots of variations in the ontology or descriptors that can be used in GFF3 which requires writing a custom parser to handle a GFF3 generated or used 329 * by a specific application. Probably could be abstracted out but for now easier to handle with custom code to deal with gff3 elements that are not 330 * included but can be extracted from other data elements. 331 * @param fastaSequenceFile 332 * @param gffFile 333 * @param lazyloadsequences If set to true then the fasta file will be parsed for accession id but sequences will be read from disk when needed to save memory 334 * @return 335 * @throws Exception 336 */ 337 static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGmodGFF3(File fastaSequenceFile, File gffFile,boolean lazyloadsequences) throws Exception { 338 LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile,lazyloadsequences); 339 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 340 FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath()); 341 addGmodGFF3GeneFeatures(chromosomeSequenceList, listGenes); 342 return chromosomeSequenceList; 343 } 344 345 /** 346 * Load GFF3 file using mRNA as the gene feature as not all GFF3 files are complete 347 * @param chromosomeSequenceList 348 * @param listGenes 349 * @throws Exception 350 */ 351 static public void addGmodGFF3GeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 352 353 354 // key off mRNA as being a known feature that may or may not have a parent gene 355 356 357 FeatureList mRNAFeatures = listGenes.selectByType("mRNA"); 358 LinkedHashMap<String,FeatureList> featureIDHashMap = FeatureHelper.buildFeatureAtrributeIndex("ID", listGenes); 359 LinkedHashMap<String,FeatureList> featureParentHashMap = FeatureHelper.buildFeatureAtrributeIndex("Parent", listGenes); 360 361 for (FeatureI f : mRNAFeatures) { 362 String geneID; 363 String geneNote = null; 364 String geneSource = null; 365 String sequenceName = null; 366 ChromosomeSequence seq = null; 367 GeneSequence geneSequence = null; 368 369 Feature mRNAFeature = (Feature) f; 370 String mRNAID = mRNAFeature.getAttribute("ID"); 371 String mRNAsource = mRNAFeature.source(); 372 String mRNANote = mRNAFeature.getAttribute("Note"); 373 String mRNAParent = mRNAFeature.getAttribute("Parent"); 374 if (mRNAParent != null && mRNAParent.length() > 0) { 375 // FeatureList geneFeatureList = listGenes.selectByAttribute("ID", mRNAParent); 376 FeatureList geneFeatureList = featureIDHashMap.get(mRNAParent); 377 Feature geneFeature = (Feature) geneFeatureList.get(0); 378 geneID = geneFeature.getAttribute("ID"); 379 geneNote = geneFeature.getAttribute("Note"); 380 geneSource = geneFeature.source(); 381 sequenceName = geneFeature.seqname(); 382 383 // 384 } else { 385 //deal with cases where no parent gene is given 386 geneID = mRNAID; 387 geneSource = mRNAsource; 388 sequenceName = mRNAFeature.seqname(); 389 } 390 391 seq = chromosomeSequenceList.get(sequenceName); 392 393 AccessionID geneAccessionID = new AccessionID(geneID); 394 395 // FeatureList mRNAChildren = listGenes.selectByAttribute("Parent", mRNAID); 396 FeatureList mRNAChildren = featureParentHashMap.get(mRNAID); 397 FeatureList cdsFeatures = mRNAChildren.selectByType("CDS"); 398 FeatureI feature = cdsFeatures.get(0); 399 Strand strand = Strand.POSITIVE; 400 401 if (feature.location().isNegative()) { 402 strand = Strand.NEGATIVE; 403 } 404 cdsFeatures = cdsFeatures.sortByStart(); 405 406 407 408 409 410 411 412 //String seqName = feature.seqname(); 413 FeatureI startCodon = null; 414 FeatureI stopCodon = null; 415 Integer startCodonBegin = null; 416 Integer stopCodonEnd = null; 417 String startCodonName = ""; 418 String stopCodonName = ""; 419 FeatureList startCodonList = mRNAChildren.selectByType("five_prime_UTR"); 420 if (startCodonList != null && startCodonList.size() > 0) { 421 startCodon = startCodonList.get(0); 422 if (strand == Strand.NEGATIVE) { 423 startCodonBegin = startCodon.location().bioEnd(); 424 } else { 425 startCodonBegin = startCodon.location().bioStart(); 426 } 427 startCodonName = startCodon.getAttribute("ID"); 428 } 429 430 FeatureList stopCodonList = mRNAChildren.selectByType("three_prime_UTR"); 431 432 if (stopCodonList != null && stopCodonList.size() > 0) { 433 stopCodon = stopCodonList.get(0); 434 if (strand == Strand.NEGATIVE) { 435 stopCodonEnd = stopCodon.location().bioStart(); 436 } else { 437 stopCodonEnd = stopCodon.location().bioEnd(); 438 } 439 stopCodonName = stopCodon.getAttribute("ID"); 440 441 } 442 443 444 445 446 if (startCodonBegin == null) { 447 if (strand == Strand.NEGATIVE) { 448 FeatureI firstFeature = cdsFeatures.get(0); 449 450 startCodonBegin = firstFeature.location().bioEnd(); 451 } else { 452 FeatureI firstFeature = cdsFeatures.get(0); 453 454 startCodonBegin = firstFeature.location().bioStart(); 455 } 456 } 457 458 if (stopCodonEnd == null) { 459 if (strand == Strand.NEGATIVE) { 460 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 461 stopCodonEnd = lastFeature.location().bioStart(); 462 } else { 463 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 464 stopCodonEnd = lastFeature.location().bioEnd(); 465 } 466 } 467 //for gtf ordering can be strand based so first is last and last is first 468 if (startCodonBegin > stopCodonEnd) { 469 int temp = startCodonBegin; 470 startCodonBegin = stopCodonEnd; 471 stopCodonEnd = temp; 472 } 473 474 475 476 AccessionID transcriptAccessionID = new AccessionID(mRNAID); 477 geneSequence = seq.getGene(geneID); 478 if (geneSequence == null) { 479 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 480 geneSequence.setSource(geneSource); 481 if (geneNote != null && geneNote.length() > 0) { 482 geneSequence.addNote(geneNote); 483 } 484 } else { 485 486 if (startCodonBegin < geneSequence.getBioBegin()) { 487 geneSequence.setBioBegin(startCodonBegin); 488 } 489 if (stopCodonEnd > geneSequence.getBioBegin()) { 490 geneSequence.setBioEnd(stopCodonEnd); 491 } 492 493 } 494 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 495 transcriptSequence.setSource(mRNAsource); 496 if (mRNANote != null && mRNANote.length() > 0) { 497 transcriptSequence.addNote(mRNANote); 498 499 } 500 if (startCodon != null) { 501 if (startCodonName == null || startCodonName.length() == 0) { 502 startCodonName = mRNAID + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 503 } 504 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 505 } 506 if (stopCodon != null) { 507 if (stopCodonName == null || stopCodonName.length() == 0) { 508 stopCodonName = mRNAID + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 509 } 510 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 511 } 512 513 for (FeatureI cdsFeature : cdsFeatures) { 514 Feature cds = (Feature) cdsFeature; 515 String cdsNote = cdsFeature.getAttribute("Note"); 516 String cdsSource = cds.source(); 517 String cdsName = cds.getAttribute("ID"); 518 if (cdsName == null || cdsName.length() == 0) { 519 cdsName = mRNAID + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 520 } 521 AccessionID cdsAccessionID = new AccessionID(cdsName); 522 ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 523 exonSequence.setSource(cdsSource); 524 if (cdsNote != null && cdsNote.length() > 0) { 525 exonSequence.addNote(cdsNote); 526 } 527 transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame()); 528 } 529 geneSequence.addIntronsUsingExons(); 530 531 } 532 533 } 534 535 static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGlimmerGFF3(File fastaSequenceFile, File gffFile) throws Exception { 536 LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 537 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 538 FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath()); 539 addGlimmerGFF3GeneFeatures(chromosomeSequenceList, listGenes); 540 return chromosomeSequenceList; 541 } 542 543 static public void addGlimmerGFF3GeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 544 FeatureList mRNAFeatures = listGenes.selectByType("mRNA"); 545 for (FeatureI f : mRNAFeatures) { 546 Feature mRNAFeature = (Feature) f; 547 String geneid = mRNAFeature.getAttribute("ID"); 548 String source = mRNAFeature.source(); 549 550 FeatureList gene = listGenes.selectByAttribute("Parent", geneid); 551 FeatureI geneFeature = gene.get(0); 552 ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname()); 553 AccessionID geneAccessionID = new AccessionID(geneid); 554 GeneSequence geneSequence = null; 555 556 FeatureList cdsFeatures = gene.selectByType("CDS"); 557 FeatureI feature = cdsFeatures.get(0); 558 Strand strand = Strand.POSITIVE; 559 560 if (feature.location().isNegative()) { 561 strand = Strand.NEGATIVE; 562 } 563 cdsFeatures = cdsFeatures.sortByStart(); 564 565 566 567 568 569 570 571 //String seqName = feature.seqname(); 572 FeatureI startCodon = null; 573 FeatureI stopCodon = null; 574 Integer startCodonBegin = null; 575 Integer stopCodonEnd = null; 576 String startCodonName = ""; 577 String stopCodonName = ""; 578 FeatureList startCodonList = gene.selectByAttribute("Note", "initial-exon"); 579 if (startCodonList != null && startCodonList.size() > 0) { 580 startCodon = startCodonList.get(0); 581 if (strand == Strand.NEGATIVE) { 582 startCodonBegin = startCodon.location().bioEnd(); 583 } else { 584 startCodonBegin = startCodon.location().bioStart(); 585 } 586 startCodonName = startCodon.getAttribute("ID"); 587 } 588 589 FeatureList stopCodonList = gene.selectByAttribute("Note", "final-exon"); 590 591 if (stopCodonList != null && stopCodonList.size() > 0) { 592 stopCodon = stopCodonList.get(0); 593 if (strand == Strand.NEGATIVE) { 594 stopCodonEnd = stopCodon.location().bioStart(); 595 } else { 596 stopCodonEnd = stopCodon.location().bioEnd(); 597 } 598 stopCodonName = stopCodon.getAttribute("ID"); 599 600 } 601 602 603 604 605 if (startCodonBegin == null) { 606 if (strand == Strand.NEGATIVE) { 607 FeatureI firstFeature = cdsFeatures.get(0); 608 609 startCodonBegin = firstFeature.location().bioEnd(); 610 } else { 611 FeatureI firstFeature = cdsFeatures.get(0); 612 613 startCodonBegin = firstFeature.location().bioStart(); 614 } 615 } 616 617 if (stopCodonEnd == null) { 618 if (strand == Strand.NEGATIVE) { 619 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 620 stopCodonEnd = lastFeature.location().bioStart(); 621 } else { 622 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 623 stopCodonEnd = lastFeature.location().bioEnd(); 624 } 625 } 626 //for gtf ordering can be strand based so first is last and last is first 627 if (startCodonBegin > stopCodonEnd) { 628 int temp = startCodonBegin; 629 startCodonBegin = stopCodonEnd; 630 stopCodonEnd = temp; 631 } 632 633 634 635 AccessionID transcriptAccessionID = new AccessionID(geneid); 636 if (geneSequence == null) { 637 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 638 geneSequence.setSource(source); 639 } 640 /* 641 else { 642 643 if (startCodonBegin < geneSequence.getBioBegin()) { 644 geneSequence.setBioBegin(startCodonBegin); 645 } 646 if (stopCodonEnd > geneSequence.getBioBegin()) { 647 geneSequence.setBioEnd(stopCodonEnd); 648 } 649 } 650 */ 651 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 652 if (startCodon != null) { 653 if (startCodonName == null || startCodonName.length() == 0) { 654 startCodonName = geneid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 655 } 656 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 657 } 658 if (stopCodon != null) { 659 if (stopCodonName == null || stopCodonName.length() == 0) { 660 stopCodonName = geneid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 661 } 662 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 663 } 664 665 for (FeatureI cdsFeature : cdsFeatures) { 666 Feature cds = (Feature) cdsFeature; 667 String cdsName = cds.getAttribute("ID"); 668 if (cdsName == null || cdsName.length() == 0) { 669 cdsName = geneid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 670 } 671 AccessionID cdsAccessionID = new AccessionID(cdsName); 672 //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 673 transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame()); 674 } 675 676 } 677 678 } 679 680 static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneMarkGTF(File fastaSequenceFile, File gffFile) throws Exception { 681 LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 682 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 683 FeatureList listGenes = GeneMarkGTFReader.read(gffFile.getAbsolutePath()); 684 addGeneMarkGTFGeneFeatures(chromosomeSequenceList, listGenes); 685 return chromosomeSequenceList; 686 } 687 688 static public void addGeneMarkGTFGeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 689 Collection<String> geneIds = listGenes.attributeValues("gene_id"); 690 for (String geneid : geneIds) { 691 // if(geneid.equals("45_g")){ 692 // int dummy =1; 693 // } 694 FeatureList gene = listGenes.selectByAttribute("gene_id", geneid); 695 FeatureI geneFeature = gene.get(0); 696 ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname()); 697 AccessionID geneAccessionID = new AccessionID(geneid); 698 GeneSequence geneSequence = null; 699 Collection<String> transcriptids = gene.attributeValues("transcript_id"); 700 for (String transcriptid : transcriptids) { 701 // get all the individual features (exons, CDS regions, etc.) of this gene 702 703 704 FeatureList transcriptFeature = listGenes.selectByAttribute("transcript_id", transcriptid); 705 // now select only the coding regions of this gene 706 FeatureList cdsFeatures = transcriptFeature.selectByType("CDS"); 707 // sort them 708 cdsFeatures = cdsFeatures.sortByStart(); 709 710 FeatureI feature = cdsFeatures.get(0); 711 Strand strand = Strand.POSITIVE; 712 713 if (feature.location().isNegative()) { 714 strand = Strand.NEGATIVE; 715 } 716 717 //String seqName = feature.seqname(); 718 FeatureI startCodon = null; 719 FeatureI stopCodon = null; 720 Integer startCodonBegin = null; 721 Integer stopCodonEnd = null; 722 String startCodonName = ""; 723 String stopCodonName = ""; 724 FeatureList startCodonList = transcriptFeature.selectByType("start_codon"); 725 if (startCodonList != null && startCodonList.size() > 0) { 726 startCodon = startCodonList.get(0); 727 if (strand == Strand.POSITIVE) { 728 startCodonBegin = startCodon.location().bioStart(); 729 } else { 730 startCodonBegin = startCodon.location().bioEnd(); 731 } 732 startCodonName = startCodon.getAttribute("transcript_name"); 733 } 734 735 FeatureList stopCodonList = transcriptFeature.selectByType("stop_codon"); 736 737 if (stopCodonList != null && stopCodonList.size() > 0) { 738 stopCodon = stopCodonList.get(0); 739 if (strand == Strand.POSITIVE) { 740 stopCodonEnd = stopCodon.location().bioEnd(); 741 } else { 742 stopCodonEnd = stopCodon.location().bioStart(); 743 } 744 745 stopCodonName = stopCodon.getAttribute("transcript_name"); 746 747 } 748 749 750 751 752 if (startCodonBegin == null) { 753 if (strand == Strand.NEGATIVE) { 754 FeatureI firstFeature = cdsFeatures.get(0); 755 756 startCodonBegin = firstFeature.location().bioEnd(); 757 } else { 758 FeatureI firstFeature = cdsFeatures.get(0); 759 760 startCodonBegin = firstFeature.location().bioStart(); 761 } 762 } 763 764 if (stopCodonEnd == null) { 765 if (strand == Strand.NEGATIVE) { 766 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 767 stopCodonEnd = lastFeature.location().bioStart(); 768 } else { 769 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 770 stopCodonEnd = lastFeature.location().bioEnd(); 771 } 772 } 773 //for gtf ordering can be strand based so first is last and last is first 774 if (startCodonBegin > stopCodonEnd) { 775 int temp = startCodonBegin; 776 startCodonBegin = stopCodonEnd; 777 stopCodonEnd = temp; 778 } 779 780 AccessionID transcriptAccessionID = new AccessionID(transcriptid); 781 if (geneSequence == null) { 782 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 783 geneSequence.setSource(((Feature) feature).source()); 784 } else { 785 //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end 786 787 if (startCodonBegin < geneSequence.getBioBegin()) { 788 geneSequence.setBioBegin(startCodonBegin); 789 } 790 if (stopCodonEnd > geneSequence.getBioBegin()) { 791 geneSequence.setBioEnd(stopCodonEnd); 792 } 793 794 } 795 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 796 if (startCodon != null) { 797 if (startCodonName == null || startCodonName.length() == 0) { 798 startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 799 } 800 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 801 } 802 if (stopCodon != null) { 803 if (stopCodonName == null || stopCodonName.length() == 0) { 804 stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 805 } 806 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 807 } 808 809 for (FeatureI cdsFeature : cdsFeatures) { 810 Feature cds = (Feature) cdsFeature; 811 // for genemark it appears frame of 2 =1 and frame of 1 = 2 812 // doesn't matter when you string cds regions together as one block 813 // but does make a difference when you try to make a protein sequence for each CDS region where 814 // you give up or borrow based on the frame value 815 // compared with gff like files and docs for geneid and glimmer where geneid and glimmer both do it the same 816 // way that appears to match the gff3 docs. 817 int frame = cds.frame(); 818 if (frame == 1) { 819 frame = 2; 820 } else if (frame == 2) { 821 frame = 1; 822 } else { 823 frame = 0; 824 } 825 String cdsName = cds.getAttribute("transcript_name"); 826 if (cdsName == null || cdsName.length() == 0) { 827 cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 828 } 829 AccessionID cdsAccessionID = new AccessionID(cdsName); 830 //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 831 transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), frame); 832 } 833 } 834 } 835 836 } 837 838 static public LinkedHashMap<String, ProteinSequence> getProteinSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception { 839 LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = new LinkedHashMap<String, ProteinSequence>(); 840 for (ChromosomeSequence dnaSequence : chromosomeSequences) { 841 for (GeneSequence geneSequence : dnaSequence.getGeneSequences().values()) { 842 for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) { 843 //TODO remove? 844// DNASequence dnaCodingSequence = transcriptSequence.getDNACodingSequence(); 845// logger.info("CDS={}", dnaCodingSequence.getSequenceAsString()); 846 847 try { 848 ProteinSequence proteinSequence = transcriptSequence.getProteinSequence(); 849 850// logger.info("{} {}", proteinSequence.getAccession().getID(), proteinSequence); 851 if (proteinSequenceHashMap.containsKey(proteinSequence.getAccession().getID())) { 852 throw new Exception("Duplicate protein sequence id=" + proteinSequence.getAccession().getID() + " found at Gene id=" + geneSequence.getAccession().getID()); 853 } else { 854 proteinSequenceHashMap.put(proteinSequence.getAccession().getID(), proteinSequence); 855 } 856 } catch (Exception e) { 857 logger.error("Exception: ", e); 858 } 859 860 } 861 862 } 863 } 864 return proteinSequenceHashMap; 865 } 866 867 static public LinkedHashMap<String, GeneSequence> getGeneSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception { 868 LinkedHashMap<String, GeneSequence> geneSequenceHashMap = new LinkedHashMap<String, GeneSequence>(); 869 for (ChromosomeSequence chromosomeSequence : chromosomeSequences) { 870 for (GeneSequence geneSequence : chromosomeSequence.getGeneSequences().values()) { 871 geneSequenceHashMap.put(geneSequence.getAccession().getID(), geneSequence); 872 } 873 } 874 875 return geneSequenceHashMap; 876 } 877 878 public static void main(String[] args) throws Exception { 879/* if (false) { 880 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("Scaffolds.fna"), new File("genemark_hmm.gtf")); 881 LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values()); 882 } 883 884 if (false) { 885 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("Scaffolds.fna"), new File("glimmerhmm.gff")); 886 LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values()); 887 // for (ProteinSequence proteinSequence : proteinSequenceList.values()) { 888 // logger.info(proteinSequence.getAccession().getID() + " " + proteinSequence); 889 // } 890 FastaWriterHelper.writeProteinSequence(new File("predicted_glimmer.faa"), proteinSequenceList.values()); 891 892 } 893 if (false) { 894 GeneFeatureHelper.outputFastaSequenceLengthGFF3(new File("Scaffolds.fna"), new File("scaffolds.gff3")); 895 } 896 897 */ 898 899 try { 900 901 if (true) { 902 //File fastaSequenceFile = new File("Scaffolds.fna"); 903 //File gff3File = new File("geneid-v6.gff3"); 904 //LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGmodGFF3(fastaSequenceFile, gff3File,true); 905 } 906 907 } catch (Exception e) { 908 logger.error("Exception: ", e); 909 } 910 911 } 912}