001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome; 022 023import org.biojava.nbio.genome.parsers.gff.*; 024import org.biojava.nbio.core.sequence.*; 025import org.biojava.nbio.core.sequence.io.FastaReaderHelper; 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import java.io.File; 030import java.io.FileWriter; 031import java.util.ArrayList; 032import java.util.Collection; 033import java.util.Collections; 034import java.util.LinkedHashMap; 035import java.util.Map; 036 037/** 038 * 039 * @author Scooter Willis 040 */ 041public class GeneFeatureHelper { 042 043 private static final Logger logger = LoggerFactory.getLogger(GeneFeatureHelper.class); 044 045 static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromUpperCaseExonFastaFile(File fastaSequenceFile, File uppercaseFastaFile, boolean throwExceptionGeneNotFound) throws Exception { 046 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<>(); 047 Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 048 for (String accession : dnaSequenceList.keySet()) { 049 DNASequence contigSequence = dnaSequenceList.get(accession); 050 ChromosomeSequence chromsomeSequence = new ChromosomeSequence(contigSequence.getSequenceAsString()); 051 chromsomeSequence.setAccession(contigSequence.getAccession()); 052 chromosomeSequenceList.put(accession, chromsomeSequence); 053 } 054 055 056 Map<String, DNASequence> geneSequenceList = FastaReaderHelper.readFastaDNASequence(uppercaseFastaFile); 057 for (DNASequence dnaSequence : geneSequenceList.values()) { 058 String geneSequence = dnaSequence.getSequenceAsString(); 059 String lcGeneSequence = geneSequence.toLowerCase(); 060 String reverseGeneSequence = dnaSequence.getReverse().getSequenceAsString(); 061 String lcReverseGeneSequence = reverseGeneSequence.toLowerCase(); 062 Integer bioStart = null; 063 Integer bioEnd = null; 064 Strand strand = Strand.POSITIVE; 065 boolean geneFound = false; 066 String accession = ""; 067 DNASequence contigDNASequence = null; 068 for (String id : dnaSequenceList.keySet()) { 069 accession = id; 070 contigDNASequence = dnaSequenceList.get(id); 071 String contigSequence = contigDNASequence.getSequenceAsString().toLowerCase(); 072 bioStart = contigSequence.indexOf(lcGeneSequence); 073 if (bioStart != -1) { 074 bioStart = bioStart + 1; 075 bioEnd = bioStart + geneSequence.length() - 1; 076 geneFound = true; 077 break; 078 } else { 079 bioStart = contigSequence.indexOf(lcReverseGeneSequence); 080 if (bioStart != -1) { 081 bioStart = bioStart + 1; 082 bioEnd = bioStart - geneSequence.length() - 1; 083 strand = Strand.NEGATIVE; 084 geneFound = true; 085 break; 086 } 087 } 088 } 089 090 if (geneFound) { 091 logger.info("Gene {} found at {} {} {} {}", 092 dnaSequence.getAccession().toString(), contigDNASequence.getAccession().toString(), bioStart, bioEnd, strand); 093 ChromosomeSequence chromosomeSequence = chromosomeSequenceList.get(accession); 094 095 ArrayList<Integer> exonBoundries = new ArrayList<>(); 096 097 //look for transitions from lowercase to upper case 098 for (int i = 0; i < geneSequence.length(); i++) { 099 if (i == 0 && Character.isUpperCase(geneSequence.charAt(i))) { 100 exonBoundries.add(i); 101 } else if (i == geneSequence.length() - 1) { 102 exonBoundries.add(i); 103 } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i - 1))) { 104 exonBoundries.add(i); 105 } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i + 1))) { 106 exonBoundries.add(i); 107 } 108 } 109 if (strand == Strand.NEGATIVE) { 110 Collections.reverse(exonBoundries); 111 } 112 113 114 String geneaccession = dnaSequence.getAccession().getID(); 115 String note = geneaccession; 116 String[] values = geneaccession.split(" "); 117 geneaccession = values[0]; 118 119 120 121 GeneSequence geneSeq = chromosomeSequence.addGene(new AccessionID(geneaccession), bioStart, bioEnd, strand); 122 geneSeq.addNote(note); 123 geneSeq.setSource(uppercaseFastaFile.getName()); 124 //String transcriptName = geneaccession + "-transcript"; 125 //TranscriptSequence transcriptSequence = geneSeq.addTranscript(new AccessionID(transcriptName), bioStart, bioEnd); 126 127 int runningFrameLength = 0; 128 for (int i = 0; i < exonBoundries.size() - 1; i = i + 2) { 129 int cdsBioStart = exonBoundries.get(i) + bioStart; 130 int cdsBioEnd = exonBoundries.get(i + 1) + bioStart; 131 runningFrameLength = runningFrameLength + Math.abs(cdsBioEnd - cdsBioStart) + 1; 132 //String cdsName = transcriptName + "-cds-" + cdsBioStart + "-" + cdsBioEnd; 133 134 //AccessionID cdsAccessionID = new AccessionID(cdsName); 135 //ExonSequence exonSequence = geneSeq.addExon(cdsAccessionID, cdsBioStart, cdsBioEnd); 136 int remainder = runningFrameLength % 3; 137 //int frame = 0; 138 if (remainder == 1) { 139 //frame = 2; // borrow 2 from next CDS region 140 } else if (remainder == 2) { 141 //frame = 1; 142 } 143 //CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsBioStart, cdsBioEnd, frame); 144 } 145 146 147 } else { 148 if (throwExceptionGeneNotFound) { 149 throw new Exception(dnaSequence.getAccession().toString() + " not found"); 150 } 151 logger.info("Gene not found {}", dnaSequence.getAccession().toString()); 152 } 153 154 } 155 return chromosomeSequenceList; 156 } 157 158 /** 159 * Output a gff3 feature file that will give the length of each scaffold/chromosome in the fasta file. 160 * Used for gbrowse so it knows length. 161 * @param fastaSequenceFile 162 * @param gffFile 163 * @throws Exception 164 */ 165 static public void outputFastaSequenceLengthGFF3(File fastaSequenceFile, File gffFile) throws Exception { 166 Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 167 String fileName = fastaSequenceFile.getName(); 168 FileWriter fw = new FileWriter(gffFile); 169 String newLine = System.getProperty("line.separator"); 170 fw.write("##gff-version 3" + newLine); 171 for (DNASequence dnaSequence : dnaSequenceList.values()) { 172 String gff3line = dnaSequence.getAccession().getID() + "\t" + fileName + "\t" + "contig" + "\t" + "1" + "\t" + dnaSequence.getBioEnd() + "\t.\t.\t.\tName=" + dnaSequence.getAccession().getID() + newLine; 173 fw.write(gff3line); 174 } 175 fw.close(); 176 } 177 178 /** 179 * Loads Fasta file and GFF2 feature file generated from the geneid prediction algorithm 180 * 181 * @param fastaSequenceFile 182 * @param gffFile 183 * @return 184 * @throws Exception 185 */ 186 static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneIDGFF2(File fastaSequenceFile, File gffFile) throws Exception { 187 Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 188 Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 189 FeatureList listGenes = GeneIDGFF2Reader.read(gffFile.getAbsolutePath()); 190 addGeneIDGFF2GeneFeatures(chromosomeSequenceList, listGenes); 191 return chromosomeSequenceList; 192 } 193 194 /** 195 * Load GFF2 feature file generated from the geneid prediction algorithm and map features onto the chromosome sequences 196 * 197 * @param chromosomeSequenceList 198 * @param listGenes 199 * @throws Exception 200 */ 201 static public void addGeneIDGFF2GeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 202 Collection<String> geneIds = listGenes.attributeValues("gene_id"); 203 for (String geneid : geneIds) { 204 FeatureList gene = listGenes.selectByAttribute("gene_id", geneid); 205 FeatureI geneFeature = gene.get(0); 206 ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname()); 207 geneid = geneid.replaceAll("_", ".G"); 208 AccessionID geneAccessionID = new AccessionID(geneid); 209 GeneSequence geneSequence = null; 210 Collection<String> transcriptids = gene.attributeValues("gene_id"); 211 for (String transcriptid : transcriptids) { 212 // get all the individual features (exons, CDS regions, etc.) of this gene 213 FeatureList transcriptFeature = listGenes.selectByAttribute("gene_id", transcriptid); 214 transcriptid = transcriptid.replaceAll("_", ".G"); 215 216 217 218 219 // String seqName = feature.seqname(); 220 //FeatureI startCodon = null; 221 //FeatureI stopCodon = null; 222 Integer startCodonBegin = null; 223 Integer stopCodonEnd = null; 224 //String startCodonName = ""; 225 //String stopCodonName = ""; 226 227 228 // now select only the coding regions of this gene 229 FeatureList firstFeatures = transcriptFeature.selectByType("First"); 230 FeatureList terminalFeatures = transcriptFeature.selectByType("Terminal"); 231 FeatureList internalFeatures = transcriptFeature.selectByType("Internal"); 232 FeatureList singleFeatures = transcriptFeature.selectByType("Single"); 233 FeatureList cdsFeatures = new FeatureList(); 234 cdsFeatures.add(firstFeatures); 235 cdsFeatures.add(terminalFeatures); 236 cdsFeatures.add(internalFeatures); 237 cdsFeatures.add(singleFeatures); 238 // sort them 239 cdsFeatures = cdsFeatures.sortByStart(); 240 Strand strand = Strand.POSITIVE; 241 FeatureI feature = cdsFeatures.get(0); 242 if (feature.location().isNegative()) { 243 strand = Strand.NEGATIVE; 244 } 245 if (startCodonBegin == null) { 246 FeatureI firstFeature = cdsFeatures.get(0); 247 if (strand == Strand.NEGATIVE) { 248 startCodonBegin = firstFeature.location().bioEnd(); 249 } else { 250 startCodonBegin = firstFeature.location().bioStart(); 251 } 252 } 253 254 if (stopCodonEnd == null) { 255 256 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 257 if (strand == Strand.NEGATIVE) { 258 stopCodonEnd = lastFeature.location().bioStart(); 259 } else { 260 stopCodonEnd = lastFeature.location().bioEnd(); 261 } 262 } 263 //for gtf ordering can be strand based so first is last and last is first 264 if (startCodonBegin > stopCodonEnd) { 265 int temp = startCodonBegin; 266 startCodonBegin = stopCodonEnd; 267 stopCodonEnd = temp; 268 } 269 270 AccessionID transcriptAccessionID = new AccessionID(transcriptid); 271 if (geneSequence == null) { 272 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 273 geneSequence.setSource(((Feature) feature).source()); 274 } else { 275 //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end 276 277 if (startCodonBegin < geneSequence.getBioBegin()) { 278 geneSequence.setBioBegin(startCodonBegin); 279 } 280 if (stopCodonEnd > geneSequence.getBioBegin()) { 281 geneSequence.setBioEnd(stopCodonEnd); 282 } 283 284 } 285 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 286 /* 287 if (startCodon != null) { 288 if (startCodonName == null || startCodonName.length() == 0) { 289 startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 290 } 291 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 292 } 293 if (stopCodon != null) { 294 if (stopCodonName == null || stopCodonName.length() == 0) { 295 stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 296 } 297 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 298 } 299 */ 300 301 for (FeatureI cdsFeature : cdsFeatures) { 302 Feature cds = (Feature) cdsFeature; 303 String cdsName = cds.getAttribute("transcript_name"); 304 if (cdsName == null || cdsName.length() == 0) { 305 cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 306 } 307 AccessionID cdsAccessionID = new AccessionID(cdsName); 308 //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 309 CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame()); 310 cdsSequence.setSequenceScore(cds.score()); 311 } 312 } 313 } 314 315 } 316 317 static public Map<String, ChromosomeSequence> getChromosomeSequenceFromDNASequence(Map<String, DNASequence> dnaSequenceList) { 318 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<>(); 319 for (String key : dnaSequenceList.keySet()) { 320 DNASequence dnaSequence = dnaSequenceList.get(key); 321 ChromosomeSequence chromosomeSequence = new ChromosomeSequence(dnaSequence.getProxySequenceReader()); //we want the underlying sequence but don't need storage 322 chromosomeSequence.setAccession(dnaSequence.getAccession()); 323 chromosomeSequenceList.put(key, chromosomeSequence); 324 } 325 return chromosomeSequenceList; 326 } 327 328 /** 329 * Lots of variations in the ontology or descriptors that can be used in GFF3 which requires writing a custom parser to handle a GFF3 generated or used 330 * by a specific application. Probably could be abstracted out but for now easier to handle with custom code to deal with gff3 elements that are not 331 * included but can be extracted from other data elements. 332 * @param fastaSequenceFile 333 * @param gffFile 334 * @param lazyloadsequences If set to true then the fasta file will be parsed for accession id but sequences will be read from disk when needed to save memory 335 * @return 336 * @throws Exception 337 */ 338 static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGmodGFF3(File fastaSequenceFile, File gffFile,boolean lazyloadsequences) throws Exception { 339 Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile,lazyloadsequences); 340 Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 341 FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath()); 342 addGmodGFF3GeneFeatures(chromosomeSequenceList, listGenes); 343 return chromosomeSequenceList; 344 } 345 346 /** 347 * Load GFF3 file using mRNA as the gene feature as not all GFF3 files are complete 348 * @param chromosomeSequenceList 349 * @param listGenes 350 * @throws Exception 351 */ 352 static public void addGmodGFF3GeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 353 354 355 // key off mRNA as being a known feature that may or may not have a parent gene 356 357 358 FeatureList mRNAFeatures = listGenes.selectByType("mRNA"); 359 LinkedHashMap<String,FeatureList> featureIDHashMap = FeatureHelper.buildFeatureAtrributeIndex("ID", listGenes); 360 LinkedHashMap<String,FeatureList> featureParentHashMap = FeatureHelper.buildFeatureAtrributeIndex("Parent", listGenes); 361 362 for (FeatureI f : mRNAFeatures) { 363 String geneID; 364 String geneNote = null; 365 String geneSource = null; 366 String sequenceName = null; 367 ChromosomeSequence seq = null; 368 GeneSequence geneSequence = null; 369 370 Feature mRNAFeature = (Feature) f; 371 String mRNAID = mRNAFeature.getAttribute("ID"); 372 String mRNAsource = mRNAFeature.source(); 373 String mRNANote = mRNAFeature.getAttribute("Note"); 374 String mRNAParent = mRNAFeature.getAttribute("Parent"); 375 if (mRNAParent != null && mRNAParent.length() > 0) { 376 // FeatureList geneFeatureList = listGenes.selectByAttribute("ID", mRNAParent); 377 FeatureList geneFeatureList = featureIDHashMap.get(mRNAParent); 378 Feature geneFeature = (Feature) geneFeatureList.get(0); 379 geneID = geneFeature.getAttribute("ID"); 380 geneNote = geneFeature.getAttribute("Note"); 381 geneSource = geneFeature.source(); 382 sequenceName = geneFeature.seqname(); 383 384 // 385 } else { 386 //deal with cases where no parent gene is given 387 geneID = mRNAID; 388 geneSource = mRNAsource; 389 sequenceName = mRNAFeature.seqname(); 390 } 391 392 seq = chromosomeSequenceList.get(sequenceName); 393 394 AccessionID geneAccessionID = new AccessionID(geneID); 395 396 // FeatureList mRNAChildren = listGenes.selectByAttribute("Parent", mRNAID); 397 FeatureList mRNAChildren = featureParentHashMap.get(mRNAID); 398 FeatureList cdsFeatures = mRNAChildren.selectByType("CDS"); 399 FeatureI feature = cdsFeatures.get(0); 400 Strand strand = Strand.POSITIVE; 401 402 if (feature.location().isNegative()) { 403 strand = Strand.NEGATIVE; 404 } 405 cdsFeatures = cdsFeatures.sortByStart(); 406 407 408 409 410 411 412 413 //String seqName = feature.seqname(); 414 FeatureI startCodon = null; 415 FeatureI stopCodon = null; 416 Integer startCodonBegin = null; 417 Integer stopCodonEnd = null; 418 String startCodonName = ""; 419 String stopCodonName = ""; 420 FeatureList startCodonList = mRNAChildren.selectByType("five_prime_UTR"); 421 if (startCodonList != null && startCodonList.size() > 0) { 422 startCodon = startCodonList.get(0); 423 if (strand == Strand.NEGATIVE) { 424 startCodonBegin = startCodon.location().bioEnd(); 425 } else { 426 startCodonBegin = startCodon.location().bioStart(); 427 } 428 startCodonName = startCodon.getAttribute("ID"); 429 } 430 431 FeatureList stopCodonList = mRNAChildren.selectByType("three_prime_UTR"); 432 433 if (stopCodonList != null && stopCodonList.size() > 0) { 434 stopCodon = stopCodonList.get(0); 435 if (strand == Strand.NEGATIVE) { 436 stopCodonEnd = stopCodon.location().bioStart(); 437 } else { 438 stopCodonEnd = stopCodon.location().bioEnd(); 439 } 440 stopCodonName = stopCodon.getAttribute("ID"); 441 442 } 443 444 445 446 447 if (startCodonBegin == null) { 448 if (strand == Strand.NEGATIVE) { 449 FeatureI firstFeature = cdsFeatures.get(0); 450 451 startCodonBegin = firstFeature.location().bioEnd(); 452 } else { 453 FeatureI firstFeature = cdsFeatures.get(0); 454 455 startCodonBegin = firstFeature.location().bioStart(); 456 } 457 } 458 459 if (stopCodonEnd == null) { 460 if (strand == Strand.NEGATIVE) { 461 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 462 stopCodonEnd = lastFeature.location().bioStart(); 463 } else { 464 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 465 stopCodonEnd = lastFeature.location().bioEnd(); 466 } 467 } 468 //for gtf ordering can be strand based so first is last and last is first 469 if (startCodonBegin > stopCodonEnd) { 470 int temp = startCodonBegin; 471 startCodonBegin = stopCodonEnd; 472 stopCodonEnd = temp; 473 } 474 475 476 477 AccessionID transcriptAccessionID = new AccessionID(mRNAID); 478 geneSequence = seq.getGene(geneID); 479 if (geneSequence == null) { 480 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 481 geneSequence.setSource(geneSource); 482 if (geneNote != null && geneNote.length() > 0) { 483 geneSequence.addNote(geneNote); 484 } 485 } else { 486 487 if (startCodonBegin < geneSequence.getBioBegin()) { 488 geneSequence.setBioBegin(startCodonBegin); 489 } 490 if (stopCodonEnd > geneSequence.getBioBegin()) { 491 geneSequence.setBioEnd(stopCodonEnd); 492 } 493 494 } 495 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 496 transcriptSequence.setSource(mRNAsource); 497 if (mRNANote != null && mRNANote.length() > 0) { 498 transcriptSequence.addNote(mRNANote); 499 500 } 501 if (startCodon != null) { 502 if (startCodonName == null || startCodonName.length() == 0) { 503 startCodonName = mRNAID + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 504 } 505 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 506 } 507 if (stopCodon != null) { 508 if (stopCodonName == null || stopCodonName.length() == 0) { 509 stopCodonName = mRNAID + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 510 } 511 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 512 } 513 514 for (FeatureI cdsFeature : cdsFeatures) { 515 Feature cds = (Feature) cdsFeature; 516 String cdsNote = cdsFeature.getAttribute("Note"); 517 String cdsSource = cds.source(); 518 String cdsName = cds.getAttribute("ID"); 519 if (cdsName == null || cdsName.length() == 0) { 520 cdsName = mRNAID + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 521 } 522 AccessionID cdsAccessionID = new AccessionID(cdsName); 523 ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 524 exonSequence.setSource(cdsSource); 525 if (cdsNote != null && cdsNote.length() > 0) { 526 exonSequence.addNote(cdsNote); 527 } 528 transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame()); 529 } 530 geneSequence.addIntronsUsingExons(); 531 532 } 533 534 } 535 536 static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGlimmerGFF3(File fastaSequenceFile, File gffFile) throws Exception { 537 Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 538 Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 539 FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath()); 540 addGlimmerGFF3GeneFeatures(chromosomeSequenceList, listGenes); 541 return chromosomeSequenceList; 542 } 543 544 static public void addGlimmerGFF3GeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 545 FeatureList mRNAFeatures = listGenes.selectByType("mRNA"); 546 for (FeatureI f : mRNAFeatures) { 547 Feature mRNAFeature = (Feature) f; 548 String geneid = mRNAFeature.getAttribute("ID"); 549 String source = mRNAFeature.source(); 550 551 FeatureList gene = listGenes.selectByAttribute("Parent", geneid); 552 FeatureI geneFeature = gene.get(0); 553 ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname()); 554 AccessionID geneAccessionID = new AccessionID(geneid); 555 GeneSequence geneSequence = null; 556 557 FeatureList cdsFeatures = gene.selectByType("CDS"); 558 FeatureI feature = cdsFeatures.get(0); 559 Strand strand = Strand.POSITIVE; 560 561 if (feature.location().isNegative()) { 562 strand = Strand.NEGATIVE; 563 } 564 cdsFeatures = cdsFeatures.sortByStart(); 565 566 567 568 569 570 571 572 //String seqName = feature.seqname(); 573 FeatureI startCodon = null; 574 FeatureI stopCodon = null; 575 Integer startCodonBegin = null; 576 Integer stopCodonEnd = null; 577 String startCodonName = ""; 578 String stopCodonName = ""; 579 FeatureList startCodonList = gene.selectByAttribute("Note", "initial-exon"); 580 if (startCodonList != null && startCodonList.size() > 0) { 581 startCodon = startCodonList.get(0); 582 if (strand == Strand.NEGATIVE) { 583 startCodonBegin = startCodon.location().bioEnd(); 584 } else { 585 startCodonBegin = startCodon.location().bioStart(); 586 } 587 startCodonName = startCodon.getAttribute("ID"); 588 } 589 590 FeatureList stopCodonList = gene.selectByAttribute("Note", "final-exon"); 591 592 if (stopCodonList != null && stopCodonList.size() > 0) { 593 stopCodon = stopCodonList.get(0); 594 if (strand == Strand.NEGATIVE) { 595 stopCodonEnd = stopCodon.location().bioStart(); 596 } else { 597 stopCodonEnd = stopCodon.location().bioEnd(); 598 } 599 stopCodonName = stopCodon.getAttribute("ID"); 600 601 } 602 603 604 605 606 if (startCodonBegin == null) { 607 if (strand == Strand.NEGATIVE) { 608 FeatureI firstFeature = cdsFeatures.get(0); 609 610 startCodonBegin = firstFeature.location().bioEnd(); 611 } else { 612 FeatureI firstFeature = cdsFeatures.get(0); 613 614 startCodonBegin = firstFeature.location().bioStart(); 615 } 616 } 617 618 if (stopCodonEnd == null) { 619 if (strand == Strand.NEGATIVE) { 620 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 621 stopCodonEnd = lastFeature.location().bioStart(); 622 } else { 623 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 624 stopCodonEnd = lastFeature.location().bioEnd(); 625 } 626 } 627 //for gtf ordering can be strand based so first is last and last is first 628 if (startCodonBegin > stopCodonEnd) { 629 int temp = startCodonBegin; 630 startCodonBegin = stopCodonEnd; 631 stopCodonEnd = temp; 632 } 633 634 635 636 AccessionID transcriptAccessionID = new AccessionID(geneid); 637 if (geneSequence == null) { 638 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 639 geneSequence.setSource(source); 640 } 641 /* 642 else { 643 644 if (startCodonBegin < geneSequence.getBioBegin()) { 645 geneSequence.setBioBegin(startCodonBegin); 646 } 647 if (stopCodonEnd > geneSequence.getBioBegin()) { 648 geneSequence.setBioEnd(stopCodonEnd); 649 } 650 } 651 */ 652 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 653 if (startCodon != null) { 654 if (startCodonName == null || startCodonName.length() == 0) { 655 startCodonName = geneid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 656 } 657 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 658 } 659 if (stopCodon != null) { 660 if (stopCodonName == null || stopCodonName.length() == 0) { 661 stopCodonName = geneid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 662 } 663 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 664 } 665 666 for (FeatureI cdsFeature : cdsFeatures) { 667 Feature cds = (Feature) cdsFeature; 668 String cdsName = cds.getAttribute("ID"); 669 if (cdsName == null || cdsName.length() == 0) { 670 cdsName = geneid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 671 } 672 AccessionID cdsAccessionID = new AccessionID(cdsName); 673 //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 674 transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame()); 675 } 676 677 } 678 679 } 680 681 static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneMarkGTF(File fastaSequenceFile, File gffFile) throws Exception { 682 Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile); 683 Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList); 684 FeatureList listGenes = GeneMarkGTFReader.read(gffFile.getAbsolutePath()); 685 addGeneMarkGTFGeneFeatures(chromosomeSequenceList, listGenes); 686 return chromosomeSequenceList; 687 } 688 689 static public void addGeneMarkGTFGeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception { 690 Collection<String> geneIds = listGenes.attributeValues("gene_id"); 691 for (String geneid : geneIds) { 692 // if(geneid.equals("45_g")){ 693 // int dummy =1; 694 // } 695 FeatureList gene = listGenes.selectByAttribute("gene_id", geneid); 696 FeatureI geneFeature = gene.get(0); 697 ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname()); 698 AccessionID geneAccessionID = new AccessionID(geneid); 699 GeneSequence geneSequence = null; 700 Collection<String> transcriptids = gene.attributeValues("transcript_id"); 701 for (String transcriptid : transcriptids) { 702 // get all the individual features (exons, CDS regions, etc.) of this gene 703 704 705 FeatureList transcriptFeature = listGenes.selectByAttribute("transcript_id", transcriptid); 706 // now select only the coding regions of this gene 707 FeatureList cdsFeatures = transcriptFeature.selectByType("CDS"); 708 // sort them 709 cdsFeatures = cdsFeatures.sortByStart(); 710 711 FeatureI feature = cdsFeatures.get(0); 712 Strand strand = Strand.POSITIVE; 713 714 if (feature.location().isNegative()) { 715 strand = Strand.NEGATIVE; 716 } 717 718 //String seqName = feature.seqname(); 719 FeatureI startCodon = null; 720 FeatureI stopCodon = null; 721 Integer startCodonBegin = null; 722 Integer stopCodonEnd = null; 723 String startCodonName = ""; 724 String stopCodonName = ""; 725 FeatureList startCodonList = transcriptFeature.selectByType("start_codon"); 726 if (startCodonList != null && startCodonList.size() > 0) { 727 startCodon = startCodonList.get(0); 728 if (strand == Strand.POSITIVE) { 729 startCodonBegin = startCodon.location().bioStart(); 730 } else { 731 startCodonBegin = startCodon.location().bioEnd(); 732 } 733 startCodonName = startCodon.getAttribute("transcript_name"); 734 } 735 736 FeatureList stopCodonList = transcriptFeature.selectByType("stop_codon"); 737 738 if (stopCodonList != null && stopCodonList.size() > 0) { 739 stopCodon = stopCodonList.get(0); 740 if (strand == Strand.POSITIVE) { 741 stopCodonEnd = stopCodon.location().bioEnd(); 742 } else { 743 stopCodonEnd = stopCodon.location().bioStart(); 744 } 745 746 stopCodonName = stopCodon.getAttribute("transcript_name"); 747 748 } 749 750 751 752 753 if (startCodonBegin == null) { 754 if (strand == Strand.NEGATIVE) { 755 FeatureI firstFeature = cdsFeatures.get(0); 756 757 startCodonBegin = firstFeature.location().bioEnd(); 758 } else { 759 FeatureI firstFeature = cdsFeatures.get(0); 760 761 startCodonBegin = firstFeature.location().bioStart(); 762 } 763 } 764 765 if (stopCodonEnd == null) { 766 if (strand == Strand.NEGATIVE) { 767 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 768 stopCodonEnd = lastFeature.location().bioStart(); 769 } else { 770 FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1); 771 stopCodonEnd = lastFeature.location().bioEnd(); 772 } 773 } 774 //for gtf ordering can be strand based so first is last and last is first 775 if (startCodonBegin > stopCodonEnd) { 776 int temp = startCodonBegin; 777 startCodonBegin = stopCodonEnd; 778 stopCodonEnd = temp; 779 } 780 781 AccessionID transcriptAccessionID = new AccessionID(transcriptid); 782 if (geneSequence == null) { 783 geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); 784 geneSequence.setSource(((Feature) feature).source()); 785 } else { 786 //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end 787 788 if (startCodonBegin < geneSequence.getBioBegin()) { 789 geneSequence.setBioBegin(startCodonBegin); 790 } 791 if (stopCodonEnd > geneSequence.getBioBegin()) { 792 geneSequence.setBioEnd(stopCodonEnd); 793 } 794 795 } 796 TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); 797 if (startCodon != null) { 798 if (startCodonName == null || startCodonName.length() == 0) { 799 startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd(); 800 } 801 transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd()); 802 } 803 if (stopCodon != null) { 804 if (stopCodonName == null || stopCodonName.length() == 0) { 805 stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd(); 806 } 807 transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd()); 808 } 809 810 for (FeatureI cdsFeature : cdsFeatures) { 811 Feature cds = (Feature) cdsFeature; 812 // for genemark it appears frame of 2 =1 and frame of 1 = 2 813 // doesn't matter when you string cds regions together as one block 814 // but does make a difference when you try to make a protein sequence for each CDS region where 815 // you give up or borrow based on the frame value 816 // compared with gff like files and docs for geneid and glimmer where geneid and glimmer both do it the same 817 // way that appears to match the gff3 docs. 818 int frame = cds.frame(); 819 if (frame == 1) { 820 frame = 2; 821 } else if (frame == 2) { 822 frame = 1; 823 } else { 824 frame = 0; 825 } 826 String cdsName = cds.getAttribute("transcript_name"); 827 if (cdsName == null || cdsName.length() == 0) { 828 cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd(); 829 } 830 AccessionID cdsAccessionID = new AccessionID(cdsName); 831 //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); 832 transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), frame); 833 } 834 } 835 } 836 837 } 838 839 static public LinkedHashMap<String, ProteinSequence> getProteinSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception { 840 LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = new LinkedHashMap<>(); 841 for (ChromosomeSequence dnaSequence : chromosomeSequences) { 842 for (GeneSequence geneSequence : dnaSequence.getGeneSequences().values()) { 843 for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) { 844 //TODO remove? 845// DNASequence dnaCodingSequence = transcriptSequence.getDNACodingSequence(); 846// logger.info("CDS={}", dnaCodingSequence.getSequenceAsString()); 847 848 try { 849 ProteinSequence proteinSequence = transcriptSequence.getProteinSequence(); 850 851// logger.info("{} {}", proteinSequence.getAccession().getID(), proteinSequence); 852 if (proteinSequenceHashMap.containsKey(proteinSequence.getAccession().getID())) { 853 throw new Exception("Duplicate protein sequence id=" + proteinSequence.getAccession().getID() + " found at Gene id=" + geneSequence.getAccession().getID()); 854 } else { 855 proteinSequenceHashMap.put(proteinSequence.getAccession().getID(), proteinSequence); 856 } 857 } catch (Exception e) { 858 logger.error("Exception: ", e); 859 } 860 861 } 862 863 } 864 } 865 return proteinSequenceHashMap; 866 } 867 868 static public LinkedHashMap<String, GeneSequence> getGeneSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception { 869 LinkedHashMap<String, GeneSequence> geneSequenceHashMap = new LinkedHashMap<>(); 870 for (ChromosomeSequence chromosomeSequence : chromosomeSequences) { 871 for (GeneSequence geneSequence : chromosomeSequence.getGeneSequences().values()) { 872 geneSequenceHashMap.put(geneSequence.getAccession().getID(), geneSequence); 873 } 874 } 875 876 return geneSequenceHashMap; 877 } 878 879 public static void main(String[] args) throws Exception { 880/* if (false) { 881 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("Scaffolds.fna"), new File("genemark_hmm.gtf")); 882 LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values()); 883 } 884 885 if (false) { 886 LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("Scaffolds.fna"), new File("glimmerhmm.gff")); 887 LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values()); 888 // for (ProteinSequence proteinSequence : proteinSequenceList.values()) { 889 // logger.info(proteinSequence.getAccession().getID() + " " + proteinSequence); 890 // } 891 FastaWriterHelper.writeProteinSequence(new File("predicted_glimmer.faa"), proteinSequenceList.values()); 892 893 } 894 if (false) { 895 GeneFeatureHelper.outputFastaSequenceLengthGFF3(new File("Scaffolds.fna"), new File("scaffolds.gff3")); 896 } 897 898 */ 899 900 try { 901 902 if (true) { 903 //File fastaSequenceFile = new File("Scaffolds.fna"); 904 //File gff3File = new File("geneid-v6.gff3"); 905 //LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGmodGFF3(fastaSequenceFile, gff3File,true); 906 } 907 908 } catch (Exception e) { 909 logger.error("Exception: ", e); 910 } 911 912 } 913}