001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome;
022
023import org.biojava.nbio.genome.parsers.gff.*;
024import org.biojava.nbio.core.sequence.*;
025import org.biojava.nbio.core.sequence.io.FastaReaderHelper;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import java.io.File;
030import java.io.FileWriter;
031import java.util.ArrayList;
032import java.util.Collection;
033import java.util.Collections;
034import java.util.LinkedHashMap;
035import java.util.Map;
036
037/**
038 *
039 * @author Scooter Willis 
040 */
041public class GeneFeatureHelper {
042
043        private static final Logger logger = LoggerFactory.getLogger(GeneFeatureHelper.class);
044
045        static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromUpperCaseExonFastaFile(File fastaSequenceFile, File uppercaseFastaFile, boolean throwExceptionGeneNotFound) throws Exception {
046                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<>();
047                Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
048                for (String accession : dnaSequenceList.keySet()) {
049                        DNASequence contigSequence = dnaSequenceList.get(accession);
050                        ChromosomeSequence chromsomeSequence = new ChromosomeSequence(contigSequence.getSequenceAsString());
051                        chromsomeSequence.setAccession(contigSequence.getAccession());
052                        chromosomeSequenceList.put(accession, chromsomeSequence);
053                }
054
055
056                Map<String, DNASequence> geneSequenceList = FastaReaderHelper.readFastaDNASequence(uppercaseFastaFile);
057                for (DNASequence dnaSequence : geneSequenceList.values()) {
058                        String geneSequence = dnaSequence.getSequenceAsString();
059                        String lcGeneSequence = geneSequence.toLowerCase();
060                        String reverseGeneSequence = dnaSequence.getReverse().getSequenceAsString();
061                        String lcReverseGeneSequence = reverseGeneSequence.toLowerCase();
062                        Integer bioStart = null;
063                        Integer bioEnd = null;
064                        Strand strand = Strand.POSITIVE;
065                        boolean geneFound = false;
066                        String accession = "";
067                        DNASequence contigDNASequence = null;
068                        for (String id : dnaSequenceList.keySet()) {
069                                accession = id;
070                                contigDNASequence = dnaSequenceList.get(id);
071                                String contigSequence = contigDNASequence.getSequenceAsString().toLowerCase();
072                                bioStart = contigSequence.indexOf(lcGeneSequence);
073                                if (bioStart != -1) {
074                                        bioStart = bioStart + 1;
075                                        bioEnd = bioStart + geneSequence.length() - 1;
076                                        geneFound = true;
077                                        break;
078                                } else {
079                                        bioStart = contigSequence.indexOf(lcReverseGeneSequence);
080                                        if (bioStart != -1) {
081                                                bioStart = bioStart + 1;
082                                                bioEnd = bioStart - geneSequence.length() - 1;
083                                                strand = Strand.NEGATIVE;
084                                                geneFound = true;
085                                                break;
086                                        }
087                                }
088                        }
089
090                        if (geneFound) {
091                                logger.info("Gene {} found at {} {} {} {}",
092                                                dnaSequence.getAccession().toString(), contigDNASequence.getAccession().toString(), bioStart, bioEnd, strand);
093                                ChromosomeSequence chromosomeSequence = chromosomeSequenceList.get(accession);
094
095                                ArrayList<Integer> exonBoundries = new ArrayList<>();
096
097                                //look for transitions from lowercase to upper case
098                                for (int i = 0; i < geneSequence.length(); i++) {
099                                        if (i == 0 && Character.isUpperCase(geneSequence.charAt(i))) {
100                                                exonBoundries.add(i);
101                                        } else if (i == geneSequence.length() - 1) {
102                                                exonBoundries.add(i);
103                                        } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i - 1))) {
104                                                exonBoundries.add(i);
105                                        } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i + 1))) {
106                                                exonBoundries.add(i);
107                                        }
108                                }
109                                if (strand == Strand.NEGATIVE) {
110                                        Collections.reverse(exonBoundries);
111                                }
112
113
114                                String geneaccession = dnaSequence.getAccession().getID();
115                                String note = geneaccession;
116                                String[] values = geneaccession.split(" ");
117                                geneaccession = values[0];
118
119
120
121                                GeneSequence geneSeq = chromosomeSequence.addGene(new AccessionID(geneaccession), bioStart, bioEnd, strand);
122                                geneSeq.addNote(note);
123                                geneSeq.setSource(uppercaseFastaFile.getName());
124                                //String transcriptName = geneaccession + "-transcript";
125                                //TranscriptSequence transcriptSequence = geneSeq.addTranscript(new AccessionID(transcriptName), bioStart, bioEnd);
126
127                                int runningFrameLength = 0;
128                                for (int i = 0; i < exonBoundries.size() - 1; i = i + 2) {
129                                        int cdsBioStart = exonBoundries.get(i) + bioStart;
130                                        int cdsBioEnd = exonBoundries.get(i + 1) + bioStart;
131                                        runningFrameLength = runningFrameLength + Math.abs(cdsBioEnd - cdsBioStart) + 1;
132                                        //String cdsName = transcriptName + "-cds-" + cdsBioStart + "-" + cdsBioEnd;
133
134                                        //AccessionID cdsAccessionID = new AccessionID(cdsName);
135                                        //ExonSequence exonSequence = geneSeq.addExon(cdsAccessionID, cdsBioStart, cdsBioEnd);
136                                        int remainder = runningFrameLength % 3;
137                                        //int frame = 0;
138                                        if (remainder == 1) {
139                                                //frame = 2; // borrow 2 from next CDS region
140                                        } else if (remainder == 2) {
141                                                //frame = 1;
142                                        }
143                                        //CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsBioStart, cdsBioEnd, frame);
144                                }
145
146
147                        } else {
148                                if (throwExceptionGeneNotFound) {
149                                        throw new Exception(dnaSequence.getAccession().toString() + " not found");
150                                }
151                                logger.info("Gene not found {}", dnaSequence.getAccession().toString());
152                        }
153
154                }
155                return chromosomeSequenceList;
156        }
157
158        /**
159         * Output a gff3 feature file that will give the length of each scaffold/chromosome in the fasta file.
160         * Used for gbrowse so it knows length.
161         * @param fastaSequenceFile
162         * @param gffFile
163         * @throws Exception
164         */
165        static public void outputFastaSequenceLengthGFF3(File fastaSequenceFile, File gffFile) throws Exception {
166                Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
167                String fileName = fastaSequenceFile.getName();
168                FileWriter fw = new FileWriter(gffFile);
169                String newLine = System.getProperty("line.separator");
170                fw.write("##gff-version 3" + newLine);
171                for (DNASequence dnaSequence : dnaSequenceList.values()) {
172                        String gff3line = dnaSequence.getAccession().getID() + "\t" + fileName + "\t" + "contig" + "\t" + "1" + "\t" + dnaSequence.getBioEnd() + "\t.\t.\t.\tName=" + dnaSequence.getAccession().getID() + newLine;
173                        fw.write(gff3line);
174                }
175                fw.close();
176        }
177
178        /**
179         * Loads Fasta file and GFF2 feature file generated from the geneid prediction algorithm
180         *
181         * @param fastaSequenceFile
182         * @param gffFile
183         * @return
184         * @throws Exception
185         */
186        static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneIDGFF2(File fastaSequenceFile, File gffFile) throws Exception {
187                Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
188                Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
189                FeatureList listGenes = GeneIDGFF2Reader.read(gffFile.getAbsolutePath());
190                addGeneIDGFF2GeneFeatures(chromosomeSequenceList, listGenes);
191                return chromosomeSequenceList;
192        }
193
194        /**
195         * Load GFF2 feature file generated from the geneid prediction algorithm and map features onto the chromosome sequences
196         *
197         * @param chromosomeSequenceList
198         * @param listGenes
199         * @throws Exception
200         */
201        static public void addGeneIDGFF2GeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
202                Collection<String> geneIds = listGenes.attributeValues("gene_id");
203                for (String geneid : geneIds) {
204                        FeatureList gene = listGenes.selectByAttribute("gene_id", geneid);
205                        FeatureI geneFeature = gene.get(0);
206                        ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
207                        geneid = geneid.replaceAll("_", ".G");
208                        AccessionID geneAccessionID = new AccessionID(geneid);
209                        GeneSequence geneSequence = null;
210                        Collection<String> transcriptids = gene.attributeValues("gene_id");
211                        for (String transcriptid : transcriptids) {
212                                // get all the individual features (exons, CDS regions, etc.) of this gene
213                                FeatureList transcriptFeature = listGenes.selectByAttribute("gene_id", transcriptid);
214                                transcriptid = transcriptid.replaceAll("_", ".G");
215
216
217
218
219                                //      String seqName = feature.seqname();
220                                //FeatureI startCodon = null;
221                                //FeatureI stopCodon = null;
222                                Integer startCodonBegin = null;
223                                Integer stopCodonEnd = null;
224                                //String startCodonName = "";
225                                //String stopCodonName = "";
226
227
228                                // now select only the coding regions of this gene
229                                FeatureList firstFeatures = transcriptFeature.selectByType("First");
230                                FeatureList terminalFeatures = transcriptFeature.selectByType("Terminal");
231                                FeatureList internalFeatures = transcriptFeature.selectByType("Internal");
232                                FeatureList singleFeatures = transcriptFeature.selectByType("Single");
233                                FeatureList cdsFeatures = new FeatureList();
234                                cdsFeatures.add(firstFeatures);
235                                cdsFeatures.add(terminalFeatures);
236                                cdsFeatures.add(internalFeatures);
237                                cdsFeatures.add(singleFeatures);
238                                // sort them
239                                cdsFeatures = cdsFeatures.sortByStart();
240                                Strand strand = Strand.POSITIVE;
241                                FeatureI feature = cdsFeatures.get(0);
242                                if (feature.location().isNegative()) {
243                                        strand = Strand.NEGATIVE;
244                                }
245                                if (startCodonBegin == null) {
246                                        FeatureI firstFeature = cdsFeatures.get(0);
247                                        if (strand == Strand.NEGATIVE) {
248                                                startCodonBegin = firstFeature.location().bioEnd();
249                                        } else {
250                                                startCodonBegin = firstFeature.location().bioStart();
251                                        }
252                                }
253
254                                if (stopCodonEnd == null) {
255
256                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
257                                        if (strand == Strand.NEGATIVE) {
258                                                stopCodonEnd = lastFeature.location().bioStart();
259                                        } else {
260                                                stopCodonEnd = lastFeature.location().bioEnd();
261                                        }
262                                }
263                                //for gtf ordering can be strand based so first is last and last is first
264                                if (startCodonBegin > stopCodonEnd) {
265                                        int temp = startCodonBegin;
266                                        startCodonBegin = stopCodonEnd;
267                                        stopCodonEnd = temp;
268                                }
269
270                                AccessionID transcriptAccessionID = new AccessionID(transcriptid);
271                                if (geneSequence == null) {
272                                        geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
273                                        geneSequence.setSource(((Feature) feature).source());
274                                } else {
275                                        //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end
276
277                                        if (startCodonBegin < geneSequence.getBioBegin()) {
278                                                geneSequence.setBioBegin(startCodonBegin);
279                                        }
280                                        if (stopCodonEnd > geneSequence.getBioBegin()) {
281                                                geneSequence.setBioEnd(stopCodonEnd);
282                                        }
283
284                                }
285                                TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
286                                /*
287                                if (startCodon != null) {
288                                        if (startCodonName == null || startCodonName.length() == 0) {
289                                                startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
290                                        }
291                                        transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
292                                }
293                                if (stopCodon != null) {
294                                        if (stopCodonName == null || stopCodonName.length() == 0) {
295                                                stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
296                                        }
297                                        transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
298                                }
299                                */
300
301                                for (FeatureI cdsFeature : cdsFeatures) {
302                                        Feature cds = (Feature) cdsFeature;
303                                        String cdsName = cds.getAttribute("transcript_name");
304                                        if (cdsName == null || cdsName.length() == 0) {
305                                                cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
306                                        }
307                                        AccessionID cdsAccessionID = new AccessionID(cdsName);
308                                        //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
309                                        CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
310                                        cdsSequence.setSequenceScore(cds.score());
311                                }
312                        }
313                }
314
315        }
316
317        static public Map<String, ChromosomeSequence> getChromosomeSequenceFromDNASequence(Map<String, DNASequence> dnaSequenceList) {
318                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<>();
319                for (String key : dnaSequenceList.keySet()) {
320                        DNASequence dnaSequence = dnaSequenceList.get(key);
321                        ChromosomeSequence chromosomeSequence = new ChromosomeSequence(dnaSequence.getProxySequenceReader()); //we want the underlying sequence but don't need storage
322                        chromosomeSequence.setAccession(dnaSequence.getAccession());
323                        chromosomeSequenceList.put(key, chromosomeSequence);
324                }
325                return chromosomeSequenceList;
326        }
327
328        /**
329         * Lots of variations in the ontology or descriptors that can be used in GFF3 which requires writing a custom parser to handle a GFF3 generated or used
330         * by a specific application. Probably could be abstracted out but for now easier to handle with custom code to deal with gff3 elements that are not
331         * included but can be extracted from other data elements.
332         * @param fastaSequenceFile
333         * @param gffFile
334         * @param lazyloadsequences If set to true then the fasta file will be parsed for accession id but sequences will be read from disk when needed to save memory
335         * @return
336         * @throws Exception
337         */
338        static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGmodGFF3(File fastaSequenceFile, File gffFile,boolean lazyloadsequences) throws Exception {
339                Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile,lazyloadsequences);
340                Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
341                FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath());
342                addGmodGFF3GeneFeatures(chromosomeSequenceList, listGenes);
343                return chromosomeSequenceList;
344        }
345
346        /**
347         * Load GFF3 file using mRNA as the gene feature as not all GFF3 files are complete
348         * @param chromosomeSequenceList
349         * @param listGenes
350         * @throws Exception
351         */
352        static public void addGmodGFF3GeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
353
354
355                // key off mRNA as being a known feature that may or may not have a parent gene
356
357
358                FeatureList mRNAFeatures = listGenes.selectByType("mRNA");
359                LinkedHashMap<String,FeatureList> featureIDHashMap = FeatureHelper.buildFeatureAtrributeIndex("ID", listGenes);
360                LinkedHashMap<String,FeatureList> featureParentHashMap = FeatureHelper.buildFeatureAtrributeIndex("Parent", listGenes);
361
362                for (FeatureI f : mRNAFeatures) {
363                        String geneID;
364                        String geneNote = null;
365                        String geneSource = null;
366                        String sequenceName = null;
367                        ChromosomeSequence seq = null;
368                        GeneSequence geneSequence = null;
369
370                        Feature mRNAFeature = (Feature) f;
371                        String mRNAID = mRNAFeature.getAttribute("ID");
372                        String mRNAsource = mRNAFeature.source();
373                        String mRNANote = mRNAFeature.getAttribute("Note");
374                        String mRNAParent = mRNAFeature.getAttribute("Parent");
375                        if (mRNAParent != null && mRNAParent.length() > 0) {
376                           // FeatureList geneFeatureList = listGenes.selectByAttribute("ID", mRNAParent);
377                                FeatureList geneFeatureList = featureIDHashMap.get(mRNAParent);
378                                Feature geneFeature = (Feature) geneFeatureList.get(0);
379                                geneID = geneFeature.getAttribute("ID");
380                                geneNote = geneFeature.getAttribute("Note");
381                                geneSource = geneFeature.source();
382                                sequenceName = geneFeature.seqname();
383
384                                //
385                        } else {
386                                //deal with cases where no parent gene is given
387                                geneID = mRNAID;
388                                geneSource = mRNAsource;
389                                sequenceName = mRNAFeature.seqname();
390                        }
391
392                        seq = chromosomeSequenceList.get(sequenceName);
393
394                        AccessionID geneAccessionID = new AccessionID(geneID);
395
396                  //  FeatureList mRNAChildren = listGenes.selectByAttribute("Parent", mRNAID);
397                        FeatureList mRNAChildren = featureParentHashMap.get(mRNAID);
398                        FeatureList cdsFeatures = mRNAChildren.selectByType("CDS");
399                        FeatureI feature = cdsFeatures.get(0);
400                        Strand strand = Strand.POSITIVE;
401
402                        if (feature.location().isNegative()) {
403                                strand = Strand.NEGATIVE;
404                        }
405                        cdsFeatures = cdsFeatures.sortByStart();
406
407
408
409
410
411
412
413                        //String seqName = feature.seqname();
414                        FeatureI startCodon = null;
415                        FeatureI stopCodon = null;
416                        Integer startCodonBegin = null;
417                        Integer stopCodonEnd = null;
418                        String startCodonName = "";
419                        String stopCodonName = "";
420                        FeatureList startCodonList = mRNAChildren.selectByType("five_prime_UTR");
421                        if (startCodonList != null && startCodonList.size() > 0) {
422                                startCodon = startCodonList.get(0);
423                                if (strand == Strand.NEGATIVE) {
424                                        startCodonBegin = startCodon.location().bioEnd();
425                                } else {
426                                        startCodonBegin = startCodon.location().bioStart();
427                                }
428                                startCodonName = startCodon.getAttribute("ID");
429                        }
430
431                        FeatureList stopCodonList = mRNAChildren.selectByType("three_prime_UTR");
432
433                        if (stopCodonList != null && stopCodonList.size() > 0) {
434                                stopCodon = stopCodonList.get(0);
435                                if (strand == Strand.NEGATIVE) {
436                                        stopCodonEnd = stopCodon.location().bioStart();
437                                } else {
438                                        stopCodonEnd = stopCodon.location().bioEnd();
439                                }
440                                stopCodonName = stopCodon.getAttribute("ID");
441
442                        }
443
444
445
446
447                        if (startCodonBegin == null) {
448                                if (strand == Strand.NEGATIVE) {
449                                        FeatureI firstFeature = cdsFeatures.get(0);
450
451                                        startCodonBegin = firstFeature.location().bioEnd();
452                                } else {
453                                        FeatureI firstFeature = cdsFeatures.get(0);
454
455                                        startCodonBegin = firstFeature.location().bioStart();
456                                }
457                        }
458
459                        if (stopCodonEnd == null) {
460                                if (strand == Strand.NEGATIVE) {
461                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
462                                        stopCodonEnd = lastFeature.location().bioStart();
463                                } else {
464                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
465                                        stopCodonEnd = lastFeature.location().bioEnd();
466                                }
467                        }
468                        //for gtf ordering can be strand based so first is last and last is first
469                        if (startCodonBegin > stopCodonEnd) {
470                                int temp = startCodonBegin;
471                                startCodonBegin = stopCodonEnd;
472                                stopCodonEnd = temp;
473                        }
474
475
476
477                        AccessionID transcriptAccessionID = new AccessionID(mRNAID);
478                        geneSequence = seq.getGene(geneID);
479                        if (geneSequence == null) {
480                                geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
481                                geneSequence.setSource(geneSource);
482                                if (geneNote != null && geneNote.length() > 0) {
483                                        geneSequence.addNote(geneNote);
484                                }
485                        } else {
486
487                                if (startCodonBegin < geneSequence.getBioBegin()) {
488                                        geneSequence.setBioBegin(startCodonBegin);
489                                }
490                                if (stopCodonEnd > geneSequence.getBioBegin()) {
491                                        geneSequence.setBioEnd(stopCodonEnd);
492                                }
493
494                        }
495                        TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
496                        transcriptSequence.setSource(mRNAsource);
497                        if (mRNANote != null && mRNANote.length() > 0) {
498                                transcriptSequence.addNote(mRNANote);
499
500                        }
501                        if (startCodon != null) {
502                                if (startCodonName == null || startCodonName.length() == 0) {
503                                        startCodonName = mRNAID + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
504                                }
505                                transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
506                        }
507                        if (stopCodon != null) {
508                                if (stopCodonName == null || stopCodonName.length() == 0) {
509                                        stopCodonName = mRNAID + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
510                                }
511                                transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
512                        }
513
514                        for (FeatureI cdsFeature : cdsFeatures) {
515                                Feature cds = (Feature) cdsFeature;
516                                String cdsNote = cdsFeature.getAttribute("Note");
517                                String cdsSource = cds.source();
518                                String cdsName = cds.getAttribute("ID");
519                                if (cdsName == null || cdsName.length() == 0) {
520                                        cdsName = mRNAID + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
521                                }
522                                AccessionID cdsAccessionID = new AccessionID(cdsName);
523                                ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
524                                exonSequence.setSource(cdsSource);
525                                if (cdsNote != null && cdsNote.length() > 0) {
526                                        exonSequence.addNote(cdsNote);
527                                }
528                                transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
529                        }
530                        geneSequence.addIntronsUsingExons();
531
532                }
533
534        }
535
536        static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGlimmerGFF3(File fastaSequenceFile, File gffFile) throws Exception {
537                Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
538                Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
539                FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath());
540                addGlimmerGFF3GeneFeatures(chromosomeSequenceList, listGenes);
541                return chromosomeSequenceList;
542        }
543
544        static public void addGlimmerGFF3GeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
545                FeatureList mRNAFeatures = listGenes.selectByType("mRNA");
546                for (FeatureI f : mRNAFeatures) {
547                        Feature mRNAFeature = (Feature) f;
548                        String geneid = mRNAFeature.getAttribute("ID");
549                        String source = mRNAFeature.source();
550
551                        FeatureList gene = listGenes.selectByAttribute("Parent", geneid);
552                        FeatureI geneFeature = gene.get(0);
553                        ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
554                        AccessionID geneAccessionID = new AccessionID(geneid);
555                        GeneSequence geneSequence = null;
556
557                        FeatureList cdsFeatures = gene.selectByType("CDS");
558                        FeatureI feature = cdsFeatures.get(0);
559                        Strand strand = Strand.POSITIVE;
560
561                        if (feature.location().isNegative()) {
562                                strand = Strand.NEGATIVE;
563                        }
564                        cdsFeatures = cdsFeatures.sortByStart();
565
566
567
568
569
570
571
572                        //String seqName = feature.seqname();
573                        FeatureI startCodon = null;
574                        FeatureI stopCodon = null;
575                        Integer startCodonBegin = null;
576                        Integer stopCodonEnd = null;
577                        String startCodonName = "";
578                        String stopCodonName = "";
579                        FeatureList startCodonList = gene.selectByAttribute("Note", "initial-exon");
580                        if (startCodonList != null && startCodonList.size() > 0) {
581                                startCodon = startCodonList.get(0);
582                                if (strand == Strand.NEGATIVE) {
583                                        startCodonBegin = startCodon.location().bioEnd();
584                                } else {
585                                        startCodonBegin = startCodon.location().bioStart();
586                                }
587                                startCodonName = startCodon.getAttribute("ID");
588                        }
589
590                        FeatureList stopCodonList = gene.selectByAttribute("Note", "final-exon");
591
592                        if (stopCodonList != null && stopCodonList.size() > 0) {
593                                stopCodon = stopCodonList.get(0);
594                                if (strand == Strand.NEGATIVE) {
595                                        stopCodonEnd = stopCodon.location().bioStart();
596                                } else {
597                                        stopCodonEnd = stopCodon.location().bioEnd();
598                                }
599                                stopCodonName = stopCodon.getAttribute("ID");
600
601                        }
602
603
604
605
606                        if (startCodonBegin == null) {
607                                if (strand == Strand.NEGATIVE) {
608                                        FeatureI firstFeature = cdsFeatures.get(0);
609
610                                        startCodonBegin = firstFeature.location().bioEnd();
611                                } else {
612                                        FeatureI firstFeature = cdsFeatures.get(0);
613
614                                        startCodonBegin = firstFeature.location().bioStart();
615                                }
616                        }
617
618                        if (stopCodonEnd == null) {
619                                if (strand == Strand.NEGATIVE) {
620                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
621                                        stopCodonEnd = lastFeature.location().bioStart();
622                                } else {
623                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
624                                        stopCodonEnd = lastFeature.location().bioEnd();
625                                }
626                        }
627                        //for gtf ordering can be strand based so first is last and last is first
628                        if (startCodonBegin > stopCodonEnd) {
629                                int temp = startCodonBegin;
630                                startCodonBegin = stopCodonEnd;
631                                stopCodonEnd = temp;
632                        }
633
634
635
636                        AccessionID transcriptAccessionID = new AccessionID(geneid);
637                        if (geneSequence == null) {
638                                geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
639                                geneSequence.setSource(source);
640                        }
641                        /*
642                        else {
643
644                                if (startCodonBegin < geneSequence.getBioBegin()) {
645                                        geneSequence.setBioBegin(startCodonBegin);
646                                }
647                                if (stopCodonEnd > geneSequence.getBioBegin()) {
648                                        geneSequence.setBioEnd(stopCodonEnd);
649                                }
650                        }
651                        */
652                        TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
653                        if (startCodon != null) {
654                                if (startCodonName == null || startCodonName.length() == 0) {
655                                        startCodonName = geneid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
656                                }
657                                transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
658                        }
659                        if (stopCodon != null) {
660                                if (stopCodonName == null || stopCodonName.length() == 0) {
661                                        stopCodonName = geneid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
662                                }
663                                transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
664                        }
665
666                        for (FeatureI cdsFeature : cdsFeatures) {
667                                Feature cds = (Feature) cdsFeature;
668                                String cdsName = cds.getAttribute("ID");
669                                if (cdsName == null || cdsName.length() == 0) {
670                                        cdsName = geneid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
671                                }
672                                AccessionID cdsAccessionID = new AccessionID(cdsName);
673                                //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
674                                transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
675                        }
676
677                }
678
679        }
680
681        static public Map<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneMarkGTF(File fastaSequenceFile, File gffFile) throws Exception {
682                Map<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
683                Map<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
684                FeatureList listGenes = GeneMarkGTFReader.read(gffFile.getAbsolutePath());
685                addGeneMarkGTFGeneFeatures(chromosomeSequenceList, listGenes);
686                return chromosomeSequenceList;
687        }
688
689        static public void addGeneMarkGTFGeneFeatures(Map<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
690                Collection<String> geneIds = listGenes.attributeValues("gene_id");
691                for (String geneid : geneIds) {
692                        //       if(geneid.equals("45_g")){
693                        //           int dummy =1;
694                        //       }
695                        FeatureList gene = listGenes.selectByAttribute("gene_id", geneid);
696                        FeatureI geneFeature = gene.get(0);
697                        ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
698                        AccessionID geneAccessionID = new AccessionID(geneid);
699                        GeneSequence geneSequence = null;
700                        Collection<String> transcriptids = gene.attributeValues("transcript_id");
701                        for (String transcriptid : transcriptids) {
702                                // get all the individual features (exons, CDS regions, etc.) of this gene
703
704
705                                FeatureList transcriptFeature = listGenes.selectByAttribute("transcript_id", transcriptid);
706                                // now select only the coding regions of this gene
707                                FeatureList cdsFeatures = transcriptFeature.selectByType("CDS");
708                                // sort them
709                                cdsFeatures = cdsFeatures.sortByStart();
710
711                                FeatureI feature = cdsFeatures.get(0);
712                                Strand strand = Strand.POSITIVE;
713
714                                if (feature.location().isNegative()) {
715                                        strand = Strand.NEGATIVE;
716                                }
717
718                                //String seqName = feature.seqname();
719                                FeatureI startCodon = null;
720                                FeatureI stopCodon = null;
721                                Integer startCodonBegin = null;
722                                Integer stopCodonEnd = null;
723                                String startCodonName = "";
724                                String stopCodonName = "";
725                                FeatureList startCodonList = transcriptFeature.selectByType("start_codon");
726                                if (startCodonList != null && startCodonList.size() > 0) {
727                                        startCodon = startCodonList.get(0);
728                                        if (strand == Strand.POSITIVE) {
729                                                startCodonBegin = startCodon.location().bioStart();
730                                        } else {
731                                                startCodonBegin = startCodon.location().bioEnd();
732                                        }
733                                        startCodonName = startCodon.getAttribute("transcript_name");
734                                }
735
736                                FeatureList stopCodonList = transcriptFeature.selectByType("stop_codon");
737
738                                if (stopCodonList != null && stopCodonList.size() > 0) {
739                                        stopCodon = stopCodonList.get(0);
740                                        if (strand == Strand.POSITIVE) {
741                                                stopCodonEnd = stopCodon.location().bioEnd();
742                                        } else {
743                                                stopCodonEnd = stopCodon.location().bioStart();
744                                        }
745
746                                        stopCodonName = stopCodon.getAttribute("transcript_name");
747
748                                }
749
750
751
752
753                                if (startCodonBegin == null) {
754                                        if (strand == Strand.NEGATIVE) {
755                                                FeatureI firstFeature = cdsFeatures.get(0);
756
757                                                startCodonBegin = firstFeature.location().bioEnd();
758                                        } else {
759                                                FeatureI firstFeature = cdsFeatures.get(0);
760
761                                                startCodonBegin = firstFeature.location().bioStart();
762                                        }
763                                }
764
765                                if (stopCodonEnd == null) {
766                                        if (strand == Strand.NEGATIVE) {
767                                                FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
768                                                stopCodonEnd = lastFeature.location().bioStart();
769                                        } else {
770                                                FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
771                                                stopCodonEnd = lastFeature.location().bioEnd();
772                                        }
773                                }
774                                //for gtf ordering can be strand based so first is last and last is first
775                                if (startCodonBegin > stopCodonEnd) {
776                                        int temp = startCodonBegin;
777                                        startCodonBegin = stopCodonEnd;
778                                        stopCodonEnd = temp;
779                                }
780
781                                AccessionID transcriptAccessionID = new AccessionID(transcriptid);
782                                if (geneSequence == null) {
783                                        geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
784                                        geneSequence.setSource(((Feature) feature).source());
785                                } else {
786                                        //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end
787
788                                        if (startCodonBegin < geneSequence.getBioBegin()) {
789                                                geneSequence.setBioBegin(startCodonBegin);
790                                        }
791                                        if (stopCodonEnd > geneSequence.getBioBegin()) {
792                                                geneSequence.setBioEnd(stopCodonEnd);
793                                        }
794
795                                }
796                                TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
797                                if (startCodon != null) {
798                                        if (startCodonName == null || startCodonName.length() == 0) {
799                                                startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
800                                        }
801                                        transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
802                                }
803                                if (stopCodon != null) {
804                                        if (stopCodonName == null || stopCodonName.length() == 0) {
805                                                stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
806                                        }
807                                        transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
808                                }
809
810                                for (FeatureI cdsFeature : cdsFeatures) {
811                                        Feature cds = (Feature) cdsFeature;
812                                        // for genemark it appears frame of 2 =1 and frame of 1 = 2
813                                        // doesn't matter when you string cds regions together as one block
814                                        // but does make a difference when you try to make a protein sequence for each CDS region where
815                                        // you give up or borrow based on the frame value
816                                        // compared with gff like files and docs for geneid and glimmer where geneid and glimmer both do it the same
817                                        // way that appears to match the gff3 docs.
818                                        int frame = cds.frame();
819                                        if (frame == 1) {
820                                                frame = 2;
821                                        } else if (frame == 2) {
822                                                frame = 1;
823                                        } else {
824                                                frame = 0;
825                                        }
826                                        String cdsName = cds.getAttribute("transcript_name");
827                                        if (cdsName == null || cdsName.length() == 0) {
828                                                cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
829                                        }
830                                        AccessionID cdsAccessionID = new AccessionID(cdsName);
831                                        //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
832                                        transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), frame);
833                                }
834                        }
835                }
836
837        }
838
839        static public LinkedHashMap<String, ProteinSequence> getProteinSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception {
840                LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = new LinkedHashMap<>();
841                for (ChromosomeSequence dnaSequence : chromosomeSequences) {
842                        for (GeneSequence geneSequence : dnaSequence.getGeneSequences().values()) {
843                                for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) {
844                                        //TODO remove?
845//                    DNASequence dnaCodingSequence = transcriptSequence.getDNACodingSequence();
846//                    logger.info("CDS={}", dnaCodingSequence.getSequenceAsString());
847
848                                        try {
849                                                ProteinSequence proteinSequence = transcriptSequence.getProteinSequence();
850
851//                        logger.info("{} {}", proteinSequence.getAccession().getID(), proteinSequence);
852                                                if (proteinSequenceHashMap.containsKey(proteinSequence.getAccession().getID())) {
853                                                        throw new Exception("Duplicate protein sequence id=" + proteinSequence.getAccession().getID() + " found at Gene id=" + geneSequence.getAccession().getID());
854                                                } else {
855                                                        proteinSequenceHashMap.put(proteinSequence.getAccession().getID(), proteinSequence);
856                                                }
857                                        } catch (Exception e) {
858                                                logger.error("Exception: ", e);
859                                        }
860
861                                }
862
863                        }
864                }
865                return proteinSequenceHashMap;
866        }
867
868        static public LinkedHashMap<String, GeneSequence> getGeneSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception {
869                LinkedHashMap<String, GeneSequence> geneSequenceHashMap = new LinkedHashMap<>();
870                for (ChromosomeSequence chromosomeSequence : chromosomeSequences) {
871                        for (GeneSequence geneSequence : chromosomeSequence.getGeneSequences().values()) {
872                                geneSequenceHashMap.put(geneSequence.getAccession().getID(), geneSequence);
873                        }
874                }
875
876                return geneSequenceHashMap;
877        }
878
879        public static void main(String[] args) throws Exception {
880/*        if (false) {
881                        LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("Scaffolds.fna"), new File("genemark_hmm.gtf"));
882                        LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values());
883                }
884
885                if (false) {
886                        LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("Scaffolds.fna"), new File("glimmerhmm.gff"));
887                        LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values());
888                        //  for (ProteinSequence proteinSequence : proteinSequenceList.values()) {
889                        //      logger.info(proteinSequence.getAccession().getID() + " " + proteinSequence);
890                        //  }
891                        FastaWriterHelper.writeProteinSequence(new File("predicted_glimmer.faa"), proteinSequenceList.values());
892
893                }
894                if (false) {
895                        GeneFeatureHelper.outputFastaSequenceLengthGFF3(new File("Scaffolds.fna"), new File("scaffolds.gff3"));
896                }
897
898 */
899
900                try {
901
902                        if (true) {
903                                //File fastaSequenceFile = new File("Scaffolds.fna");
904                                //File gff3File = new File("geneid-v6.gff3");
905                                //LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGmodGFF3(fastaSequenceFile, gff3File,true);
906                        }
907
908                } catch (Exception e) {
909                        logger.error("Exception: ", e);
910                }
911
912        }
913}