Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome;
022
023import org.biojava.nbio.genome.parsers.gff.*;
024import org.biojava.nbio.core.sequence.*;
025import org.biojava.nbio.core.sequence.io.FastaReaderHelper;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import java.io.File;
030import java.io.FileWriter;
031import java.util.ArrayList;
032import java.util.Collection;
033import java.util.Collections;
034import java.util.LinkedHashMap;
035
036/**
037 *
038 * @author Scooter Willis <willishf at gmail dot com>
039 */
040public class GeneFeatureHelper {
041
042        private static final Logger logger = LoggerFactory.getLogger(GeneFeatureHelper.class);
043
044        static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromUpperCaseExonFastaFile(File fastaSequenceFile, File uppercaseFastaFile, boolean throwExceptionGeneNotFound) throws Exception {
045                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<String, ChromosomeSequence>();
046                LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
047                for (String accession : dnaSequenceList.keySet()) {
048                        DNASequence contigSequence = dnaSequenceList.get(accession);
049                        ChromosomeSequence chromsomeSequence = new ChromosomeSequence(contigSequence.getSequenceAsString());
050                        chromsomeSequence.setAccession(contigSequence.getAccession());
051                        chromosomeSequenceList.put(accession, chromsomeSequence);
052                }
053
054
055                LinkedHashMap<String, DNASequence> geneSequenceList = FastaReaderHelper.readFastaDNASequence(uppercaseFastaFile);
056                for (DNASequence dnaSequence : geneSequenceList.values()) {
057                        String geneSequence = dnaSequence.getSequenceAsString();
058                        String lcGeneSequence = geneSequence.toLowerCase();
059                        String reverseGeneSequence = dnaSequence.getReverse().getSequenceAsString();
060                        String lcReverseGeneSequence = reverseGeneSequence.toLowerCase();
061                        Integer bioStart = null;
062                        Integer bioEnd = null;
063                        Strand strand = Strand.POSITIVE;
064                        boolean geneFound = false;
065                        String accession = "";
066                        DNASequence contigDNASequence = null;
067                        for (String id : dnaSequenceList.keySet()) {
068                                accession = id;
069                                contigDNASequence = dnaSequenceList.get(id);
070                                String contigSequence = contigDNASequence.getSequenceAsString().toLowerCase();
071                                bioStart = contigSequence.indexOf(lcGeneSequence);
072                                if (bioStart != -1) {
073                                        bioStart = bioStart + 1;
074                                        bioEnd = bioStart + geneSequence.length() - 1;
075                                        geneFound = true;
076                                        break;
077                                } else {
078                                        bioStart = contigSequence.indexOf(lcReverseGeneSequence);
079                                        if (bioStart != -1) {
080                                                bioStart = bioStart + 1;
081                                                bioEnd = bioStart - geneSequence.length() - 1;
082                                                strand = Strand.NEGATIVE;
083                                                geneFound = true;
084                                                break;
085                                        }
086                                }
087                        }
088
089                        if (geneFound) {
090                                logger.info("Gene {} found at {} {} {} {}",
091                                                dnaSequence.getAccession().toString(), contigDNASequence.getAccession().toString(), bioStart, bioEnd, strand);
092                                ChromosomeSequence chromosomeSequence = chromosomeSequenceList.get(accession);
093
094                                ArrayList<Integer> exonBoundries = new ArrayList<Integer>();
095
096                                //look for transitions from lowercase to upper case
097                                for (int i = 0; i < geneSequence.length(); i++) {
098                                        if (i == 0 && Character.isUpperCase(geneSequence.charAt(i))) {
099                                                exonBoundries.add(i);
100                                        } else if (i == geneSequence.length() - 1) {
101                                                exonBoundries.add(i);
102                                        } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i - 1))) {
103                                                exonBoundries.add(i);
104                                        } else if (Character.isUpperCase(geneSequence.charAt(i)) && Character.isLowerCase(geneSequence.charAt(i + 1))) {
105                                                exonBoundries.add(i);
106                                        }
107                                }
108                                if (strand == Strand.NEGATIVE) {
109                                        Collections.reverse(exonBoundries);
110                                }
111
112
113                                String geneaccession = dnaSequence.getAccession().getID();
114                                String note = geneaccession;
115                                String[] values = geneaccession.split(" ");
116                                geneaccession = values[0];
117
118
119
120                                GeneSequence geneSeq = chromosomeSequence.addGene(new AccessionID(geneaccession), bioStart, bioEnd, strand);
121                                geneSeq.addNote(note);
122                                geneSeq.setSource(uppercaseFastaFile.getName());
123                                //String transcriptName = geneaccession + "-transcript";
124                                //TranscriptSequence transcriptSequence = geneSeq.addTranscript(new AccessionID(transcriptName), bioStart, bioEnd);
125
126                                int runningFrameLength = 0;
127                                for (int i = 0; i < exonBoundries.size() - 1; i = i + 2) {
128                                        int cdsBioStart = exonBoundries.get(i) + bioStart;
129                                        int cdsBioEnd = exonBoundries.get(i + 1) + bioStart;
130                                        runningFrameLength = runningFrameLength + Math.abs(cdsBioEnd - cdsBioStart) + 1;
131                                        //String cdsName = transcriptName + "-cds-" + cdsBioStart + "-" + cdsBioEnd;
132
133                                        //AccessionID cdsAccessionID = new AccessionID(cdsName);
134                                        //ExonSequence exonSequence = geneSeq.addExon(cdsAccessionID, cdsBioStart, cdsBioEnd);
135                                        int remainder = runningFrameLength % 3;
136                                        //int frame = 0;
137                                        if (remainder == 1) {
138                                                //frame = 2; // borrow 2 from next CDS region
139                                        } else if (remainder == 2) {
140                                                //frame = 1;
141                                        }
142                                        //CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsBioStart, cdsBioEnd, frame);
143                                }
144
145
146                        } else {
147                                if (throwExceptionGeneNotFound) {
148                                        throw new Exception(dnaSequence.getAccession().toString() + " not found");
149                                }
150                                logger.info("Gene not found {}", dnaSequence.getAccession().toString());
151                        }
152
153                }
154                return chromosomeSequenceList;
155        }
156
157        /**
158         * Output a gff3 feature file that will give the length of each scaffold/chromosome in the fasta file.
159         * Used for gbrowse so it knows length.
160         * @param fastaSequenceFile
161         * @param gffFile
162         * @throws Exception
163         */
164        static public void outputFastaSequenceLengthGFF3(File fastaSequenceFile, File gffFile) throws Exception {
165                LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
166                String fileName = fastaSequenceFile.getName();
167                FileWriter fw = new FileWriter(gffFile);
168                String newLine = System.getProperty("line.separator");
169                fw.write("##gff-version 3" + newLine);
170                for (DNASequence dnaSequence : dnaSequenceList.values()) {
171                        String gff3line = dnaSequence.getAccession().getID() + "\t" + fileName + "\t" + "contig" + "\t" + "1" + "\t" + dnaSequence.getBioEnd() + "\t.\t.\t.\tName=" + dnaSequence.getAccession().getID() + newLine;
172                        fw.write(gff3line);
173                }
174                fw.close();
175        }
176
177        /**
178         * Loads Fasta file and GFF2 feature file generated from the geneid prediction algorithm
179         *
180         * @param fastaSequenceFile
181         * @param gffFile
182         * @return
183         * @throws Exception
184         */
185        static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneIDGFF2(File fastaSequenceFile, File gffFile) throws Exception {
186                LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
187                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
188                FeatureList listGenes = GeneIDGFF2Reader.read(gffFile.getAbsolutePath());
189                addGeneIDGFF2GeneFeatures(chromosomeSequenceList, listGenes);
190                return chromosomeSequenceList;
191        }
192
193        /**
194         * Load GFF2 feature file generated from the geneid prediction algorithm and map features onto the chromosome sequences
195         *
196         * @param chromosomeSequenceList
197         * @param listGenes
198         * @throws Exception
199         */
200        static public void addGeneIDGFF2GeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
201                Collection<String> geneIds = listGenes.attributeValues("gene_id");
202                for (String geneid : geneIds) {
203                        FeatureList gene = listGenes.selectByAttribute("gene_id", geneid);
204                        FeatureI geneFeature = gene.get(0);
205                        ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
206                        geneid = geneid.replaceAll("_", ".G");
207                        AccessionID geneAccessionID = new AccessionID(geneid);
208                        GeneSequence geneSequence = null;
209                        Collection<String> transcriptids = gene.attributeValues("gene_id");
210                        for (String transcriptid : transcriptids) {
211                                // get all the individual features (exons, CDS regions, etc.) of this gene
212                                FeatureList transcriptFeature = listGenes.selectByAttribute("gene_id", transcriptid);
213                                transcriptid = transcriptid.replaceAll("_", ".G");
214
215
216
217
218                                //      String seqName = feature.seqname();
219                                //FeatureI startCodon = null;
220                                //FeatureI stopCodon = null;
221                                Integer startCodonBegin = null;
222                                Integer stopCodonEnd = null;
223                                //String startCodonName = "";
224                                //String stopCodonName = "";
225
226
227                                // now select only the coding regions of this gene
228                                FeatureList firstFeatures = transcriptFeature.selectByType("First");
229                                FeatureList terminalFeatures = transcriptFeature.selectByType("Terminal");
230                                FeatureList internalFeatures = transcriptFeature.selectByType("Internal");
231                                FeatureList singleFeatures = transcriptFeature.selectByType("Single");
232                                FeatureList cdsFeatures = new FeatureList();
233                                cdsFeatures.add(firstFeatures);
234                                cdsFeatures.add(terminalFeatures);
235                                cdsFeatures.add(internalFeatures);
236                                cdsFeatures.add(singleFeatures);
237                                // sort them
238                                cdsFeatures = cdsFeatures.sortByStart();
239                                Strand strand = Strand.POSITIVE;
240                                FeatureI feature = cdsFeatures.get(0);
241                                if (feature.location().isNegative()) {
242                                        strand = Strand.NEGATIVE;
243                                }
244                                if (startCodonBegin == null) {
245                                        FeatureI firstFeature = cdsFeatures.get(0);
246                                        if (strand == Strand.NEGATIVE) {
247                                                startCodonBegin = firstFeature.location().bioEnd();
248                                        } else {
249                                                startCodonBegin = firstFeature.location().bioStart();
250                                        }
251                                }
252
253                                if (stopCodonEnd == null) {
254
255                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
256                                        if (strand == Strand.NEGATIVE) {
257                                                stopCodonEnd = lastFeature.location().bioStart();
258                                        } else {
259                                                stopCodonEnd = lastFeature.location().bioEnd();
260                                        }
261                                }
262                                //for gtf ordering can be strand based so first is last and last is first
263                                if (startCodonBegin > stopCodonEnd) {
264                                        int temp = startCodonBegin;
265                                        startCodonBegin = stopCodonEnd;
266                                        stopCodonEnd = temp;
267                                }
268
269                                AccessionID transcriptAccessionID = new AccessionID(transcriptid);
270                                if (geneSequence == null) {
271                                        geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
272                                        geneSequence.setSource(((Feature) feature).source());
273                                } else {
274                                        //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end
275
276                                        if (startCodonBegin < geneSequence.getBioBegin()) {
277                                                geneSequence.setBioBegin(startCodonBegin);
278                                        }
279                                        if (stopCodonEnd > geneSequence.getBioBegin()) {
280                                                geneSequence.setBioEnd(stopCodonEnd);
281                                        }
282
283                                }
284                                TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
285                                /*
286                                if (startCodon != null) {
287                                        if (startCodonName == null || startCodonName.length() == 0) {
288                                                startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
289                                        }
290                                        transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
291                                }
292                                if (stopCodon != null) {
293                                        if (stopCodonName == null || stopCodonName.length() == 0) {
294                                                stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
295                                        }
296                                        transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
297                                }
298                                */
299
300                                for (FeatureI cdsFeature : cdsFeatures) {
301                                        Feature cds = (Feature) cdsFeature;
302                                        String cdsName = cds.getAttribute("transcript_name");
303                                        if (cdsName == null || cdsName.length() == 0) {
304                                                cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
305                                        }
306                                        AccessionID cdsAccessionID = new AccessionID(cdsName);
307                                        //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
308                                        CDSSequence cdsSequence = transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
309                                        cdsSequence.setSequenceScore(cds.score());
310                                }
311                        }
312                }
313
314        }
315
316        static public LinkedHashMap<String, ChromosomeSequence> getChromosomeSequenceFromDNASequence(LinkedHashMap<String, DNASequence> dnaSequenceList) {
317                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = new LinkedHashMap<String, ChromosomeSequence>();
318                for (String key : dnaSequenceList.keySet()) {
319                        DNASequence dnaSequence = dnaSequenceList.get(key);
320                        ChromosomeSequence chromosomeSequence = new ChromosomeSequence(dnaSequence.getProxySequenceReader()); //we want the underlying sequence but don't need storage
321                        chromosomeSequence.setAccession(dnaSequence.getAccession());
322                        chromosomeSequenceList.put(key, chromosomeSequence);
323                }
324                return chromosomeSequenceList;
325        }
326
327        /**
328         * Lots of variations in the ontology or descriptors that can be used in GFF3 which requires writing a custom parser to handle a GFF3 generated or used
329         * by a specific application. Probably could be abstracted out but for now easier to handle with custom code to deal with gff3 elements that are not
330         * included but can be extracted from other data elements.
331         * @param fastaSequenceFile
332         * @param gffFile
333         * @param lazyloadsequences If set to true then the fasta file will be parsed for accession id but sequences will be read from disk when needed to save memory
334         * @return
335         * @throws Exception
336         */
337        static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGmodGFF3(File fastaSequenceFile, File gffFile,boolean lazyloadsequences) throws Exception {
338                LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile,lazyloadsequences);
339                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
340                FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath());
341                addGmodGFF3GeneFeatures(chromosomeSequenceList, listGenes);
342                return chromosomeSequenceList;
343        }
344
345        /**
346         * Load GFF3 file using mRNA as the gene feature as not all GFF3 files are complete
347         * @param chromosomeSequenceList
348         * @param listGenes
349         * @throws Exception
350         */
351        static public void addGmodGFF3GeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
352
353
354                // key off mRNA as being a known feature that may or may not have a parent gene
355
356
357                FeatureList mRNAFeatures = listGenes.selectByType("mRNA");
358                LinkedHashMap<String,FeatureList> featureIDHashMap = FeatureHelper.buildFeatureAtrributeIndex("ID", listGenes);
359                LinkedHashMap<String,FeatureList> featureParentHashMap = FeatureHelper.buildFeatureAtrributeIndex("Parent", listGenes);
360
361                for (FeatureI f : mRNAFeatures) {
362                        String geneID;
363                        String geneNote = null;
364                        String geneSource = null;
365                        String sequenceName = null;
366                        ChromosomeSequence seq = null;
367                        GeneSequence geneSequence = null;
368
369                        Feature mRNAFeature = (Feature) f;
370                        String mRNAID = mRNAFeature.getAttribute("ID");
371                        String mRNAsource = mRNAFeature.source();
372                        String mRNANote = mRNAFeature.getAttribute("Note");
373                        String mRNAParent = mRNAFeature.getAttribute("Parent");
374                        if (mRNAParent != null && mRNAParent.length() > 0) {
375                           // FeatureList geneFeatureList = listGenes.selectByAttribute("ID", mRNAParent);
376                                FeatureList geneFeatureList = featureIDHashMap.get(mRNAParent);
377                                Feature geneFeature = (Feature) geneFeatureList.get(0);
378                                geneID = geneFeature.getAttribute("ID");
379                                geneNote = geneFeature.getAttribute("Note");
380                                geneSource = geneFeature.source();
381                                sequenceName = geneFeature.seqname();
382
383                                //
384                        } else {
385                                //deal with cases where no parent gene is given
386                                geneID = mRNAID;
387                                geneSource = mRNAsource;
388                                sequenceName = mRNAFeature.seqname();
389                        }
390
391                        seq = chromosomeSequenceList.get(sequenceName);
392
393                        AccessionID geneAccessionID = new AccessionID(geneID);
394
395                  //  FeatureList mRNAChildren = listGenes.selectByAttribute("Parent", mRNAID);
396                        FeatureList mRNAChildren = featureParentHashMap.get(mRNAID);
397                        FeatureList cdsFeatures = mRNAChildren.selectByType("CDS");
398                        FeatureI feature = cdsFeatures.get(0);
399                        Strand strand = Strand.POSITIVE;
400
401                        if (feature.location().isNegative()) {
402                                strand = Strand.NEGATIVE;
403                        }
404                        cdsFeatures = cdsFeatures.sortByStart();
405
406
407
408
409
410
411
412                        //String seqName = feature.seqname();
413                        FeatureI startCodon = null;
414                        FeatureI stopCodon = null;
415                        Integer startCodonBegin = null;
416                        Integer stopCodonEnd = null;
417                        String startCodonName = "";
418                        String stopCodonName = "";
419                        FeatureList startCodonList = mRNAChildren.selectByType("five_prime_UTR");
420                        if (startCodonList != null && startCodonList.size() > 0) {
421                                startCodon = startCodonList.get(0);
422                                if (strand == Strand.NEGATIVE) {
423                                        startCodonBegin = startCodon.location().bioEnd();
424                                } else {
425                                        startCodonBegin = startCodon.location().bioStart();
426                                }
427                                startCodonName = startCodon.getAttribute("ID");
428                        }
429
430                        FeatureList stopCodonList = mRNAChildren.selectByType("three_prime_UTR");
431
432                        if (stopCodonList != null && stopCodonList.size() > 0) {
433                                stopCodon = stopCodonList.get(0);
434                                if (strand == Strand.NEGATIVE) {
435                                        stopCodonEnd = stopCodon.location().bioStart();
436                                } else {
437                                        stopCodonEnd = stopCodon.location().bioEnd();
438                                }
439                                stopCodonName = stopCodon.getAttribute("ID");
440
441                        }
442
443
444
445
446                        if (startCodonBegin == null) {
447                                if (strand == Strand.NEGATIVE) {
448                                        FeatureI firstFeature = cdsFeatures.get(0);
449
450                                        startCodonBegin = firstFeature.location().bioEnd();
451                                } else {
452                                        FeatureI firstFeature = cdsFeatures.get(0);
453
454                                        startCodonBegin = firstFeature.location().bioStart();
455                                }
456                        }
457
458                        if (stopCodonEnd == null) {
459                                if (strand == Strand.NEGATIVE) {
460                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
461                                        stopCodonEnd = lastFeature.location().bioStart();
462                                } else {
463                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
464                                        stopCodonEnd = lastFeature.location().bioEnd();
465                                }
466                        }
467                        //for gtf ordering can be strand based so first is last and last is first
468                        if (startCodonBegin > stopCodonEnd) {
469                                int temp = startCodonBegin;
470                                startCodonBegin = stopCodonEnd;
471                                stopCodonEnd = temp;
472                        }
473
474
475
476                        AccessionID transcriptAccessionID = new AccessionID(mRNAID);
477                        geneSequence = seq.getGene(geneID);
478                        if (geneSequence == null) {
479                                geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
480                                geneSequence.setSource(geneSource);
481                                if (geneNote != null && geneNote.length() > 0) {
482                                        geneSequence.addNote(geneNote);
483                                }
484                        } else {
485
486                                if (startCodonBegin < geneSequence.getBioBegin()) {
487                                        geneSequence.setBioBegin(startCodonBegin);
488                                }
489                                if (stopCodonEnd > geneSequence.getBioBegin()) {
490                                        geneSequence.setBioEnd(stopCodonEnd);
491                                }
492
493                        }
494                        TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
495                        transcriptSequence.setSource(mRNAsource);
496                        if (mRNANote != null && mRNANote.length() > 0) {
497                                transcriptSequence.addNote(mRNANote);
498
499                        }
500                        if (startCodon != null) {
501                                if (startCodonName == null || startCodonName.length() == 0) {
502                                        startCodonName = mRNAID + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
503                                }
504                                transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
505                        }
506                        if (stopCodon != null) {
507                                if (stopCodonName == null || stopCodonName.length() == 0) {
508                                        stopCodonName = mRNAID + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
509                                }
510                                transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
511                        }
512
513                        for (FeatureI cdsFeature : cdsFeatures) {
514                                Feature cds = (Feature) cdsFeature;
515                                String cdsNote = cdsFeature.getAttribute("Note");
516                                String cdsSource = cds.source();
517                                String cdsName = cds.getAttribute("ID");
518                                if (cdsName == null || cdsName.length() == 0) {
519                                        cdsName = mRNAID + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
520                                }
521                                AccessionID cdsAccessionID = new AccessionID(cdsName);
522                                ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
523                                exonSequence.setSource(cdsSource);
524                                if (cdsNote != null && cdsNote.length() > 0) {
525                                        exonSequence.addNote(cdsNote);
526                                }
527                                transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
528                        }
529                        geneSequence.addIntronsUsingExons();
530
531                }
532
533        }
534
535        static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGlimmerGFF3(File fastaSequenceFile, File gffFile) throws Exception {
536                LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
537                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
538                FeatureList listGenes = GFF3Reader.read(gffFile.getAbsolutePath());
539                addGlimmerGFF3GeneFeatures(chromosomeSequenceList, listGenes);
540                return chromosomeSequenceList;
541        }
542
543        static public void addGlimmerGFF3GeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
544                FeatureList mRNAFeatures = listGenes.selectByType("mRNA");
545                for (FeatureI f : mRNAFeatures) {
546                        Feature mRNAFeature = (Feature) f;
547                        String geneid = mRNAFeature.getAttribute("ID");
548                        String source = mRNAFeature.source();
549
550                        FeatureList gene = listGenes.selectByAttribute("Parent", geneid);
551                        FeatureI geneFeature = gene.get(0);
552                        ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
553                        AccessionID geneAccessionID = new AccessionID(geneid);
554                        GeneSequence geneSequence = null;
555
556                        FeatureList cdsFeatures = gene.selectByType("CDS");
557                        FeatureI feature = cdsFeatures.get(0);
558                        Strand strand = Strand.POSITIVE;
559
560                        if (feature.location().isNegative()) {
561                                strand = Strand.NEGATIVE;
562                        }
563                        cdsFeatures = cdsFeatures.sortByStart();
564
565
566
567
568
569
570
571                        //String seqName = feature.seqname();
572                        FeatureI startCodon = null;
573                        FeatureI stopCodon = null;
574                        Integer startCodonBegin = null;
575                        Integer stopCodonEnd = null;
576                        String startCodonName = "";
577                        String stopCodonName = "";
578                        FeatureList startCodonList = gene.selectByAttribute("Note", "initial-exon");
579                        if (startCodonList != null && startCodonList.size() > 0) {
580                                startCodon = startCodonList.get(0);
581                                if (strand == Strand.NEGATIVE) {
582                                        startCodonBegin = startCodon.location().bioEnd();
583                                } else {
584                                        startCodonBegin = startCodon.location().bioStart();
585                                }
586                                startCodonName = startCodon.getAttribute("ID");
587                        }
588
589                        FeatureList stopCodonList = gene.selectByAttribute("Note", "final-exon");
590
591                        if (stopCodonList != null && stopCodonList.size() > 0) {
592                                stopCodon = stopCodonList.get(0);
593                                if (strand == Strand.NEGATIVE) {
594                                        stopCodonEnd = stopCodon.location().bioStart();
595                                } else {
596                                        stopCodonEnd = stopCodon.location().bioEnd();
597                                }
598                                stopCodonName = stopCodon.getAttribute("ID");
599
600                        }
601
602
603
604
605                        if (startCodonBegin == null) {
606                                if (strand == Strand.NEGATIVE) {
607                                        FeatureI firstFeature = cdsFeatures.get(0);
608
609                                        startCodonBegin = firstFeature.location().bioEnd();
610                                } else {
611                                        FeatureI firstFeature = cdsFeatures.get(0);
612
613                                        startCodonBegin = firstFeature.location().bioStart();
614                                }
615                        }
616
617                        if (stopCodonEnd == null) {
618                                if (strand == Strand.NEGATIVE) {
619                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
620                                        stopCodonEnd = lastFeature.location().bioStart();
621                                } else {
622                                        FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
623                                        stopCodonEnd = lastFeature.location().bioEnd();
624                                }
625                        }
626                        //for gtf ordering can be strand based so first is last and last is first
627                        if (startCodonBegin > stopCodonEnd) {
628                                int temp = startCodonBegin;
629                                startCodonBegin = stopCodonEnd;
630                                stopCodonEnd = temp;
631                        }
632
633
634
635                        AccessionID transcriptAccessionID = new AccessionID(geneid);
636                        if (geneSequence == null) {
637                                geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
638                                geneSequence.setSource(source);
639                        }
640                        /*
641                        else {
642
643                                if (startCodonBegin < geneSequence.getBioBegin()) {
644                                        geneSequence.setBioBegin(startCodonBegin);
645                                }
646                                if (stopCodonEnd > geneSequence.getBioBegin()) {
647                                        geneSequence.setBioEnd(stopCodonEnd);
648                                }
649                        }
650                        */
651                        TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
652                        if (startCodon != null) {
653                                if (startCodonName == null || startCodonName.length() == 0) {
654                                        startCodonName = geneid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
655                                }
656                                transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
657                        }
658                        if (stopCodon != null) {
659                                if (stopCodonName == null || stopCodonName.length() == 0) {
660                                        stopCodonName = geneid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
661                                }
662                                transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
663                        }
664
665                        for (FeatureI cdsFeature : cdsFeatures) {
666                                Feature cds = (Feature) cdsFeature;
667                                String cdsName = cds.getAttribute("ID");
668                                if (cdsName == null || cdsName.length() == 0) {
669                                        cdsName = geneid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
670                                }
671                                AccessionID cdsAccessionID = new AccessionID(cdsName);
672                                //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
673                                transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), cds.frame());
674                        }
675
676                }
677
678        }
679
680        static public LinkedHashMap<String, ChromosomeSequence> loadFastaAddGeneFeaturesFromGeneMarkGTF(File fastaSequenceFile, File gffFile) throws Exception {
681                LinkedHashMap<String, DNASequence> dnaSequenceList = FastaReaderHelper.readFastaDNASequence(fastaSequenceFile);
682                LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.getChromosomeSequenceFromDNASequence(dnaSequenceList);
683                FeatureList listGenes = GeneMarkGTFReader.read(gffFile.getAbsolutePath());
684                addGeneMarkGTFGeneFeatures(chromosomeSequenceList, listGenes);
685                return chromosomeSequenceList;
686        }
687
688        static public void addGeneMarkGTFGeneFeatures(LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList, FeatureList listGenes) throws Exception {
689                Collection<String> geneIds = listGenes.attributeValues("gene_id");
690                for (String geneid : geneIds) {
691                        //       if(geneid.equals("45_g")){
692                        //           int dummy =1;
693                        //       }
694                        FeatureList gene = listGenes.selectByAttribute("gene_id", geneid);
695                        FeatureI geneFeature = gene.get(0);
696                        ChromosomeSequence seq = chromosomeSequenceList.get(geneFeature.seqname());
697                        AccessionID geneAccessionID = new AccessionID(geneid);
698                        GeneSequence geneSequence = null;
699                        Collection<String> transcriptids = gene.attributeValues("transcript_id");
700                        for (String transcriptid : transcriptids) {
701                                // get all the individual features (exons, CDS regions, etc.) of this gene
702
703
704                                FeatureList transcriptFeature = listGenes.selectByAttribute("transcript_id", transcriptid);
705                                // now select only the coding regions of this gene
706                                FeatureList cdsFeatures = transcriptFeature.selectByType("CDS");
707                                // sort them
708                                cdsFeatures = cdsFeatures.sortByStart();
709
710                                FeatureI feature = cdsFeatures.get(0);
711                                Strand strand = Strand.POSITIVE;
712
713                                if (feature.location().isNegative()) {
714                                        strand = Strand.NEGATIVE;
715                                }
716
717                                //String seqName = feature.seqname();
718                                FeatureI startCodon = null;
719                                FeatureI stopCodon = null;
720                                Integer startCodonBegin = null;
721                                Integer stopCodonEnd = null;
722                                String startCodonName = "";
723                                String stopCodonName = "";
724                                FeatureList startCodonList = transcriptFeature.selectByType("start_codon");
725                                if (startCodonList != null && startCodonList.size() > 0) {
726                                        startCodon = startCodonList.get(0);
727                                        if (strand == Strand.POSITIVE) {
728                                                startCodonBegin = startCodon.location().bioStart();
729                                        } else {
730                                                startCodonBegin = startCodon.location().bioEnd();
731                                        }
732                                        startCodonName = startCodon.getAttribute("transcript_name");
733                                }
734
735                                FeatureList stopCodonList = transcriptFeature.selectByType("stop_codon");
736
737                                if (stopCodonList != null && stopCodonList.size() > 0) {
738                                        stopCodon = stopCodonList.get(0);
739                                        if (strand == Strand.POSITIVE) {
740                                                stopCodonEnd = stopCodon.location().bioEnd();
741                                        } else {
742                                                stopCodonEnd = stopCodon.location().bioStart();
743                                        }
744
745                                        stopCodonName = stopCodon.getAttribute("transcript_name");
746
747                                }
748
749
750
751
752                                if (startCodonBegin == null) {
753                                        if (strand == Strand.NEGATIVE) {
754                                                FeatureI firstFeature = cdsFeatures.get(0);
755
756                                                startCodonBegin = firstFeature.location().bioEnd();
757                                        } else {
758                                                FeatureI firstFeature = cdsFeatures.get(0);
759
760                                                startCodonBegin = firstFeature.location().bioStart();
761                                        }
762                                }
763
764                                if (stopCodonEnd == null) {
765                                        if (strand == Strand.NEGATIVE) {
766                                                FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
767                                                stopCodonEnd = lastFeature.location().bioStart();
768                                        } else {
769                                                FeatureI lastFeature = cdsFeatures.get(cdsFeatures.size() - 1);
770                                                stopCodonEnd = lastFeature.location().bioEnd();
771                                        }
772                                }
773                                //for gtf ordering can be strand based so first is last and last is first
774                                if (startCodonBegin > stopCodonEnd) {
775                                        int temp = startCodonBegin;
776                                        startCodonBegin = stopCodonEnd;
777                                        stopCodonEnd = temp;
778                                }
779
780                                AccessionID transcriptAccessionID = new AccessionID(transcriptid);
781                                if (geneSequence == null) {
782                                        geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand);
783                                        geneSequence.setSource(((Feature) feature).source());
784                                } else {
785                                        //if multiple transcripts for one gene make sure the gene is defined as the min and max start/end
786
787                                        if (startCodonBegin < geneSequence.getBioBegin()) {
788                                                geneSequence.setBioBegin(startCodonBegin);
789                                        }
790                                        if (stopCodonEnd > geneSequence.getBioBegin()) {
791                                                geneSequence.setBioEnd(stopCodonEnd);
792                                        }
793
794                                }
795                                TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
796                                if (startCodon != null) {
797                                        if (startCodonName == null || startCodonName.length() == 0) {
798                                                startCodonName = transcriptid + "-start_codon-" + startCodon.location().bioStart() + "-" + startCodon.location().bioEnd();
799                                        }
800                                        transcriptSequence.addStartCodonSequence(new AccessionID(startCodonName), startCodon.location().bioStart(), startCodon.location().bioEnd());
801                                }
802                                if (stopCodon != null) {
803                                        if (stopCodonName == null || stopCodonName.length() == 0) {
804                                                stopCodonName = transcriptid + "-stop_codon-" + stopCodon.location().bioStart() + "-" + stopCodon.location().bioEnd();
805                                        }
806                                        transcriptSequence.addStopCodonSequence(new AccessionID(stopCodonName), stopCodon.location().bioStart(), stopCodon.location().bioEnd());
807                                }
808
809                                for (FeatureI cdsFeature : cdsFeatures) {
810                                        Feature cds = (Feature) cdsFeature;
811                                        // for genemark it appears frame of 2 =1 and frame of 1 = 2
812                                        // doesn't matter when you string cds regions together as one block
813                                        // but does make a difference when you try to make a protein sequence for each CDS region where
814                                        // you give up or borrow based on the frame value
815                                        // compared with gff like files and docs for geneid and glimmer where geneid and glimmer both do it the same
816                                        // way that appears to match the gff3 docs.
817                                        int frame = cds.frame();
818                                        if (frame == 1) {
819                                                frame = 2;
820                                        } else if (frame == 2) {
821                                                frame = 1;
822                                        } else {
823                                                frame = 0;
824                                        }
825                                        String cdsName = cds.getAttribute("transcript_name");
826                                        if (cdsName == null || cdsName.length() == 0) {
827                                                cdsName = transcriptid + "-cds-" + cds.location().bioStart() + "-" + cds.location().bioEnd();
828                                        }
829                                        AccessionID cdsAccessionID = new AccessionID(cdsName);
830                                        //ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd());
831                                        transcriptSequence.addCDS(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd(), frame);
832                                }
833                        }
834                }
835
836        }
837
838        static public LinkedHashMap<String, ProteinSequence> getProteinSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception {
839                LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = new LinkedHashMap<String, ProteinSequence>();
840                for (ChromosomeSequence dnaSequence : chromosomeSequences) {
841                        for (GeneSequence geneSequence : dnaSequence.getGeneSequences().values()) {
842                                for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) {
843                                        //TODO remove?
844//                    DNASequence dnaCodingSequence = transcriptSequence.getDNACodingSequence();
845//                    logger.info("CDS={}", dnaCodingSequence.getSequenceAsString());
846
847                                        try {
848                                                ProteinSequence proteinSequence = transcriptSequence.getProteinSequence();
849
850//                        logger.info("{} {}", proteinSequence.getAccession().getID(), proteinSequence);
851                                                if (proteinSequenceHashMap.containsKey(proteinSequence.getAccession().getID())) {
852                                                        throw new Exception("Duplicate protein sequence id=" + proteinSequence.getAccession().getID() + " found at Gene id=" + geneSequence.getAccession().getID());
853                                                } else {
854                                                        proteinSequenceHashMap.put(proteinSequence.getAccession().getID(), proteinSequence);
855                                                }
856                                        } catch (Exception e) {
857                                                logger.error("Exception: ", e);
858                                        }
859
860                                }
861
862                        }
863                }
864                return proteinSequenceHashMap;
865        }
866
867        static public LinkedHashMap<String, GeneSequence> getGeneSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception {
868                LinkedHashMap<String, GeneSequence> geneSequenceHashMap = new LinkedHashMap<String, GeneSequence>();
869                for (ChromosomeSequence chromosomeSequence : chromosomeSequences) {
870                        for (GeneSequence geneSequence : chromosomeSequence.getGeneSequences().values()) {
871                                geneSequenceHashMap.put(geneSequence.getAccession().getID(), geneSequence);
872                        }
873                }
874
875                return geneSequenceHashMap;
876        }
877
878        public static void main(String[] args) throws Exception {
879/*        if (false) {
880                        LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGeneMarkGTF(new File("Scaffolds.fna"), new File("genemark_hmm.gtf"));
881                        LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values());
882                }
883
884                if (false) {
885                        LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceList = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGlimmerGFF3(new File("Scaffolds.fna"), new File("glimmerhmm.gff"));
886                        LinkedHashMap<String, ProteinSequence> proteinSequenceList = GeneFeatureHelper.getProteinSequences(chromosomeSequenceList.values());
887                        //  for (ProteinSequence proteinSequence : proteinSequenceList.values()) {
888                        //      logger.info(proteinSequence.getAccession().getID() + " " + proteinSequence);
889                        //  }
890                        FastaWriterHelper.writeProteinSequence(new File("predicted_glimmer.faa"), proteinSequenceList.values());
891
892                }
893                if (false) {
894                        GeneFeatureHelper.outputFastaSequenceLengthGFF3(new File("Scaffolds.fna"), new File("scaffolds.gff3"));
895                }
896
897 */
898
899                try {
900
901                        if (true) {
902                                //File fastaSequenceFile = new File("Scaffolds.fna");
903                                //File gff3File = new File("geneid-v6.gff3");
904                                //LinkedHashMap<String, ChromosomeSequence> chromosomeSequenceHashMap = GeneFeatureHelper.loadFastaAddGeneFeaturesFromGmodGFF3(fastaSequenceFile, gff3File,true);
905                        }
906
907                } catch (Exception e) {
908                        logger.error("Exception: ", e);
909                }
910
911        }
912}