001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
027import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
028import org.biojava.nbio.core.sequence.template.CompoundSet;
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import java.util.ArrayList;
033import java.util.Collections;
034import java.util.LinkedHashMap;
035
036/**
037 *
038 * @author Scooter Willis
039 */
040public class GeneSequence extends DNASequence {
041
042        private final static Logger logger = LoggerFactory.getLogger(GeneSequence.class);
043
044        private final LinkedHashMap<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<String, TranscriptSequence>();
045        private final LinkedHashMap<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<String, IntronSequence>();
046        private final LinkedHashMap<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<String, ExonSequence>();
047        private final ArrayList<IntronSequence> intronSequenceList = new ArrayList<IntronSequence>();
048        private final ArrayList<ExonSequence> exonSequenceList = new ArrayList<ExonSequence>();
049        boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons
050        private Strand strand = Strand.UNDEFINED;
051        private ChromosomeSequence chromosomeSequence;
052
053        /**
054         * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult
055         * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one
056         * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein.
057         *
058         * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a
059         * TranscriptSequence and then add CDS sequences.
060         *
061         * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files
062         *
063         * @param parentDNASequence
064         * @param begin
065         * @param end inclusive of end
066         * @param strand force a gene to have strand and transcription sequence will inherit
067         */
068        public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) {
069                chromosomeSequence = parentSequence;
070                setParentSequence(parentSequence);
071                setBioBegin(begin);
072                setBioEnd(end);
073                setStrand(strand);
074                this.setCompoundSet(DNACompoundSet.getDNACompoundSet());
075        }
076
077        /**
078         * The parent ChromosomeSequence which contains the actual DNA sequence data
079         * @return Chromosome sequence
080         */
081        public ChromosomeSequence getParentChromosomeSequence() {
082                return chromosomeSequence;
083        }
084
085        @Override
086        public int getLength() {
087                return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1;
088        }
089
090        /**
091         * Once everything has been added to the gene sequence where you might have added exon sequences only then you
092         * can infer the intron sequences and add them. You may also have the case where you only added one or more
093         * TranscriptSequences and from that you can infer the exon sequences and intron sequences.
094         * Currently not implement
095         */
096        public void addIntronsUsingExons() throws Exception {
097                if (intronAdded) { //going to assume introns are correct
098                        return;
099                }
100                if (exonSequenceList.size() == 0) {
101                        return;
102                }
103                ExonComparator exonComparator = new ExonComparator();
104                //sort based on start position and sense;
105                Collections.sort(exonSequenceList, exonComparator);
106                int shift = -1;
107                if (getStrand() == Strand.NEGATIVE) {
108                        shift = 1;
109                }
110                //ExonSequence firstExonSequence = exonSequenceList.get(0);
111                int intronIndex = 1;
112//       if (firstExonSequence.getBioBegin().intValue() != getBioBegin().intValue()) {
113//           this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), getBioBegin(), firstExonSequence.getBioBegin() + shift);
114//           intronIndex++;
115//       }
116                for (int i = 0; i < exonSequenceList.size() - 1; i++) {
117                        ExonSequence exon1 = exonSequenceList.get(i);
118                        ExonSequence exon2 = exonSequenceList.get(i + 1);
119                        this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), exon1.getBioEnd() - shift, exon2.getBioBegin() + shift);
120                        intronIndex++;
121                }
122
123//       ExonSequence lastExonSequence = exonSequenceList.get(exonSequenceList.size() - 1);
124//       if (lastExonSequence.getBioEnd().intValue() != getBioEnd().intValue()) {
125//           this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), lastExonSequence.getBioEnd() - shift, getBioEnd());
126//           intronIndex++;
127//       }
128
129                //    log.severe("Add in support for building introns based on added exons");
130
131        }
132
133        /**
134         * A gene should have Strand
135         * @return the strand
136         */
137        public Strand getStrand() {
138                return strand;
139        }
140
141        /**
142         * @param strand the strand to set
143         */
144        public void setStrand(Strand strand) {
145                this.strand = strand;
146        }
147
148        /**
149         * Get the transcript sequence by accession
150         * @param accession
151         * @return the transcript
152         */
153        public TranscriptSequence getTranscript(String accession) {
154                return transcriptSequenceHashMap.get(accession);
155        }
156
157        /**
158         * Get the collection of transcription sequences assigned to this gene
159         * @return transcripts
160         */
161        public LinkedHashMap<String, TranscriptSequence> getTranscripts() {
162                return transcriptSequenceHashMap;
163        }
164
165        /**
166         * Remove the transcript sequence from the gene
167         * @param accession
168         * @return transcriptsequence
169         */
170        public TranscriptSequence removeTranscript(String accession) {
171
172
173                return transcriptSequenceHashMap.remove(accession);
174        }
175
176        /**
177         * Add a transcription sequence to a gene which describes a ProteinSequence
178         * @param accession
179         * @param begin
180         * @param end
181         * @return transcript sequence
182         * @throws Exception If the accession id is already used
183         */
184        public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception {
185                if (transcriptSequenceHashMap.containsKey(accession.getID())) {
186                        throw new Exception("Duplicate accesion id " + accession.getID());
187                }
188                TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end);
189                transcriptSequence.setAccession(accession);
190                transcriptSequenceHashMap.put(accession.getID(), transcriptSequence);
191                return transcriptSequence;
192        }
193
194        /**
195         * Remove the intron by accession
196         * @param accession
197         * @return intron sequence
198         */
199        public IntronSequence removeIntron(String accession) {
200                for (IntronSequence intronSequence : intronSequenceList) {
201                        if (intronSequence.getAccession().getID().equals(accession)) {
202                                intronSequenceList.remove(intronSequence);
203                                intronSequenceHashMap.remove(accession);
204                                return intronSequence;
205                        }
206                }
207                return null;
208        }
209
210        /**
211         * Add an Intron Currently used to mark an IntronSequence as a feature
212         * @param accession
213         * @param begin
214         * @param end
215         * @return intron sequence
216         */
217        public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception {
218                if (intronSequenceHashMap.containsKey(accession.getID())) {
219                        throw new Exception("Duplicate accesion id " + accession.getID());
220                }
221                intronAdded = true;
222                IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent
223                intronSequence.setAccession(accession);
224                intronSequenceList.add(intronSequence);
225                intronSequenceHashMap.put(accession.getID(), intronSequence);
226                return intronSequence;
227        }
228
229        /**
230         * Remove the exon sequence
231         * @param accession
232         * @return exon sequence
233         */
234        public ExonSequence removeExon(String accession) {
235                for (ExonSequence exonSequence : exonSequenceList) {
236                        if (exonSequence.getAccession().getID().equals(accession)) {
237                                exonSequenceList.remove(exonSequence);
238                                exonSequenceHashMap.remove(accession);
239                                // we now have a new gap which creates an intron
240                                intronSequenceList.clear();
241                                intronSequenceHashMap.clear();
242                                intronAdded = false;
243                                try{
244                                        addIntronsUsingExons();
245                                } catch(Exception e){
246                                        logger.error("Remove Exon validate() error " + e.getMessage());
247                                }
248                                return exonSequence;
249                        }
250                }
251                return null;
252        }
253
254        /**
255         * Add an ExonSequence mainly used to mark as a feature
256         * @param accession
257         * @param begin
258         * @param end
259         * @return exon sequence
260         */
261        public ExonSequence addExon(AccessionID accession, int begin, int end) throws Exception {
262                if (exonSequenceHashMap.containsKey(accession.getID())) {
263                        throw new Exception("Duplicate accesion id " + accession.getID());
264                }
265
266                ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent
267                exonSequence.setAccession(accession);
268                exonSequenceList.add(exonSequence);
269                exonSequenceHashMap.put(accession.getID(), exonSequence);
270                return exonSequence;
271        }
272
273        /**
274         * Get the exons as an ArrayList
275         * @return exons
276         */
277        public ArrayList<ExonSequence> getExonSequences() {
278                return exonSequenceList;
279        }
280
281        /**
282         * Get the introns as an ArrayList
283         * @return introns
284         */
285        public ArrayList<IntronSequence> getIntronSequences() {
286                return intronSequenceList;
287        }
288
289        /**
290         * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction
291         * Returns the DNASequence representative of the 5' and 3' reading based on strand
292         * @return dna sequence
293         */
294        public DNASequence getSequence5PrimeTo3Prime() {
295                String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand());
296                if (getStrand() == Strand.NEGATIVE) {
297                        //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand
298                        StringBuilder b = new StringBuilder(getLength());
299                        CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet();
300                        for (int i = 0; i < sequence.length(); i++) {
301                                String nucleotide = String.valueOf(sequence.charAt(i));
302                                NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide);
303                                b.append(nucleotideCompound.getComplement().getShortName());
304                        }
305                        sequence = b.toString();
306                }
307                DNASequence dnaSequence = null;
308                try {
309                        dnaSequence = new DNASequence(sequence.toUpperCase());
310                } catch (CompoundNotFoundException e) {
311                        // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere
312                        logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage());
313                }
314                dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
315                return dnaSequence;
316        }
317}