001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
027import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
028import org.biojava.nbio.core.sequence.template.CompoundSet;
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import java.util.ArrayList;
033import java.util.Collections;
034import java.util.LinkedHashMap;
035
036/**
037 *
038 * @author Scooter Willis
039 */
040public class GeneSequence extends DNASequence {
041
042        private final static Logger logger = LoggerFactory.getLogger(GeneSequence.class);
043
044        private final LinkedHashMap<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<String, TranscriptSequence>();
045        private final LinkedHashMap<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<String, IntronSequence>();
046        private final LinkedHashMap<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<String, ExonSequence>();
047        private final ArrayList<IntronSequence> intronSequenceList = new ArrayList<IntronSequence>();
048        private final ArrayList<ExonSequence> exonSequenceList = new ArrayList<ExonSequence>();
049        boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons
050        private Strand strand = Strand.UNDEFINED;
051        private ChromosomeSequence chromosomeSequence;
052
053        /**
054         * Use GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId,  int begin, int end, Strand strand)
055         * which mandates an accessionID.
056         * @param parentSequence
057         * @param begin
058         * @param end inclusive of end
059         * @param strand force a gene to have strand and transcription sequence will inherit
060         * @deprecated
061         */
062        public GeneSequence(ChromosomeSequence parentSequence,  int begin, int end, Strand strand) {
063                setCompoundSet(DNACompoundSet.getDNACompoundSet());
064                try {
065                        initSequenceStorage(parentSequence.getSequenceAsString());
066                } catch (CompoundNotFoundException e) {
067                        throw new IllegalArgumentException(e);
068                }
069                chromosomeSequence = parentSequence;
070                setParentSequence(parentSequence);
071                setBioBegin(begin);
072                setBioEnd(end);
073                setStrand(strand);
074        }
075
076        /**
077         * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult
078         * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one
079         * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein.
080         *
081         * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a
082         * TranscriptSequence and then add CDS sequences.
083         *
084         * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files
085         *
086         * @param parentSequence
087         * @param accessionId An identifier for the gene.
088         * @param begin
089         * @param end
090         * @param strand force a gene to have strand and transcription sequence will inherit
091         */
092        public GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId,  int begin, int end, Strand strand) {
093                this(parentSequence,begin,end,strand);
094                setAccession(accessionId);
095        }
096
097        /**
098         * The parent ChromosomeSequence which contains the actual DNA sequence data
099         * @return Chromosome sequence
100         */
101        public ChromosomeSequence getParentChromosomeSequence() {
102                return chromosomeSequence;
103        }
104
105        @Override
106        public int getLength() {
107                return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1;
108        }
109
110        /**
111         * Once everything has been added to the gene sequence where you might have added exon sequences only then you
112         * can infer the intron sequences and add them. You may also have the case where you only added one or more
113         * TranscriptSequences and from that you can infer the exon sequences and intron sequences.
114         * Currently not implement
115         */
116        public void addIntronsUsingExons() throws Exception {
117                if (intronAdded) { //going to assume introns are correct
118                        return;
119                }
120                if (exonSequenceList.size() == 0) {
121                        return;
122                }
123                ExonComparator exonComparator = new ExonComparator();
124                //sort based on start position and sense;
125                Collections.sort(exonSequenceList, exonComparator);
126                int shift = -1;
127                if (getStrand() == Strand.NEGATIVE) {
128                        shift = 1;
129                }
130                //ExonSequence firstExonSequence = exonSequenceList.get(0);
131                int intronIndex = 1;
132//       if (firstExonSequence.getBioBegin().intValue() != getBioBegin().intValue()) {
133//           this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), getBioBegin(), firstExonSequence.getBioBegin() + shift);
134//           intronIndex++;
135//       }
136                for (int i = 0; i < exonSequenceList.size() - 1; i++) {
137                        ExonSequence exon1 = exonSequenceList.get(i);
138                        ExonSequence exon2 = exonSequenceList.get(i + 1);
139                        AccessionID intronId= new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex);
140                        this.addIntron(intronId, exon1.getBioEnd() - shift, exon2.getBioBegin() + shift);
141                        intronIndex++;
142                }
143
144//       ExonSequence lastExonSequence = exonSequenceList.get(exonSequenceList.size() - 1);
145//       if (lastExonSequence.getBioEnd().intValue() != getBioEnd().intValue()) {
146//           this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), lastExonSequence.getBioEnd() - shift, getBioEnd());
147//           intronIndex++;
148//       }
149
150                //    log.severe("Add in support for building introns based on added exons");
151
152        }
153
154        /**
155         * A gene should have Strand
156         * @return the strand
157         */
158        public Strand getStrand() {
159                return strand;
160        }
161
162        /**
163         * @param strand the strand to set
164         */
165        public void setStrand(Strand strand) {
166                this.strand = strand;
167        }
168
169        /**
170         * Get the transcript sequence by accession
171         * @param accession
172         * @return the transcript
173         */
174        public TranscriptSequence getTranscript(String accession) {
175                return transcriptSequenceHashMap.get(accession);
176        }
177
178        /**
179         * Get the collection of transcription sequences assigned to this gene
180         * @return transcripts
181         */
182        public LinkedHashMap<String, TranscriptSequence> getTranscripts() {
183                return transcriptSequenceHashMap;
184        }
185
186        /**
187         * Remove the transcript sequence from the gene
188         * @param accession
189         * @return transcriptsequence
190         */
191        public TranscriptSequence removeTranscript(String accession) {
192                return transcriptSequenceHashMap.remove(accession);
193        }
194
195        /**
196         * Add a transcription sequence to a gene which describes a ProteinSequence
197         * @param accession
198         * @param begin
199         * @param end
200         * @return transcript sequence
201         * @throws Exception If the accession id is already used
202         */
203        public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception {
204                if (transcriptSequenceHashMap.containsKey(accession.getID())) {
205                        throw new Exception("Duplicate accesion id " + accession.getID());
206                }
207                TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end);
208                transcriptSequence.setAccession(accession);
209                transcriptSequenceHashMap.put(accession.getID(), transcriptSequence);
210                return transcriptSequence;
211        }
212
213        /**
214         * Remove the intron by accession
215         * @param accession
216         * @return the removed intron sequence, or null if no intron with that accession exists.
217         */
218        public IntronSequence removeIntron(String accession) {
219                for (IntronSequence intronSequence : intronSequenceList) {
220                        if (intronSequence.getAccession().getID().equals(accession)) {
221                                intronSequenceList.remove(intronSequence);
222                                intronSequenceHashMap.remove(accession);
223                                return intronSequence;
224                        }
225                }
226                return null;
227        }
228
229        /**
230         * Add an Intron Currently used to mark an IntronSequence as a feature
231         * @param accession
232         * @param begin
233         * @param end
234         * @return intron sequence
235         */
236        public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception {
237                if (intronSequenceHashMap.containsKey(accession.getID())) {
238                        throw new Exception("Duplicate accesion id " + accession.getID());
239                }
240                intronAdded = true;
241                IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent
242                intronSequence.setAccession(accession);
243                intronSequenceList.add(intronSequence);
244                intronSequenceHashMap.put(accession.getID(), intronSequence);
245                return intronSequence;
246        }
247
248        /**
249         * Remove the exon sequence
250         * @param accession
251         * @return exon sequence
252         */
253        public ExonSequence removeExon(String accession) {
254                for (ExonSequence exonSequence : exonSequenceList) {
255                        if (exonSequence.getAccession().getID().equals(accession)) {
256                                exonSequenceList.remove(exonSequence);
257                                exonSequenceHashMap.remove(accession);
258                                // we now have a new gap which creates an intron
259                                intronSequenceList.clear();
260                                intronSequenceHashMap.clear();
261                                intronAdded = false;
262                                try{
263                                        addIntronsUsingExons();
264                                } catch(Exception e){
265                                        logger.error("Remove Exon validate() error " + e.getMessage());
266                                }
267                                return exonSequence;
268                        }
269                }
270                return null;
271        }
272
273        /**
274         * Add an ExonSequence mainly used to mark as a feature
275         * @param accession
276         * @param begin
277         * @param end
278         * @return exon sequence
279         * @throws IllegalArgumentException if accessionID is already added.
280         */
281        public ExonSequence addExon(AccessionID accession, int begin, int end) {
282                if (exonSequenceHashMap.containsKey(accession.getID())) {
283                        throw new IllegalArgumentException("Duplicate accession id: " + accession.getID());
284                }
285
286                ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent
287                exonSequence.setAccession(accession);
288                exonSequenceList.add(exonSequence);
289                exonSequenceHashMap.put(accession.getID(), exonSequence);
290                return exonSequence;
291        }
292
293        /**
294         * Get the exons as an ArrayList. Modifying this list will not modify the underlying collection
295         * @return exons
296         */
297        public ArrayList<ExonSequence> getExonSequences() {
298                return new ArrayList<>(exonSequenceList);
299        }
300
301        /**
302         * Get the introns as an ArrayList. Modifying this list will not modify the underlying collection
303         * @return introns
304         */
305        public ArrayList<IntronSequence> getIntronSequences() {
306                return  new ArrayList<>(intronSequenceList);
307        }
308
309        /**
310         * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction
311         * Returns the DNASequence representative of the 5' and 3' reading based on strand
312         * @return dna sequence or null if sequence could not be generated.
313         */
314        public DNASequence getSequence5PrimeTo3Prime() {
315                String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand());
316                if (getStrand() == Strand.NEGATIVE) {
317                        //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand
318                        StringBuilder b = new StringBuilder(getLength());
319                        CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet();
320                        for (int i = 0; i < sequence.length(); i++) {
321                                String nucleotide = String.valueOf(sequence.charAt(i));
322                                NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide);
323                                b.append(nucleotideCompound.getComplement().getShortName());
324                        }
325                        sequence = b.toString();
326                }
327                DNASequence dnaSequence = null;
328                try {
329                        dnaSequence = new DNASequence(sequence.toUpperCase());
330                    dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
331                } catch (CompoundNotFoundException e) {
332                        // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere
333                        logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage());
334                }
335                return dnaSequence;
336        }
337}