Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
027import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
028import org.biojava.nbio.core.sequence.template.CompoundSet;
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import java.util.ArrayList;
033import java.util.Collections;
034import java.util.LinkedHashMap;
035import java.util.Map;
036import java.util.List;
037
038/**
039 *
040 * @author Scooter Willis
041 */
042public class GeneSequence extends DNASequence {
043
044        private final static Logger logger = LoggerFactory.getLogger(GeneSequence.class);
045
046        private final Map<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<>();
047        private final Map<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<>();
048        private final Map<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<>();
049        private final List<IntronSequence> intronSequenceList = new ArrayList<>();
050        private final List<ExonSequence> exonSequenceList = new ArrayList<>();
051        boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons
052        private Strand strand = Strand.UNDEFINED;
053        private ChromosomeSequence chromosomeSequence;
054
055        /**
056         * Use GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId,  int begin, int end, Strand strand)
057         * which mandates an accessionID.
058         * @param parentSequence
059         * @param begin
060         * @param end inclusive of end
061         * @param strand force a gene to have strand and transcription sequence will inherit
062         * @deprecated
063         */
064        public GeneSequence(ChromosomeSequence parentSequence,  int begin, int end, Strand strand) {
065                setCompoundSet(DNACompoundSet.getDNACompoundSet());
066                try {
067                        initSequenceStorage(parentSequence.getSequenceAsString());
068                } catch (CompoundNotFoundException e) {
069                        throw new IllegalArgumentException(e);
070                }
071                chromosomeSequence = parentSequence;
072                setParentSequence(parentSequence);
073                setBioBegin(begin);
074                setBioEnd(end);
075                setStrand(strand);
076        }
077
078        /**
079         * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult
080         * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one
081         * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein.
082         *
083         * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a
084         * TranscriptSequence and then add CDS sequences.
085         *
086         * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files
087         *
088         * @param parentSequence
089         * @param accessionId An identifier for the gene.
090         * @param begin
091         * @param end
092         * @param strand force a gene to have strand and transcription sequence will inherit
093         */
094        public GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId,  int begin, int end, Strand strand) {
095                this(parentSequence,begin,end,strand);
096                setAccession(accessionId);
097        }
098
099        /**
100         * The parent ChromosomeSequence which contains the actual DNA sequence data
101         * @return Chromosome sequence
102         */
103        public ChromosomeSequence getParentChromosomeSequence() {
104                return chromosomeSequence;
105        }
106
107        @Override
108        public int getLength() {
109                return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1;
110        }
111
112        /**
113         * Once everything has been added to the gene sequence where you might have added exon sequences only then you
114         * can infer the intron sequences and add them. You may also have the case where you only added one or more
115         * TranscriptSequences and from that you can infer the exon sequences and intron sequences.
116         * Currently not implement
117         */
118        public void addIntronsUsingExons() throws Exception {
119                if (intronAdded) { //going to assume introns are correct
120                        return;
121                }
122                if (exonSequenceList.size() == 0) {
123                        return;
124                }
125                ExonComparator exonComparator = new ExonComparator();
126                //sort based on start position and sense;
127                Collections.sort(exonSequenceList, exonComparator);
128                int shift = -1;
129                if (getStrand() == Strand.NEGATIVE) {
130                        shift = 1;
131                }
132                //ExonSequence firstExonSequence = exonSequenceList.get(0);
133                int intronIndex = 1;
134//       if (firstExonSequence.getBioBegin().intValue() != getBioBegin().intValue()) {
135//           this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), getBioBegin(), firstExonSequence.getBioBegin() + shift);
136//           intronIndex++;
137//       }
138                for (int i = 0; i < exonSequenceList.size() - 1; i++) {
139                        ExonSequence exon1 = exonSequenceList.get(i);
140                        ExonSequence exon2 = exonSequenceList.get(i + 1);
141                        AccessionID intronId= new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex);
142                        this.addIntron(intronId, exon1.getBioEnd() - shift, exon2.getBioBegin() + shift);
143                        intronIndex++;
144                }
145
146//       ExonSequence lastExonSequence = exonSequenceList.get(exonSequenceList.size() - 1);
147//       if (lastExonSequence.getBioEnd().intValue() != getBioEnd().intValue()) {
148//           this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), lastExonSequence.getBioEnd() - shift, getBioEnd());
149//           intronIndex++;
150//       }
151
152                //    log.severe("Add in support for building introns based on added exons");
153
154        }
155
156        /**
157         * A gene should have Strand
158         * @return the strand
159         */
160        public Strand getStrand() {
161                return strand;
162        }
163
164        /**
165         * @param strand the strand to set
166         */
167        public void setStrand(Strand strand) {
168                this.strand = strand;
169        }
170
171        /**
172         * Get the transcript sequence by accession
173         * @param accession
174         * @return the transcript
175         */
176        public TranscriptSequence getTranscript(String accession) {
177                return transcriptSequenceHashMap.get(accession);
178        }
179
180        /**
181         * Get the collection of transcription sequences assigned to this gene
182         * @return transcripts
183         */
184        public Map<String, TranscriptSequence> getTranscripts() {
185                return transcriptSequenceHashMap;
186        }
187
188        /**
189         * Remove the transcript sequence from the gene
190         * @param accession
191         * @return transcriptsequence
192         */
193        public TranscriptSequence removeTranscript(String accession) {
194                return transcriptSequenceHashMap.remove(accession);
195        }
196
197        /**
198         * Add a transcription sequence to a gene which describes a ProteinSequence
199         * @param accession
200         * @param begin
201         * @param end
202         * @return transcript sequence
203         * @throws Exception If the accession id is already used
204         */
205        public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception {
206                if (transcriptSequenceHashMap.containsKey(accession.getID())) {
207                        throw new Exception("Duplicate accesion id " + accession.getID());
208                }
209                TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end);
210                transcriptSequence.setAccession(accession);
211                transcriptSequenceHashMap.put(accession.getID(), transcriptSequence);
212                return transcriptSequence;
213        }
214
215        /**
216         * Remove the intron by accession
217         * @param accession
218         * @return the removed intron sequence, or null if no intron with that accession exists.
219         */
220        public IntronSequence removeIntron(String accession) {
221                for (IntronSequence intronSequence : intronSequenceList) {
222                        if (intronSequence.getAccession().getID().equals(accession)) {
223                                intronSequenceList.remove(intronSequence);
224                                intronSequenceHashMap.remove(accession);
225                                return intronSequence;
226                        }
227                }
228                return null;
229        }
230
231        /**
232         * Add an Intron Currently used to mark an IntronSequence as a feature
233         * @param accession
234         * @param begin
235         * @param end
236         * @return intron sequence
237         */
238        public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception {
239                if (intronSequenceHashMap.containsKey(accession.getID())) {
240                        throw new Exception("Duplicate accesion id " + accession.getID());
241                }
242                intronAdded = true;
243                IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent
244                intronSequence.setAccession(accession);
245                intronSequenceList.add(intronSequence);
246                intronSequenceHashMap.put(accession.getID(), intronSequence);
247                return intronSequence;
248        }
249
250        /**
251         * Remove the exon sequence
252         * @param accession
253         * @return exon sequence
254         */
255        public ExonSequence removeExon(String accession) {
256                for (ExonSequence exonSequence : exonSequenceList) {
257                        if (exonSequence.getAccession().getID().equals(accession)) {
258                                exonSequenceList.remove(exonSequence);
259                                exonSequenceHashMap.remove(accession);
260                                // we now have a new gap which creates an intron
261                                intronSequenceList.clear();
262                                intronSequenceHashMap.clear();
263                                intronAdded = false;
264                                try{
265                                        addIntronsUsingExons();
266                                } catch(Exception e){
267                                        logger.error("Remove Exon validate() error " + e.getMessage());
268                                }
269                                return exonSequence;
270                        }
271                }
272                return null;
273        }
274
275        /**
276         * Add an ExonSequence mainly used to mark as a feature
277         * @param accession
278         * @param begin
279         * @param end
280         * @return exon sequence
281         * @throws IllegalArgumentException if accessionID is already added.
282         */
283        public ExonSequence addExon(AccessionID accession, int begin, int end) {
284                if (exonSequenceHashMap.containsKey(accession.getID())) {
285                        throw new IllegalArgumentException("Duplicate accession id: " + accession.getID());
286                }
287
288                ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent
289                exonSequence.setAccession(accession);
290                exonSequenceList.add(exonSequence);
291                exonSequenceHashMap.put(accession.getID(), exonSequence);
292                return exonSequence;
293        }
294
295        /**
296         * Get the exons as an ArrayList. Modifying this list will not modify the underlying collection
297         * @return exons
298         */
299        public List<ExonSequence> getExonSequences() {
300                return new ArrayList<>(exonSequenceList);
301        }
302
303        /**
304         * Get the introns as an ArrayList. Modifying this list will not modify the underlying collection
305         * @return introns
306         */
307        public List<IntronSequence> getIntronSequences() {
308                return  new ArrayList<>(intronSequenceList);
309        }
310
311        /**
312         * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction
313         * Returns the DNASequence representative of the 5' and 3' reading based on strand
314         * @return dna sequence or null if sequence could not be generated.
315         */
316        public DNASequence getSequence5PrimeTo3Prime() {
317                String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand());
318                if (getStrand() == Strand.NEGATIVE) {
319                        //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand
320                        StringBuilder b = new StringBuilder(getLength());
321                        CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet();
322                        for (int i = 0; i < sequence.length(); i++) {
323                                String nucleotide = String.valueOf(sequence.charAt(i));
324                                NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide);
325                                b.append(nucleotideCompound.getComplement().getShortName());
326                        }
327                        sequence = b.toString();
328                }
329                DNASequence dnaSequence = null;
330                try {
331                        dnaSequence = new DNASequence(sequence.toUpperCase());
332                    dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
333                } catch (CompoundNotFoundException e) {
334                        // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere
335                        logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage());
336                }
337                return dnaSequence;
338        }
339}