001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
027import org.biojava.nbio.core.sequence.transcription.TranscriptionEngine;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.util.ArrayList;
032import java.util.Collections;
033import java.util.LinkedHashMap;
034
035/**
036 * This is the sequence if you want to go from a gene sequence to a protein sequence. Need to start with a
037 * ChromosomeSequence then getting a GeneSequence and then a TranscriptSequence
038 * @author Scooter Willis
039 */
040public class TranscriptSequence extends DNASequence {
041
042        private final static Logger logger = LoggerFactory.getLogger(TranscriptSequence.class);
043
044        private final ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>();
045        private final LinkedHashMap<String, CDSSequence> cdsSequenceHashMap = new LinkedHashMap<String, CDSSequence>();
046        private StartCodonSequence startCodonSequence = null;
047        private StopCodonSequence stopCodonSequence = null;
048        private GeneSequence parentGeneSequence = null;
049
050        /**
051         * Use {@code}public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end){@code}
052         * that requires an explicit accessionID
053         * @deprecated
054         */
055        public TranscriptSequence(GeneSequence parentDNASequence, int begin, int end) {
056                setCompoundSet(DNACompoundSet.getDNACompoundSet());
057                try {
058                        initSequenceStorage(parentDNASequence.getSequenceAsString());
059                } catch (CompoundNotFoundException e) {
060                        throw new IllegalArgumentException(e);
061                }
062                setParentSequence(parentDNASequence);
063                this.parentGeneSequence = parentDNASequence;
064                setBioBegin(begin);
065                setBioEnd(end);
066        }
067
068        /**
069         *
070         * @param parentDNASequence
071         * @param accessionID
072         * @param begin
073         * @param end inclusive of end
074         * @throws  IllegalArgumentException if the parentDNASequence is incompatible with DNACompoundSet
075         */
076        public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end) {
077                this(parentDNASequence, begin, end);
078                setAccession(accessionID);
079        }
080
081                @Override
082        public int getLength() {
083                return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1;
084        }
085
086        /**
087         * @return the strand
088         */
089        public Strand getStrand() {
090                return parentGeneSequence.getStrand();
091        }
092
093        /**
094         * Remove a CDS or coding sequence from the transcript sequence
095         * @param accession
096         * @return
097         */
098        public CDSSequence removeCDS(String accession) {
099                for (CDSSequence cdsSequence : cdsSequenceList) {
100                        if (cdsSequence.getAccession().getID().equals(accession)) {
101                                cdsSequenceList.remove(cdsSequence);
102                                cdsSequenceHashMap.remove(accession);
103                                return cdsSequence;
104                        }
105                }
106                return null;
107        }
108
109        /**
110         * Get the CDS sequences that have been added to the TranscriptSequences
111         * @return
112         */
113        public LinkedHashMap<String, CDSSequence> getCDSSequences() {
114                return cdsSequenceHashMap;
115        }
116
117        /**
118         * Add a Coding Sequence region with phase to the transcript sequence
119         * @param accession
120         * @param begin
121         * @param end
122         * @param phase 0,1,2
123         * @return
124         */
125        public CDSSequence addCDS(AccessionID accession, int begin, int end, int phase) throws Exception {
126                if (cdsSequenceHashMap.containsKey(accession.getID())) {
127                        throw new Exception("Duplicate accession id " + accession.getID());
128                }
129                CDSSequence cdsSequence = new CDSSequence(this, begin, end, phase); //sense should be the same as parent
130                cdsSequence.setAccession(accession);
131                cdsSequenceList.add(cdsSequence);
132                Collections.sort(cdsSequenceList, new CDSComparator());
133                cdsSequenceHashMap.put(accession.getID(), cdsSequence);
134                return cdsSequence;
135        }
136
137        /**
138         * http://www.sequenceontology.org/gff3.shtml
139         * http://biowiki.org/~yam/bioe131/GFF.ppt
140         * @return
141         */
142        /**
143         * Return a list of protein sequences based on each CDS sequence
144         * where the phase shift between two CDS sequences is assigned to the
145         * CDS sequence that starts the triplet. This can be used to map
146         * a CDS/exon region of a protein sequence back to the DNA sequence
147         * If you have a protein sequence and a predicted gene you can take the
148         * predict CDS protein sequences and align back to the protein sequence.
149         * If you have errors in mapping the predicted protein CDS regions to
150         * an the known protein sequence then you can identify possible errors
151         * in the prediction
152         *
153         * @return
154         */
155        public ArrayList<ProteinSequence> getProteinCDSSequences() {
156                ArrayList<ProteinSequence> proteinSequenceList = new ArrayList<ProteinSequence>();
157                for (int i = 0; i < cdsSequenceList.size(); i++) {
158                        CDSSequence cdsSequence = cdsSequenceList.get(i);
159                        String codingSequence = cdsSequence.getCodingSequence();
160                        //          logger.debug("CDS {} {} = {}", getStrand(), cdsSequence.getPhase(), codingSequence);
161                        if (this.getStrand() == Strand.NEGATIVE) {
162                                if (cdsSequence.phase == 1) {
163                                        codingSequence = codingSequence.substring(1, codingSequence.length());
164                                } else if (cdsSequence.phase == 2) {
165                                        codingSequence = codingSequence.substring(2, codingSequence.length());
166                                }
167                                if (i < cdsSequenceList.size() - 1) {
168                                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
169                                        if (nextCDSSequence.phase == 1) {
170                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
171                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
172                                        } else if (nextCDSSequence.phase == 2) {
173                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
174                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
175                                        }
176                                }
177                        } else {
178                                if (cdsSequence.phase == 1) {
179                                        codingSequence = codingSequence.substring(1, codingSequence.length());
180                                } else if (cdsSequence.phase == 2) {
181                                        codingSequence = codingSequence.substring(2, codingSequence.length());
182                                }
183                                if (i < cdsSequenceList.size() - 1) {
184                                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
185                                        if (nextCDSSequence.phase == 1) {
186                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
187                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
188                                        } else if (nextCDSSequence.phase == 2) {
189                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
190                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
191                                        }
192                                }
193                        }
194
195
196                        //    logger.debug("Coding Sequence: {}", codingSequence);
197
198                        DNASequence dnaCodingSequence = null;
199                        try {
200                                dnaCodingSequence = new DNASequence(codingSequence.toUpperCase());
201                        } catch (CompoundNotFoundException e) {
202                                // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24
203                                logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage());
204                        }
205                        RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(TranscriptionEngine.getDefault());
206                        ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(TranscriptionEngine.getDefault());
207                        proteinSequence.setAccession(new AccessionID(cdsSequence.getAccession().getID()));
208                        proteinSequence.setParentDNASequence(cdsSequence, 1, cdsSequence.getLength());
209                        proteinSequenceList.add(proteinSequence);
210                }
211                return proteinSequenceList;
212        }
213
214        /**
215         * Get the stitched together CDS sequences then maps to the cDNA
216         * @return
217         */
218        public DNASequence getDNACodingSequence() {
219                StringBuilder sb = new StringBuilder();
220                for (CDSSequence cdsSequence : cdsSequenceList) {
221                        sb.append(cdsSequence.getCodingSequence());
222                }
223
224                DNASequence dnaSequence = null;
225                try {
226                        dnaSequence = new DNASequence(sb.toString().toUpperCase());
227                } catch (CompoundNotFoundException e) {
228                        // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24
229                        logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage());
230                }
231                dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
232                return dnaSequence;
233        }
234
235        /**
236         * Get the protein sequence
237         * @return
238         */
239        public ProteinSequence getProteinSequence() {
240                return getProteinSequence(TranscriptionEngine.getDefault());
241        }
242
243        /**
244         * Get the protein sequence with user defined TranscriptEngine
245         * @param engine
246         * @return
247         */
248        public ProteinSequence getProteinSequence(TranscriptionEngine engine) {
249                DNASequence dnaCodingSequence = getDNACodingSequence();
250                RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(engine);
251                ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(engine);
252                proteinSequence.setAccession(new AccessionID(this.getAccession().getID()));
253
254                return proteinSequence;
255        }
256
257        /**
258         * @return the startCodonSequence
259         */
260        public StartCodonSequence getStartCodonSequence() {
261                return startCodonSequence;
262        }
263
264        /**
265         * Sets the start codon sequence at given begin /  end location. Note that calling this method multiple times
266         * will replace any existing value.
267         * @param accession
268         * @param begin
269         * @param end
270         */
271        public void addStartCodonSequence(AccessionID accession, int begin, int end) {
272                this.startCodonSequence = new StartCodonSequence(this, begin, end);
273                startCodonSequence.setAccession(accession);
274        }
275
276        /**
277         * @return the stopCodonSequence
278         */
279        public StopCodonSequence getStopCodonSequence() {
280                return stopCodonSequence;
281        }
282
283        /**
284         * Sets the stop codon sequence at given begin /  end location. Note that calling this method multiple times
285         * will replace any existing value.
286         * @param accession
287         * @param begin
288         * @param end
289         */
290        public void addStopCodonSequence(AccessionID accession, int begin, int end) {
291                this.stopCodonSequence = new StopCodonSequence(this, begin, end);
292                stopCodonSequence.setAccession(accession);
293        }
294}