001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
027import org.biojava.nbio.core.sequence.transcription.TranscriptionEngine;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.util.ArrayList;
032import java.util.Collections;
033import java.util.LinkedHashMap;
034import java.util.List;
035import java.util.Map;
036
037/**
038 * This is the sequence if you want to go from a gene sequence to a protein sequence. Need to start with a
039 * ChromosomeSequence then getting a GeneSequence and then a TranscriptSequence
040 * @author Scooter Willis
041 */
042public class TranscriptSequence extends DNASequence {
043
044        private final static Logger logger = LoggerFactory.getLogger(TranscriptSequence.class);
045
046        private final List<CDSSequence> cdsSequenceList = new ArrayList<>();
047        private final Map<String, CDSSequence> cdsSequenceHashMap = new LinkedHashMap<>();
048        private StartCodonSequence startCodonSequence = null;
049        private StopCodonSequence stopCodonSequence = null;
050        private GeneSequence parentGeneSequence = null;
051
052        /**
053         * Use {@code}public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end){@code}
054         * that requires an explicit accessionID
055         * @deprecated
056         */
057        public TranscriptSequence(GeneSequence parentDNASequence, int begin, int end) {
058                setCompoundSet(DNACompoundSet.getDNACompoundSet());
059                try {
060                        initSequenceStorage(parentDNASequence.getSequenceAsString());
061                } catch (CompoundNotFoundException e) {
062                        throw new IllegalArgumentException(e);
063                }
064                setParentSequence(parentDNASequence);
065                this.parentGeneSequence = parentDNASequence;
066                setBioBegin(begin);
067                setBioEnd(end);
068        }
069
070        /**
071         *
072         * @param parentDNASequence
073         * @param accessionID
074         * @param begin
075         * @param end inclusive of end
076         * @throws  IllegalArgumentException if the parentDNASequence is incompatible with DNACompoundSet
077         */
078        public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end) {
079                this(parentDNASequence, begin, end);
080                setAccession(accessionID);
081        }
082
083                @Override
084        public int getLength() {
085                return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1;
086        }
087
088        /**
089         * @return the strand
090         */
091        public Strand getStrand() {
092                return parentGeneSequence.getStrand();
093        }
094
095        /**
096         * Remove a CDS or coding sequence from the transcript sequence
097         * @param accession
098         * @return
099         */
100        public CDSSequence removeCDS(String accession) {
101                for (CDSSequence cdsSequence : cdsSequenceList) {
102                        if (cdsSequence.getAccession().getID().equals(accession)) {
103                                cdsSequenceList.remove(cdsSequence);
104                                cdsSequenceHashMap.remove(accession);
105                                return cdsSequence;
106                        }
107                }
108                return null;
109        }
110
111        /**
112         * Get the CDS sequences that have been added to the TranscriptSequences
113         * @return
114         */
115        public Map<String, CDSSequence> getCDSSequences() {
116                return cdsSequenceHashMap;
117        }
118
119        /**
120         * Add a Coding Sequence region with phase to the transcript sequence
121         * @param accession
122         * @param begin
123         * @param end
124         * @param phase 0,1,2
125         * @return
126         */
127        public CDSSequence addCDS(AccessionID accession, int begin, int end, int phase) throws Exception {
128                if (cdsSequenceHashMap.containsKey(accession.getID())) {
129                        throw new Exception("Duplicate accession id " + accession.getID());
130                }
131                CDSSequence cdsSequence = new CDSSequence(this, begin, end, phase); //sense should be the same as parent
132                cdsSequence.setAccession(accession);
133                cdsSequenceList.add(cdsSequence);
134                Collections.sort(cdsSequenceList, new CDSComparator());
135                cdsSequenceHashMap.put(accession.getID(), cdsSequence);
136                return cdsSequence;
137        }
138
139        /**
140         * http://www.sequenceontology.org/gff3.shtml
141         * http://biowiki.org/~yam/bioe131/GFF.ppt
142         * @return
143         */
144        /**
145         * Return a list of protein sequences based on each CDS sequence
146         * where the phase shift between two CDS sequences is assigned to the
147         * CDS sequence that starts the triplet. This can be used to map
148         * a CDS/exon region of a protein sequence back to the DNA sequence
149         * If you have a protein sequence and a predicted gene you can take the
150         * predict CDS protein sequences and align back to the protein sequence.
151         * If you have errors in mapping the predicted protein CDS regions to
152         * an the known protein sequence then you can identify possible errors
153         * in the prediction
154         *
155         * @return
156         */
157        public List<ProteinSequence> getProteinCDSSequences() {
158                List<ProteinSequence> proteinSequenceList = new ArrayList<>();
159                for (int i = 0; i < cdsSequenceList.size(); i++) {
160                        CDSSequence cdsSequence = cdsSequenceList.get(i);
161                        String codingSequence = cdsSequence.getCodingSequence();
162                        //          logger.debug("CDS {} {} = {}", getStrand(), cdsSequence.getPhase(), codingSequence);
163                        if (this.getStrand() == Strand.NEGATIVE) {
164                                if (cdsSequence.phase == 1) {
165                                        codingSequence = codingSequence.substring(1, codingSequence.length());
166                                } else if (cdsSequence.phase == 2) {
167                                        codingSequence = codingSequence.substring(2, codingSequence.length());
168                                }
169                                if (i < cdsSequenceList.size() - 1) {
170                                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
171                                        if (nextCDSSequence.phase == 1) {
172                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
173                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
174                                        } else if (nextCDSSequence.phase == 2) {
175                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
176                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
177                                        }
178                                }
179                        } else {
180                                if (cdsSequence.phase == 1) {
181                                        codingSequence = codingSequence.substring(1, codingSequence.length());
182                                } else if (cdsSequence.phase == 2) {
183                                        codingSequence = codingSequence.substring(2, codingSequence.length());
184                                }
185                                if (i < cdsSequenceList.size() - 1) {
186                                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
187                                        if (nextCDSSequence.phase == 1) {
188                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
189                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
190                                        } else if (nextCDSSequence.phase == 2) {
191                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
192                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
193                                        }
194                                }
195                        }
196
197
198                        //    logger.debug("Coding Sequence: {}", codingSequence);
199
200                        DNASequence dnaCodingSequence = null;
201                        try {
202                                dnaCodingSequence = new DNASequence(codingSequence.toUpperCase());
203                        } catch (CompoundNotFoundException e) {
204                                // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24
205                                logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage());
206                        }
207                        RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(TranscriptionEngine.getDefault());
208                        ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(TranscriptionEngine.getDefault());
209                        proteinSequence.setAccession(new AccessionID(cdsSequence.getAccession().getID()));
210                        proteinSequence.setParentDNASequence(cdsSequence, 1, cdsSequence.getLength());
211                        proteinSequenceList.add(proteinSequence);
212                }
213                return proteinSequenceList;
214        }
215
216        /**
217         * Get the stitched together CDS sequences then maps to the cDNA
218         * @return
219         */
220        public DNASequence getDNACodingSequence() {
221                StringBuilder sb = new StringBuilder();
222                for (CDSSequence cdsSequence : cdsSequenceList) {
223                        sb.append(cdsSequence.getCodingSequence());
224                }
225
226                DNASequence dnaSequence = null;
227                try {
228                        dnaSequence = new DNASequence(sb.toString().toUpperCase());
229                } catch (CompoundNotFoundException e) {
230                        // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24
231                        logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage());
232                }
233                dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
234                return dnaSequence;
235        }
236
237        /**
238         * Get the protein sequence
239         * @return
240         */
241        public ProteinSequence getProteinSequence() {
242                return getProteinSequence(TranscriptionEngine.getDefault());
243        }
244
245        /**
246         * Get the protein sequence with user defined TranscriptEngine
247         * @param engine
248         * @return
249         */
250        public ProteinSequence getProteinSequence(TranscriptionEngine engine) {
251                DNASequence dnaCodingSequence = getDNACodingSequence();
252                RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(engine);
253                ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(engine);
254                proteinSequence.setAccession(new AccessionID(this.getAccession().getID()));
255
256                return proteinSequence;
257        }
258
259        /**
260         * @return the startCodonSequence
261         */
262        public StartCodonSequence getStartCodonSequence() {
263                return startCodonSequence;
264        }
265
266        /**
267         * Sets the start codon sequence at given begin /  end location. Note that calling this method multiple times
268         * will replace any existing value.
269         * @param accession
270         * @param begin
271         * @param end
272         */
273        public void addStartCodonSequence(AccessionID accession, int begin, int end) {
274                this.startCodonSequence = new StartCodonSequence(this, begin, end);
275                startCodonSequence.setAccession(accession);
276        }
277
278        /**
279         * @return the stopCodonSequence
280         */
281        public StopCodonSequence getStopCodonSequence() {
282                return stopCodonSequence;
283        }
284
285        /**
286         * Sets the stop codon sequence at given begin /  end location. Note that calling this method multiple times
287         * will replace any existing value.
288         * @param accession
289         * @param begin
290         * @param end
291         */
292        public void addStopCodonSequence(AccessionID accession, int begin, int end) {
293                this.stopCodonSequence = new StopCodonSequence(this, begin, end);
294                stopCodonSequence.setAccession(accession);
295        }
296}