001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
027import org.biojava.nbio.core.sequence.transcription.TranscriptionEngine;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.util.ArrayList;
032import java.util.Collections;
033import java.util.LinkedHashMap;
034
035/**
036 * This is the sequence if you want to go from a gene sequence to a protein sequence. Need to start with a
037 * ChromosomeSequence then getting a GeneSequence and then a TranscriptSequence
038 * @author Scooter Willis
039 */
040public class TranscriptSequence extends DNASequence {
041
042        private final static Logger logger = LoggerFactory.getLogger(TranscriptSequence.class);
043
044        private final ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>();
045        private final LinkedHashMap<String, CDSSequence> cdsSequenceHashMap = new LinkedHashMap<String, CDSSequence>();
046        private StartCodonSequence startCodonSequence = null;
047        private StopCodonSequence stopCodonSequence = null;
048        private GeneSequence parentGeneSequence = null;
049
050        /**
051         *
052         * @param parentDNASequence
053         * @param begin
054         * @param end inclusive of end
055         */
056        public TranscriptSequence(GeneSequence parentDNASequence, int begin, int end) {
057                setParentSequence(parentDNASequence);
058                this.parentGeneSequence = parentDNASequence;
059                setBioBegin(begin);
060                setBioEnd(end);
061                this.setCompoundSet(DNACompoundSet.getDNACompoundSet());
062
063        }
064
065                @Override
066        public int getLength() {
067                return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1;
068        }
069
070        /**
071         * @return the strand
072         */
073        public Strand getStrand() {
074                return parentGeneSequence.getStrand();
075        }
076
077        /**
078         * Remove a CDS or coding sequence from the transcript sequence
079         * @param accession
080         * @return
081         */
082        public CDSSequence removeCDS(String accession) {
083                for (CDSSequence cdsSequence : cdsSequenceList) {
084                        if (cdsSequence.getAccession().getID().equals(accession)) {
085                                cdsSequenceList.remove(cdsSequence);
086                                cdsSequenceHashMap.remove(accession);
087                                return cdsSequence;
088                        }
089                }
090                return null;
091        }
092
093        /**
094         * Get the CDS sequences that have been added to the TranscriptSequences
095         * @return
096         */
097        public LinkedHashMap<String, CDSSequence> getCDSSequences() {
098                return cdsSequenceHashMap;
099        }
100
101        /**
102         * Add a Coding Sequence region with phase to the transcript sequence
103         * @param accession
104         * @param begin
105         * @param end
106         * @param phase 0,1,2
107         * @return
108         */
109        public CDSSequence addCDS(AccessionID accession, int begin, int end, int phase) throws Exception {
110                if (cdsSequenceHashMap.containsKey(accession.getID())) {
111                        throw new Exception("Duplicate accesion id " + accession.getID());
112                }
113                CDSSequence cdsSequence = new CDSSequence(this, begin, end, phase); //sense should be the same as parent
114                cdsSequence.setAccession(accession);
115                cdsSequenceList.add(cdsSequence);
116                Collections.sort(cdsSequenceList, new CDSComparator());
117                cdsSequenceHashMap.put(accession.getID(), cdsSequence);
118                return cdsSequence;
119        }
120
121        /**
122         * http://www.sequenceontology.org/gff3.shtml
123         * http://biowiki.org/~yam/bioe131/GFF.ppt
124         * @return
125         */
126        /**
127         * Return a list of protein sequences based on each CDS sequence
128         * where the phase shift between two CDS sequences is assigned to the
129         * CDS sequence that starts the triplet. This can be used to map
130         * a CDS/exon region of a protein sequence back to the DNA sequence
131         * If you have a protein sequence and a predicted gene you can take the
132         * predict CDS protein sequences and align back to the protein sequence.
133         * If you have errors in mapping the predicted protein CDS regions to
134         * an the known protein sequence then you can identify possible errors
135         * in the prediction
136         *
137         * @return
138         */
139        public ArrayList<ProteinSequence> getProteinCDSSequences() {
140                ArrayList<ProteinSequence> proteinSequenceList = new ArrayList<ProteinSequence>();
141                for (int i = 0; i < cdsSequenceList.size(); i++) {
142                        CDSSequence cdsSequence = cdsSequenceList.get(i);
143                        String codingSequence = cdsSequence.getCodingSequence();
144                        //          logger.debug("CDS {} {} = {}", getStrand(), cdsSequence.getPhase(), codingSequence);
145                        if (this.getStrand() == Strand.NEGATIVE) {
146                                if (cdsSequence.phase == 1) {
147                                        codingSequence = codingSequence.substring(1, codingSequence.length());
148                                } else if (cdsSequence.phase == 2) {
149                                        codingSequence = codingSequence.substring(2, codingSequence.length());
150                                }
151                                if (i < cdsSequenceList.size() - 1) {
152                                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
153                                        if (nextCDSSequence.phase == 1) {
154                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
155                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
156                                        } else if (nextCDSSequence.phase == 2) {
157                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
158                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
159                                        }
160                                }
161                        } else {
162                                if (cdsSequence.phase == 1) {
163                                        codingSequence = codingSequence.substring(1, codingSequence.length());
164                                } else if (cdsSequence.phase == 2) {
165                                        codingSequence = codingSequence.substring(2, codingSequence.length());
166                                }
167                                if (i < cdsSequenceList.size() - 1) {
168                                        CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1);
169                                        if (nextCDSSequence.phase == 1) {
170                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
171                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 1);
172                                        } else if (nextCDSSequence.phase == 2) {
173                                                String nextCodingSequence = nextCDSSequence.getCodingSequence();
174                                                codingSequence = codingSequence + nextCodingSequence.substring(0, 2);
175                                        }
176                                }
177                        }
178
179
180                        //    logger.debug("Coding Sequence: {}", codingSequence);
181
182                        DNASequence dnaCodingSequence = null;
183                        try {
184                                dnaCodingSequence = new DNASequence(codingSequence.toUpperCase());
185                        } catch (CompoundNotFoundException e) {
186                                // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24
187                                logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage());
188                        }
189                        RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(TranscriptionEngine.getDefault());
190                        ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(TranscriptionEngine.getDefault());
191                        proteinSequence.setAccession(new AccessionID(cdsSequence.getAccession().getID()));
192                        proteinSequence.setParentDNASequence(cdsSequence, 1, cdsSequence.getLength());
193                        proteinSequenceList.add(proteinSequence);
194                }
195                return proteinSequenceList;
196        }
197
198        /**
199         * Get the stitched together CDS sequences then maps to the cDNA
200         * @return
201         */
202        public DNASequence getDNACodingSequence() {
203                StringBuilder sb = new StringBuilder();
204                for (CDSSequence cdsSequence : cdsSequenceList) {
205                        sb.append(cdsSequence.getCodingSequence());
206                }
207
208                DNASequence dnaSequence = null;
209                try {
210                        dnaSequence = new DNASequence(sb.toString().toUpperCase());
211                } catch (CompoundNotFoundException e) {
212                        // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24
213                        logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage());
214                }
215                dnaSequence.setAccession(new AccessionID(this.getAccession().getID()));
216                return dnaSequence;
217        }
218
219        /**
220         * Get the protein sequence
221         * @return
222         */
223        public ProteinSequence getProteinSequence() {
224                return getProteinSequence(TranscriptionEngine.getDefault());
225        }
226
227        /**
228         * Get the protein sequence with user defined TranscriptEngine
229         * @param engine
230         * @return
231         */
232        public ProteinSequence getProteinSequence(TranscriptionEngine engine) {
233                DNASequence dnaCodingSequence = getDNACodingSequence();
234                RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(engine);
235                ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(engine);
236                proteinSequence.setAccession(new AccessionID(this.getAccession().getID()));
237
238                return proteinSequence;
239        }
240
241        /**
242         * @return the startCodonSequence
243         */
244        public StartCodonSequence getStartCodonSequence() {
245                return startCodonSequence;
246        }
247
248        /**
249         * @param startCodonSequence the startCodonSequence to set
250         */
251        public void addStartCodonSequence(AccessionID accession, int begin, int end) {
252                this.startCodonSequence = new StartCodonSequence(this, begin, end);
253                startCodonSequence.setAccession(accession);
254        }
255
256        /**
257         * @return the stopCodonSequence
258         */
259        public StopCodonSequence getStopCodonSequence() {
260                return stopCodonSequence;
261        }
262
263        /**
264         * @param stopCodonSequence the stopCodonSequence to set
265         */
266        public void addStopCodonSequence(AccessionID accession, int begin, int end) {
267                this.stopCodonSequence = new StopCodonSequence(this, begin, end);
268                stopCodonSequence.setAccession(accession);
269        }
270}