001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 027import org.biojava.nbio.core.sequence.transcription.TranscriptionEngine; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.util.ArrayList; 032import java.util.Collections; 033import java.util.LinkedHashMap; 034 035/** 036 * This is the sequence if you want to go from a gene sequence to a protein sequence. Need to start with a 037 * ChromosomeSequence then getting a GeneSequence and then a TranscriptSequence 038 * @author Scooter Willis 039 */ 040public class TranscriptSequence extends DNASequence { 041 042 private final static Logger logger = LoggerFactory.getLogger(TranscriptSequence.class); 043 044 private final ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>(); 045 private final LinkedHashMap<String, CDSSequence> cdsSequenceHashMap = new LinkedHashMap<String, CDSSequence>(); 046 private StartCodonSequence startCodonSequence = null; 047 private StopCodonSequence stopCodonSequence = null; 048 private GeneSequence parentGeneSequence = null; 049 050 /** 051 * Use {@code}public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end){@code} 052 * that requires an explicit accessionID 053 * @deprecated 054 */ 055 public TranscriptSequence(GeneSequence parentDNASequence, int begin, int end) { 056 setCompoundSet(DNACompoundSet.getDNACompoundSet()); 057 try { 058 initSequenceStorage(parentDNASequence.getSequenceAsString()); 059 } catch (CompoundNotFoundException e) { 060 throw new IllegalArgumentException(e); 061 } 062 setParentSequence(parentDNASequence); 063 this.parentGeneSequence = parentDNASequence; 064 setBioBegin(begin); 065 setBioEnd(end); 066 } 067 068 /** 069 * 070 * @param parentDNASequence 071 * @param accessionID 072 * @param begin 073 * @param end inclusive of end 074 * @throws IllegalArgumentException if the parentDNASequence is incompatible with DNACompoundSet 075 */ 076 public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end) { 077 this(parentDNASequence, begin, end); 078 setAccession(accessionID); 079 } 080 081 @Override 082 public int getLength() { 083 return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1; 084 } 085 086 /** 087 * @return the strand 088 */ 089 public Strand getStrand() { 090 return parentGeneSequence.getStrand(); 091 } 092 093 /** 094 * Remove a CDS or coding sequence from the transcript sequence 095 * @param accession 096 * @return 097 */ 098 public CDSSequence removeCDS(String accession) { 099 for (CDSSequence cdsSequence : cdsSequenceList) { 100 if (cdsSequence.getAccession().getID().equals(accession)) { 101 cdsSequenceList.remove(cdsSequence); 102 cdsSequenceHashMap.remove(accession); 103 return cdsSequence; 104 } 105 } 106 return null; 107 } 108 109 /** 110 * Get the CDS sequences that have been added to the TranscriptSequences 111 * @return 112 */ 113 public LinkedHashMap<String, CDSSequence> getCDSSequences() { 114 return cdsSequenceHashMap; 115 } 116 117 /** 118 * Add a Coding Sequence region with phase to the transcript sequence 119 * @param accession 120 * @param begin 121 * @param end 122 * @param phase 0,1,2 123 * @return 124 */ 125 public CDSSequence addCDS(AccessionID accession, int begin, int end, int phase) throws Exception { 126 if (cdsSequenceHashMap.containsKey(accession.getID())) { 127 throw new Exception("Duplicate accession id " + accession.getID()); 128 } 129 CDSSequence cdsSequence = new CDSSequence(this, begin, end, phase); //sense should be the same as parent 130 cdsSequence.setAccession(accession); 131 cdsSequenceList.add(cdsSequence); 132 Collections.sort(cdsSequenceList, new CDSComparator()); 133 cdsSequenceHashMap.put(accession.getID(), cdsSequence); 134 return cdsSequence; 135 } 136 137 /** 138 * http://www.sequenceontology.org/gff3.shtml 139 * http://biowiki.org/~yam/bioe131/GFF.ppt 140 * @return 141 */ 142 /** 143 * Return a list of protein sequences based on each CDS sequence 144 * where the phase shift between two CDS sequences is assigned to the 145 * CDS sequence that starts the triplet. This can be used to map 146 * a CDS/exon region of a protein sequence back to the DNA sequence 147 * If you have a protein sequence and a predicted gene you can take the 148 * predict CDS protein sequences and align back to the protein sequence. 149 * If you have errors in mapping the predicted protein CDS regions to 150 * an the known protein sequence then you can identify possible errors 151 * in the prediction 152 * 153 * @return 154 */ 155 public ArrayList<ProteinSequence> getProteinCDSSequences() { 156 ArrayList<ProteinSequence> proteinSequenceList = new ArrayList<ProteinSequence>(); 157 for (int i = 0; i < cdsSequenceList.size(); i++) { 158 CDSSequence cdsSequence = cdsSequenceList.get(i); 159 String codingSequence = cdsSequence.getCodingSequence(); 160 // logger.debug("CDS {} {} = {}", getStrand(), cdsSequence.getPhase(), codingSequence); 161 if (this.getStrand() == Strand.NEGATIVE) { 162 if (cdsSequence.phase == 1) { 163 codingSequence = codingSequence.substring(1, codingSequence.length()); 164 } else if (cdsSequence.phase == 2) { 165 codingSequence = codingSequence.substring(2, codingSequence.length()); 166 } 167 if (i < cdsSequenceList.size() - 1) { 168 CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1); 169 if (nextCDSSequence.phase == 1) { 170 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 171 codingSequence = codingSequence + nextCodingSequence.substring(0, 1); 172 } else if (nextCDSSequence.phase == 2) { 173 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 174 codingSequence = codingSequence + nextCodingSequence.substring(0, 2); 175 } 176 } 177 } else { 178 if (cdsSequence.phase == 1) { 179 codingSequence = codingSequence.substring(1, codingSequence.length()); 180 } else if (cdsSequence.phase == 2) { 181 codingSequence = codingSequence.substring(2, codingSequence.length()); 182 } 183 if (i < cdsSequenceList.size() - 1) { 184 CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1); 185 if (nextCDSSequence.phase == 1) { 186 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 187 codingSequence = codingSequence + nextCodingSequence.substring(0, 1); 188 } else if (nextCDSSequence.phase == 2) { 189 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 190 codingSequence = codingSequence + nextCodingSequence.substring(0, 2); 191 } 192 } 193 } 194 195 196 // logger.debug("Coding Sequence: {}", codingSequence); 197 198 DNASequence dnaCodingSequence = null; 199 try { 200 dnaCodingSequence = new DNASequence(codingSequence.toUpperCase()); 201 } catch (CompoundNotFoundException e) { 202 // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24 203 logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage()); 204 } 205 RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(TranscriptionEngine.getDefault()); 206 ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(TranscriptionEngine.getDefault()); 207 proteinSequence.setAccession(new AccessionID(cdsSequence.getAccession().getID())); 208 proteinSequence.setParentDNASequence(cdsSequence, 1, cdsSequence.getLength()); 209 proteinSequenceList.add(proteinSequence); 210 } 211 return proteinSequenceList; 212 } 213 214 /** 215 * Get the stitched together CDS sequences then maps to the cDNA 216 * @return 217 */ 218 public DNASequence getDNACodingSequence() { 219 StringBuilder sb = new StringBuilder(); 220 for (CDSSequence cdsSequence : cdsSequenceList) { 221 sb.append(cdsSequence.getCodingSequence()); 222 } 223 224 DNASequence dnaSequence = null; 225 try { 226 dnaSequence = new DNASequence(sb.toString().toUpperCase()); 227 } catch (CompoundNotFoundException e) { 228 // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24 229 logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage()); 230 } 231 dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); 232 return dnaSequence; 233 } 234 235 /** 236 * Get the protein sequence 237 * @return 238 */ 239 public ProteinSequence getProteinSequence() { 240 return getProteinSequence(TranscriptionEngine.getDefault()); 241 } 242 243 /** 244 * Get the protein sequence with user defined TranscriptEngine 245 * @param engine 246 * @return 247 */ 248 public ProteinSequence getProteinSequence(TranscriptionEngine engine) { 249 DNASequence dnaCodingSequence = getDNACodingSequence(); 250 RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(engine); 251 ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(engine); 252 proteinSequence.setAccession(new AccessionID(this.getAccession().getID())); 253 254 return proteinSequence; 255 } 256 257 /** 258 * @return the startCodonSequence 259 */ 260 public StartCodonSequence getStartCodonSequence() { 261 return startCodonSequence; 262 } 263 264 /** 265 * Sets the start codon sequence at given begin / end location. Note that calling this method multiple times 266 * will replace any existing value. 267 * @param accession 268 * @param begin 269 * @param end 270 */ 271 public void addStartCodonSequence(AccessionID accession, int begin, int end) { 272 this.startCodonSequence = new StartCodonSequence(this, begin, end); 273 startCodonSequence.setAccession(accession); 274 } 275 276 /** 277 * @return the stopCodonSequence 278 */ 279 public StopCodonSequence getStopCodonSequence() { 280 return stopCodonSequence; 281 } 282 283 /** 284 * Sets the stop codon sequence at given begin / end location. Note that calling this method multiple times 285 * will replace any existing value. 286 * @param accession 287 * @param begin 288 * @param end 289 */ 290 public void addStopCodonSequence(AccessionID accession, int begin, int end) { 291 this.stopCodonSequence = new StopCodonSequence(this, begin, end); 292 stopCodonSequence.setAccession(accession); 293 } 294}