001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 027import org.biojava.nbio.core.sequence.transcription.TranscriptionEngine; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.util.ArrayList; 032import java.util.Collections; 033import java.util.LinkedHashMap; 034import java.util.List; 035import java.util.Map; 036 037/** 038 * This is the sequence if you want to go from a gene sequence to a protein sequence. Need to start with a 039 * ChromosomeSequence then getting a GeneSequence and then a TranscriptSequence 040 * @author Scooter Willis 041 */ 042public class TranscriptSequence extends DNASequence { 043 044 private final static Logger logger = LoggerFactory.getLogger(TranscriptSequence.class); 045 046 private final List<CDSSequence> cdsSequenceList = new ArrayList<>(); 047 private final Map<String, CDSSequence> cdsSequenceHashMap = new LinkedHashMap<>(); 048 private StartCodonSequence startCodonSequence = null; 049 private StopCodonSequence stopCodonSequence = null; 050 private GeneSequence parentGeneSequence = null; 051 052 /** 053 * Use {@code}public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end){@code} 054 * that requires an explicit accessionID 055 * @deprecated 056 */ 057 public TranscriptSequence(GeneSequence parentDNASequence, int begin, int end) { 058 setCompoundSet(DNACompoundSet.getDNACompoundSet()); 059 try { 060 initSequenceStorage(parentDNASequence.getSequenceAsString()); 061 } catch (CompoundNotFoundException e) { 062 throw new IllegalArgumentException(e); 063 } 064 setParentSequence(parentDNASequence); 065 this.parentGeneSequence = parentDNASequence; 066 setBioBegin(begin); 067 setBioEnd(end); 068 } 069 070 /** 071 * 072 * @param parentDNASequence 073 * @param accessionID 074 * @param begin 075 * @param end inclusive of end 076 * @throws IllegalArgumentException if the parentDNASequence is incompatible with DNACompoundSet 077 */ 078 public TranscriptSequence(GeneSequence parentDNASequence, AccessionID accessionID, int begin, int end) { 079 this(parentDNASequence, begin, end); 080 setAccession(accessionID); 081 } 082 083 @Override 084 public int getLength() { 085 return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1; 086 } 087 088 /** 089 * @return the strand 090 */ 091 public Strand getStrand() { 092 return parentGeneSequence.getStrand(); 093 } 094 095 /** 096 * Remove a CDS or coding sequence from the transcript sequence 097 * @param accession 098 * @return 099 */ 100 public CDSSequence removeCDS(String accession) { 101 for (CDSSequence cdsSequence : cdsSequenceList) { 102 if (cdsSequence.getAccession().getID().equals(accession)) { 103 cdsSequenceList.remove(cdsSequence); 104 cdsSequenceHashMap.remove(accession); 105 return cdsSequence; 106 } 107 } 108 return null; 109 } 110 111 /** 112 * Get the CDS sequences that have been added to the TranscriptSequences 113 * @return 114 */ 115 public Map<String, CDSSequence> getCDSSequences() { 116 return cdsSequenceHashMap; 117 } 118 119 /** 120 * Add a Coding Sequence region with phase to the transcript sequence 121 * @param accession 122 * @param begin 123 * @param end 124 * @param phase 0,1,2 125 * @return 126 */ 127 public CDSSequence addCDS(AccessionID accession, int begin, int end, int phase) throws Exception { 128 if (cdsSequenceHashMap.containsKey(accession.getID())) { 129 throw new Exception("Duplicate accession id " + accession.getID()); 130 } 131 CDSSequence cdsSequence = new CDSSequence(this, begin, end, phase); //sense should be the same as parent 132 cdsSequence.setAccession(accession); 133 cdsSequenceList.add(cdsSequence); 134 Collections.sort(cdsSequenceList, new CDSComparator()); 135 cdsSequenceHashMap.put(accession.getID(), cdsSequence); 136 return cdsSequence; 137 } 138 139 /** 140 * http://www.sequenceontology.org/gff3.shtml 141 * http://biowiki.org/~yam/bioe131/GFF.ppt 142 * @return 143 */ 144 /** 145 * Return a list of protein sequences based on each CDS sequence 146 * where the phase shift between two CDS sequences is assigned to the 147 * CDS sequence that starts the triplet. This can be used to map 148 * a CDS/exon region of a protein sequence back to the DNA sequence 149 * If you have a protein sequence and a predicted gene you can take the 150 * predict CDS protein sequences and align back to the protein sequence. 151 * If you have errors in mapping the predicted protein CDS regions to 152 * an the known protein sequence then you can identify possible errors 153 * in the prediction 154 * 155 * @return 156 */ 157 public List<ProteinSequence> getProteinCDSSequences() { 158 List<ProteinSequence> proteinSequenceList = new ArrayList<>(); 159 for (int i = 0; i < cdsSequenceList.size(); i++) { 160 CDSSequence cdsSequence = cdsSequenceList.get(i); 161 String codingSequence = cdsSequence.getCodingSequence(); 162 // logger.debug("CDS {} {} = {}", getStrand(), cdsSequence.getPhase(), codingSequence); 163 if (this.getStrand() == Strand.NEGATIVE) { 164 if (cdsSequence.phase == 1) { 165 codingSequence = codingSequence.substring(1, codingSequence.length()); 166 } else if (cdsSequence.phase == 2) { 167 codingSequence = codingSequence.substring(2, codingSequence.length()); 168 } 169 if (i < cdsSequenceList.size() - 1) { 170 CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1); 171 if (nextCDSSequence.phase == 1) { 172 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 173 codingSequence = codingSequence + nextCodingSequence.substring(0, 1); 174 } else if (nextCDSSequence.phase == 2) { 175 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 176 codingSequence = codingSequence + nextCodingSequence.substring(0, 2); 177 } 178 } 179 } else { 180 if (cdsSequence.phase == 1) { 181 codingSequence = codingSequence.substring(1, codingSequence.length()); 182 } else if (cdsSequence.phase == 2) { 183 codingSequence = codingSequence.substring(2, codingSequence.length()); 184 } 185 if (i < cdsSequenceList.size() - 1) { 186 CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1); 187 if (nextCDSSequence.phase == 1) { 188 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 189 codingSequence = codingSequence + nextCodingSequence.substring(0, 1); 190 } else if (nextCDSSequence.phase == 2) { 191 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 192 codingSequence = codingSequence + nextCodingSequence.substring(0, 2); 193 } 194 } 195 } 196 197 198 // logger.debug("Coding Sequence: {}", codingSequence); 199 200 DNASequence dnaCodingSequence = null; 201 try { 202 dnaCodingSequence = new DNASequence(codingSequence.toUpperCase()); 203 } catch (CompoundNotFoundException e) { 204 // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24 205 logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage()); 206 } 207 RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(TranscriptionEngine.getDefault()); 208 ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(TranscriptionEngine.getDefault()); 209 proteinSequence.setAccession(new AccessionID(cdsSequence.getAccession().getID())); 210 proteinSequence.setParentDNASequence(cdsSequence, 1, cdsSequence.getLength()); 211 proteinSequenceList.add(proteinSequence); 212 } 213 return proteinSequenceList; 214 } 215 216 /** 217 * Get the stitched together CDS sequences then maps to the cDNA 218 * @return 219 */ 220 public DNASequence getDNACodingSequence() { 221 StringBuilder sb = new StringBuilder(); 222 for (CDSSequence cdsSequence : cdsSequenceList) { 223 sb.append(cdsSequence.getCodingSequence()); 224 } 225 226 DNASequence dnaSequence = null; 227 try { 228 dnaSequence = new DNASequence(sb.toString().toUpperCase()); 229 } catch (CompoundNotFoundException e) { 230 // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24 231 logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage()); 232 } 233 dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); 234 return dnaSequence; 235 } 236 237 /** 238 * Get the protein sequence 239 * @return 240 */ 241 public ProteinSequence getProteinSequence() { 242 return getProteinSequence(TranscriptionEngine.getDefault()); 243 } 244 245 /** 246 * Get the protein sequence with user defined TranscriptEngine 247 * @param engine 248 * @return 249 */ 250 public ProteinSequence getProteinSequence(TranscriptionEngine engine) { 251 DNASequence dnaCodingSequence = getDNACodingSequence(); 252 RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(engine); 253 ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(engine); 254 proteinSequence.setAccession(new AccessionID(this.getAccession().getID())); 255 256 return proteinSequence; 257 } 258 259 /** 260 * @return the startCodonSequence 261 */ 262 public StartCodonSequence getStartCodonSequence() { 263 return startCodonSequence; 264 } 265 266 /** 267 * Sets the start codon sequence at given begin / end location. Note that calling this method multiple times 268 * will replace any existing value. 269 * @param accession 270 * @param begin 271 * @param end 272 */ 273 public void addStartCodonSequence(AccessionID accession, int begin, int end) { 274 this.startCodonSequence = new StartCodonSequence(this, begin, end); 275 startCodonSequence.setAccession(accession); 276 } 277 278 /** 279 * @return the stopCodonSequence 280 */ 281 public StopCodonSequence getStopCodonSequence() { 282 return stopCodonSequence; 283 } 284 285 /** 286 * Sets the stop codon sequence at given begin / end location. Note that calling this method multiple times 287 * will replace any existing value. 288 * @param accession 289 * @param begin 290 * @param end 291 */ 292 public void addStopCodonSequence(AccessionID accession, int begin, int end) { 293 this.stopCodonSequence = new StopCodonSequence(this, begin, end); 294 stopCodonSequence.setAccession(accession); 295 } 296}