001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 027import org.biojava.nbio.core.sequence.transcription.TranscriptionEngine; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.util.ArrayList; 032import java.util.Collections; 033import java.util.LinkedHashMap; 034 035/** 036 * This is the sequence if you want to go from a gene sequence to a protein sequence. Need to start with a 037 * ChromosomeSequence then getting a GeneSequence and then a TranscriptSequence 038 * @author Scooter Willis 039 */ 040public class TranscriptSequence extends DNASequence { 041 042 private final static Logger logger = LoggerFactory.getLogger(TranscriptSequence.class); 043 044 private final ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>(); 045 private final LinkedHashMap<String, CDSSequence> cdsSequenceHashMap = new LinkedHashMap<String, CDSSequence>(); 046 private StartCodonSequence startCodonSequence = null; 047 private StopCodonSequence stopCodonSequence = null; 048 private GeneSequence parentGeneSequence = null; 049 050 /** 051 * 052 * @param parentDNASequence 053 * @param begin 054 * @param end inclusive of end 055 */ 056 public TranscriptSequence(GeneSequence parentDNASequence, int begin, int end) { 057 setParentSequence(parentDNASequence); 058 this.parentGeneSequence = parentDNASequence; 059 setBioBegin(begin); 060 setBioEnd(end); 061 this.setCompoundSet(DNACompoundSet.getDNACompoundSet()); 062 063 } 064 065 @Override 066 public int getLength() { 067 return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1; 068 } 069 070 /** 071 * @return the strand 072 */ 073 public Strand getStrand() { 074 return parentGeneSequence.getStrand(); 075 } 076 077 /** 078 * Remove a CDS or coding sequence from the transcript sequence 079 * @param accession 080 * @return 081 */ 082 public CDSSequence removeCDS(String accession) { 083 for (CDSSequence cdsSequence : cdsSequenceList) { 084 if (cdsSequence.getAccession().getID().equals(accession)) { 085 cdsSequenceList.remove(cdsSequence); 086 cdsSequenceHashMap.remove(accession); 087 return cdsSequence; 088 } 089 } 090 return null; 091 } 092 093 /** 094 * Get the CDS sequences that have been added to the TranscriptSequences 095 * @return 096 */ 097 public LinkedHashMap<String, CDSSequence> getCDSSequences() { 098 return cdsSequenceHashMap; 099 } 100 101 /** 102 * Add a Coding Sequence region with phase to the transcript sequence 103 * @param accession 104 * @param begin 105 * @param end 106 * @param phase 0,1,2 107 * @return 108 */ 109 public CDSSequence addCDS(AccessionID accession, int begin, int end, int phase) throws Exception { 110 if (cdsSequenceHashMap.containsKey(accession.getID())) { 111 throw new Exception("Duplicate accesion id " + accession.getID()); 112 } 113 CDSSequence cdsSequence = new CDSSequence(this, begin, end, phase); //sense should be the same as parent 114 cdsSequence.setAccession(accession); 115 cdsSequenceList.add(cdsSequence); 116 Collections.sort(cdsSequenceList, new CDSComparator()); 117 cdsSequenceHashMap.put(accession.getID(), cdsSequence); 118 return cdsSequence; 119 } 120 121 /** 122 * http://www.sequenceontology.org/gff3.shtml 123 * http://biowiki.org/~yam/bioe131/GFF.ppt 124 * @return 125 */ 126 /** 127 * Return a list of protein sequences based on each CDS sequence 128 * where the phase shift between two CDS sequences is assigned to the 129 * CDS sequence that starts the triplet. This can be used to map 130 * a CDS/exon region of a protein sequence back to the DNA sequence 131 * If you have a protein sequence and a predicted gene you can take the 132 * predict CDS protein sequences and align back to the protein sequence. 133 * If you have errors in mapping the predicted protein CDS regions to 134 * an the known protein sequence then you can identify possible errors 135 * in the prediction 136 * 137 * @return 138 */ 139 public ArrayList<ProteinSequence> getProteinCDSSequences() { 140 ArrayList<ProteinSequence> proteinSequenceList = new ArrayList<ProteinSequence>(); 141 for (int i = 0; i < cdsSequenceList.size(); i++) { 142 CDSSequence cdsSequence = cdsSequenceList.get(i); 143 String codingSequence = cdsSequence.getCodingSequence(); 144 // logger.debug("CDS {} {} = {}", getStrand(), cdsSequence.getPhase(), codingSequence); 145 if (this.getStrand() == Strand.NEGATIVE) { 146 if (cdsSequence.phase == 1) { 147 codingSequence = codingSequence.substring(1, codingSequence.length()); 148 } else if (cdsSequence.phase == 2) { 149 codingSequence = codingSequence.substring(2, codingSequence.length()); 150 } 151 if (i < cdsSequenceList.size() - 1) { 152 CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1); 153 if (nextCDSSequence.phase == 1) { 154 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 155 codingSequence = codingSequence + nextCodingSequence.substring(0, 1); 156 } else if (nextCDSSequence.phase == 2) { 157 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 158 codingSequence = codingSequence + nextCodingSequence.substring(0, 2); 159 } 160 } 161 } else { 162 if (cdsSequence.phase == 1) { 163 codingSequence = codingSequence.substring(1, codingSequence.length()); 164 } else if (cdsSequence.phase == 2) { 165 codingSequence = codingSequence.substring(2, codingSequence.length()); 166 } 167 if (i < cdsSequenceList.size() - 1) { 168 CDSSequence nextCDSSequence = cdsSequenceList.get(i + 1); 169 if (nextCDSSequence.phase == 1) { 170 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 171 codingSequence = codingSequence + nextCodingSequence.substring(0, 1); 172 } else if (nextCDSSequence.phase == 2) { 173 String nextCodingSequence = nextCDSSequence.getCodingSequence(); 174 codingSequence = codingSequence + nextCodingSequence.substring(0, 2); 175 } 176 } 177 } 178 179 180 // logger.debug("Coding Sequence: {}", codingSequence); 181 182 DNASequence dnaCodingSequence = null; 183 try { 184 dnaCodingSequence = new DNASequence(codingSequence.toUpperCase()); 185 } catch (CompoundNotFoundException e) { 186 // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24 187 logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage()); 188 } 189 RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(TranscriptionEngine.getDefault()); 190 ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(TranscriptionEngine.getDefault()); 191 proteinSequence.setAccession(new AccessionID(cdsSequence.getAccession().getID())); 192 proteinSequence.setParentDNASequence(cdsSequence, 1, cdsSequence.getLength()); 193 proteinSequenceList.add(proteinSequence); 194 } 195 return proteinSequenceList; 196 } 197 198 /** 199 * Get the stitched together CDS sequences then maps to the cDNA 200 * @return 201 */ 202 public DNASequence getDNACodingSequence() { 203 StringBuilder sb = new StringBuilder(); 204 for (CDSSequence cdsSequence : cdsSequenceList) { 205 sb.append(cdsSequence.getCodingSequence()); 206 } 207 208 DNASequence dnaSequence = null; 209 try { 210 dnaSequence = new DNASequence(sb.toString().toUpperCase()); 211 } catch (CompoundNotFoundException e) { 212 // if I understand this should not happen, please correct if I'm wrong - JD 2014-10-24 213 logger.error("Could not create DNA coding sequence, {}. This is most likely a bug.", e.getMessage()); 214 } 215 dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); 216 return dnaSequence; 217 } 218 219 /** 220 * Get the protein sequence 221 * @return 222 */ 223 public ProteinSequence getProteinSequence() { 224 return getProteinSequence(TranscriptionEngine.getDefault()); 225 } 226 227 /** 228 * Get the protein sequence with user defined TranscriptEngine 229 * @param engine 230 * @return 231 */ 232 public ProteinSequence getProteinSequence(TranscriptionEngine engine) { 233 DNASequence dnaCodingSequence = getDNACodingSequence(); 234 RNASequence rnaCodingSequence = dnaCodingSequence.getRNASequence(engine); 235 ProteinSequence proteinSequence = rnaCodingSequence.getProteinSequence(engine); 236 proteinSequence.setAccession(new AccessionID(this.getAccession().getID())); 237 238 return proteinSequence; 239 } 240 241 /** 242 * @return the startCodonSequence 243 */ 244 public StartCodonSequence getStartCodonSequence() { 245 return startCodonSequence; 246 } 247 248 /** 249 * @param startCodonSequence the startCodonSequence to set 250 */ 251 public void addStartCodonSequence(AccessionID accession, int begin, int end) { 252 this.startCodonSequence = new StartCodonSequence(this, begin, end); 253 startCodonSequence.setAccession(accession); 254 } 255 256 /** 257 * @return the stopCodonSequence 258 */ 259 public StopCodonSequence getStopCodonSequence() { 260 return stopCodonSequence; 261 } 262 263 /** 264 * @param stopCodonSequence the stopCodonSequence to set 265 */ 266 public void addStopCodonSequence(AccessionID accession, int begin, int end) { 267 this.stopCodonSequence = new StopCodonSequence(this, begin, end); 268 stopCodonSequence.setAccession(accession); 269 } 270}