001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 027import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 028import org.biojava.nbio.core.sequence.template.CompoundSet; 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import java.util.ArrayList; 033import java.util.Collections; 034import java.util.LinkedHashMap; 035 036/** 037 * 038 * @author Scooter Willis 039 */ 040public class GeneSequence extends DNASequence { 041 042 private final static Logger logger = LoggerFactory.getLogger(GeneSequence.class); 043 044 private final LinkedHashMap<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<String, TranscriptSequence>(); 045 private final LinkedHashMap<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<String, IntronSequence>(); 046 private final LinkedHashMap<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<String, ExonSequence>(); 047 private final ArrayList<IntronSequence> intronSequenceList = new ArrayList<IntronSequence>(); 048 private final ArrayList<ExonSequence> exonSequenceList = new ArrayList<ExonSequence>(); 049 boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons 050 private Strand strand = Strand.UNDEFINED; 051 private ChromosomeSequence chromosomeSequence; 052 053 /** 054 * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult 055 * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one 056 * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein. 057 * 058 * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a 059 * TranscriptSequence and then add CDS sequences. 060 * 061 * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files 062 * 063 * @param parentDNASequence 064 * @param begin 065 * @param end inclusive of end 066 * @param strand force a gene to have strand and transcription sequence will inherit 067 */ 068 public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) { 069 chromosomeSequence = parentSequence; 070 setParentSequence(parentSequence); 071 setBioBegin(begin); 072 setBioEnd(end); 073 setStrand(strand); 074 this.setCompoundSet(DNACompoundSet.getDNACompoundSet()); 075 } 076 077 /** 078 * The parent ChromosomeSequence which contains the actual DNA sequence data 079 * @return Chromosome sequence 080 */ 081 public ChromosomeSequence getParentChromosomeSequence() { 082 return chromosomeSequence; 083 } 084 085 @Override 086 public int getLength() { 087 return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1; 088 } 089 090 /** 091 * Once everything has been added to the gene sequence where you might have added exon sequences only then you 092 * can infer the intron sequences and add them. You may also have the case where you only added one or more 093 * TranscriptSequences and from that you can infer the exon sequences and intron sequences. 094 * Currently not implement 095 */ 096 public void addIntronsUsingExons() throws Exception { 097 if (intronAdded) { //going to assume introns are correct 098 return; 099 } 100 if (exonSequenceList.size() == 0) { 101 return; 102 } 103 ExonComparator exonComparator = new ExonComparator(); 104 //sort based on start position and sense; 105 Collections.sort(exonSequenceList, exonComparator); 106 int shift = -1; 107 if (getStrand() == Strand.NEGATIVE) { 108 shift = 1; 109 } 110 //ExonSequence firstExonSequence = exonSequenceList.get(0); 111 int intronIndex = 1; 112// if (firstExonSequence.getBioBegin().intValue() != getBioBegin().intValue()) { 113// this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), getBioBegin(), firstExonSequence.getBioBegin() + shift); 114// intronIndex++; 115// } 116 for (int i = 0; i < exonSequenceList.size() - 1; i++) { 117 ExonSequence exon1 = exonSequenceList.get(i); 118 ExonSequence exon2 = exonSequenceList.get(i + 1); 119 this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), exon1.getBioEnd() - shift, exon2.getBioBegin() + shift); 120 intronIndex++; 121 } 122 123// ExonSequence lastExonSequence = exonSequenceList.get(exonSequenceList.size() - 1); 124// if (lastExonSequence.getBioEnd().intValue() != getBioEnd().intValue()) { 125// this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), lastExonSequence.getBioEnd() - shift, getBioEnd()); 126// intronIndex++; 127// } 128 129 // log.severe("Add in support for building introns based on added exons"); 130 131 } 132 133 /** 134 * A gene should have Strand 135 * @return the strand 136 */ 137 public Strand getStrand() { 138 return strand; 139 } 140 141 /** 142 * @param strand the strand to set 143 */ 144 public void setStrand(Strand strand) { 145 this.strand = strand; 146 } 147 148 /** 149 * Get the transcript sequence by accession 150 * @param accession 151 * @return the transcript 152 */ 153 public TranscriptSequence getTranscript(String accession) { 154 return transcriptSequenceHashMap.get(accession); 155 } 156 157 /** 158 * Get the collection of transcription sequences assigned to this gene 159 * @return transcripts 160 */ 161 public LinkedHashMap<String, TranscriptSequence> getTranscripts() { 162 return transcriptSequenceHashMap; 163 } 164 165 /** 166 * Remove the transcript sequence from the gene 167 * @param accession 168 * @return transcriptsequence 169 */ 170 public TranscriptSequence removeTranscript(String accession) { 171 172 173 return transcriptSequenceHashMap.remove(accession); 174 } 175 176 /** 177 * Add a transcription sequence to a gene which describes a ProteinSequence 178 * @param accession 179 * @param begin 180 * @param end 181 * @return transcript sequence 182 * @throws Exception If the accession id is already used 183 */ 184 public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception { 185 if (transcriptSequenceHashMap.containsKey(accession.getID())) { 186 throw new Exception("Duplicate accesion id " + accession.getID()); 187 } 188 TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end); 189 transcriptSequence.setAccession(accession); 190 transcriptSequenceHashMap.put(accession.getID(), transcriptSequence); 191 return transcriptSequence; 192 } 193 194 /** 195 * Remove the intron by accession 196 * @param accession 197 * @return intron sequence 198 */ 199 public IntronSequence removeIntron(String accession) { 200 for (IntronSequence intronSequence : intronSequenceList) { 201 if (intronSequence.getAccession().getID().equals(accession)) { 202 intronSequenceList.remove(intronSequence); 203 intronSequenceHashMap.remove(accession); 204 return intronSequence; 205 } 206 } 207 return null; 208 } 209 210 /** 211 * Add an Intron Currently used to mark an IntronSequence as a feature 212 * @param accession 213 * @param begin 214 * @param end 215 * @return intron sequence 216 */ 217 public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception { 218 if (intronSequenceHashMap.containsKey(accession.getID())) { 219 throw new Exception("Duplicate accesion id " + accession.getID()); 220 } 221 intronAdded = true; 222 IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent 223 intronSequence.setAccession(accession); 224 intronSequenceList.add(intronSequence); 225 intronSequenceHashMap.put(accession.getID(), intronSequence); 226 return intronSequence; 227 } 228 229 /** 230 * Remove the exon sequence 231 * @param accession 232 * @return exon sequence 233 */ 234 public ExonSequence removeExon(String accession) { 235 for (ExonSequence exonSequence : exonSequenceList) { 236 if (exonSequence.getAccession().getID().equals(accession)) { 237 exonSequenceList.remove(exonSequence); 238 exonSequenceHashMap.remove(accession); 239 // we now have a new gap which creates an intron 240 intronSequenceList.clear(); 241 intronSequenceHashMap.clear(); 242 intronAdded = false; 243 try{ 244 addIntronsUsingExons(); 245 } catch(Exception e){ 246 logger.error("Remove Exon validate() error " + e.getMessage()); 247 } 248 return exonSequence; 249 } 250 } 251 return null; 252 } 253 254 /** 255 * Add an ExonSequence mainly used to mark as a feature 256 * @param accession 257 * @param begin 258 * @param end 259 * @return exon sequence 260 */ 261 public ExonSequence addExon(AccessionID accession, int begin, int end) throws Exception { 262 if (exonSequenceHashMap.containsKey(accession.getID())) { 263 throw new Exception("Duplicate accesion id " + accession.getID()); 264 } 265 266 ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent 267 exonSequence.setAccession(accession); 268 exonSequenceList.add(exonSequence); 269 exonSequenceHashMap.put(accession.getID(), exonSequence); 270 return exonSequence; 271 } 272 273 /** 274 * Get the exons as an ArrayList 275 * @return exons 276 */ 277 public ArrayList<ExonSequence> getExonSequences() { 278 return exonSequenceList; 279 } 280 281 /** 282 * Get the introns as an ArrayList 283 * @return introns 284 */ 285 public ArrayList<IntronSequence> getIntronSequences() { 286 return intronSequenceList; 287 } 288 289 /** 290 * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction 291 * Returns the DNASequence representative of the 5' and 3' reading based on strand 292 * @return dna sequence 293 */ 294 public DNASequence getSequence5PrimeTo3Prime() { 295 String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand()); 296 if (getStrand() == Strand.NEGATIVE) { 297 //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand 298 StringBuilder b = new StringBuilder(getLength()); 299 CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet(); 300 for (int i = 0; i < sequence.length(); i++) { 301 String nucleotide = String.valueOf(sequence.charAt(i)); 302 NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide); 303 b.append(nucleotideCompound.getComplement().getShortName()); 304 } 305 sequence = b.toString(); 306 } 307 DNASequence dnaSequence = null; 308 try { 309 dnaSequence = new DNASequence(sequence.toUpperCase()); 310 } catch (CompoundNotFoundException e) { 311 // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere 312 logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage()); 313 } 314 dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); 315 return dnaSequence; 316 } 317}