001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 027import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 028import org.biojava.nbio.core.sequence.template.CompoundSet; 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import java.util.ArrayList; 033import java.util.Collections; 034import java.util.LinkedHashMap; 035import java.util.Map; 036import java.util.List; 037 038/** 039 * 040 * @author Scooter Willis 041 */ 042public class GeneSequence extends DNASequence { 043 044 private final static Logger logger = LoggerFactory.getLogger(GeneSequence.class); 045 046 private final Map<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<>(); 047 private final Map<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<>(); 048 private final Map<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<>(); 049 private final List<IntronSequence> intronSequenceList = new ArrayList<>(); 050 private final List<ExonSequence> exonSequenceList = new ArrayList<>(); 051 boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons 052 private Strand strand = Strand.UNDEFINED; 053 private ChromosomeSequence chromosomeSequence; 054 055 /** 056 * Use GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId, int begin, int end, Strand strand) 057 * which mandates an accessionID. 058 * @param parentSequence 059 * @param begin 060 * @param end inclusive of end 061 * @param strand force a gene to have strand and transcription sequence will inherit 062 * @deprecated 063 */ 064 public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) { 065 setCompoundSet(DNACompoundSet.getDNACompoundSet()); 066 try { 067 initSequenceStorage(parentSequence.getSequenceAsString()); 068 } catch (CompoundNotFoundException e) { 069 throw new IllegalArgumentException(e); 070 } 071 chromosomeSequence = parentSequence; 072 setParentSequence(parentSequence); 073 setBioBegin(begin); 074 setBioEnd(end); 075 setStrand(strand); 076 } 077 078 /** 079 * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult 080 * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one 081 * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein. 082 * 083 * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a 084 * TranscriptSequence and then add CDS sequences. 085 * 086 * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files 087 * 088 * @param parentSequence 089 * @param accessionId An identifier for the gene. 090 * @param begin 091 * @param end 092 * @param strand force a gene to have strand and transcription sequence will inherit 093 */ 094 public GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId, int begin, int end, Strand strand) { 095 this(parentSequence,begin,end,strand); 096 setAccession(accessionId); 097 } 098 099 /** 100 * The parent ChromosomeSequence which contains the actual DNA sequence data 101 * @return Chromosome sequence 102 */ 103 public ChromosomeSequence getParentChromosomeSequence() { 104 return chromosomeSequence; 105 } 106 107 @Override 108 public int getLength() { 109 return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1; 110 } 111 112 /** 113 * Once everything has been added to the gene sequence where you might have added exon sequences only then you 114 * can infer the intron sequences and add them. You may also have the case where you only added one or more 115 * TranscriptSequences and from that you can infer the exon sequences and intron sequences. 116 * Currently not implement 117 */ 118 public void addIntronsUsingExons() throws Exception { 119 if (intronAdded) { //going to assume introns are correct 120 return; 121 } 122 if (exonSequenceList.size() == 0) { 123 return; 124 } 125 ExonComparator exonComparator = new ExonComparator(); 126 //sort based on start position and sense; 127 Collections.sort(exonSequenceList, exonComparator); 128 int shift = -1; 129 if (getStrand() == Strand.NEGATIVE) { 130 shift = 1; 131 } 132 //ExonSequence firstExonSequence = exonSequenceList.get(0); 133 int intronIndex = 1; 134// if (firstExonSequence.getBioBegin().intValue() != getBioBegin().intValue()) { 135// this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), getBioBegin(), firstExonSequence.getBioBegin() + shift); 136// intronIndex++; 137// } 138 for (int i = 0; i < exonSequenceList.size() - 1; i++) { 139 ExonSequence exon1 = exonSequenceList.get(i); 140 ExonSequence exon2 = exonSequenceList.get(i + 1); 141 AccessionID intronId= new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex); 142 this.addIntron(intronId, exon1.getBioEnd() - shift, exon2.getBioBegin() + shift); 143 intronIndex++; 144 } 145 146// ExonSequence lastExonSequence = exonSequenceList.get(exonSequenceList.size() - 1); 147// if (lastExonSequence.getBioEnd().intValue() != getBioEnd().intValue()) { 148// this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), lastExonSequence.getBioEnd() - shift, getBioEnd()); 149// intronIndex++; 150// } 151 152 // log.severe("Add in support for building introns based on added exons"); 153 154 } 155 156 /** 157 * A gene should have Strand 158 * @return the strand 159 */ 160 public Strand getStrand() { 161 return strand; 162 } 163 164 /** 165 * @param strand the strand to set 166 */ 167 public void setStrand(Strand strand) { 168 this.strand = strand; 169 } 170 171 /** 172 * Get the transcript sequence by accession 173 * @param accession 174 * @return the transcript 175 */ 176 public TranscriptSequence getTranscript(String accession) { 177 return transcriptSequenceHashMap.get(accession); 178 } 179 180 /** 181 * Get the collection of transcription sequences assigned to this gene 182 * @return transcripts 183 */ 184 public Map<String, TranscriptSequence> getTranscripts() { 185 return transcriptSequenceHashMap; 186 } 187 188 /** 189 * Remove the transcript sequence from the gene 190 * @param accession 191 * @return transcriptsequence 192 */ 193 public TranscriptSequence removeTranscript(String accession) { 194 return transcriptSequenceHashMap.remove(accession); 195 } 196 197 /** 198 * Add a transcription sequence to a gene which describes a ProteinSequence 199 * @param accession 200 * @param begin 201 * @param end 202 * @return transcript sequence 203 * @throws Exception If the accession id is already used 204 */ 205 public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception { 206 if (transcriptSequenceHashMap.containsKey(accession.getID())) { 207 throw new Exception("Duplicate accesion id " + accession.getID()); 208 } 209 TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end); 210 transcriptSequence.setAccession(accession); 211 transcriptSequenceHashMap.put(accession.getID(), transcriptSequence); 212 return transcriptSequence; 213 } 214 215 /** 216 * Remove the intron by accession 217 * @param accession 218 * @return the removed intron sequence, or null if no intron with that accession exists. 219 */ 220 public IntronSequence removeIntron(String accession) { 221 for (IntronSequence intronSequence : intronSequenceList) { 222 if (intronSequence.getAccession().getID().equals(accession)) { 223 intronSequenceList.remove(intronSequence); 224 intronSequenceHashMap.remove(accession); 225 return intronSequence; 226 } 227 } 228 return null; 229 } 230 231 /** 232 * Add an Intron Currently used to mark an IntronSequence as a feature 233 * @param accession 234 * @param begin 235 * @param end 236 * @return intron sequence 237 */ 238 public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception { 239 if (intronSequenceHashMap.containsKey(accession.getID())) { 240 throw new Exception("Duplicate accesion id " + accession.getID()); 241 } 242 intronAdded = true; 243 IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent 244 intronSequence.setAccession(accession); 245 intronSequenceList.add(intronSequence); 246 intronSequenceHashMap.put(accession.getID(), intronSequence); 247 return intronSequence; 248 } 249 250 /** 251 * Remove the exon sequence 252 * @param accession 253 * @return exon sequence 254 */ 255 public ExonSequence removeExon(String accession) { 256 for (ExonSequence exonSequence : exonSequenceList) { 257 if (exonSequence.getAccession().getID().equals(accession)) { 258 exonSequenceList.remove(exonSequence); 259 exonSequenceHashMap.remove(accession); 260 // we now have a new gap which creates an intron 261 intronSequenceList.clear(); 262 intronSequenceHashMap.clear(); 263 intronAdded = false; 264 try{ 265 addIntronsUsingExons(); 266 } catch(Exception e){ 267 logger.error("Remove Exon validate() error " + e.getMessage()); 268 } 269 return exonSequence; 270 } 271 } 272 return null; 273 } 274 275 /** 276 * Add an ExonSequence mainly used to mark as a feature 277 * @param accession 278 * @param begin 279 * @param end 280 * @return exon sequence 281 * @throws IllegalArgumentException if accessionID is already added. 282 */ 283 public ExonSequence addExon(AccessionID accession, int begin, int end) { 284 if (exonSequenceHashMap.containsKey(accession.getID())) { 285 throw new IllegalArgumentException("Duplicate accession id: " + accession.getID()); 286 } 287 288 ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent 289 exonSequence.setAccession(accession); 290 exonSequenceList.add(exonSequence); 291 exonSequenceHashMap.put(accession.getID(), exonSequence); 292 return exonSequence; 293 } 294 295 /** 296 * Get the exons as an ArrayList. Modifying this list will not modify the underlying collection 297 * @return exons 298 */ 299 public List<ExonSequence> getExonSequences() { 300 return new ArrayList<>(exonSequenceList); 301 } 302 303 /** 304 * Get the introns as an ArrayList. Modifying this list will not modify the underlying collection 305 * @return introns 306 */ 307 public List<IntronSequence> getIntronSequences() { 308 return new ArrayList<>(intronSequenceList); 309 } 310 311 /** 312 * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction 313 * Returns the DNASequence representative of the 5' and 3' reading based on strand 314 * @return dna sequence or null if sequence could not be generated. 315 */ 316 public DNASequence getSequence5PrimeTo3Prime() { 317 String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand()); 318 if (getStrand() == Strand.NEGATIVE) { 319 //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand 320 StringBuilder b = new StringBuilder(getLength()); 321 CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet(); 322 for (int i = 0; i < sequence.length(); i++) { 323 String nucleotide = String.valueOf(sequence.charAt(i)); 324 NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide); 325 b.append(nucleotideCompound.getComplement().getShortName()); 326 } 327 sequence = b.toString(); 328 } 329 DNASequence dnaSequence = null; 330 try { 331 dnaSequence = new DNASequence(sequence.toUpperCase()); 332 dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); 333 } catch (CompoundNotFoundException e) { 334 // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere 335 logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage()); 336 } 337 return dnaSequence; 338 } 339}