001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 027import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 028import org.biojava.nbio.core.sequence.template.CompoundSet; 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import java.util.ArrayList; 033import java.util.Collections; 034import java.util.LinkedHashMap; 035 036/** 037 * 038 * @author Scooter Willis 039 */ 040public class GeneSequence extends DNASequence { 041 042 private final static Logger logger = LoggerFactory.getLogger(GeneSequence.class); 043 044 private final LinkedHashMap<String, TranscriptSequence> transcriptSequenceHashMap = new LinkedHashMap<String, TranscriptSequence>(); 045 private final LinkedHashMap<String, IntronSequence> intronSequenceHashMap = new LinkedHashMap<String, IntronSequence>(); 046 private final LinkedHashMap<String, ExonSequence> exonSequenceHashMap = new LinkedHashMap<String, ExonSequence>(); 047 private final ArrayList<IntronSequence> intronSequenceList = new ArrayList<IntronSequence>(); 048 private final ArrayList<ExonSequence> exonSequenceList = new ArrayList<ExonSequence>(); 049 boolean intronAdded = false; // need to deal with the problem that typically introns are not added when validating the list and adding in introns as the regions not included in exons 050 private Strand strand = Strand.UNDEFINED; 051 private ChromosomeSequence chromosomeSequence; 052 053 /** 054 * Use GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId, int begin, int end, Strand strand) 055 * which mandates an accessionID. 056 * @param parentSequence 057 * @param begin 058 * @param end inclusive of end 059 * @param strand force a gene to have strand and transcription sequence will inherit 060 * @deprecated 061 */ 062 public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) { 063 setCompoundSet(DNACompoundSet.getDNACompoundSet()); 064 try { 065 initSequenceStorage(parentSequence.getSequenceAsString()); 066 } catch (CompoundNotFoundException e) { 067 throw new IllegalArgumentException(e); 068 } 069 chromosomeSequence = parentSequence; 070 setParentSequence(parentSequence); 071 setBioBegin(begin); 072 setBioEnd(end); 073 setStrand(strand); 074 } 075 076 /** 077 * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult 078 * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one 079 * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein. 080 * 081 * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a 082 * TranscriptSequence and then add CDS sequences. 083 * 084 * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files 085 * 086 * @param parentSequence 087 * @param accessionId An identifier for the gene. 088 * @param begin 089 * @param end 090 * @param strand force a gene to have strand and transcription sequence will inherit 091 */ 092 public GeneSequence(ChromosomeSequence parentSequence, AccessionID accessionId, int begin, int end, Strand strand) { 093 this(parentSequence,begin,end,strand); 094 setAccession(accessionId); 095 } 096 097 /** 098 * The parent ChromosomeSequence which contains the actual DNA sequence data 099 * @return Chromosome sequence 100 */ 101 public ChromosomeSequence getParentChromosomeSequence() { 102 return chromosomeSequence; 103 } 104 105 @Override 106 public int getLength() { 107 return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1; 108 } 109 110 /** 111 * Once everything has been added to the gene sequence where you might have added exon sequences only then you 112 * can infer the intron sequences and add them. You may also have the case where you only added one or more 113 * TranscriptSequences and from that you can infer the exon sequences and intron sequences. 114 * Currently not implement 115 */ 116 public void addIntronsUsingExons() throws Exception { 117 if (intronAdded) { //going to assume introns are correct 118 return; 119 } 120 if (exonSequenceList.size() == 0) { 121 return; 122 } 123 ExonComparator exonComparator = new ExonComparator(); 124 //sort based on start position and sense; 125 Collections.sort(exonSequenceList, exonComparator); 126 int shift = -1; 127 if (getStrand() == Strand.NEGATIVE) { 128 shift = 1; 129 } 130 //ExonSequence firstExonSequence = exonSequenceList.get(0); 131 int intronIndex = 1; 132// if (firstExonSequence.getBioBegin().intValue() != getBioBegin().intValue()) { 133// this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), getBioBegin(), firstExonSequence.getBioBegin() + shift); 134// intronIndex++; 135// } 136 for (int i = 0; i < exonSequenceList.size() - 1; i++) { 137 ExonSequence exon1 = exonSequenceList.get(i); 138 ExonSequence exon2 = exonSequenceList.get(i + 1); 139 AccessionID intronId= new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex); 140 this.addIntron(intronId, exon1.getBioEnd() - shift, exon2.getBioBegin() + shift); 141 intronIndex++; 142 } 143 144// ExonSequence lastExonSequence = exonSequenceList.get(exonSequenceList.size() - 1); 145// if (lastExonSequence.getBioEnd().intValue() != getBioEnd().intValue()) { 146// this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), lastExonSequence.getBioEnd() - shift, getBioEnd()); 147// intronIndex++; 148// } 149 150 // log.severe("Add in support for building introns based on added exons"); 151 152 } 153 154 /** 155 * A gene should have Strand 156 * @return the strand 157 */ 158 public Strand getStrand() { 159 return strand; 160 } 161 162 /** 163 * @param strand the strand to set 164 */ 165 public void setStrand(Strand strand) { 166 this.strand = strand; 167 } 168 169 /** 170 * Get the transcript sequence by accession 171 * @param accession 172 * @return the transcript 173 */ 174 public TranscriptSequence getTranscript(String accession) { 175 return transcriptSequenceHashMap.get(accession); 176 } 177 178 /** 179 * Get the collection of transcription sequences assigned to this gene 180 * @return transcripts 181 */ 182 public LinkedHashMap<String, TranscriptSequence> getTranscripts() { 183 return transcriptSequenceHashMap; 184 } 185 186 /** 187 * Remove the transcript sequence from the gene 188 * @param accession 189 * @return transcriptsequence 190 */ 191 public TranscriptSequence removeTranscript(String accession) { 192 return transcriptSequenceHashMap.remove(accession); 193 } 194 195 /** 196 * Add a transcription sequence to a gene which describes a ProteinSequence 197 * @param accession 198 * @param begin 199 * @param end 200 * @return transcript sequence 201 * @throws Exception If the accession id is already used 202 */ 203 public TranscriptSequence addTranscript(AccessionID accession, int begin, int end) throws Exception { 204 if (transcriptSequenceHashMap.containsKey(accession.getID())) { 205 throw new Exception("Duplicate accesion id " + accession.getID()); 206 } 207 TranscriptSequence transcriptSequence = new TranscriptSequence(this, begin, end); 208 transcriptSequence.setAccession(accession); 209 transcriptSequenceHashMap.put(accession.getID(), transcriptSequence); 210 return transcriptSequence; 211 } 212 213 /** 214 * Remove the intron by accession 215 * @param accession 216 * @return the removed intron sequence, or null if no intron with that accession exists. 217 */ 218 public IntronSequence removeIntron(String accession) { 219 for (IntronSequence intronSequence : intronSequenceList) { 220 if (intronSequence.getAccession().getID().equals(accession)) { 221 intronSequenceList.remove(intronSequence); 222 intronSequenceHashMap.remove(accession); 223 return intronSequence; 224 } 225 } 226 return null; 227 } 228 229 /** 230 * Add an Intron Currently used to mark an IntronSequence as a feature 231 * @param accession 232 * @param begin 233 * @param end 234 * @return intron sequence 235 */ 236 public IntronSequence addIntron(AccessionID accession, int begin, int end) throws Exception { 237 if (intronSequenceHashMap.containsKey(accession.getID())) { 238 throw new Exception("Duplicate accesion id " + accession.getID()); 239 } 240 intronAdded = true; 241 IntronSequence intronSequence = new IntronSequence(this, begin, end); // working off the assumption that intron frame is always 0 or doesn't matter and same sense as parent 242 intronSequence.setAccession(accession); 243 intronSequenceList.add(intronSequence); 244 intronSequenceHashMap.put(accession.getID(), intronSequence); 245 return intronSequence; 246 } 247 248 /** 249 * Remove the exon sequence 250 * @param accession 251 * @return exon sequence 252 */ 253 public ExonSequence removeExon(String accession) { 254 for (ExonSequence exonSequence : exonSequenceList) { 255 if (exonSequence.getAccession().getID().equals(accession)) { 256 exonSequenceList.remove(exonSequence); 257 exonSequenceHashMap.remove(accession); 258 // we now have a new gap which creates an intron 259 intronSequenceList.clear(); 260 intronSequenceHashMap.clear(); 261 intronAdded = false; 262 try{ 263 addIntronsUsingExons(); 264 } catch(Exception e){ 265 logger.error("Remove Exon validate() error " + e.getMessage()); 266 } 267 return exonSequence; 268 } 269 } 270 return null; 271 } 272 273 /** 274 * Add an ExonSequence mainly used to mark as a feature 275 * @param accession 276 * @param begin 277 * @param end 278 * @return exon sequence 279 * @throws IllegalArgumentException if accessionID is already added. 280 */ 281 public ExonSequence addExon(AccessionID accession, int begin, int end) { 282 if (exonSequenceHashMap.containsKey(accession.getID())) { 283 throw new IllegalArgumentException("Duplicate accession id: " + accession.getID()); 284 } 285 286 ExonSequence exonSequence = new ExonSequence(this, begin, end); //sense should be the same as parent 287 exonSequence.setAccession(accession); 288 exonSequenceList.add(exonSequence); 289 exonSequenceHashMap.put(accession.getID(), exonSequence); 290 return exonSequence; 291 } 292 293 /** 294 * Get the exons as an ArrayList. Modifying this list will not modify the underlying collection 295 * @return exons 296 */ 297 public ArrayList<ExonSequence> getExonSequences() { 298 return new ArrayList<>(exonSequenceList); 299 } 300 301 /** 302 * Get the introns as an ArrayList. Modifying this list will not modify the underlying collection 303 * @return introns 304 */ 305 public ArrayList<IntronSequence> getIntronSequences() { 306 return new ArrayList<>(intronSequenceList); 307 } 308 309 /** 310 * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction 311 * Returns the DNASequence representative of the 5' and 3' reading based on strand 312 * @return dna sequence or null if sequence could not be generated. 313 */ 314 public DNASequence getSequence5PrimeTo3Prime() { 315 String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand()); 316 if (getStrand() == Strand.NEGATIVE) { 317 //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand 318 StringBuilder b = new StringBuilder(getLength()); 319 CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet(); 320 for (int i = 0; i < sequence.length(); i++) { 321 String nucleotide = String.valueOf(sequence.charAt(i)); 322 NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide); 323 b.append(nucleotideCompound.getComplement().getShortName()); 324 } 325 sequence = b.toString(); 326 } 327 DNASequence dnaSequence = null; 328 try { 329 dnaSequence = new DNASequence(sequence.toUpperCase()); 330 dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); 331 } catch (CompoundNotFoundException e) { 332 // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere 333 logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage()); 334 } 335 return dnaSequence; 336 } 337}