001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.edits; 022 023import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 024import org.biojava.nbio.core.sequence.BasicSequence; 025import org.biojava.nbio.core.sequence.storage.JoiningSequenceReader; 026import org.biojava.nbio.core.sequence.template.Compound; 027import org.biojava.nbio.core.sequence.template.Sequence; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.util.ArrayList; 032import java.util.List; 033 034/** 035 * Interface for carrying out edit operations on a Sequence. The 3 major 036 * methods of Editing are supported 037 * 038 * <ul> 039 * <li>Insertion</li> 040 * <li>Deletion</li> 041 * <li>Substitution</li> 042 * </ul> 043 * 044 * The interface is provided so end users can use our implementations, which 045 * are implementations which attempts to create views of Sequences in an 046 * editted form not a full-realised editted Sequence, or their own. 047 * 048 * @author ayates 049 * @param <C> The type of compound to edit 050 */ 051public interface Edit<C extends Compound> { 052 053 Sequence<C> edit(Sequence<C> sequence); 054 055 /** 056 * Abstract class which defines all edit operations as a call to discover 057 * what 5' and 3' ends of an editing Sequence should be joined together 058 * with a target Sequence. These ends can be of 0 length but conceptionally 059 * they can still exist. 060 */ 061 public static abstract class AbstractEdit<C extends Compound> implements Edit<C> { 062 063 private final static Logger logger = LoggerFactory.getLogger(AbstractEdit.class); 064 065 /** 066 * Should return the 5-prime end of the given Sequence according to 067 * the edit. An empty Sequence is valid. 068 */ 069 protected abstract Sequence<C> getFivePrime(Sequence<C> editingSequence); 070 071 /** 072 * Should return the 3-prime end of the given Sequence according to 073 * the edit. An empty Sequence is valid. 074 */ 075 protected abstract Sequence<C> getThreePrime(Sequence<C> editingSequence); 076 077 078 @Override 079 public Sequence<C> edit(Sequence<C> editingSequence) { 080 Sequence<C> targetSequence = getTargetSequence(editingSequence); 081 List<Sequence<C>> sequences = new ArrayList<>(); 082 083 sequences.add(getFivePrime(editingSequence)); 084 sequences.add(targetSequence); 085 sequences.add(getThreePrime(editingSequence)); 086 087 return new JoiningSequenceReader<>(sequences); 088 } 089 private int start = -1; 090 private int end = -1; 091 private String stringSequence; 092 private Sequence<C> sequence; 093 094 public AbstractEdit(int start) { 095 this.start = start; 096 } 097 098 public AbstractEdit(int start, int end) { 099 this.start = start; 100 this.end = end; 101 } 102 103 protected void setStringSequence(String stringSequence) { 104 this.stringSequence = stringSequence; 105 } 106 107 protected void setSequence(Sequence<C> sequence) { 108 this.sequence = sequence; 109 } 110 111 /** 112 * Returns the Sequence which is our edit. 113 * 114 * @param editingSequence Asked for in-case we need to do String to 115 * Sequence conversion so we need a CompoundSet which is given 116 * by the Sequence we are editing 117 * @return The {@link Sequence<C>} object we wish to insert 118 */ 119 public Sequence<C> getTargetSequence(Sequence<C> editingSequence) { 120 if (sequence == null && stringSequence != null) { 121 try { 122 sequence = new BasicSequence<>( 123 stringSequence, editingSequence.getCompoundSet()); 124 } catch (CompoundNotFoundException e) { 125 // TODO is there a better way to handle this exception? 126 logger.error("Problem setting sequence, some unrecognised compounds: {}", e.getMessage()); 127 } 128 } 129 return sequence; 130 } 131 132 /** 133 * Returns an empty sequence with the given compound set of the editing 134 * sequence 135 */ 136 protected Sequence<C> getEmptySequence(Sequence<C> editingSequence) { 137 Sequence<C> s = null; 138 try { 139 s = new BasicSequence<>("", editingSequence.getCompoundSet()); 140 } catch (CompoundNotFoundException e) { 141 // should not happen 142 logger.error("Could not construct empty sequence. {}. This is most likely a bug.", e.getMessage()); 143 } 144 return s; 145 } 146 147 public int getStart() { 148 return start; 149 } 150 151 public int getEnd() { 152 return end; 153 } 154 } 155 156 /** 157 * Implementation which allows for the deletion of bases from a Sequence 158 */ 159 public static class Delete<C extends Compound> extends AbstractEdit<C> { 160 161 public Delete(int position) { 162 this(position, position); 163 } 164 165 public Delete(int start, int end) { 166 super(start, end); 167 setStringSequence(""); 168 } 169 170 protected int getRealStart() { 171 return getStart() - 1; 172 } 173 174 protected int getRealEnd() { 175 return getEnd() + 1; 176 } 177 178 @Override 179 protected Sequence<C> getFivePrime(Sequence<C> editingSequence) { 180 int start = getRealStart(); 181 if (start == 0) { 182 return getEmptySequence(editingSequence); 183 } 184 return editingSequence.getSubSequence(1, start); 185 } 186 187 @Override 188 protected Sequence<C> getThreePrime(Sequence<C> editingSequence) { 189 int end = getRealEnd(); 190 if (end > editingSequence.getLength()) { 191 return getEmptySequence(editingSequence); 192 } 193 return editingSequence.getSubSequence(end, editingSequence.getLength()); 194 } 195 } 196 197 /** 198 * Edit implementation which allows us to insert a base at any position 199 * in a Sequence. Specifying 1 base is used to insert at the start and 200 * end of a Sequence. If you wish to carry out an in-sequence insertion 201 * then you specify the flanking base positions e.g. 202 * 203 * <pre> 204 * ACTG insert TT @ position 1 : TTACGT 205 * ACTG insert TT @ position 2,3 : ACTTGT 206 * ACTG insert A @ position 4 : ACGTA 207 * </pre> 208 * 209 * The code will raise exceptions if you attempt a single base edit 210 * with an insertion. 211 */ 212 public static class Insert<C extends Compound> extends AbstractEdit<C> { 213 214 private final boolean singlePosition; 215 216 public Insert(String sequence, int position) { 217 super(position, position); 218 this.singlePosition = true; 219 setStringSequence(sequence); 220 } 221 222 public Insert(Sequence<C> sequence, int position) { 223 super(position, position); 224 this.singlePosition = true; 225 setSequence(sequence); 226 } 227 228 public Insert(String sequence, int start, int stop) { 229 super(start, stop); 230 this.singlePosition = false; 231 setStringSequence(sequence); 232 } 233 234 public Insert(Sequence<C> sequence, int start, int stop) { 235 super(start, stop); 236 this.singlePosition = false; 237 setSequence(sequence); 238 } 239 240 @Override 241 protected Sequence<C> getFivePrime(Sequence<C> editingSequence) { 242 if (singlePosition) { 243 if (getStart() == 1) { 244 return getEmptySequence(editingSequence); 245 } else if (getEnd() == editingSequence.getLength()) { 246 return editingSequence; 247 } else { 248 throw new IllegalStateException("Given one position to " 249 + "insert at but this is not the start or end " 250 + "of the Sequence; cannot support this"); 251 } 252 } 253 return editingSequence.getSubSequence(1, getStart()); 254 } 255 256 @Override 257 protected Sequence<C> getThreePrime(Sequence<C> editingSequence) { 258 if (singlePosition) { 259 if (getStart() == 1) { 260 return editingSequence; 261 } else if (getEnd() == editingSequence.getLength()) { 262 return getEmptySequence(editingSequence); 263 } else { 264 throw new IllegalStateException("Given one position to " 265 + "insert at but this is not the start or end " 266 + "of the Sequence; cannot support this"); 267 } 268 } 269 return editingSequence.getSubSequence(getEnd(), editingSequence.getLength()); 270 } 271 } 272 273 /** 274 * Allows for the substitution of bases into an existing Sequence. This 275 * allows us to do edits like: 276 * 277 * <pre> 278 * Sub TT @ position 2 279 * AAAA -> ATTA 280 * </pre> 281 * 282 * We do not support 283 * 284 * Edits do not require the length of the insertion but do rely on the 285 * presence of a CompoundSet to parse a String (if given) which means 286 * the eventual length of a Sequence is a lazy operation. 287 */ 288 public static class Substitute<C extends Compound> extends AbstractEdit<C> { 289 290 public Substitute(String sequence, int position) { 291 super(position); 292 setStringSequence(sequence); 293 } 294 295 public Substitute(Sequence<C> sequence, int position) { 296 super(position); 297 setSequence(sequence); 298 } 299 300 /** 301 * Must use this rather than the no-args getEnd as this can return 302 * -1 and the length of a sub is dependent on the length of the 303 * Sequence; we cannot assume 1:1 mapping between characters in a 304 * String and the number of compounds we will have to insert. 305 */ 306 public int getEnd(Sequence<C> sequence) { 307 if (getEnd() == -1) { 308 int start = getStart(); 309 int length = getTargetSequence(sequence).getLength(); 310 return (start + length) - 1; 311 } 312 return getEnd(); 313 } 314 315 @Override 316 protected Sequence<C> getFivePrime(Sequence<C> editingSequence) { 317 int start = getStart(); 318 if (start == 1) { 319 return getEmptySequence(editingSequence); 320 } 321 return editingSequence.getSubSequence(1, start - 1); 322 } 323 324 @Override 325 protected Sequence<C> getThreePrime(Sequence<C> editingSequence) { 326 int end = getEnd(editingSequence); 327 if (end > editingSequence.getLength()) { 328 throw new IndexOutOfBoundsException(end + 329 " is greater than the max index of " + 330 "the editing sequence (" + 331 editingSequence.getLength()); 332 } else if (end == editingSequence.getLength()) { 333 return getEmptySequence(editingSequence); 334 } 335 return editingSequence.getSubSequence(end + 1, editingSequence.getLength()); 336 } 337 } 338}