001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.edits;
022
023import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
024import org.biojava.nbio.core.sequence.BasicSequence;
025import org.biojava.nbio.core.sequence.storage.JoiningSequenceReader;
026import org.biojava.nbio.core.sequence.template.Compound;
027import org.biojava.nbio.core.sequence.template.Sequence;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.util.ArrayList;
032import java.util.List;
033
034/**
035 * Interface for carrying out edit operations on a Sequence. The 3 major
036 * methods of Editing are supported
037 *
038 * <ul>
039 * <li>Insertion</li>
040 * <li>Deletion</li>
041 * <li>Substitution</li>
042 * </ul>
043 *
044 * The interface is provided so end users can use our implementations, which
045 * are implementations which attempts to create views of Sequences in an
046 * editted form not a full-realised editted Sequence, or their own.
047 *
048 * @author ayates
049 * @param <C> The type of compound to edit
050 */
051public interface Edit<C extends Compound> {
052
053        Sequence<C> edit(Sequence<C> sequence);
054
055        /**
056         * Abstract class which defines all edit operations as a call to discover
057         * what 5' and 3' ends of an editing Sequence should be joined together
058         * with a target Sequence. These ends can be of 0 length but conceptionally
059         * they can still exist.
060         */
061        public static abstract class AbstractEdit<C extends Compound> implements Edit<C> {
062
063                private final static Logger logger = LoggerFactory.getLogger(AbstractEdit.class);
064
065                /**
066                 * Should return the 5-prime end of the given Sequence according to
067                 * the edit. An empty Sequence is valid.
068                 */
069                protected abstract Sequence<C> getFivePrime(Sequence<C> editingSequence);
070
071                /**
072                 * Should return the 3-prime end of the given Sequence according to
073                 * the edit. An empty Sequence is valid.
074                 */
075                protected abstract Sequence<C> getThreePrime(Sequence<C> editingSequence);
076
077
078                @Override
079                public Sequence<C> edit(Sequence<C> editingSequence) {
080                        Sequence<C> targetSequence = getTargetSequence(editingSequence);
081                        List<Sequence<C>> sequences = new ArrayList<>();
082
083                        sequences.add(getFivePrime(editingSequence));
084                        sequences.add(targetSequence);
085                        sequences.add(getThreePrime(editingSequence));
086
087                        return new JoiningSequenceReader<>(sequences);
088                }
089                private int start = -1;
090                private int end = -1;
091                private String stringSequence;
092                private Sequence<C> sequence;
093
094                public AbstractEdit(int start) {
095                        this.start = start;
096                }
097
098                public AbstractEdit(int start, int end) {
099                        this.start = start;
100                        this.end = end;
101                }
102
103                protected void setStringSequence(String stringSequence) {
104                        this.stringSequence = stringSequence;
105                }
106
107                protected void setSequence(Sequence<C> sequence) {
108                        this.sequence = sequence;
109                }
110
111                /**
112                 * Returns the Sequence which is our edit.
113                 *
114                 * @param editingSequence Asked for in-case we need to do String to
115                 * Sequence conversion so we need a CompoundSet which is given
116                 * by the Sequence we are editing
117                 * @return The {@link Sequence<C>} object we wish to insert
118                 */
119                public Sequence<C> getTargetSequence(Sequence<C> editingSequence) {
120                        if (sequence == null && stringSequence != null) {
121                                try {
122                                        sequence = new BasicSequence<>(
123                                                                stringSequence, editingSequence.getCompoundSet());
124                                } catch (CompoundNotFoundException e) {
125                                        // TODO is there a better way to handle this exception?
126                                        logger.error("Problem setting sequence, some unrecognised compounds: {}", e.getMessage());
127                                }
128                        }
129                        return sequence;
130                }
131
132                /**
133                 * Returns an empty sequence with the given compound set of the editing
134                 * sequence
135                 */
136                protected Sequence<C> getEmptySequence(Sequence<C> editingSequence) {
137                        Sequence<C> s = null;
138                        try {
139                                s = new BasicSequence<>("", editingSequence.getCompoundSet());
140                        } catch (CompoundNotFoundException e) {
141                                // should not happen
142                                logger.error("Could not construct empty sequence. {}. This is most likely a bug.", e.getMessage());
143                        }
144                        return s;
145                }
146
147                public int getStart() {
148                        return start;
149                }
150
151                public int getEnd() {
152                        return end;
153                }
154        }
155
156        /**
157         * Implementation which allows for the deletion of bases from a Sequence
158         */
159        public static class Delete<C extends Compound> extends AbstractEdit<C> {
160
161                public Delete(int position) {
162                        this(position, position);
163                }
164
165                public Delete(int start, int end) {
166                        super(start, end);
167                        setStringSequence("");
168                }
169
170                protected int getRealStart() {
171                        return getStart() - 1;
172                }
173
174                protected int getRealEnd() {
175                        return getEnd() + 1;
176                }
177
178                @Override
179                protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
180                        int start = getRealStart();
181                        if (start == 0) {
182                                return getEmptySequence(editingSequence);
183                        }
184                        return editingSequence.getSubSequence(1, start);
185                }
186
187                @Override
188                protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
189                        int end = getRealEnd();
190                        if (end > editingSequence.getLength()) {
191                                return getEmptySequence(editingSequence);
192                        }
193                        return editingSequence.getSubSequence(end, editingSequence.getLength());
194                }
195        }
196
197        /**
198         * Edit implementation which allows us to insert a base at any position
199         * in a Sequence. Specifying 1 base is used to insert at the start and
200         * end of a Sequence. If you wish to carry out an in-sequence insertion
201         * then you specify the flanking base positions e.g.
202         *
203         * <pre>
204         *   ACTG insert TT @ position 1   : TTACGT
205         *   ACTG insert TT @ position 2,3 : ACTTGT
206         *   ACTG insert A  @ position 4   : ACGTA
207         * </pre>
208         *
209         * The code will raise exceptions if you attempt a single base edit
210         * with an insertion.
211         */
212        public static class Insert<C extends Compound> extends AbstractEdit<C> {
213
214                private final boolean singlePosition;
215
216                public Insert(String sequence, int position) {
217                        super(position, position);
218                        this.singlePosition = true;
219                        setStringSequence(sequence);
220                }
221
222                public Insert(Sequence<C> sequence, int position) {
223                        super(position, position);
224                        this.singlePosition = true;
225                        setSequence(sequence);
226                }
227
228                public Insert(String sequence, int start, int stop) {
229                        super(start, stop);
230                        this.singlePosition = false;
231                        setStringSequence(sequence);
232                }
233
234                public Insert(Sequence<C> sequence, int start, int stop) {
235                        super(start, stop);
236                        this.singlePosition = false;
237                        setSequence(sequence);
238                }
239
240                @Override
241                protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
242                        if (singlePosition) {
243                                if (getStart() == 1) {
244                                        return getEmptySequence(editingSequence);
245                                } else if (getEnd() == editingSequence.getLength()) {
246                                        return editingSequence;
247                                } else {
248                                        throw new IllegalStateException("Given one position to "
249                                                        + "insert at but this is not the start or end "
250                                                        + "of the Sequence; cannot support this");
251                                }
252                        }
253                        return editingSequence.getSubSequence(1, getStart());
254                }
255
256                @Override
257                protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
258                        if (singlePosition) {
259                                if (getStart() == 1) {
260                                        return editingSequence;
261                                } else if (getEnd() == editingSequence.getLength()) {
262                                        return getEmptySequence(editingSequence);
263                                } else {
264                                        throw new IllegalStateException("Given one position to "
265                                                        + "insert at but this is not the start or end "
266                                                        + "of the Sequence; cannot support this");
267                                }
268                        }
269                        return editingSequence.getSubSequence(getEnd(), editingSequence.getLength());
270                }
271        }
272
273        /**
274         * Allows for the substitution of bases into an existing Sequence. This
275         * allows us to do edits like:
276         *
277         * <pre>
278         *    Sub TT @ position 2
279         *    AAAA -> ATTA
280         * </pre>
281         *
282         * We do not support
283         *
284         * Edits do not require the length of the insertion but do rely on the
285         * presence of a CompoundSet to parse a String (if given) which means
286         * the eventual length of a Sequence is a lazy operation.
287         */
288        public static class Substitute<C extends Compound> extends AbstractEdit<C> {
289
290                public Substitute(String sequence, int position) {
291                        super(position);
292                        setStringSequence(sequence);
293                }
294
295                public Substitute(Sequence<C> sequence, int position) {
296                        super(position);
297                        setSequence(sequence);
298                }
299
300                /**
301                 * Must use this rather than the no-args getEnd as this can return
302                 * -1 and the length of a sub is dependent on the length of the
303                 * Sequence; we cannot assume 1:1 mapping between characters in a
304                 * String and the number of compounds we will have to insert.
305                 */
306                public int getEnd(Sequence<C> sequence) {
307                        if (getEnd() == -1) {
308                                int start = getStart();
309                                int length = getTargetSequence(sequence).getLength();
310                                return (start + length) - 1;
311                        }
312                        return getEnd();
313                }
314
315                @Override
316                protected Sequence<C> getFivePrime(Sequence<C> editingSequence) {
317                        int start = getStart();
318                        if (start == 1) {
319                                return getEmptySequence(editingSequence);
320                        }
321                        return editingSequence.getSubSequence(1, start - 1);
322                }
323
324                @Override
325                protected Sequence<C> getThreePrime(Sequence<C> editingSequence) {
326                        int end = getEnd(editingSequence);
327                        if (end > editingSequence.getLength()) {
328                                throw new IndexOutOfBoundsException(end +
329                                                " is greater than the max index of " +
330                                                "the editing sequence (" +
331                                                editingSequence.getLength());
332                        } else if (end == editingSequence.getLength()) {
333                                return getEmptySequence(editingSequence);
334                        }
335                        return editingSequence.getSubSequence(end + 1, editingSequence.getLength());
336                }
337        }
338}