001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.storage;
022
023import org.biojava.nbio.core.sequence.AccessionID;
024import org.biojava.nbio.core.sequence.compound.NucleotideCompound;
025import org.biojava.nbio.core.sequence.template.CompoundSet;
026import org.biojava.nbio.core.sequence.template.Sequence;
027
028import java.util.ArrayList;
029import java.util.HashMap;
030import java.util.List;
031import java.util.Map;
032
033/**
034 * Implementation of the 2bit encoding. This will default to the following
035 * encodings:
036 *
037 * <ul>
038 * <li>0 - T</li>
039 * <li>1 - C</li>
040 * <li>2 - A</li>
041 * <li>3 - G</li>
042 * </ul>
043 *
044 * We also do not support case sensitive encodings therefore if you pass a
045 * lowercased a this will be treated as if it is an uppercase A and we will
046 * erase that information.
047 *
048 * @author ayates
049 */
050public class TwoBitSequenceReader<C extends NucleotideCompound> extends BitSequenceReader<C> {
051
052        public TwoBitSequenceReader(Sequence<C> sequence) {
053                super(new TwoBitArrayWorker<C>(sequence), sequence.getAccession());
054        }
055
056        public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet) {
057                this(sequence, compoundSet, new AccessionID("Unknown"));
058        }
059
060        public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) {
061                super(new TwoBitArrayWorker<C>(sequence, compoundSet), accession);
062        }
063
064        public TwoBitSequenceReader(TwoBitArrayWorker<C> worker) {
065                super(worker, new AccessionID("unknown"));
066        }
067
068        public TwoBitSequenceReader(TwoBitArrayWorker<C> worker, AccessionID accession) {
069                super(worker, accession);
070        }
071
072        /**
073         * Extension of the BitArrayWorker which provides the 2bit implementation
074         * code. This is intended to work with the 4 basic nucelotide types. If you
075         * require a different version of the encoding used here then extend
076         * and override as required.
077         *
078         * @param <C> Must extend NucleotideCompound
079         */
080        public static class TwoBitArrayWorker<C extends NucleotideCompound> extends BitArrayWorker<C> {
081
082                public TwoBitArrayWorker(CompoundSet<C> compoundSet, int length) {
083                        super(compoundSet, length);
084                }
085
086                public TwoBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
087                        super(compoundSet, sequence);
088                }
089
090                public TwoBitArrayWorker(Sequence<C> sequence) {
091                        super(sequence);
092                }
093
094                public TwoBitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
095                        super(sequence, compoundSet);
096                }
097
098                /**
099                 * Masking value used for extracting the right most 2 bits from a byte
100                 */
101                private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1));
102
103                @Override
104                protected byte bitMask() {
105                        return MASK;
106                }
107
108                @Override
109                protected int compoundsPerDatatype() {
110                        return 16;
111                }
112
113                /**
114                 * Returns a Map which encodes TCAG into positions 0,1,2,3.
115                 */
116                @Override
117                @SuppressWarnings("serial")
118                protected Map<C, Integer> generateCompoundsToIndex() {
119                        final CompoundSet<C> cs = getCompoundSet();
120                        return new HashMap<C, Integer>() {
121
122                                {
123                                        put(cs.getCompoundForString("T"), 0);
124                                        put(cs.getCompoundForString("C"), 1);
125                                        put(cs.getCompoundForString("A"), 2);
126                                        put(cs.getCompoundForString("G"), 3);
127                                        put(cs.getCompoundForString("t"), 0);
128                                        put(cs.getCompoundForString("c"), 1);
129                                        put(cs.getCompoundForString("a"), 2);
130                                        put(cs.getCompoundForString("g"), 3);
131                                }
132                        };
133                }
134
135                /**
136                 * Returns a List which encodes TCAG into positions 0,1,2,3.
137                 */
138                @Override
139                protected List<C> generateIndexToCompounds() {
140                        CompoundSet<C> cs = getCompoundSet();
141                        List<C> result = new ArrayList<C>();
142                        result.add( cs.getCompoundForString("T"));
143
144
145                        result.add( cs.getCompoundForString("C"));
146                        result.add( cs.getCompoundForString("A"));
147                        result.add( cs.getCompoundForString("G"));
148                        return result;
149                }
150        }
151
152}