001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.core.sequence.storage;
022
023import org.biojava.nbio.core.sequence.AccessionID;
024import org.biojava.nbio.core.sequence.template.Compound;
025import org.biojava.nbio.core.sequence.template.CompoundSet;
026import org.biojava.nbio.core.sequence.template.Sequence;
027
028import java.util.*;
029
030/**
031 *
032 * Four bit encoding of the bit formats. This can support up to 16 compounds
033 * from a compound set. To allow us to support the redundant set of Nucleotide
034 * compounds this class will use case-insensitive encoding. The values assigned
035 * to these compounds is also done at runtime; if you want a predictable
036 * ordering then override and use your own encodings. However all
037 * encodings are calculated using lexographical ordering of the compounds
038 * so if a CompoundSet does not change then this encoding should not cauuse
039 * a problem.
040 *
041 * @author ayates
042 */
043public class FourBitSequenceReader<C extends Compound> extends BitSequenceReader<C> {
044
045        public FourBitSequenceReader(Sequence<C> sequence) {
046                super(new FourBitArrayWorker<C>(sequence), sequence.getAccession());
047        }
048
049        public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet) {
050                this(sequence, compoundSet, new AccessionID("Unknown"));
051        }
052
053        public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) {
054                super(new FourBitArrayWorker<C>(sequence, compoundSet), accession);
055        }
056
057        public FourBitSequenceReader(FourBitArrayWorker<C> worker) {
058                super(worker, new AccessionID("unknown"));
059        }
060
061        public FourBitSequenceReader(FourBitArrayWorker<C> worker, AccessionID accession) {
062                super(worker, accession);
063        }
064
065        /**
066         * A four bit per compound implementation of the bit array worker code. This
067         * version can handle upto 16 compounds but this does mean that its ability
068         * to compress a normal sequence is halved (compared to the 1/4 performance
069         * seen with the 2bit workers).
070         *
071         * @param <C> Must extend NucleotideCompound
072         */
073        public static class FourBitArrayWorker<C extends Compound> extends BitArrayWorker<C> {
074
075                public FourBitArrayWorker(CompoundSet<C> compoundSet, int length) {
076                        super(compoundSet, length);
077                }
078
079                public FourBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) {
080                        super(compoundSet, sequence);
081                }
082
083                public FourBitArrayWorker(Sequence<C> sequence) {
084                        super(sequence);
085                }
086
087                public FourBitArrayWorker(String sequence, CompoundSet<C> compoundSet) {
088                        super(sequence, compoundSet);
089                }
090                /**
091                 * Masking value used for extracting the right most 2 bits from a byte
092                 */
093                private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1) | (int) Math.pow(2, 2) | (int) Math.pow(2, 3));
094
095
096                @Override
097                protected byte bitMask() {
098                        return MASK;
099                }
100
101
102                @Override
103                protected int compoundsPerDatatype() {
104                        return 8;
105                }
106
107                /**
108                 * Returns a Map which encodes the contents of CompoundSet. This
109                 * version is case-insensitive i.e. C and c both encode for the same
110                 * position. We sort lexigraphically so if the compound set has
111                 * not changed then neither will this.
112                 */
113
114                @Override
115                protected Map<C, Integer> generateCompoundsToIndex() {
116                        final CompoundSet<C> cs = getCompoundSet();
117                        Map<C, Integer> map = new HashMap<C, Integer>();
118                        int index = 0;
119                        for (C currentCompound : sortedCompounds(cs)) {
120                                C upperCasedCompound = getOptionalUpperCasedCompound(currentCompound, cs);
121
122                                //if it has the uppercased compound then set this
123                                //compounds' value to that one
124                                if (map.containsKey(upperCasedCompound)) {
125                                        map.put(currentCompound, map.get(upperCasedCompound));
126                                } else {
127                                        map.put(currentCompound, index++);
128                                }
129                        }
130
131                        return map;
132                }
133
134                private C getOptionalUpperCasedCompound(C currentCompound, CompoundSet<C> cs) {
135                        C upperCasedCompound = null;
136                        String upperCasedString = cs.getStringForCompound(currentCompound).toUpperCase();
137                        if (cs.getCompoundForString(upperCasedString) == null) {
138                                upperCasedCompound = currentCompound;
139                        } else {
140                                upperCasedCompound = cs.getCompoundForString(upperCasedString);
141                        }
142                        return upperCasedCompound;
143                }
144
145                private List<C> sortedCompounds(final CompoundSet<C> cs) {
146                        List<C> compounds = new ArrayList<C>(cs.getAllCompounds());
147                        Collections.sort(compounds, new Comparator<C>() {
148
149
150                                @Override
151                                public int compare(C o1, C o2) {
152                                        String s1 = cs.getStringForCompound(o1);
153                                        String s2 = cs.getStringForCompound(o2);
154                                        return s1.compareTo(s2);
155                                }
156                        });
157                        return compounds;
158                }
159
160                /**
161                 * Returns a List which reverse encodes the Compound, Integer map
162                 */
163
164                @Override
165                protected List<C> generateIndexToCompounds() {
166                        CompoundSet<C> cs = getCompoundSet();
167                        Map<C, Integer> lookup = getCompoundsToIndexLookup();
168                        Map<Integer, C> tempMap = new HashMap<Integer, C>();
169                        //First get the reverse lookup working
170                        for (C compound : lookup.keySet()) {
171                                C upperCasedCompound = getOptionalUpperCasedCompound(compound, cs);
172                                Integer pos = lookup.get(upperCasedCompound);
173                                tempMap.put(pos, upperCasedCompound);
174                        }
175
176                        //Then populate the results by going back through the sorted integer keys
177                        List<C> compounds = new ArrayList<C>();
178                        List<Integer> keys = new ArrayList<Integer>(tempMap.keySet());
179                        Collections.sort(keys);
180                        for (Integer key : keys) {
181                                compounds.add(tempMap.get(key));
182                        }
183
184                        return compounds;
185                }
186        }
187}