001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.storage; 022 023import org.biojava.nbio.core.sequence.AccessionID; 024import org.biojava.nbio.core.sequence.template.Compound; 025import org.biojava.nbio.core.sequence.template.CompoundSet; 026import org.biojava.nbio.core.sequence.template.Sequence; 027 028import java.util.*; 029 030/** 031 * 032 * Four bit encoding of the bit formats. This can support up to 16 compounds 033 * from a compound set. To allow us to support the redundant set of Nucleotide 034 * compounds this class will use case-insensitive encoding. The values assigned 035 * to these compounds is also done at runtime; if you want a predictable 036 * ordering then override and use your own encodings. However all 037 * encodings are calculated using lexographical ordering of the compounds 038 * so if a CompoundSet does not change then this encoding should not cauuse 039 * a problem. 040 * 041 * @author ayates 042 */ 043public class FourBitSequenceReader<C extends Compound> extends BitSequenceReader<C> { 044 045 public FourBitSequenceReader(Sequence<C> sequence) { 046 super(new FourBitArrayWorker<C>(sequence), sequence.getAccession()); 047 } 048 049 public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet) { 050 this(sequence, compoundSet, new AccessionID("Unknown")); 051 } 052 053 public FourBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) { 054 super(new FourBitArrayWorker<C>(sequence, compoundSet), accession); 055 } 056 057 public FourBitSequenceReader(FourBitArrayWorker<C> worker) { 058 super(worker, new AccessionID("unknown")); 059 } 060 061 public FourBitSequenceReader(FourBitArrayWorker<C> worker, AccessionID accession) { 062 super(worker, accession); 063 } 064 065 /** 066 * A four bit per compound implementation of the bit array worker code. This 067 * version can handle upto 16 compounds but this does mean that its ability 068 * to compress a normal sequence is halved (compared to the 1/4 performance 069 * seen with the 2bit workers). 070 * 071 * @param <C> Must extend NucleotideCompound 072 */ 073 public static class FourBitArrayWorker<C extends Compound> extends BitArrayWorker<C> { 074 075 public FourBitArrayWorker(CompoundSet<C> compoundSet, int length) { 076 super(compoundSet, length); 077 } 078 079 public FourBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) { 080 super(compoundSet, sequence); 081 } 082 083 public FourBitArrayWorker(Sequence<C> sequence) { 084 super(sequence); 085 } 086 087 public FourBitArrayWorker(String sequence, CompoundSet<C> compoundSet) { 088 super(sequence, compoundSet); 089 } 090 /** 091 * Masking value used for extracting the right most 2 bits from a byte 092 */ 093 private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1) | (int) Math.pow(2, 2) | (int) Math.pow(2, 3)); 094 095 096 @Override 097 protected byte bitMask() { 098 return MASK; 099 } 100 101 102 @Override 103 protected int compoundsPerDatatype() { 104 return 8; 105 } 106 107 /** 108 * Returns a Map which encodes the contents of CompoundSet. This 109 * version is case-insensitive i.e. C and c both encode for the same 110 * position. We sort lexigraphically so if the compound set has 111 * not changed then neither will this. 112 */ 113 114 @Override 115 protected Map<C, Integer> generateCompoundsToIndex() { 116 final CompoundSet<C> cs = getCompoundSet(); 117 Map<C, Integer> map = new HashMap<C, Integer>(); 118 int index = 0; 119 for (C currentCompound : sortedCompounds(cs)) { 120 C upperCasedCompound = getOptionalUpperCasedCompound(currentCompound, cs); 121 122 //if it has the uppercased compound then set this 123 //compounds' value to that one 124 if (map.containsKey(upperCasedCompound)) { 125 map.put(currentCompound, map.get(upperCasedCompound)); 126 } else { 127 map.put(currentCompound, index++); 128 } 129 } 130 131 return map; 132 } 133 134 private C getOptionalUpperCasedCompound(C currentCompound, CompoundSet<C> cs) { 135 C upperCasedCompound = null; 136 String upperCasedString = cs.getStringForCompound(currentCompound).toUpperCase(); 137 if (cs.getCompoundForString(upperCasedString) == null) { 138 upperCasedCompound = currentCompound; 139 } else { 140 upperCasedCompound = cs.getCompoundForString(upperCasedString); 141 } 142 return upperCasedCompound; 143 } 144 145 private List<C> sortedCompounds(final CompoundSet<C> cs) { 146 List<C> compounds = new ArrayList<C>(cs.getAllCompounds()); 147 Collections.sort(compounds, new Comparator<C>() { 148 149 150 @Override 151 public int compare(C o1, C o2) { 152 String s1 = cs.getStringForCompound(o1); 153 String s2 = cs.getStringForCompound(o2); 154 return s1.compareTo(s2); 155 } 156 }); 157 return compounds; 158 } 159 160 /** 161 * Returns a List which reverse encodes the Compound, Integer map 162 */ 163 164 @Override 165 protected List<C> generateIndexToCompounds() { 166 CompoundSet<C> cs = getCompoundSet(); 167 Map<C, Integer> lookup = getCompoundsToIndexLookup(); 168 Map<Integer, C> tempMap = new HashMap<Integer, C>(); 169 //First get the reverse lookup working 170 for (C compound : lookup.keySet()) { 171 C upperCasedCompound = getOptionalUpperCasedCompound(compound, cs); 172 Integer pos = lookup.get(upperCasedCompound); 173 tempMap.put(pos, upperCasedCompound); 174 } 175 176 //Then populate the results by going back through the sorted integer keys 177 List<C> compounds = new ArrayList<C>(); 178 List<Integer> keys = new ArrayList<Integer>(tempMap.keySet()); 179 Collections.sort(keys); 180 for (Integer key : keys) { 181 compounds.add(tempMap.get(key)); 182 } 183 184 return compounds; 185 } 186 } 187}