001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.storage; 022 023import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 024import org.biojava.nbio.core.sequence.AccessionID; 025import org.biojava.nbio.core.sequence.template.*; 026 027import java.util.*; 028 029/** 030 * This reader actually proxies onto multiple types of sequence in order 031 * to allow a number of sequence objects to act as if they are one sequence. 032 * The code takes in any number of sequences, records the minimum and maximum 033 * bounds each sequence covers with respect to 1 position indexing and then 034 * binary searches these when a position is requested. Because of this 035 * 0 length Sequences are excluded during construction. 036 * 037 * Performance is not as good as if you are using a flat sequence however the 038 * speed of lookup is more than adaquate for most situations. Using the iterator 039 * gives the best performance as this does not rely on the binary search 040 * mechanism instead iterating through each sequence in turn. 041 * 042 * @author ayates 043 * @param <C> Tyoe of compound to hold 044 */ 045public class JoiningSequenceReader<C extends Compound> implements ProxySequenceReader<C> { 046 047 /** 048 * Internal implementation flag and controls how we look for the right 049 * sub-sequence 050 */ 051 private static final boolean BINARY_SEARCH = true; 052 private final List<Sequence<C>> sequences; 053 private final CompoundSet<C> compoundSet; 054 private int[] maxSequenceIndex; 055 private int[] minSequenceIndex; 056 057 /** 058 * Allows creation of the store from Vargs Sequence<C> objects. CompoundSet 059 * defaults to the first element of the array (assuming there are elements 060 * available during construction otherwise we will throw an illegal 061 * state exception). 062 */ 063 public JoiningSequenceReader(Sequence<C>... sequences) { 064 this(Arrays.asList(sequences)); 065 } 066 067 /** 068 * Allows creation of the store from List<Sequence<C>>. CompoundSet 069 * defaults to the first element of the List (assuming there are elements 070 * available during construction otherwise we will throw an illegal 071 * state exception). 072 */ 073 public JoiningSequenceReader(List<Sequence<C>> sequences) { 074 this.sequences = grepSequences(sequences); 075 this.compoundSet = grepCompoundSet(); 076 } 077 078 public JoiningSequenceReader(CompoundSet<C> compoundSet, Sequence<C>... sequences) { 079 this(compoundSet, Arrays.asList(sequences)); 080 } 081 082 public JoiningSequenceReader(CompoundSet<C> compoundSet, List<Sequence<C>> sequences) { 083 this.sequences = grepSequences(sequences); 084 this.compoundSet = compoundSet; 085 } 086 087 private List<Sequence<C>> grepSequences(List<Sequence<C>> sequences) { 088 List<Sequence<C>> seqs = new ArrayList<Sequence<C>>(); 089 for (Sequence<C> s : sequences) { 090 if (s.getLength() != 0) { 091 seqs.add(s); 092 } 093 } 094 return seqs; 095 } 096 097 private CompoundSet<C> grepCompoundSet() { 098 if (sequences.isEmpty()) { 099 throw new IllegalStateException("Cannot get a CompoundSet because we have no sequences. Set during construction"); 100 } 101 return sequences.get(0).getCompoundSet(); 102 } 103 104 105 @Override 106 public C getCompoundAt(int position) { 107 int sequenceIndex = getSequenceIndex(position); 108 Sequence<C> sequence = sequences.get(sequenceIndex); 109 int indexInSequence = (position - getMinSequenceIndex()[sequenceIndex]) + 1; 110 return sequence.getCompoundAt(indexInSequence); 111 } 112 113 114 @Override 115 public CompoundSet<C> getCompoundSet() { 116 return compoundSet; 117 } 118 119 120 @Override 121 public int getLength() { 122 int[] maxSeqIndex = getMaxSequenceIndex(); 123 if (maxSeqIndex.length == 0) { 124 return 0; 125 } 126 return maxSeqIndex[maxSeqIndex.length - 1]; 127 } 128 129 /** 130 * Returns which Sequence holds the position queried for 131 */ 132 private int getSequenceIndex(int position) { 133 if (BINARY_SEARCH) { 134 return binarySearch(position); 135 } else { 136 return linearSearch(position); 137 } 138 } 139 140 private int[] getMinSequenceIndex() { 141 if (minSequenceIndex == null) { 142 initSeqIndexes(); 143 } 144 return minSequenceIndex; 145 } 146 147 private int[] getMaxSequenceIndex() { 148 if (maxSequenceIndex == null) { 149 initSeqIndexes(); 150 } 151 return maxSequenceIndex; 152 } 153 154 private void initSeqIndexes() { 155 minSequenceIndex = new int[sequences.size()]; 156 maxSequenceIndex = new int[sequences.size()]; 157 int currentMaxIndex = 0; 158 int currentMinIndex = 1; 159 int lastLength = 0; 160 for (int i = 0; i < sequences.size(); i++) { 161 currentMinIndex += lastLength; 162 currentMaxIndex += sequences.get(i).getLength(); 163 minSequenceIndex[i] = currentMinIndex; 164 maxSequenceIndex[i] = currentMaxIndex; 165 lastLength = sequences.get(i).getLength(); 166 } 167 } 168 169 /** 170 * Scans through the sequence index arrays in linear time. Not the best 171 * performance but easier to code 172 */ 173 private int linearSearch(int position) { 174 int[] minSeqIndex = getMinSequenceIndex(); 175 int[] maxSeqIndex = getMaxSequenceIndex(); 176 int length = minSeqIndex.length; 177 for (int i = 0; i < length; i++) { 178 if (position >= minSeqIndex[i] && position <= maxSeqIndex[i]) { 179 return i; 180 } 181 } 182 throw new IndexOutOfBoundsException("Given position " + position + " does not map into this Sequence"); 183 } 184 185 /** 186 * Scans through the sequence index arrays using binary search 187 */ 188 private int binarySearch(int position) { 189 int[] minSeqIndex = getMinSequenceIndex(); 190 int[] maxSeqIndex = getMaxSequenceIndex(); 191 192 int low = 0; 193 int high = minSeqIndex.length - 1; 194 while (low <= high) { 195 //Go to the mid point in the array 196 int mid = (low + high) >>> 1; 197 198 //Get the max position represented by this Sequence 199 int midMinPosition = minSeqIndex[mid]; 200 int midMaxPosition = maxSeqIndex[mid]; 201 202 //if current position is greater than the current bounds then 203 //increase search space 204 if (midMinPosition < position && midMaxPosition < position) { 205 low = mid + 1; 206 } //if current position is less than current bounds then decrease 207 //search space 208 else if (midMinPosition > position && midMaxPosition > position) { 209 high = mid - 1; 210 } else { 211 return mid; 212 } 213 } 214 throw new IndexOutOfBoundsException("Given position " + position + " does not map into this Sequence"); 215 } 216 217 /** 218 * Iterator implementation which attempts to move through the 2D structure 219 * attempting to skip onto the next sequence as & when it is asked to 220 */ 221 222 @Override 223 public Iterator<C> iterator() { 224 final List<Sequence<C>> localSequences = sequences; 225 return new Iterator<C>() { 226 227 private Iterator<C> currentSequenceIterator = null; 228 private int currentPosition = 0; 229 230 231 @Override 232 public boolean hasNext() { 233 //If the current iterator is null then see if the Sequences object has anything 234 if (currentSequenceIterator == null) { 235 return !localSequences.isEmpty(); 236 } 237 238 //See if we had any compounds 239 boolean hasNext = currentSequenceIterator.hasNext(); 240 if (!hasNext) { 241 hasNext = currentPosition < sequences.size(); 242 } 243 return hasNext; 244 } 245 246 247 @Override 248 public C next() { 249 if (currentSequenceIterator == null) { 250 if (localSequences.isEmpty()) { 251 throw new NoSuchElementException("No sequences to iterate over; make sure you call hasNext() before next()"); 252 } 253 currentSequenceIterator = localSequences.get(currentPosition).iterator(); 254 currentPosition++; 255 } 256 if (!currentSequenceIterator.hasNext()) { 257 currentSequenceIterator = localSequences.get(currentPosition).iterator(); 258 currentPosition++; 259 } 260 return currentSequenceIterator.next(); 261 } 262 263 264 @Override 265 public void remove() throws UnsupportedOperationException { 266 throw new UnsupportedOperationException("Cannot remove from this Sequence"); 267 } 268 }; 269 } 270 271 272 @Override 273 public void setCompoundSet(CompoundSet<C> compoundSet) { 274 throw new UnsupportedOperationException(); 275 } 276 277 278 @Override 279 public void setContents(String sequence) throws CompoundNotFoundException { 280 throw new UnsupportedOperationException(); 281 } 282 283 284 @Override 285 public int countCompounds(C... compounds) { 286 return SequenceMixin.countCompounds(this, compounds); 287 } 288 289 290 @Override 291 public AccessionID getAccession() throws UnsupportedOperationException { 292 throw new UnsupportedOperationException(); 293 } 294 295 296 @Override 297 public List<C> getAsList() { 298 return SequenceMixin.toList(this); 299 } 300 301 302 @Override 303 public int getIndexOf(C compound) { 304 return SequenceMixin.indexOf(this, compound); 305 } 306 307 308 @Override 309 public int getLastIndexOf(C compound) { 310 return SequenceMixin.lastIndexOf(this, compound); 311 } 312 313 314 @Override 315 public String getSequenceAsString() { 316 return SequenceMixin.toStringBuilder(this).toString(); 317 } 318 319 320 @Override 321 public SequenceView<C> getSubSequence(Integer start, Integer end) { 322 return SequenceMixin.createSubSequence(this, start, end); 323 } 324 325 @Override 326 public SequenceView<C> getInverse() { 327 return SequenceMixin.inverse(this); 328 } 329}