001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @author Richard Holland 023 * @auther Scooter Willis 024 * 025 */ 026package org.biojava.nbio.core.sequence.loader; 027 028import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 029import org.biojava.nbio.core.sequence.AccessionID; 030import org.biojava.nbio.core.sequence.Strand; 031import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; 032import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 033import org.biojava.nbio.core.sequence.template.*; 034 035import java.io.BufferedReader; 036import java.io.File; 037import java.io.FileReader; 038import java.io.IOException; 039import java.util.ArrayList; 040import java.util.Iterator; 041import java.util.List; 042 043/** 044 * This class represents the storage container of a sequence stored in a fasta file where 045 * the initial parsing of the file we store the offset and length of the sequence. When a call 046 * is made to any method that needs sequence data then the file will be opened and the sequence 047 * loaded. This class could be improved by using the hints or a some algorithm that indicates 048 * the sequence data once loaded should stay loaded. Could keep track of the last time sequence 049 * data was loaded and then after X amount of time clear the contents to free up memory. 050 * 051 * 052 * @author Scooter Willis <willishf at gmail dot com> 053 * @param <C> 054 */ 055public class SequenceFileProxyLoader<C extends Compound> implements ProxySequenceReader<C> { 056 057 SequenceParserInterface sequenceParser; 058 private CompoundSet<C> compoundSet; 059 private List<C> parsedCompounds = new ArrayList<C>(); 060 File file; 061 long sequenceStartIndex = -1; 062 int sequenceLength = -1; 063 //private boolean initialized = false; 064 065 /** 066 * 067 * @param file The file where the sequence will be found 068 * @param sequenceParser The parser to use to load the sequence 069 * @param sequenceStartIndex The file offset to the start of the sequence 070 * @param sequenceLength The length of the sequence 071 * @param compoundSet 072 * @throws IOException if problems occur while reading the file 073 * @throws CompoundNotFoundException if a compound in the sequence can't be found in the given compoundSet 074 */ 075 public SequenceFileProxyLoader(File file, SequenceParserInterface sequenceParser, long sequenceStartIndex, int sequenceLength, CompoundSet<C> compoundSet) 076 throws IOException, CompoundNotFoundException { 077 this.sequenceParser = sequenceParser; 078 this.file = file; 079 this.sequenceStartIndex = sequenceStartIndex; 080 this.sequenceLength = sequenceLength; 081 setCompoundSet(compoundSet); 082 083 init(); 084 } 085 086 /** 087 * 088 * @param compoundSet 089 */ 090 @Override 091 public void setCompoundSet(CompoundSet<C> compoundSet) { 092 this.compoundSet = compoundSet; 093 } 094 095 /** 096 * Load the sequence 097 * @return 098 */ 099 private boolean init() throws IOException, CompoundNotFoundException { 100 101 BufferedReader br = new BufferedReader(new FileReader(file)); 102 br.skip(sequenceStartIndex); 103 String sequence = sequenceParser.getSequence(br, sequenceLength); 104 setContents(sequence); 105 br.close(); // close file to prevent too many being open 106 107 return true; 108 } 109 110 /** 111 * 112 * @param sequence 113 */ 114 @Override 115 public void setContents(String sequence) throws CompoundNotFoundException { 116 // Horrendously inefficient - pretty much the way the old BJ did things. 117 // TODO Should be optimised. 118 this.parsedCompounds.clear(); 119 for (int i = 0; i < sequence.length();) { 120 String compoundStr = null; 121 C compound = null; 122 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 123 compoundStr = sequence.substring(i, i + compoundStrLength); 124 compound = compoundSet.getCompoundForString(compoundStr); 125 } 126 if (compound == null) { 127 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 128 } else { 129 i += compoundStr.length(); 130 } 131 this.parsedCompounds.add(compound); 132 } 133 134 } 135 136 /** 137 * 138 * @return 139 */ 140 @Override 141 public int getLength() { 142 return sequenceLength; 143 } 144 145 /** 146 * 147 * @param position 148 * @return 149 */ 150 @Override 151 public C getCompoundAt(int position) { 152 153 return this.parsedCompounds.get(position - 1); 154 } 155 156 /** 157 * 158 * @param compound 159 * @return 160 */ 161 @Override 162 public int getIndexOf(C compound) { 163 164 return this.parsedCompounds.indexOf(compound) + 1; 165 } 166 167 /** 168 * 169 * @param compound 170 * @return 171 */ 172 @Override 173 public int getLastIndexOf(C compound) { 174 175 return this.parsedCompounds.lastIndexOf(compound) + 1; 176 } 177 178 /** 179 * 180 * @return 181 */ 182 @Override 183 public String toString() { 184 185 return getSequenceAsString(); 186 } 187 188 /** 189 * 190 * @return 191 */ 192 @Override 193 public String getSequenceAsString() { 194 return getSequenceAsString(1, getLength(), Strand.POSITIVE); 195 } 196 197 /** 198 * 199 * @param bioBegin 200 * @param bioEnd 201 * @param strand 202 * @return 203 */ 204 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 205 206 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<C>(); 207 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 208 } 209 210 /** 211 * 212 * @return 213 */ 214 @Override 215 public List<C> getAsList() { 216 217 return this.parsedCompounds; 218 219 } 220 221 /** 222 * 223 * @param bioBegin 224 * @param bioEnd 225 * @return 226 */ 227 @Override 228 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 229 230 return new SequenceProxyView<C>(SequenceFileProxyLoader.this, bioBegin, bioEnd); 231 } 232 233 /** 234 * 235 * @return 236 */ 237 @Override 238 public Iterator<C> iterator() { 239 240 return this.parsedCompounds.iterator(); 241 } 242 243 /** 244 * 245 * @return 246 */ 247 @Override 248 public CompoundSet<C> getCompoundSet() { 249 return compoundSet; 250 } 251 252 /** 253 * 254 * @return 255 */ 256 @Override 257 public AccessionID getAccession() { 258 throw new UnsupportedOperationException("Not supported yet."); 259 } 260 261 /** 262 * 263 * @param compounds 264 * @return 265 */ 266 @Override 267 public int countCompounds(C... compounds) { 268 return SequenceMixin.countCompounds(this, compounds); 269 } 270 271 /** 272 * 273 * @return 274 */ 275 @Override 276 public SequenceView<C> getInverse() { 277 return SequenceMixin.inverse(this); 278 } 279}