001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @author Richard Holland 023 * @auther Scooter Willis 024 * 025 */ 026package org.biojava.nbio.core.sequence.loader; 027 028import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 029import org.biojava.nbio.core.sequence.AccessionID; 030import org.biojava.nbio.core.sequence.Strand; 031import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; 032import org.biojava.nbio.core.sequence.storage.SequenceAsStringHelper; 033import org.biojava.nbio.core.sequence.template.*; 034import org.biojava.nbio.core.util.Equals; 035 036import java.io.BufferedReader; 037import java.io.File; 038import java.io.FileReader; 039import java.io.IOException; 040import java.util.ArrayList; 041import java.util.Iterator; 042import java.util.List; 043 044/** 045 * This class represents the storage container of a sequence stored in a fasta file where 046 * the initial parsing of the file we store the offset and length of the sequence. When a call 047 * is made to any method that needs sequence data then the file will be opened and the sequence 048 * loaded. This class could be improved by using the hints or a some algorithm that indicates 049 * the sequence data once loaded should stay loaded. Could keep track of the last time sequence 050 * data was loaded and then after X amount of time clear the contents to free up memory. 051 * 052 * 053 * @author Scooter Willis 054 * @param <C> the compound type 055 */ 056public class SequenceFileProxyLoader<C extends Compound> implements ProxySequenceReader<C> { 057 058 SequenceParserInterface sequenceParser; 059 private CompoundSet<C> compoundSet; 060 private List<C> parsedCompounds = new ArrayList<>(); 061 File file; 062 long sequenceStartIndex = -1; 063 int sequenceLength = -1; 064 //private boolean initialized = false; 065 066 /** 067 * 068 * @param file The file where the sequence will be found 069 * @param sequenceParser The parser to use to load the sequence 070 * @param sequenceStartIndex The file offset to the start of the sequence 071 * @param sequenceLength The length of the sequence 072 * @param compoundSet 073 * @throws IOException if problems occur while reading the file 074 * @throws CompoundNotFoundException if a compound in the sequence can't be found in the given compoundSet 075 */ 076 public SequenceFileProxyLoader(File file, SequenceParserInterface sequenceParser, long sequenceStartIndex, int sequenceLength, CompoundSet<C> compoundSet) 077 throws IOException, CompoundNotFoundException { 078 this.sequenceParser = sequenceParser; 079 this.file = file; 080 this.sequenceStartIndex = sequenceStartIndex; 081 this.sequenceLength = sequenceLength; 082 setCompoundSet(compoundSet); 083 084 init(); 085 } 086 087 /** 088 * 089 * @param compoundSet 090 */ 091 @Override 092 public void setCompoundSet(CompoundSet<C> compoundSet) { 093 this.compoundSet = compoundSet; 094 } 095 096 /** 097 * Load the sequence 098 * @return 099 */ 100 private boolean init() throws IOException, CompoundNotFoundException { 101 102 try (BufferedReader br = new BufferedReader(new FileReader(file))) { 103 br.skip(sequenceStartIndex); 104 String sequence = sequenceParser.getSequence(br, sequenceLength); 105 setContents(sequence); 106 } 107 108 return true; 109 } 110 111 /** 112 * 113 * @param sequence 114 */ 115 @Override 116 public void setContents(String sequence) throws CompoundNotFoundException { 117 // Horrendously inefficient - pretty much the way the old BJ did things. 118 // TODO Should be optimised. 119 this.parsedCompounds.clear(); 120 for (int i = 0; i < sequence.length();) { 121 String compoundStr = null; 122 C compound = null; 123 for (int compoundStrLength = 1; compound == null && compoundStrLength <= compoundSet.getMaxSingleCompoundStringLength(); compoundStrLength++) { 124 compoundStr = sequence.substring(i, i + compoundStrLength); 125 compound = compoundSet.getCompoundForString(compoundStr); 126 } 127 if (compound == null) { 128 throw new CompoundNotFoundException("Compound "+compoundStr+" not found"); 129 } else { 130 i += compoundStr.length(); 131 } 132 this.parsedCompounds.add(compound); 133 } 134 135 } 136 137 /** 138 * 139 * @return 140 */ 141 @Override 142 public int getLength() { 143 return sequenceLength; 144 } 145 146 /** 147 * 148 * @param position 149 * @return 150 */ 151 @Override 152 public C getCompoundAt(int position) { 153 154 return this.parsedCompounds.get(position - 1); 155 } 156 157 /** 158 * 159 * @param compound 160 * @return 161 */ 162 @Override 163 public int getIndexOf(C compound) { 164 165 return this.parsedCompounds.indexOf(compound) + 1; 166 } 167 168 /** 169 * 170 * @param compound 171 * @return 172 */ 173 @Override 174 public int getLastIndexOf(C compound) { 175 176 return this.parsedCompounds.lastIndexOf(compound) + 1; 177 } 178 179 /** 180 * 181 * @return 182 */ 183 @Override 184 public String toString() { 185 186 return getSequenceAsString(); 187 } 188 189 /** 190 * 191 * @return 192 */ 193 @Override 194 public String getSequenceAsString() { 195 return getSequenceAsString(1, getLength(), Strand.POSITIVE); 196 } 197 198 /** 199 * 200 * @param bioBegin 201 * @param bioEnd 202 * @param strand 203 * @return 204 */ 205 public String getSequenceAsString(Integer bioBegin, Integer bioEnd, Strand strand) { 206 207 SequenceAsStringHelper<C> sequenceAsStringHelper = new SequenceAsStringHelper<>(); 208 return sequenceAsStringHelper.getSequenceAsString(this.parsedCompounds, compoundSet, bioBegin, bioEnd, strand); 209 } 210 211 /** 212 * 213 * @return 214 */ 215 @Override 216 public List<C> getAsList() { 217 218 return this.parsedCompounds; 219 220 } 221 222 @Override 223 public boolean equals(Object o) { 224 225 if(! Equals.classEqual(this, o)) { 226 return false; 227 } 228 229 Sequence<C> other = (Sequence<C>)o; 230 if ( other.getCompoundSet() != getCompoundSet()) 231 return false; 232 233 List<C> rawCompounds = getAsList(); 234 List<C> otherCompounds = other.getAsList(); 235 236 if ( rawCompounds.size() != otherCompounds.size()) 237 return false; 238 239 for (int i = 0 ; i < rawCompounds.size() ; i++){ 240 Compound myCompound = rawCompounds.get(i); 241 Compound otherCompound = otherCompounds.get(i); 242 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 243 return false; 244 } 245 return true; 246 } 247 248 @Override 249 public int hashCode(){ 250 String s = getSequenceAsString(); 251 return s.hashCode(); 252 } 253 254 /** 255 * 256 * @param bioBegin 257 * @param bioEnd 258 * @return 259 */ 260 @Override 261 public SequenceView<C> getSubSequence(final Integer bioBegin, final Integer bioEnd) { 262 263 return new SequenceProxyView<>(SequenceFileProxyLoader.this, bioBegin, bioEnd); 264 } 265 266 /** 267 * 268 * @return 269 */ 270 @Override 271 public Iterator<C> iterator() { 272 273 return this.parsedCompounds.iterator(); 274 } 275 276 /** 277 * 278 * @return 279 */ 280 @Override 281 public CompoundSet<C> getCompoundSet() { 282 return compoundSet; 283 } 284 285 /** 286 * 287 * @return 288 */ 289 @Override 290 public AccessionID getAccession() { 291 throw new UnsupportedOperationException("Not supported yet."); 292 } 293 294 /** 295 * 296 * @param compounds 297 * @return 298 */ 299 @Override 300 public int countCompounds(C... compounds) { 301 return SequenceMixin.countCompounds(this, compounds); 302 } 303 304 /** 305 * 306 * @return 307 */ 308 @Override 309 public SequenceView<C> getInverse() { 310 return SequenceMixin.inverse(this); 311 } 312}