001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 025import org.biojava.nbio.core.sequence.ProteinSequence; 026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 027import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 028import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; 029import org.biojava.nbio.core.sequence.loader.SequenceFileProxyLoader; 030import org.biojava.nbio.core.sequence.template.AbstractSequence; 031import org.biojava.nbio.core.sequence.template.CompoundSet; 032import org.biojava.nbio.core.sequence.template.ProxySequenceReader; 033 034import java.io.File; 035import java.io.IOException; 036import java.util.List; 037 038/** 039 * This class is a good example of using the SequenceCreatorInterface where during parsing of the stream 040 * the sequence and the offset index are passed to create a Protein sequence that will be loaded in lazily. 041 * This way you can load very large fasta files and store accession id and delay loading the sequence to save 042 * memory. The index is the file stream offset so when a ProteinSequence has a call to getSequence() the 043 * SequenceFileProxyLoader will open the file and offset to the index and retrieve the sequence. 044 * 045 * Same approach can be used for genome sequence data stored in a local fasta file, in a database or via http 046 * interface to a remote server 047 * 048 * @author Scooter Willis <willishf at gmail dot com> 049 */ 050public class FileProxyProteinSequenceCreator implements SequenceCreatorInterface<AminoAcidCompound> { 051 052 CompoundSet<AminoAcidCompound> compoundSet; 053 File file; 054 SequenceParserInterface sequenceParser; 055 056 /** 057 * Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read 058 * @param fastaFile 059 * @param compoundSet 060 */ 061 public FileProxyProteinSequenceCreator(File file, CompoundSet<AminoAcidCompound> compoundSet, SequenceParserInterface sequenceParser ) { 062 this.compoundSet = compoundSet; 063 this.file = file; 064 this.sequenceParser = sequenceParser; 065 } 066 067 /** 068 * Even though we are passing in the sequence we really only care about the length of the sequence and the offset 069 * index in the fasta file. 070 * @param sequence 071 * @param index 072 * @return 073 * @throws CompoundNotFoundException 074 * @throws IOException 075 */ 076 @Override 077 public AbstractSequence<AminoAcidCompound> getSequence(String sequence, long index) throws CompoundNotFoundException, IOException { 078 SequenceFileProxyLoader<AminoAcidCompound> sequenceFileProxyLoader = 079 new SequenceFileProxyLoader<AminoAcidCompound>( 080 file, 081 sequenceParser, 082 index, 083 sequence.length(), 084 compoundSet 085 ); 086 return new ProteinSequence(sequenceFileProxyLoader, compoundSet); 087 } 088 089 /** 090 * Should be able to extend the same concept to a remote URL call or database connection. Not supported yet 091 * @param proxyLoader 092 * @param index 093 * @return 094 */ 095 @Override 096 public AbstractSequence<AminoAcidCompound> getSequence( 097 ProxySequenceReader<AminoAcidCompound> proxyLoader, long index) { 098 throw new UnsupportedOperationException("Not supported yet."); 099 } 100 101 /** 102 * Not sure of use case and currently not supported 103 * @param list 104 * @return 105 */ 106 @Override 107 public AbstractSequence<AminoAcidCompound> getSequence( 108 List<AminoAcidCompound> list) { 109 throw new UnsupportedOperationException("Not supported yet."); 110 } 111}