001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.ProteinSequence;
026import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
027import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
028import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
029import org.biojava.nbio.core.sequence.loader.SequenceFileProxyLoader;
030import org.biojava.nbio.core.sequence.template.AbstractSequence;
031import org.biojava.nbio.core.sequence.template.CompoundSet;
032import org.biojava.nbio.core.sequence.template.ProxySequenceReader;
033
034import java.io.File;
035import java.io.IOException;
036import java.util.List;
037
038/**
039 * This class is a good example of using the SequenceCreatorInterface where during parsing of the stream
040 * the sequence and the offset index are passed to create a Protein sequence that will be loaded in lazily.
041 * This way you can load very large fasta files and store accession id and delay loading the sequence to save
042 * memory. The index is the file stream offset so when a ProteinSequence has a call to getSequence() the
043 * SequenceFileProxyLoader will open the file and offset to the index and retrieve the sequence.
044 *
045 * Same approach can be used for genome sequence data stored in a local fasta file, in a database or via http
046 * interface to a remote server
047 *
048 * @author Scooter Willis <willishf at gmail dot com>
049 */
050public class FileProxyProteinSequenceCreator implements SequenceCreatorInterface<AminoAcidCompound> {
051
052        CompoundSet<AminoAcidCompound> compoundSet;
053        File file;
054        SequenceParserInterface sequenceParser;
055
056        /**
057         * Need File so that we can store full path name in SequenceFileProxyLoader for Random File access as a quick read
058         * @param fastaFile
059         * @param compoundSet
060         */
061        public FileProxyProteinSequenceCreator(File file, CompoundSet<AminoAcidCompound> compoundSet, SequenceParserInterface sequenceParser ) {
062                this.compoundSet = compoundSet;
063                this.file = file;
064                this.sequenceParser = sequenceParser;
065        }
066
067        /**
068         * Even though we are passing in the sequence we really only care about the length of the sequence and the offset
069         * index in the fasta file.
070         * @param sequence
071         * @param index
072         * @return
073         * @throws CompoundNotFoundException
074         * @throws IOException
075         */
076        @Override
077        public AbstractSequence<AminoAcidCompound> getSequence(String sequence, long index) throws CompoundNotFoundException, IOException {
078                SequenceFileProxyLoader<AminoAcidCompound> sequenceFileProxyLoader =
079                                new SequenceFileProxyLoader<AminoAcidCompound>(
080                                                file,
081                                                sequenceParser,
082                                                index,
083                                                sequence.length(),
084                                                compoundSet
085                                                );
086                return new ProteinSequence(sequenceFileProxyLoader, compoundSet);
087        }
088
089        /**
090         * Should be able to extend the same concept to a remote URL call or database connection. Not supported yet
091         * @param proxyLoader
092         * @param index
093         * @return
094         */
095        @Override
096        public AbstractSequence<AminoAcidCompound> getSequence(
097                        ProxySequenceReader<AminoAcidCompound> proxyLoader, long index) {
098                throw new UnsupportedOperationException("Not supported yet.");
099        }
100
101        /**
102         * Not sure of use case and currently not supported
103         * @param list
104         * @return
105         */
106        @Override
107        public AbstractSequence<AminoAcidCompound> getSequence(
108                        List<AminoAcidCompound> list) {
109                throw new UnsupportedOperationException("Not supported yet.");
110        }
111}