001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.io;
022
023import org.biojava.nbio.structure.ResidueNumber;
024import org.biojava.nbio.structure.Structure;
025import org.biojava.nbio.structure.StructureException;
026import org.biojava.nbio.structure.align.util.AtomCache;
027import org.biojava.nbio.core.sequence.ProteinSequence;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
029import org.biojava.nbio.core.sequence.io.FastaReader;
030import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
031import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
032
033import java.io.File;
034import java.io.FileNotFoundException;
035import java.io.IOException;
036import java.io.InputStream;
037import java.util.LinkedHashMap;
038
039
040/**
041 * Reads a protein sequence from a fasta file and attempts to match it to a
042 * 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in
043 * the output, allowing structural alignments to be read from fasta files.
044 *
045 * <p>Structures are loaded from an AtomCache. For this to work, the accession
046 * for each protein should be parsed from the fasta header line into a form
047 * understood by {@link AtomCache#getStructure(String)}.
048 *
049 * <p>Lowercase letters are sometimes used to specify unaligned residues.
050 * This information can be preserved by using a CasePreservingSequenceCreator,
051 * which allows the case of residues to be accessed through the
052 * {@link ProteinSequence#getUserCollection()} method.
053 *
054 * @author Spencer Bliven
055 *
056 */
057public class FastaStructureParser {
058
059        // inputs
060        private FastaReader<ProteinSequence, AminoAcidCompound> reader;
061        private AtomCache cache;
062
063        // cache processed data
064        private String[] accessions;
065        private ProteinSequence[] sequences;
066        private Structure[] structures;
067        private ResidueNumber[][] residues;
068
069        public FastaStructureParser(InputStream is,
070                        SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
071                        SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
072                        AtomCache cache)
073        {
074                this(new FastaReader<ProteinSequence, AminoAcidCompound>(
075                                is, headerParser, sequenceCreator),cache);
076        }
077
078        public FastaStructureParser(File file,
079                        SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
080                        SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
081                        AtomCache cache) throws FileNotFoundException
082        {
083                this(new FastaReader<ProteinSequence, AminoAcidCompound>(
084                                file, headerParser, sequenceCreator), cache);
085        }
086
087        public FastaStructureParser(FastaReader<ProteinSequence, AminoAcidCompound> reader,
088                        AtomCache cache) {
089                this.reader = reader;
090                this.cache = cache;
091                this.accessions = null;
092                this.sequences = null;
093                this.structures = null;
094                this.residues = null;
095        }
096
097
098        /**
099         * Parses the fasta file and loads it into memory.
100         *
101         * Information can be subsequently accessed through
102         * {@link #getSequences()},
103         * {@link #getStructures()},
104         * {@link #getResidues()}, and
105         * {@link #getAccessions()}.
106         *
107         * @throws IOException
108         * @throws StructureException
109         */
110        public void process() throws IOException, StructureException {
111                if(sequences == null) { // only process once, then return cached values
112                        LinkedHashMap<String, ProteinSequence> sequenceMap = reader.process();
113
114                        sequences = sequenceMap.values().toArray(new ProteinSequence[0]);
115                        accessions = new String[sequences.length];
116                        structures = new Structure[sequences.length];
117                        residues = new ResidueNumber[sequences.length][];
118
119                        // Match each sequence  to a series of PDB Residue numbers
120                        for(int i=0;i<sequences.length;i++) {
121                                accessions[i] = sequences[i].getAccession().getID();
122
123                                //System.out.println("Fetching "+accession);
124                                structures[i] = cache.getStructure(accessions[i]);
125
126                                residues[i] = StructureSequenceMatcher.matchSequenceToStructure(sequences[i], structures[i]);
127
128                                assert( residues[i].length == sequences[i].getLength());
129                        }
130                }
131        }
132
133
134        /**
135         * Gets the protein sequences read from the Fasta file.
136         * Returns null if {@link #process()} has not been called.
137         * @return An array ProteinSequences from
138         *  parsing the fasta file, or null if process() hasn't been called.
139         */
140        public ProteinSequence[] getSequences() {
141                return sequences;
142        }
143
144        /**
145         * Gets the protein structures mapped from the Fasta file.
146         * Returns null if {@link #process()} has not been called.
147         * @return An array of Structures for each protein
148         *  in the fasta file, or null if process() hasn't been called.
149         */
150        public Structure[] getStructures() {
151                return structures;
152        }
153
154        /**
155         * For each residue in the fasta file, return the ResidueNumber in the
156         * corresponding structure. If the residue cannot be found in the structure,
157         * that entry will be null. This can happen if that residue was not included
158         * in the PDB file (eg disordered residues), if the fasta sequence does not
159         * match the PDB sequence, or if errors occur during the matching process.
160         * @return A 2D array of ResidueNumbers, or null if process() hasn't been called.
161         * @see StructureSequenceMatcher#matchSequenceToStructure(ProteinSequence, Structure)
162         */
163        public ResidueNumber[][] getResidues() {
164                return residues;
165        }
166
167        /**
168         * Gets the protein accessions mapped from the Fasta file.
169         * Returns null if {@link #process()} has not been called.
170         * @return An array of Structures for each protein
171         *  in the fasta file, or null if process() hasn't been called.
172         */
173        public String[] getAccessions() {
174                return accessions;
175        }
176}