001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.structure.io;
022
023import org.biojava.nbio.structure.ResidueNumber;
024import org.biojava.nbio.structure.Structure;
025import org.biojava.nbio.structure.StructureException;
026import org.biojava.nbio.structure.align.util.AtomCache;
027import org.biojava.nbio.core.sequence.ProteinSequence;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
029import org.biojava.nbio.core.sequence.io.FastaReader;
030import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface;
031import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
032
033import java.io.File;
034import java.io.FileNotFoundException;
035import java.io.IOException;
036import java.io.InputStream;
037import java.util.LinkedHashMap;
038import java.util.Map;
039
040
041/**
042 * Reads a protein sequence from a fasta file and attempts to match it to a
043 * 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in
044 * the output, allowing structural alignments to be read from fasta files.
045 *
046 * <p>Structures are loaded from an AtomCache. For this to work, the accession
047 * for each protein should be parsed from the fasta header line into a form
048 * understood by {@link AtomCache#getStructure(String)}.
049 *
050 * <p>Lowercase letters are sometimes used to specify unaligned residues.
051 * This information can be preserved by using a CasePreservingSequenceCreator,
052 * which allows the case of residues to be accessed through the
053 * {@link ProteinSequence#getUserCollection()} method.
054 *
055 * @author Spencer Bliven
056 *
057 */
058public class FastaStructureParser {
059
060        // inputs
061        private FastaReader<ProteinSequence, AminoAcidCompound> reader;
062        private AtomCache cache;
063
064        // cache processed data
065        private String[] accessions;
066        private ProteinSequence[] sequences;
067        private Structure[] structures;
068        private ResidueNumber[][] residues;
069
070        public FastaStructureParser(InputStream is,
071                        SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
072                        SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
073                        AtomCache cache)
074        {
075                this(new FastaReader<ProteinSequence, AminoAcidCompound>(
076                                is, headerParser, sequenceCreator),cache);
077        }
078
079        public FastaStructureParser(File file,
080                        SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser,
081                        SequenceCreatorInterface<AminoAcidCompound> sequenceCreator,
082                        AtomCache cache) throws FileNotFoundException
083        {
084                this(new FastaReader<ProteinSequence, AminoAcidCompound>(
085                                file, headerParser, sequenceCreator), cache);
086        }
087
088        public FastaStructureParser(FastaReader<ProteinSequence, AminoAcidCompound> reader,
089                        AtomCache cache) {
090                this.reader = reader;
091                this.cache = cache;
092                this.accessions = null;
093                this.sequences = null;
094                this.structures = null;
095                this.residues = null;
096        }
097
098
099        /**
100         * Parses the fasta file and loads it into memory.
101         *
102         * Information can be subsequently accessed through
103         * {@link #getSequences()},
104         * {@link #getStructures()},
105         * {@link #getResidues()}, and
106         * {@link #getAccessions()}.
107         *
108         * @throws IOException
109         * @throws StructureException
110         */
111        public void process() throws IOException, StructureException {
112                if(sequences == null) { // only process once, then return cached values
113                        Map<String, ProteinSequence> sequenceMap = reader.process();
114
115                        sequences = sequenceMap.values().toArray(new ProteinSequence[0]);
116                        accessions = new String[sequences.length];
117                        structures = new Structure[sequences.length];
118                        residues = new ResidueNumber[sequences.length][];
119
120                        // Match each sequence  to a series of PDB Residue numbers
121                        for(int i=0;i<sequences.length;i++) {
122                                accessions[i] = sequences[i].getAccession().getID();
123
124                                //System.out.println("Fetching "+accession);
125                                structures[i] = cache.getStructure(accessions[i]);
126
127                                residues[i] = StructureSequenceMatcher.matchSequenceToStructure(sequences[i], structures[i]);
128
129                                assert( residues[i].length == sequences[i].getLength());
130                        }
131                }
132        }
133
134
135        /**
136         * Gets the protein sequences read from the Fasta file.
137         * Returns null if {@link #process()} has not been called.
138         * @return An array ProteinSequences from
139         *  parsing the fasta file, or null if process() hasn't been called.
140         */
141        public ProteinSequence[] getSequences() {
142                return sequences;
143        }
144
145        /**
146         * Gets the protein structures mapped from the Fasta file.
147         * Returns null if {@link #process()} has not been called.
148         * @return An array of Structures for each protein
149         *  in the fasta file, or null if process() hasn't been called.
150         */
151        public Structure[] getStructures() {
152                return structures;
153        }
154
155        /**
156         * For each residue in the fasta file, return the ResidueNumber in the
157         * corresponding structure. If the residue cannot be found in the structure,
158         * that entry will be null. This can happen if that residue was not included
159         * in the PDB file (eg disordered residues), if the fasta sequence does not
160         * match the PDB sequence, or if errors occur during the matching process.
161         * @return A 2D array of ResidueNumbers, or null if process() hasn't been called.
162         * @see StructureSequenceMatcher#matchSequenceToStructure(ProteinSequence, Structure)
163         */
164        public ResidueNumber[][] getResidues() {
165                return residues;
166        }
167
168        /**
169         * Gets the protein accessions mapped from the Fasta file.
170         * Returns null if {@link #process()} has not been called.
171         * @return An array of Structures for each protein
172         *  in the fasta file, or null if process() hasn't been called.
173         */
174        public String[] getAccessions() {
175                return accessions;
176        }
177}