001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io; 022 023import org.biojava.nbio.structure.ResidueNumber; 024import org.biojava.nbio.structure.Structure; 025import org.biojava.nbio.structure.StructureException; 026import org.biojava.nbio.structure.align.util.AtomCache; 027import org.biojava.nbio.core.sequence.ProteinSequence; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 029import org.biojava.nbio.core.sequence.io.FastaReader; 030import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 031import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 032 033import java.io.File; 034import java.io.FileNotFoundException; 035import java.io.IOException; 036import java.io.InputStream; 037import java.util.LinkedHashMap; 038 039 040/** 041 * Reads a protein sequence from a fasta file and attempts to match it to a 042 * 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in 043 * the output, allowing structural alignments to be read from fasta files. 044 * 045 * <p>Structures are loaded from an AtomCache. For this to work, the accession 046 * for each protein should be parsed from the fasta header line into a form 047 * understood by {@link AtomCache#getStructure(String)}. 048 * 049 * <p>Lowercase letters are sometimes used to specify unaligned residues. 050 * This information can be preserved by using a CasePreservingSequenceCreator, 051 * which allows the case of residues to be accessed through the 052 * {@link ProteinSequence#getUserCollection()} method. 053 * 054 * @author Spencer Bliven 055 * 056 */ 057public class FastaStructureParser { 058 059 // inputs 060 private FastaReader<ProteinSequence, AminoAcidCompound> reader; 061 private AtomCache cache; 062 063 // cache processed data 064 private String[] accessions; 065 private ProteinSequence[] sequences; 066 private Structure[] structures; 067 private ResidueNumber[][] residues; 068 069 public FastaStructureParser(InputStream is, 070 SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser, 071 SequenceCreatorInterface<AminoAcidCompound> sequenceCreator, 072 AtomCache cache) 073 { 074 this(new FastaReader<ProteinSequence, AminoAcidCompound>( 075 is, headerParser, sequenceCreator),cache); 076 } 077 078 public FastaStructureParser(File file, 079 SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser, 080 SequenceCreatorInterface<AminoAcidCompound> sequenceCreator, 081 AtomCache cache) throws FileNotFoundException 082 { 083 this(new FastaReader<ProteinSequence, AminoAcidCompound>( 084 file, headerParser, sequenceCreator), cache); 085 } 086 087 public FastaStructureParser(FastaReader<ProteinSequence, AminoAcidCompound> reader, 088 AtomCache cache) { 089 this.reader = reader; 090 this.cache = cache; 091 this.accessions = null; 092 this.sequences = null; 093 this.structures = null; 094 this.residues = null; 095 } 096 097 098 /** 099 * Parses the fasta file and loads it into memory. 100 * 101 * Information can be subsequently accessed through 102 * {@link #getSequences()}, 103 * {@link #getStructures()}, 104 * {@link #getResidues()}, and 105 * {@link #getAccessions()}. 106 * 107 * @throws IOException 108 * @throws StructureException 109 */ 110 public void process() throws IOException, StructureException { 111 if(sequences == null) { // only process once, then return cached values 112 LinkedHashMap<String, ProteinSequence> sequenceMap = reader.process(); 113 114 sequences = sequenceMap.values().toArray(new ProteinSequence[0]); 115 accessions = new String[sequences.length]; 116 structures = new Structure[sequences.length]; 117 residues = new ResidueNumber[sequences.length][]; 118 119 // Match each sequence to a series of PDB Residue numbers 120 for(int i=0;i<sequences.length;i++) { 121 accessions[i] = sequences[i].getAccession().getID(); 122 123 //System.out.println("Fetching "+accession); 124 structures[i] = cache.getStructure(accessions[i]); 125 126 residues[i] = StructureSequenceMatcher.matchSequenceToStructure(sequences[i], structures[i]); 127 128 assert( residues[i].length == sequences[i].getLength()); 129 } 130 } 131 } 132 133 134 /** 135 * Gets the protein sequences read from the Fasta file. 136 * Returns null if {@link #process()} has not been called. 137 * @return An array ProteinSequences from 138 * parsing the fasta file, or null if process() hasn't been called. 139 */ 140 public ProteinSequence[] getSequences() { 141 return sequences; 142 } 143 144 /** 145 * Gets the protein structures mapped from the Fasta file. 146 * Returns null if {@link #process()} has not been called. 147 * @return An array of Structures for each protein 148 * in the fasta file, or null if process() hasn't been called. 149 */ 150 public Structure[] getStructures() { 151 return structures; 152 } 153 154 /** 155 * For each residue in the fasta file, return the ResidueNumber in the 156 * corresponding structure. If the residue cannot be found in the structure, 157 * that entry will be null. This can happen if that residue was not included 158 * in the PDB file (eg disordered residues), if the fasta sequence does not 159 * match the PDB sequence, or if errors occur during the matching process. 160 * @return A 2D array of ResidueNumbers, or null if process() hasn't been called. 161 * @see StructureSequenceMatcher#matchSequenceToStructure(ProteinSequence, Structure) 162 */ 163 public ResidueNumber[][] getResidues() { 164 return residues; 165 } 166 167 /** 168 * Gets the protein accessions mapped from the Fasta file. 169 * Returns null if {@link #process()} has not been called. 170 * @return An array of Structures for each protein 171 * in the fasta file, or null if process() hasn't been called. 172 */ 173 public String[] getAccessions() { 174 return accessions; 175 } 176}