001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.structure.io; 022 023import org.biojava.nbio.structure.ResidueNumber; 024import org.biojava.nbio.structure.Structure; 025import org.biojava.nbio.structure.StructureException; 026import org.biojava.nbio.structure.align.util.AtomCache; 027import org.biojava.nbio.core.sequence.ProteinSequence; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 029import org.biojava.nbio.core.sequence.io.FastaReader; 030import org.biojava.nbio.core.sequence.io.template.SequenceCreatorInterface; 031import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 032 033import java.io.File; 034import java.io.FileNotFoundException; 035import java.io.IOException; 036import java.io.InputStream; 037import java.util.LinkedHashMap; 038import java.util.Map; 039 040 041/** 042 * Reads a protein sequence from a fasta file and attempts to match it to a 043 * 3D structure. Any gaps ('-') in the fasta file are preserved as null atoms in 044 * the output, allowing structural alignments to be read from fasta files. 045 * 046 * <p>Structures are loaded from an AtomCache. For this to work, the accession 047 * for each protein should be parsed from the fasta header line into a form 048 * understood by {@link AtomCache#getStructure(String)}. 049 * 050 * <p>Lowercase letters are sometimes used to specify unaligned residues. 051 * This information can be preserved by using a CasePreservingSequenceCreator, 052 * which allows the case of residues to be accessed through the 053 * {@link ProteinSequence#getUserCollection()} method. 054 * 055 * @author Spencer Bliven 056 * 057 */ 058public class FastaStructureParser { 059 060 // inputs 061 private FastaReader<ProteinSequence, AminoAcidCompound> reader; 062 private AtomCache cache; 063 064 // cache processed data 065 private String[] accessions; 066 private ProteinSequence[] sequences; 067 private Structure[] structures; 068 private ResidueNumber[][] residues; 069 070 public FastaStructureParser(InputStream is, 071 SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser, 072 SequenceCreatorInterface<AminoAcidCompound> sequenceCreator, 073 AtomCache cache) 074 { 075 this(new FastaReader<ProteinSequence, AminoAcidCompound>( 076 is, headerParser, sequenceCreator),cache); 077 } 078 079 public FastaStructureParser(File file, 080 SequenceHeaderParserInterface<ProteinSequence, AminoAcidCompound> headerParser, 081 SequenceCreatorInterface<AminoAcidCompound> sequenceCreator, 082 AtomCache cache) throws FileNotFoundException 083 { 084 this(new FastaReader<ProteinSequence, AminoAcidCompound>( 085 file, headerParser, sequenceCreator), cache); 086 } 087 088 public FastaStructureParser(FastaReader<ProteinSequence, AminoAcidCompound> reader, 089 AtomCache cache) { 090 this.reader = reader; 091 this.cache = cache; 092 this.accessions = null; 093 this.sequences = null; 094 this.structures = null; 095 this.residues = null; 096 } 097 098 099 /** 100 * Parses the fasta file and loads it into memory. 101 * 102 * Information can be subsequently accessed through 103 * {@link #getSequences()}, 104 * {@link #getStructures()}, 105 * {@link #getResidues()}, and 106 * {@link #getAccessions()}. 107 * 108 * @throws IOException 109 * @throws StructureException 110 */ 111 public void process() throws IOException, StructureException { 112 if(sequences == null) { // only process once, then return cached values 113 Map<String, ProteinSequence> sequenceMap = reader.process(); 114 115 sequences = sequenceMap.values().toArray(new ProteinSequence[0]); 116 accessions = new String[sequences.length]; 117 structures = new Structure[sequences.length]; 118 residues = new ResidueNumber[sequences.length][]; 119 120 // Match each sequence to a series of PDB Residue numbers 121 for(int i=0;i<sequences.length;i++) { 122 accessions[i] = sequences[i].getAccession().getID(); 123 124 //System.out.println("Fetching "+accession); 125 structures[i] = cache.getStructure(accessions[i]); 126 127 residues[i] = StructureSequenceMatcher.matchSequenceToStructure(sequences[i], structures[i]); 128 129 assert( residues[i].length == sequences[i].getLength()); 130 } 131 } 132 } 133 134 135 /** 136 * Gets the protein sequences read from the Fasta file. 137 * Returns null if {@link #process()} has not been called. 138 * @return An array ProteinSequences from 139 * parsing the fasta file, or null if process() hasn't been called. 140 */ 141 public ProteinSequence[] getSequences() { 142 return sequences; 143 } 144 145 /** 146 * Gets the protein structures mapped from the Fasta file. 147 * Returns null if {@link #process()} has not been called. 148 * @return An array of Structures for each protein 149 * in the fasta file, or null if process() hasn't been called. 150 */ 151 public Structure[] getStructures() { 152 return structures; 153 } 154 155 /** 156 * For each residue in the fasta file, return the ResidueNumber in the 157 * corresponding structure. If the residue cannot be found in the structure, 158 * that entry will be null. This can happen if that residue was not included 159 * in the PDB file (eg disordered residues), if the fasta sequence does not 160 * match the PDB sequence, or if errors occur during the matching process. 161 * @return A 2D array of ResidueNumbers, or null if process() hasn't been called. 162 * @see StructureSequenceMatcher#matchSequenceToStructure(ProteinSequence, Structure) 163 */ 164 public ResidueNumber[][] getResidues() { 165 return residues; 166 } 167 168 /** 169 * Gets the protein accessions mapped from the Fasta file. 170 * Returns null if {@link #process()} has not been called. 171 * @return An array of Structures for each protein 172 * in the fasta file, or null if process() hasn't been called. 173 */ 174 public String[] getAccessions() { 175 return accessions; 176 } 177}