001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.AccessionID;
026import org.biojava.nbio.core.sequence.DataSource;
027import org.biojava.nbio.core.sequence.ProteinSequence;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
030import org.biojava.nbio.core.sequence.template.AbstractSequence;
031import org.biojava.nbio.core.sequence.template.AbstractSequence.AnnotationType;
032import org.biojava.nbio.core.sequence.template.Compound;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import java.util.ArrayList;
037
038/**
039 * The default fasta header parser where some headers are well defined based on the source
040 * database which allows us to set the source of the protein sequence and the identifier
041 * that can be used in future implementations to load features from external sources
042 *
043 * If the user has a custom header with local data then they can create their own implementation
044 * of a FastaHeaderParserInterface
045 *<pre>
046 * GenBank                           gi|gi-number|gb|accession|locus
047 * ENA Data Library                  gi|gi-number|emb|accession|locus
048 * DDBJ, DNA Database of Japan       gi|gi-number|dbj|accession|locus
049 * NBRF PIR                          pir||entry
050 * Protein Research Foundation       prf||name
051 * SWISS-PROT                        sp|accession|name
052 * Brookhaven Protein Data Bank (1)  pdb|entry|chain
053 * Brookhaven Protein Data Bank (2)  entry:chain|PDBID|CHAIN|SEQUENCE
054 * PDB EBI                           PDB:1ECY_A mol:protein length:142  ECOTIN
055 * Patents                           pat|country|number
056 * GenInfo Backbone Id               bbs|number
057 * General database identifier       gnl|database|identifier
058 * NCBI Reference Sequence           ref|accession|locus
059 * Local Sequence identifier         lcl|identifier
060 *</pre>
061 * @author Scooter Willis <willishf at gmail dot com>
062 */
063public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceHeaderParserInterface<S,C> {
064
065        private final static Logger logger = LoggerFactory.getLogger(GenericFastaHeaderParser.class);
066
067        /**
068         * Parse out the components where some have a | and others do not
069         * @param header
070         * @return
071         */
072        private String[] getHeaderValues(String header) {
073                String[] data = new String[0];
074                ArrayList<String> values = new ArrayList<String>();
075                StringBuffer sb = new StringBuffer();
076                //commented out 1/11/2012 to resolve an issue where headers do contain a length= at the end that are not recognized
077                //if(header.indexOf("length=") != -1){
078                //    data = new String[1];
079                //    int index = header.indexOf("length=");
080                //    data[0] = header.substring(0, index).trim();
081        //        logger.debug("accession=" + data[0]);
082                //    return data;
083                //} else
084                 if (!header.startsWith("PDB:")) {
085                        for (int i = 0; i < header.length(); i++) {
086                                if (header.charAt(i) == '|') {
087                                        values.add(sb.toString());
088                                        sb.setLength(0);//faster than  = new StringBuffer();
089                                } else if (i == header.length() - 1) {
090                                        sb.append(header.charAt(i));
091                                        values.add(sb.toString());
092                                } else {
093                                        sb.append(header.charAt(i));
094                                }
095
096                                data = new String[values.size()];
097                                values.toArray(data);
098                        }
099                } else {
100                        data = header.split(" ");
101                }
102                return data;
103        }
104
105        /**
106         * Parse the header and set the values in the sequence
107         * @param header
108         * @param sequence
109         */
110        @Override
111        public void parseHeader(String header, S sequence) {
112                //uniptrot
113                // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1
114                sequence.setOriginalHeader(header);
115                String[] data = getHeaderValues(header);
116
117                if (data.length == 1) {
118                        sequence.setAccession(new AccessionID(data[0]));
119                } else  if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) {
120                        if (data[0].equalsIgnoreCase("sp")) {
121                                sequence.setAnnotationType(AnnotationType.CURATED);
122                        } else {
123                                sequence.setAnnotationType(AnnotationType.PREDICTED);
124                        }
125
126                        sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT));
127                        if (data.length > 2) {
128                                sequence.setDescription(data[2]);
129                        }
130
131                } else if (data[0].equalsIgnoreCase("gi")) {
132                        DataSource giSource = DataSource.UNKNOWN;
133                        if (data.length >= 3) {
134                                if (data[2].equalsIgnoreCase("gb")) {
135                                        giSource = DataSource.GENBANK;
136                                } else if (data[2].equalsIgnoreCase("emb")) {
137                                        giSource = DataSource.ENA;
138                                } else if (data[2].equalsIgnoreCase("dbj")) {
139                                        giSource = DataSource.DDBJ;
140                                }
141                                sequence.setAccession(new AccessionID(data[3], giSource));
142                        } else {
143                                sequence.setAccession(new AccessionID(header, giSource));
144                        }
145                } else if (data[0].equalsIgnoreCase("pir")) {
146                        sequence.setAccession(new AccessionID(data[2], DataSource.NBRF));
147                } else if (data[0].equalsIgnoreCase("prf")) {
148                        sequence.setAccession(new AccessionID(data[2], DataSource.PRF));
149                } else if (data[0].equalsIgnoreCase("pdb")) {
150                        sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1));
151                } else if (data[0].startsWith("PDB")) {
152                        String[] pdbe = data[0].split(" ");
153                        String[] pdbaccession = pdbe[0].split(":");
154                        sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe));
155                } else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) {
156                        sequence.setAccession(new AccessionID(data[0], DataSource.PDB2));
157                } else if (data[0].equalsIgnoreCase("pat")) {
158                        sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS));
159                } else if (data[0].equalsIgnoreCase("bbs")) {
160                        sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO));
161                } else if (data[0].equalsIgnoreCase("gnl")) {
162                        sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL));
163                } else if (data[0].equalsIgnoreCase("ref")) {
164                        sequence.setAccession(new AccessionID(data[1], DataSource.NCBI));
165                } else if (data[0].equalsIgnoreCase("lcl")) {
166                        sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL));
167                } else {
168                        sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader
169                }
170
171
172        }
173
174        /**
175         *
176         * @param args
177         */
178        public static void main(String[] args) {
179
180                logger.info("parseHeader");
181                String header = "";
182                ProteinSequence sequence = null;
183                try {
184                        sequence = new ProteinSequence("");
185                } catch (CompoundNotFoundException e) {
186                        // this should not happen, in case it does we log error
187                        logger.error("Could not create empty protein sequence. Error: {}. This is most likely a bug.",e.getMessage());
188                }
189                GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound> instance =
190                  new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>();
191
192                header = "gi|gi-number|gb|accession|locus";
193                instance.parseHeader(header, sequence);
194                logger.info("accession = {}", sequence.getAccession());
195                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.GENBANK);
196
197                header = "gi|gi-number|emb|accession|locus";
198                instance.parseHeader(header, sequence);
199                logger.info("accession = {}", sequence.getAccession());
200                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.ENA);
201
202                header = "gi|gi-number|dbj|accession|locus";
203                instance.parseHeader(header, sequence);
204                logger.info("accession = {}", sequence.getAccession());
205                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.DDBJ);
206
207                header = "pir||entry";
208                instance.parseHeader(header, sequence);
209                logger.info("entry = {}", sequence.getAccession());
210                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.NBRF);
211
212                header = "prf||name";
213                instance.parseHeader(header, sequence);
214                logger.info("name = {}", sequence.getAccession());
215                logger.info("Data source: {}", sequence.getAccession().getDataSource(), DataSource.PRF);
216
217                header = "sp|accession|name";
218                instance.parseHeader(header, sequence);
219                logger.info("accession = ", sequence.getAccession());
220                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.UNIPROT);
221
222                header = "pdb|entry|chain";
223                instance.parseHeader(header, sequence);
224                logger.info("entry:chain = ", sequence.getAccession());
225                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.PDB1);
226
227                header = "entry:chain|PDBID|CHAIN|SEQUENCE";
228                instance.parseHeader(header, sequence);
229                logger.info("entry:chain = {}", sequence.getAccession());
230                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.PDB2);
231
232                header = "PDB:1ECY_A mol:protein length:142  ECOTIN";
233                instance.parseHeader(header, sequence);
234                logger.info("1ECY_A = {}", sequence.getAccession());
235                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.PDBe);
236
237                header = "pat|country|number";
238                instance.parseHeader(header, sequence);
239                logger.info("number = {}", sequence.getAccession());
240                logger.info("Data source: {}", sequence.getAccession().getDataSource(), DataSource.PATENTS);
241
242                header = "bbs|number";
243                instance.parseHeader(header, sequence);
244                logger.info("number = {}", sequence.getAccession());
245                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.GENINFO);
246
247                header = "gnl|database|identifier";
248                instance.parseHeader(header, sequence);
249                logger.info("identifier = {}", sequence.getAccession());
250                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.GENERAL);
251
252                header = "ref|accession|locus";
253                instance.parseHeader(header, sequence);
254                logger.info("accession = {}", sequence.getAccession());
255                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.NCBI);
256
257                header = "lcl|identifier";
258                instance.parseHeader(header, sequence);
259                logger.info("identifier = {}", sequence.getAccession());
260                logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.LOCAL);
261        }
262}