001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.AccessionID;
026import org.biojava.nbio.core.sequence.DataSource;
027import org.biojava.nbio.core.sequence.ProteinSequence;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
030import org.biojava.nbio.core.sequence.template.AbstractSequence;
031import org.biojava.nbio.core.sequence.template.AbstractSequence.AnnotationType;
032import org.biojava.nbio.core.sequence.template.Compound;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import java.util.ArrayList;
037
038/**
039 * The default fasta header parser where some headers are well defined based on the source
040 * database which allows us to set the source of the protein sequence and the identifier
041 * that can be used in future implementations to load features from external sources
042 *
043 * If the user has a custom header with local data then they can create their own implementation
044 * of a FastaHeaderParserInterface
045 *<pre>
046 * GenBank                           gi|gi-number|gb|accession|locus
047 * ENA Data Library                  gi|gi-number|emb|accession|locus
048 * DDBJ, DNA Database of Japan       gi|gi-number|dbj|accession|locus
049 * NBRF PIR                          pir||entry
050 * Protein Research Foundation       prf||name
051 * SWISS-PROT                        sp|accession|name
052 * Brookhaven Protein Data Bank (1)  pdb|entry|chain
053 * Brookhaven Protein Data Bank (2)  entry:chain|PDBID|CHAIN|SEQUENCE
054 * PDB EBI                           PDB:1ECY_A mol:protein length:142  ECOTIN
055 * Patents                           pat|country|number
056 * GenInfo Backbone Id               bbs|number
057 * General database identifier       gnl|database|identifier
058 * NCBI Reference Sequence           ref|accession|locus
059 * Local Sequence identifier         lcl|identifier
060 *</pre>
061 * @author Scooter Willis <willishf at gmail dot com>
062 */
063public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceHeaderParserInterface<S,C> {
064
065        private final static Logger logger = LoggerFactory.getLogger(GenericFastaHeaderParser.class);
066
067        /**
068         * Parse out the components where some have a | and others do not
069         * @param header
070         * @return
071         */
072        private String[] getHeaderValues(String header) {
073                String[] data = new String[0];
074                ArrayList<String> values = new ArrayList<String>();
075                StringBuffer sb = new StringBuffer();
076                //commented out 1/11/2012 to resolve an issue where headers do contain a length= at the end that are not recognized
077                //if(header.indexOf("length=") != -1){
078                //    data = new String[1];
079                //    int index = header.indexOf("length=");
080                //    data[0] = header.substring(0, index).trim();
081        //        logger.debug("accession=" + data[0]);
082                //    return data;
083                //} else
084                 if (!header.startsWith("PDB:")) {
085                        for (int i = 0; i < header.length(); i++) {
086                                if (header.charAt(i) == '|') {
087                                        values.add(sb.toString());
088                                        sb.setLength(0);//faster than  = new StringBuffer();
089                                } else if (i == header.length() - 1) {
090                                        sb.append(header.charAt(i));
091                                        values.add(sb.toString());
092                                } else {
093                                        sb.append(header.charAt(i));
094                                }
095
096                                data = new String[values.size()];
097                                values.toArray(data);
098                        }
099                } else {
100                        data = header.split(" ");
101                }
102                return data;
103        }
104
105        /**
106         * Parse the header and set the values in the sequence
107         * @param header
108         * @param sequence
109         */
110        @Override
111        public void parseHeader(String header, S sequence) {
112                //uniptrot
113                // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1
114                sequence.setOriginalHeader(header);
115                String[] data = getHeaderValues(header);
116
117                if (data.length == 1) {
118                        sequence.setAccession(new AccessionID(data[0]));
119                } else  if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) {
120                        if (data[0].equalsIgnoreCase("sp")) {
121                                sequence.setAnnotationType(AnnotationType.CURATED);
122                        } else {
123                                sequence.setAnnotationType(AnnotationType.PREDICTED);
124                        }
125
126                        sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT));
127                        if (data.length > 2) {
128                                sequence.setDescription(data[2]);
129                        }
130
131                } else if (data[0].equalsIgnoreCase("gi")) {
132                        DataSource giSource = DataSource.UNKNOWN;
133                        if (data.length >= 3) {
134                                if (data[2].equalsIgnoreCase("gb")) {
135                                        giSource = DataSource.GENBANK;
136                                } else if (data[2].equalsIgnoreCase("emb")) {
137                                        giSource = DataSource.ENA;
138                                } else if (data[2].equalsIgnoreCase("dbj")) {
139                                        giSource = DataSource.DDBJ;
140                                }
141                                sequence.setAccession(new AccessionID(data[3], giSource));
142                        } else {
143                                sequence.setAccession(new AccessionID(header, giSource));
144                        }
145                } else if (data[0].equalsIgnoreCase("pir")) {
146                        sequence.setAccession(new AccessionID(data[2], DataSource.NBRF));
147                } else if (data[0].equalsIgnoreCase("prf")) {
148                        sequence.setAccession(new AccessionID(data[2], DataSource.PRF));
149                } else if (data[0].equalsIgnoreCase("pdb")) {
150                        sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1));
151                } else if (data[0].startsWith("PDB")) {
152                        String[] pdbe = data[0].split(" ");
153                        String[] pdbaccession = pdbe[0].split(":");
154                        sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe));
155                } else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) {
156                        sequence.setAccession(new AccessionID(data[0], DataSource.PDB2));
157                } else if (data[0].equalsIgnoreCase("pat")) {
158                        sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS));
159                } else if (data[0].equalsIgnoreCase("bbs")) {
160                        sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO));
161                } else if (data[0].equalsIgnoreCase("gnl")) {
162                        sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL));
163                } else if (data[0].equalsIgnoreCase("ref")) {
164                        sequence.setAccession(new AccessionID(data[1], DataSource.NCBI));
165                } else if (data[0].equalsIgnoreCase("lcl")) {
166                        sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL));
167                } else {
168                        sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader
169                }
170
171
172        }
173
174        
175}