001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.io;
023
024import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
025import org.biojava.nbio.core.sequence.AccessionID;
026import org.biojava.nbio.core.sequence.DataSource;
027import org.biojava.nbio.core.sequence.ProteinSequence;
028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound;
029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface;
030import org.biojava.nbio.core.sequence.template.AbstractSequence;
031import org.biojava.nbio.core.sequence.template.AbstractSequence.AnnotationType;
032import org.biojava.nbio.core.sequence.template.Compound;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import java.util.ArrayList;
037import java.util.List;
038
039/**
040 * The default fasta header parser where some headers are well defined based on the source
041 * database which allows us to set the source of the protein sequence and the identifier
042 * that can be used in future implementations to load features from external sources
043 *
044 * If the user has a custom header with local data then they can create their own implementation
045 * of a FastaHeaderParserInterface
046 *<pre>
047 * GenBank                           gi|gi-number|gb|accession|locus
048 * ENA Data Library                  gi|gi-number|emb|accession|locus
049 * DDBJ, DNA Database of Japan       gi|gi-number|dbj|accession|locus
050 * NBRF PIR                          pir||entry
051 * Protein Research Foundation       prf||name
052 * SWISS-PROT                        sp|accession|name
053 * Brookhaven Protein Data Bank (1)  pdb|entry|chain
054 * Brookhaven Protein Data Bank (2)  entry:chain|PDBID|CHAIN|SEQUENCE
055 * PDB EBI                           PDB:1ECY_A mol:protein length:142  ECOTIN
056 * Patents                           pat|country|number
057 * GenInfo Backbone Id               bbs|number
058 * General database identifier       gnl|database|identifier
059 * NCBI Reference Sequence           ref|accession|locus
060 * Local Sequence identifier         lcl|identifier
061 *</pre>
062 * @author Scooter Willis 
063 */
064public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceHeaderParserInterface<S,C> {
065
066        private final static Logger logger = LoggerFactory.getLogger(GenericFastaHeaderParser.class);
067
068        /**
069         * Parse out the components where some have a | and others do not
070         * @param header
071         * @return
072         */
073        private String[] getHeaderValues(String header) {
074                String[] data = new String[0];
075                List<String> values = new ArrayList<>();
076                StringBuffer sb = new StringBuffer();
077                //commented out 1/11/2012 to resolve an issue where headers do contain a length= at the end that are not recognized
078                //if(header.indexOf("length=") != -1){
079                //    data = new String[1];
080                //    int index = header.indexOf("length=");
081                //    data[0] = header.substring(0, index).trim();
082        //        logger.debug("accession=" + data[0]);
083                //    return data;
084                //} else
085                 if (!header.startsWith("PDB:")) {
086                        for (int i = 0; i < header.length(); i++) {
087                                if (header.charAt(i) == '|') {
088                                        values.add(sb.toString());
089                                        sb.setLength(0);//faster than  = new StringBuffer();
090                                } else if (i == header.length() - 1) {
091                                        sb.append(header.charAt(i));
092                                        values.add(sb.toString());
093                                } else {
094                                        sb.append(header.charAt(i));
095                                }
096
097                        }
098                        data = new String[values.size()];
099                        values.toArray(data);
100                } else {
101                        data = header.split(" ");
102                }
103                return data;
104        }
105
106        /**
107         * Parse the header and set the values in the sequence
108         * @param header
109         * @param sequence
110         */
111        @Override
112        public void parseHeader(String header, S sequence) {
113                //uniptrot
114                // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1
115                sequence.setOriginalHeader(header);
116                String[] data = getHeaderValues(header);
117
118                if (data.length == 1) {
119                        sequence.setAccession(new AccessionID(data[0]));
120                } else  if ("sp".equalsIgnoreCase(data[0]) || "tr".equalsIgnoreCase(data[0])) {
121                        if ("sp".equalsIgnoreCase(data[0])) {
122                                sequence.setAnnotationType(AnnotationType.CURATED);
123                        } else {
124                                sequence.setAnnotationType(AnnotationType.PREDICTED);
125                        }
126
127                        sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT));
128                        if (data.length > 2) {
129                                sequence.setDescription(data[2]);
130                        }
131
132                } else if ("gi".equalsIgnoreCase(data[0])) {
133                        DataSource giSource = DataSource.UNKNOWN;
134                        if (data.length >= 3) {
135                                if ("gb".equalsIgnoreCase(data[2])) {
136                                        giSource = DataSource.GENBANK;
137                                } else if ("emb".equalsIgnoreCase(data[2])) {
138                                        giSource = DataSource.ENA;
139                                } else if ("dbj".equalsIgnoreCase(data[2])) {
140                                        giSource = DataSource.DDBJ;
141                                }
142                                sequence.setAccession(new AccessionID(data[3], giSource));
143                        } else {
144                                sequence.setAccession(new AccessionID(header, giSource));
145                        }
146                } else if ("pir".equalsIgnoreCase(data[0])) {
147                        sequence.setAccession(new AccessionID(data[2], DataSource.NBRF));
148                } else if ("prf".equalsIgnoreCase(data[0])) {
149                        sequence.setAccession(new AccessionID(data[2], DataSource.PRF));
150                } else if ("pdb".equalsIgnoreCase(data[0])) {
151                        sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1));
152                } else if (data[0].startsWith("PDB")) {
153                        String[] pdbe = data[0].split(" ");
154                        String[] pdbaccession = pdbe[0].split(":");
155                        sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe));
156                } else if (data[0].indexOf(":") != -1 && data.length > 1 && "PDBID".equals(data[1])) {
157                        sequence.setAccession(new AccessionID(data[0], DataSource.PDB2));
158                } else if ("pat".equalsIgnoreCase(data[0])) {
159                        sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS));
160                } else if ("bbs".equalsIgnoreCase(data[0])) {
161                        sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO));
162                } else if ("gnl".equalsIgnoreCase(data[0])) {
163                        sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL));
164                } else if ("ref".equalsIgnoreCase(data[0])) {
165                        sequence.setAccession(new AccessionID(data[1], DataSource.NCBI));
166                } else if ("lcl".equalsIgnoreCase(data[0])) {
167                        sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL));
168                } else {
169                        sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader
170                }
171
172
173        }
174
175        
176}