001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 025import org.biojava.nbio.core.sequence.AccessionID; 026import org.biojava.nbio.core.sequence.DataSource; 027import org.biojava.nbio.core.sequence.ProteinSequence; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 030import org.biojava.nbio.core.sequence.template.AbstractSequence; 031import org.biojava.nbio.core.sequence.template.AbstractSequence.AnnotationType; 032import org.biojava.nbio.core.sequence.template.Compound; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import java.util.ArrayList; 037 038/** 039 * The default fasta header parser where some headers are well defined based on the source 040 * database which allows us to set the source of the protein sequence and the identifier 041 * that can be used in future implementations to load features from external sources 042 * 043 * If the user has a custom header with local data then they can create their own implementation 044 * of a FastaHeaderParserInterface 045 *<pre> 046 * GenBank gi|gi-number|gb|accession|locus 047 * ENA Data Library gi|gi-number|emb|accession|locus 048 * DDBJ, DNA Database of Japan gi|gi-number|dbj|accession|locus 049 * NBRF PIR pir||entry 050 * Protein Research Foundation prf||name 051 * SWISS-PROT sp|accession|name 052 * Brookhaven Protein Data Bank (1) pdb|entry|chain 053 * Brookhaven Protein Data Bank (2) entry:chain|PDBID|CHAIN|SEQUENCE 054 * PDB EBI PDB:1ECY_A mol:protein length:142 ECOTIN 055 * Patents pat|country|number 056 * GenInfo Backbone Id bbs|number 057 * General database identifier gnl|database|identifier 058 * NCBI Reference Sequence ref|accession|locus 059 * Local Sequence identifier lcl|identifier 060 *</pre> 061 * @author Scooter Willis <willishf at gmail dot com> 062 */ 063public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceHeaderParserInterface<S,C> { 064 065 private final static Logger logger = LoggerFactory.getLogger(GenericFastaHeaderParser.class); 066 067 /** 068 * Parse out the components where some have a | and others do not 069 * @param header 070 * @return 071 */ 072 private String[] getHeaderValues(String header) { 073 String[] data = new String[0]; 074 ArrayList<String> values = new ArrayList<String>(); 075 StringBuffer sb = new StringBuffer(); 076 //commented out 1/11/2012 to resolve an issue where headers do contain a length= at the end that are not recognized 077 //if(header.indexOf("length=") != -1){ 078 // data = new String[1]; 079 // int index = header.indexOf("length="); 080 // data[0] = header.substring(0, index).trim(); 081 // logger.debug("accession=" + data[0]); 082 // return data; 083 //} else 084 if (!header.startsWith("PDB:")) { 085 for (int i = 0; i < header.length(); i++) { 086 if (header.charAt(i) == '|') { 087 values.add(sb.toString()); 088 sb.setLength(0);//faster than = new StringBuffer(); 089 } else if (i == header.length() - 1) { 090 sb.append(header.charAt(i)); 091 values.add(sb.toString()); 092 } else { 093 sb.append(header.charAt(i)); 094 } 095 096 data = new String[values.size()]; 097 values.toArray(data); 098 } 099 } else { 100 data = header.split(" "); 101 } 102 return data; 103 } 104 105 /** 106 * Parse the header and set the values in the sequence 107 * @param header 108 * @param sequence 109 */ 110 @Override 111 public void parseHeader(String header, S sequence) { 112 //uniptrot 113 // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1 114 sequence.setOriginalHeader(header); 115 String[] data = getHeaderValues(header); 116 117 if (data.length == 1) { 118 sequence.setAccession(new AccessionID(data[0])); 119 } else if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) { 120 if (data[0].equalsIgnoreCase("sp")) { 121 sequence.setAnnotationType(AnnotationType.CURATED); 122 } else { 123 sequence.setAnnotationType(AnnotationType.PREDICTED); 124 } 125 126 sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT)); 127 if (data.length > 2) { 128 sequence.setDescription(data[2]); 129 } 130 131 } else if (data[0].equalsIgnoreCase("gi")) { 132 DataSource giSource = DataSource.UNKNOWN; 133 if (data.length >= 3) { 134 if (data[2].equalsIgnoreCase("gb")) { 135 giSource = DataSource.GENBANK; 136 } else if (data[2].equalsIgnoreCase("emb")) { 137 giSource = DataSource.ENA; 138 } else if (data[2].equalsIgnoreCase("dbj")) { 139 giSource = DataSource.DDBJ; 140 } 141 sequence.setAccession(new AccessionID(data[3], giSource)); 142 } else { 143 sequence.setAccession(new AccessionID(header, giSource)); 144 } 145 } else if (data[0].equalsIgnoreCase("pir")) { 146 sequence.setAccession(new AccessionID(data[2], DataSource.NBRF)); 147 } else if (data[0].equalsIgnoreCase("prf")) { 148 sequence.setAccession(new AccessionID(data[2], DataSource.PRF)); 149 } else if (data[0].equalsIgnoreCase("pdb")) { 150 sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1)); 151 } else if (data[0].startsWith("PDB")) { 152 String[] pdbe = data[0].split(" "); 153 String[] pdbaccession = pdbe[0].split(":"); 154 sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe)); 155 } else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) { 156 sequence.setAccession(new AccessionID(data[0], DataSource.PDB2)); 157 } else if (data[0].equalsIgnoreCase("pat")) { 158 sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS)); 159 } else if (data[0].equalsIgnoreCase("bbs")) { 160 sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO)); 161 } else if (data[0].equalsIgnoreCase("gnl")) { 162 sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL)); 163 } else if (data[0].equalsIgnoreCase("ref")) { 164 sequence.setAccession(new AccessionID(data[1], DataSource.NCBI)); 165 } else if (data[0].equalsIgnoreCase("lcl")) { 166 sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL)); 167 } else { 168 sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader 169 } 170 171 172 } 173 174 /** 175 * 176 * @param args 177 */ 178 public static void main(String[] args) { 179 180 logger.info("parseHeader"); 181 String header = ""; 182 ProteinSequence sequence = null; 183 try { 184 sequence = new ProteinSequence(""); 185 } catch (CompoundNotFoundException e) { 186 // this should not happen, in case it does we log error 187 logger.error("Could not create empty protein sequence. Error: {}. This is most likely a bug.",e.getMessage()); 188 } 189 GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound> instance = 190 new GenericFastaHeaderParser<ProteinSequence,AminoAcidCompound>(); 191 192 header = "gi|gi-number|gb|accession|locus"; 193 instance.parseHeader(header, sequence); 194 logger.info("accession = {}", sequence.getAccession()); 195 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.GENBANK); 196 197 header = "gi|gi-number|emb|accession|locus"; 198 instance.parseHeader(header, sequence); 199 logger.info("accession = {}", sequence.getAccession()); 200 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.ENA); 201 202 header = "gi|gi-number|dbj|accession|locus"; 203 instance.parseHeader(header, sequence); 204 logger.info("accession = {}", sequence.getAccession()); 205 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.DDBJ); 206 207 header = "pir||entry"; 208 instance.parseHeader(header, sequence); 209 logger.info("entry = {}", sequence.getAccession()); 210 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.NBRF); 211 212 header = "prf||name"; 213 instance.parseHeader(header, sequence); 214 logger.info("name = {}", sequence.getAccession()); 215 logger.info("Data source: {}", sequence.getAccession().getDataSource(), DataSource.PRF); 216 217 header = "sp|accession|name"; 218 instance.parseHeader(header, sequence); 219 logger.info("accession = ", sequence.getAccession()); 220 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.UNIPROT); 221 222 header = "pdb|entry|chain"; 223 instance.parseHeader(header, sequence); 224 logger.info("entry:chain = ", sequence.getAccession()); 225 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.PDB1); 226 227 header = "entry:chain|PDBID|CHAIN|SEQUENCE"; 228 instance.parseHeader(header, sequence); 229 logger.info("entry:chain = {}", sequence.getAccession()); 230 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.PDB2); 231 232 header = "PDB:1ECY_A mol:protein length:142 ECOTIN"; 233 instance.parseHeader(header, sequence); 234 logger.info("1ECY_A = {}", sequence.getAccession()); 235 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.PDBe); 236 237 header = "pat|country|number"; 238 instance.parseHeader(header, sequence); 239 logger.info("number = {}", sequence.getAccession()); 240 logger.info("Data source: {}", sequence.getAccession().getDataSource(), DataSource.PATENTS); 241 242 header = "bbs|number"; 243 instance.parseHeader(header, sequence); 244 logger.info("number = {}", sequence.getAccession()); 245 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.GENINFO); 246 247 header = "gnl|database|identifier"; 248 instance.parseHeader(header, sequence); 249 logger.info("identifier = {}", sequence.getAccession()); 250 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.GENERAL); 251 252 header = "ref|accession|locus"; 253 instance.parseHeader(header, sequence); 254 logger.info("accession = {}", sequence.getAccession()); 255 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.NCBI); 256 257 header = "lcl|identifier"; 258 instance.parseHeader(header, sequence); 259 logger.info("identifier = {}", sequence.getAccession()); 260 logger.info("Data source: {} = {}", sequence.getAccession().getDataSource(), DataSource.LOCAL); 261 } 262}