001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 025import org.biojava.nbio.core.sequence.AccessionID; 026import org.biojava.nbio.core.sequence.DataSource; 027import org.biojava.nbio.core.sequence.ProteinSequence; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 030import org.biojava.nbio.core.sequence.template.AbstractSequence; 031import org.biojava.nbio.core.sequence.template.AbstractSequence.AnnotationType; 032import org.biojava.nbio.core.sequence.template.Compound; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import java.util.ArrayList; 037 038/** 039 * The default fasta header parser where some headers are well defined based on the source 040 * database which allows us to set the source of the protein sequence and the identifier 041 * that can be used in future implementations to load features from external sources 042 * 043 * If the user has a custom header with local data then they can create their own implementation 044 * of a FastaHeaderParserInterface 045 *<pre> 046 * GenBank gi|gi-number|gb|accession|locus 047 * ENA Data Library gi|gi-number|emb|accession|locus 048 * DDBJ, DNA Database of Japan gi|gi-number|dbj|accession|locus 049 * NBRF PIR pir||entry 050 * Protein Research Foundation prf||name 051 * SWISS-PROT sp|accession|name 052 * Brookhaven Protein Data Bank (1) pdb|entry|chain 053 * Brookhaven Protein Data Bank (2) entry:chain|PDBID|CHAIN|SEQUENCE 054 * PDB EBI PDB:1ECY_A mol:protein length:142 ECOTIN 055 * Patents pat|country|number 056 * GenInfo Backbone Id bbs|number 057 * General database identifier gnl|database|identifier 058 * NCBI Reference Sequence ref|accession|locus 059 * Local Sequence identifier lcl|identifier 060 *</pre> 061 * @author Scooter Willis <willishf at gmail dot com> 062 */ 063public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceHeaderParserInterface<S,C> { 064 065 private final static Logger logger = LoggerFactory.getLogger(GenericFastaHeaderParser.class); 066 067 /** 068 * Parse out the components where some have a | and others do not 069 * @param header 070 * @return 071 */ 072 private String[] getHeaderValues(String header) { 073 String[] data = new String[0]; 074 ArrayList<String> values = new ArrayList<String>(); 075 StringBuffer sb = new StringBuffer(); 076 //commented out 1/11/2012 to resolve an issue where headers do contain a length= at the end that are not recognized 077 //if(header.indexOf("length=") != -1){ 078 // data = new String[1]; 079 // int index = header.indexOf("length="); 080 // data[0] = header.substring(0, index).trim(); 081 // logger.debug("accession=" + data[0]); 082 // return data; 083 //} else 084 if (!header.startsWith("PDB:")) { 085 for (int i = 0; i < header.length(); i++) { 086 if (header.charAt(i) == '|') { 087 values.add(sb.toString()); 088 sb.setLength(0);//faster than = new StringBuffer(); 089 } else if (i == header.length() - 1) { 090 sb.append(header.charAt(i)); 091 values.add(sb.toString()); 092 } else { 093 sb.append(header.charAt(i)); 094 } 095 096 data = new String[values.size()]; 097 values.toArray(data); 098 } 099 } else { 100 data = header.split(" "); 101 } 102 return data; 103 } 104 105 /** 106 * Parse the header and set the values in the sequence 107 * @param header 108 * @param sequence 109 */ 110 @Override 111 public void parseHeader(String header, S sequence) { 112 //uniptrot 113 // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1 114 sequence.setOriginalHeader(header); 115 String[] data = getHeaderValues(header); 116 117 if (data.length == 1) { 118 sequence.setAccession(new AccessionID(data[0])); 119 } else if (data[0].equalsIgnoreCase("sp") || data[0].equalsIgnoreCase("tr")) { 120 if (data[0].equalsIgnoreCase("sp")) { 121 sequence.setAnnotationType(AnnotationType.CURATED); 122 } else { 123 sequence.setAnnotationType(AnnotationType.PREDICTED); 124 } 125 126 sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT)); 127 if (data.length > 2) { 128 sequence.setDescription(data[2]); 129 } 130 131 } else if (data[0].equalsIgnoreCase("gi")) { 132 DataSource giSource = DataSource.UNKNOWN; 133 if (data.length >= 3) { 134 if (data[2].equalsIgnoreCase("gb")) { 135 giSource = DataSource.GENBANK; 136 } else if (data[2].equalsIgnoreCase("emb")) { 137 giSource = DataSource.ENA; 138 } else if (data[2].equalsIgnoreCase("dbj")) { 139 giSource = DataSource.DDBJ; 140 } 141 sequence.setAccession(new AccessionID(data[3], giSource)); 142 } else { 143 sequence.setAccession(new AccessionID(header, giSource)); 144 } 145 } else if (data[0].equalsIgnoreCase("pir")) { 146 sequence.setAccession(new AccessionID(data[2], DataSource.NBRF)); 147 } else if (data[0].equalsIgnoreCase("prf")) { 148 sequence.setAccession(new AccessionID(data[2], DataSource.PRF)); 149 } else if (data[0].equalsIgnoreCase("pdb")) { 150 sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1)); 151 } else if (data[0].startsWith("PDB")) { 152 String[] pdbe = data[0].split(" "); 153 String[] pdbaccession = pdbe[0].split(":"); 154 sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe)); 155 } else if (data[0].indexOf(":") != -1 && data.length > 1 && data[1].equals("PDBID")) { 156 sequence.setAccession(new AccessionID(data[0], DataSource.PDB2)); 157 } else if (data[0].equalsIgnoreCase("pat")) { 158 sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS)); 159 } else if (data[0].equalsIgnoreCase("bbs")) { 160 sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO)); 161 } else if (data[0].equalsIgnoreCase("gnl")) { 162 sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL)); 163 } else if (data[0].equalsIgnoreCase("ref")) { 164 sequence.setAccession(new AccessionID(data[1], DataSource.NCBI)); 165 } else if (data[0].equalsIgnoreCase("lcl")) { 166 sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL)); 167 } else { 168 sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader 169 } 170 171 172 } 173 174 175}