001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 */ 022package org.biojava.nbio.core.sequence.io; 023 024import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 025import org.biojava.nbio.core.sequence.AccessionID; 026import org.biojava.nbio.core.sequence.DataSource; 027import org.biojava.nbio.core.sequence.ProteinSequence; 028import org.biojava.nbio.core.sequence.compound.AminoAcidCompound; 029import org.biojava.nbio.core.sequence.io.template.SequenceHeaderParserInterface; 030import org.biojava.nbio.core.sequence.template.AbstractSequence; 031import org.biojava.nbio.core.sequence.template.AbstractSequence.AnnotationType; 032import org.biojava.nbio.core.sequence.template.Compound; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import java.util.ArrayList; 037import java.util.List; 038 039/** 040 * The default fasta header parser where some headers are well defined based on the source 041 * database which allows us to set the source of the protein sequence and the identifier 042 * that can be used in future implementations to load features from external sources 043 * 044 * If the user has a custom header with local data then they can create their own implementation 045 * of a FastaHeaderParserInterface 046 *<pre> 047 * GenBank gi|gi-number|gb|accession|locus 048 * ENA Data Library gi|gi-number|emb|accession|locus 049 * DDBJ, DNA Database of Japan gi|gi-number|dbj|accession|locus 050 * NBRF PIR pir||entry 051 * Protein Research Foundation prf||name 052 * SWISS-PROT sp|accession|name 053 * Brookhaven Protein Data Bank (1) pdb|entry|chain 054 * Brookhaven Protein Data Bank (2) entry:chain|PDBID|CHAIN|SEQUENCE 055 * PDB EBI PDB:1ECY_A mol:protein length:142 ECOTIN 056 * Patents pat|country|number 057 * GenInfo Backbone Id bbs|number 058 * General database identifier gnl|database|identifier 059 * NCBI Reference Sequence ref|accession|locus 060 * Local Sequence identifier lcl|identifier 061 *</pre> 062 * @author Scooter Willis 063 */ 064public class GenericFastaHeaderParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceHeaderParserInterface<S,C> { 065 066 private final static Logger logger = LoggerFactory.getLogger(GenericFastaHeaderParser.class); 067 068 /** 069 * Parse out the components where some have a | and others do not 070 * @param header 071 * @return 072 */ 073 private String[] getHeaderValues(String header) { 074 String[] data = new String[0]; 075 List<String> values = new ArrayList<>(); 076 StringBuffer sb = new StringBuffer(); 077 //commented out 1/11/2012 to resolve an issue where headers do contain a length= at the end that are not recognized 078 //if(header.indexOf("length=") != -1){ 079 // data = new String[1]; 080 // int index = header.indexOf("length="); 081 // data[0] = header.substring(0, index).trim(); 082 // logger.debug("accession=" + data[0]); 083 // return data; 084 //} else 085 if (!header.startsWith("PDB:")) { 086 for (int i = 0; i < header.length(); i++) { 087 if (header.charAt(i) == '|') { 088 values.add(sb.toString()); 089 sb.setLength(0);//faster than = new StringBuffer(); 090 } else if (i == header.length() - 1) { 091 sb.append(header.charAt(i)); 092 values.add(sb.toString()); 093 } else { 094 sb.append(header.charAt(i)); 095 } 096 097 } 098 data = new String[values.size()]; 099 values.toArray(data); 100 } else { 101 data = header.split(" "); 102 } 103 return data; 104 } 105 106 /** 107 * Parse the header and set the values in the sequence 108 * @param header 109 * @param sequence 110 */ 111 @Override 112 public void parseHeader(String header, S sequence) { 113 //uniptrot 114 // tr|Q0TET7|Q0TET7_ECOL5 Putative uncharacterized protein OS=Escherichia coli O6:K15:H31 (strain 536 / UPEC) GN=ECP_2553 PE=4 SV=1 115 sequence.setOriginalHeader(header); 116 String[] data = getHeaderValues(header); 117 118 if (data.length == 1) { 119 sequence.setAccession(new AccessionID(data[0])); 120 } else if ("sp".equalsIgnoreCase(data[0]) || "tr".equalsIgnoreCase(data[0])) { 121 if ("sp".equalsIgnoreCase(data[0])) { 122 sequence.setAnnotationType(AnnotationType.CURATED); 123 } else { 124 sequence.setAnnotationType(AnnotationType.PREDICTED); 125 } 126 127 sequence.setAccession(new AccessionID(data[1], DataSource.UNIPROT)); 128 if (data.length > 2) { 129 sequence.setDescription(data[2]); 130 } 131 132 } else if ("gi".equalsIgnoreCase(data[0])) { 133 DataSource giSource = DataSource.UNKNOWN; 134 if (data.length >= 3) { 135 if ("gb".equalsIgnoreCase(data[2])) { 136 giSource = DataSource.GENBANK; 137 } else if ("emb".equalsIgnoreCase(data[2])) { 138 giSource = DataSource.ENA; 139 } else if ("dbj".equalsIgnoreCase(data[2])) { 140 giSource = DataSource.DDBJ; 141 } 142 sequence.setAccession(new AccessionID(data[3], giSource)); 143 } else { 144 sequence.setAccession(new AccessionID(header, giSource)); 145 } 146 } else if ("pir".equalsIgnoreCase(data[0])) { 147 sequence.setAccession(new AccessionID(data[2], DataSource.NBRF)); 148 } else if ("prf".equalsIgnoreCase(data[0])) { 149 sequence.setAccession(new AccessionID(data[2], DataSource.PRF)); 150 } else if ("pdb".equalsIgnoreCase(data[0])) { 151 sequence.setAccession(new AccessionID(data[1] + ":" + data[2], DataSource.PDB1)); 152 } else if (data[0].startsWith("PDB")) { 153 String[] pdbe = data[0].split(" "); 154 String[] pdbaccession = pdbe[0].split(":"); 155 sequence.setAccession(new AccessionID(pdbaccession[1], DataSource.PDBe)); 156 } else if (data[0].indexOf(":") != -1 && data.length > 1 && "PDBID".equals(data[1])) { 157 sequence.setAccession(new AccessionID(data[0], DataSource.PDB2)); 158 } else if ("pat".equalsIgnoreCase(data[0])) { 159 sequence.setAccession(new AccessionID(data[2], DataSource.PATENTS)); 160 } else if ("bbs".equalsIgnoreCase(data[0])) { 161 sequence.setAccession(new AccessionID(data[1], DataSource.GENINFO)); 162 } else if ("gnl".equalsIgnoreCase(data[0])) { 163 sequence.setAccession(new AccessionID(data[2], DataSource.GENERAL)); 164 } else if ("ref".equalsIgnoreCase(data[0])) { 165 sequence.setAccession(new AccessionID(data[1], DataSource.NCBI)); 166 } else if ("lcl".equalsIgnoreCase(data[0])) { 167 sequence.setAccession(new AccessionID(data[1], DataSource.LOCAL)); 168 } else { 169 sequence.setAccession(new AccessionID(data[0])); // avoid the common problem of picking up all the comments original header in getOriginalHeader 170 } 171 172 173 } 174 175 176}