001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Richard Holland 015 * @author Mark Schreiber 016 * @author David Scott 017 * @author Bubba Puryear 018 * @author George Waldon 019 * @author Deepak Sheoran 020 * @author Karl Nicholas <github:karlnicholas> 021 * @author Jacek Grzebyta 022 * @author Paolo Pavan 023 * 024 * For more information on the BioJava project and its aims, 025 * or to join the biojava-l mailing list, visit the home page 026 * at: 027 * 028 * http://www.biojava.org/ 029 * 030 * Created on 01-21-2010 031 */ 032package org.biojava.nbio.core.sequence.io; 033 034import org.biojava.nbio.core.exceptions.Messages; 035import org.biojava.nbio.core.exceptions.ParserException; 036import org.biojava.nbio.core.sequence.DataSource; 037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 038import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 039import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 040import org.biojava.nbio.core.sequence.features.AbstractFeature; 041import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 042import org.biojava.nbio.core.sequence.features.Qualifier; 043import org.biojava.nbio.core.sequence.features.TextFeature; 044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; 045import org.biojava.nbio.core.sequence.location.InsdcParser; 046import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 047import org.biojava.nbio.core.sequence.location.template.Location; 048import org.biojava.nbio.core.sequence.template.AbstractSequence; 049import org.biojava.nbio.core.sequence.template.Compound; 050import org.biojava.nbio.core.sequence.template.CompoundSet; 051import org.slf4j.Logger; 052import org.slf4j.LoggerFactory; 053 054import java.io.BufferedReader; 055import java.io.IOException; 056import java.util.ArrayList; 057import java.util.HashMap; 058import java.util.LinkedHashMap; 059import java.util.List; 060import java.util.regex.Matcher; 061import java.util.regex.Pattern; 062 063public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{ 064 065 private String seqData = null; 066 private GenericGenbankHeaderParser<S, C> headerParser; 067 private String header; 068 private String accession; 069 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> mapDB; 070 /** 071 * this data structure collects list of features extracted from the 072 * FEATURE_TAG section They are organized by list of the same type (i.e. 073 * same genbank Feature) and are provided with location 074 */ 075 private HashMap<String, ArrayList<AbstractFeature>> featureCollection; 076 077 private Logger log = LoggerFactory.getLogger(getClass()); 078 079 // this is a compoundset parsed from header. 080 private CompoundSet<?> compoundType; 081 082 /** 083 * The name of this format 084 */ 085 public static final String GENBANK_FORMAT = "GENBANK"; 086 087 protected static final String LOCUS_TAG = "LOCUS"; 088 protected static final String DEFINITION_TAG = "DEFINITION"; 089 protected static final String ACCESSION_TAG = "ACCESSION"; 090 protected static final String VERSION_TAG = "VERSION"; 091 protected static final String KEYWORDS_TAG = "KEYWORDS"; 092 // "SEGMENT" 093 protected static final String SOURCE_TAG = "SOURCE"; 094 protected static final String ORGANISM_TAG = "ORGANISM"; 095 protected static final String REFERENCE_TAG = "REFERENCE"; 096 protected static final String AUTHORS_TAG = "AUTHORS"; 097 protected static final String CONSORTIUM_TAG = "CONSRTM"; 098 protected static final String TITLE_TAG = "TITLE"; 099 protected static final String JOURNAL_TAG = "JOURNAL"; 100 protected static final String PUBMED_TAG = "PUBMED"; 101 protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated 102 protected static final String REMARK_TAG = "REMARK"; 103 protected static final String COMMENT_TAG = "COMMENT"; 104 protected static final String FEATURE_TAG = "FEATURES"; 105 protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated 106 protected static final String BASE_COUNT_TAG = "BASE"; 107 // "CONTIG" 108 protected static final String START_SEQUENCE_TAG = "ORIGIN"; 109 protected static final String END_SEQUENCE_TAG = "//"; 110 // locus line 111 protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}(([dms]s-)?(\\S+))?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$"); 112 // version line 113 protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); 114 // reference line 115 protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); 116 protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); 117 // dbxref line 118 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 119 120 protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK); 121 //sections start at a line and continue till the first line afterwards with a 122 //non-whitespace first character 123 //we want to match any of the following as a new section within a section 124 // \s{0,8} word \s{0,7} value 125 // \s{21} /word = value 126 // \s{21} /word 127 protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); 128 129 protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); 130 protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); 131 private static final String DBSOURCE = "DBSOURCE"; 132 private static final String PRIMARY = "PRIMARY"; 133 private static final String DBLINK = "DBLINK"; 134 135// private NCBITaxon tax = null; 136 137 138 139 private String parse(BufferedReader bufferedReader) { 140 String sectionKey = null; 141 List<String[]> section; 142 // Get an ordered list of key->value pairs in array-tuples 143 do { 144 section = this.readSection(bufferedReader); 145 sectionKey = section.get(0)[0]; 146 if (sectionKey == null) { 147 //if we reach the end of the file, section contains empty strings 148 if(section.get(0)[1]==null || section.get(0)[1]=="" || 149 section.get(0)[1].length()==0) { 150 throw new ParserException(Messages.ENDOFFILE); 151 } 152 throw new ParserException(Messages.SECTIONKEYNULL); 153 } 154 // process section-by-section 155 if (sectionKey.equals(LOCUS_TAG)) { 156 String loc = section.get(0)[1]; 157 header = loc; 158 Matcher m = lp.matcher(loc); 159 if (m.matches()) { 160 headerParser.setName(m.group(1)); 161 headerParser.setAccession(m.group(1)); // default if no accession found 162 163 String lengthUnits = m.group(2); 164 String type = m.group(5); 165 166 if (lengthUnits.equals("aa")) { 167 compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet(); 168 } else if (lengthUnits.equals("bp")) { 169 if (type != null) { 170 if (type.contains("RNA")) { 171 compoundType = RNACompoundSet.getRNACompoundSet(); 172 } else { 173 compoundType = DNACompoundSet.getDNACompoundSet(); 174 } 175 } else { 176 compoundType = DNACompoundSet.getDNACompoundSet(); 177 } 178 } 179 180 log.debug("compound type: {}", compoundType.getClass().getSimpleName()); 181 182 } else { 183 throw new ParserException("Bad locus line"); 184 } 185 } else if (sectionKey.equals(DEFINITION_TAG)) { 186 headerParser.setDescription(section.get(0)[1]); 187 } else if (sectionKey.equals(ACCESSION_TAG)) { 188 // if multiple accessions, store only first as accession, 189 // and store rest in annotation 190 String[] accs = section.get(0)[1].split("\\s+"); 191 accession = accs[0].trim(); 192 headerParser.setAccession(accession); 193 } else if (sectionKey.equals(VERSION_TAG)) { 194 String ver = section.get(0)[1]; 195 Matcher m = vp.matcher(ver); 196 if (m.matches()) { 197 String verAcc = m.group(1); 198 if (!accession.equals(verAcc)) { 199 // the version refers to a different accession! 200 // believe the version line, and store the original 201 // accession away in the additional accession set 202 accession = verAcc; 203 } 204 if (m.group(3) != null) { 205 headerParser.setVersion(Integer.parseInt(m.group(3))); 206 } 207 if (m.group(5) != null) { 208 headerParser.setIdentifier(m.group(5)); 209 } 210 } else { 211 throw new ParserException("Bad version line"); 212 } 213 } else if (sectionKey.equals(KEYWORDS_TAG)) { 214 } else if (sectionKey.equals(SOURCE_TAG)) { 215 // ignore - can get all this from the first feature 216 } else if (sectionKey.equals(REFERENCE_TAG)) { 217 } else if (sectionKey.equals(COMMENT_TAG)) { 218 // Set up some comments 219 headerParser.setComment(section.get(0)[1]); 220 } else if (sectionKey.equals(FEATURE_TAG)) { 221 // starting from second line of input, start a new feature whenever we come across 222 // a key that does not start with / 223 AbstractFeature gbFeature = null; 224 for (int i = 1; i < section.size(); i++) { 225 String key = section.get(i)[0]; 226 String val = section.get(i)[1]; 227 if (key.startsWith("/")) { 228 if (gbFeature == null) { 229 throw new ParserException("Malformed GenBank file: found a qualifier without feature."); 230 } 231 key = key.substring(1); // strip leading slash 232 val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); 233 if (val.endsWith("\"")) { 234 val = val.substring(1, val.length() - 1); // strip quotes 235 } 236 // parameter on old feature 237 if (key.equals("db_xref")) { 238 Matcher m = dbxp.matcher(val); 239 if (m.matches()) { 240 String dbname = m.group(1); 241 String raccession = m.group(2); 242 Qualifier xref = new DBReferenceInfo(dbname, raccession); 243 gbFeature.addQualifier(key, xref); 244 245 ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<DBReferenceInfo>(); 246 listDBEntry.add((DBReferenceInfo) xref); 247 mapDB.put(key, listDBEntry); 248 } else { 249 throw new ParserException("Bad dbxref"); 250 } 251 } else if (key.equalsIgnoreCase("organism")) { 252 Qualifier q = new Qualifier(key, val.replace('\n', ' ')); 253 gbFeature.addQualifier(key, q); 254 } else { 255 if (key.equalsIgnoreCase("translation")) { 256 // strip spaces from sequence 257 val = val.replaceAll("\\s+", ""); 258 Qualifier q = new Qualifier(key, val); 259 gbFeature.addQualifier(key, q); 260 } else { 261 Qualifier q = new Qualifier(key, val); 262 gbFeature.addQualifier(key, q); 263 } 264 } 265 } else { 266 // new feature! 267 gbFeature = new TextFeature(key, val, key, key); 268 Location l = 269 locationParser.parse(val); 270 gbFeature.setLocation((AbstractLocation)l); 271 272 if (!featureCollection.containsKey(key)) { 273 featureCollection.put(key, new ArrayList()); 274 } 275 featureCollection.get(key).add(gbFeature); 276 } 277 } 278 } else if (sectionKey.equals(BASE_COUNT_TAG)) { 279 // ignore - can calculate from sequence content later if needed 280 } else if (sectionKey.equals(START_SEQUENCE_TAG)) { 281 // our first line is ignorable as it is the ORIGIN tag 282 // the second line onwards conveniently have the number as 283 // the [0] tuple, and sequence string as [1] so all we have 284 // to do is concat the [1] parts and then strip out spaces, 285 // and replace '.' and '~' with '-' for our parser. 286 StringBuffer seq = new StringBuffer(); 287 for (int i = 1; i < section.size(); i++) { 288 seq.append(section.get(i)[1]); 289 } 290 seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase(); 291 } else if(sectionKey.equals(DBSOURCE)) { 292 //TODO 293 } else if(sectionKey.equals(PRIMARY)) { 294 //TODO 295 } else if(sectionKey.equals(DBLINK)) { 296 //TODO 297 } else { 298 if(!sectionKey.equals(END_SEQUENCE_TAG)) { 299 log.info("found unknown section key: "+sectionKey); 300 } 301 } 302 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 303 return seqData; 304 } 305 306 307 308 // reads an indented section, combining split lines and creating a list of 309 // key->value tuples 310 // reads an indented section, combining split lines and creating a list of 311 // key->value tuples 312 // reads an indented section, combining split lines and creating a list of 313 // key->value tuples 314 private List<String[]> readSection(BufferedReader bufferedReader) { 315 List<String[]> section = new ArrayList<String[]>(); 316 String line = ""; 317 318 String currKey = null; 319 StringBuffer currVal = new StringBuffer(); 320 boolean done = false; 321 int linecount = 0; 322 323 try { 324 while (!done) { 325 bufferedReader.mark(320); 326 line = bufferedReader.readLine(); 327 String firstSecKey = section.isEmpty() ? "" 328 : section.get(0)[0]; 329 if (line != null && line.matches("\\p{Space}*")) { 330 // regular expression \p{Space}* will match line 331 // having only white space characters 332 continue; 333 } 334 if (line == null 335 || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey 336 .equals(START_SEQUENCE_TAG) || line 337 .startsWith(END_SEQUENCE_TAG)))) { 338 // dump out last part of section 339 section.add(new String[]{currKey, currVal.toString()}); 340 bufferedReader.reset(); 341 done = true; 342 } else { 343 Matcher m = sectp.matcher(line); 344 if (m.matches()) { 345 // new key 346 if (currKey != null) { 347 section.add(new String[]{currKey, 348 currVal.toString()}); 349 } 350 // key = group(2) or group(4) or group(6) - whichever is 351 // not null 352 currKey = m.group(2) == null ? (m.group(4) == null ? m 353 .group(6) : m.group(4)) : m.group(2); 354 currVal = new StringBuffer(); 355 // val = group(3) if group(2) not null, group(5) if 356 // group(4) not null, "" otherwise, trimmed 357 currVal.append((m.group(2) == null ? (m.group(4) == null ? "" 358 : m.group(5)) 359 : m.group(3)).trim()); 360 } else { 361 // concatted line or SEQ START/END line? 362 if (line.startsWith(START_SEQUENCE_TAG) 363 || line.startsWith(END_SEQUENCE_TAG)) { 364 currKey = line; 365 } else { 366 currVal.append("\n"); // newline in between lines - 367 // can be removed later 368 currVal.append(currKey.charAt(0) == '/' ? line 369 .substring(21) : line.substring(12)); 370 } 371 } 372 } 373 } 374 } catch (IOException e) { 375 throw new ParserException(e.getMessage()); 376 } catch (RuntimeException e) { 377 throw new ParserException(e.getMessage()); 378 } 379 return section; 380 } 381 382 @Override 383 public String getSequence(BufferedReader bufferedReader, int sequenceLength) throws IOException { 384 featureCollection = new HashMap<String, ArrayList<AbstractFeature>>(); 385 mapDB = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 386 headerParser = new GenericGenbankHeaderParser<S, C>(); 387 try { 388 parse(bufferedReader); 389 } catch (ParserException e) { 390 if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null; 391 else throw new ParserException(e.getMessage()); 392 } 393 394 return seqData; 395 } 396 397 public String getHeader() { 398 return header; 399 } 400 401 public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() { 402 return headerParser; 403 } 404 405 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 406 return mapDB; 407 } 408 409 public ArrayList<String> getKeyWords() { 410 return new ArrayList<String>(featureCollection.keySet()); 411 } 412 413 public ArrayList<AbstractFeature> getFeatures(String keyword) { 414 return featureCollection.get(keyword); 415 } 416 public HashMap<String, ArrayList<AbstractFeature>> getFeatures() { 417 return featureCollection; 418 } 419 420 public void parseFeatures(AbstractSequence<C> sequence) { 421 for (String k: featureCollection.keySet()) 422 for (AbstractFeature f: featureCollection.get(k)) 423 sequence.addFeature(f); 424 } 425 426 public CompoundSet<?> getCompoundType() { 427 return compoundType; 428 } 429}