001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Richard Holland 015 * @author Mark Schreiber 016 * @author David Scott 017 * @author Bubba Puryear 018 * @author George Waldon 019 * @author Deepak Sheoran 020 * @author Karl Nicholas <github:karlnicholas> 021 * @author Jacek Grzebyta 022 * @author Paolo Pavan 023 * 024 * For more information on the BioJava project and its aims, 025 * or to join the biojava-l mailing list, visit the home page 026 * at: 027 * 028 * http://www.biojava.org/ 029 * 030 * Created on 01-21-2010 031 */ 032package org.biojava.nbio.core.sequence.io; 033 034import org.biojava.nbio.core.exceptions.Messages; 035import org.biojava.nbio.core.exceptions.ParserException; 036import org.biojava.nbio.core.sequence.DataSource; 037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 038import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 039import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 040import org.biojava.nbio.core.sequence.features.AbstractFeature; 041import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 042import org.biojava.nbio.core.sequence.features.Qualifier; 043import org.biojava.nbio.core.sequence.features.TextFeature; 044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; 045import org.biojava.nbio.core.sequence.location.InsdcParser; 046import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 047import org.biojava.nbio.core.sequence.location.template.Location; 048import org.biojava.nbio.core.sequence.reference.GenbankReference; 049import org.biojava.nbio.core.sequence.template.AbstractSequence; 050import org.biojava.nbio.core.sequence.template.Compound; 051import org.biojava.nbio.core.sequence.template.CompoundSet; 052import org.slf4j.Logger; 053import org.slf4j.LoggerFactory; 054 055import java.io.BufferedReader; 056import java.io.IOException; 057import java.util.*; 058import java.util.regex.Matcher; 059import java.util.regex.Pattern; 060 061public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{ 062 063 private String seqData = null; 064 private GenericGenbankHeaderParser<S, C> headerParser; 065 private String header; 066 private String accession; 067 private boolean isCircularSequence; 068 private Map<String, List<DBReferenceInfo>> mapDB; 069 /** 070 * this data structure collects list of features extracted from the 071 * FEATURE_TAG section They are organized by list of the same type (i.e. 072 * same genbank Feature) and are provided with location 073 */ 074 private Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> featureCollection; 075 076 private final Logger log = LoggerFactory.getLogger(getClass()); 077 078 // this is a compoundset parsed from header. 079 private CompoundSet<?> compoundType; 080 081 /** 082 * The name of this format 083 */ 084 public static final String GENBANK_FORMAT = "GENBANK"; 085 086 protected static final String LOCUS_TAG = "LOCUS"; 087 protected static final String DEFINITION_TAG = "DEFINITION"; 088 protected static final String ACCESSION_TAG = "ACCESSION"; 089 protected static final String VERSION_TAG = "VERSION"; 090 protected static final String KEYWORDS_TAG = "KEYWORDS"; 091 // "SEGMENT" 092 protected static final String SOURCE_TAG = "SOURCE"; 093 protected static final String ORGANISM_TAG = "ORGANISM"; 094 protected static final String REFERENCE_TAG = "REFERENCE"; 095 protected static final String AUTHORS_TAG = "AUTHORS"; 096 protected static final String CONSORTIUM_TAG = "CONSRTM"; 097 protected static final String TITLE_TAG = "TITLE"; 098 protected static final String JOURNAL_TAG = "JOURNAL"; 099 protected static final String PUBMED_TAG = "PUBMED"; 100 protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated 101 protected static final String REMARK_TAG = "REMARK"; 102 protected static final String COMMENT_TAG = "COMMENT"; 103 protected static final String FEATURE_TAG = "FEATURES"; 104 protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated 105 protected static final String BASE_COUNT_TAG = "BASE"; 106 // "CONTIG" 107 protected static final String START_SEQUENCE_TAG = "ORIGIN"; 108 protected static final String DBSOURCE = "DBSOURCE"; 109 protected static final String PRIMARY = "PRIMARY"; 110 protected static final String DBLINK = "DBLINK"; 111 protected static final String END_SEQUENCE_TAG = "//"; 112 // locus line with name that may contain spaces but must start and end with non whitespace character 113 protected static final Pattern lp = Pattern.compile("^(\\S+[\\S ]*\\S*)\\s+(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$"); 114 // locus line with no name 115 protected static final Pattern lp2 = Pattern.compile("^(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$"); 116 // version line 117 protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); 118 // reference line 119 protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); 120 protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); 121 // dbxref line 122 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 123 124 protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK); 125 /** 126 * sections start at a line and continue till the first line afterwards with a 127 * non-whitespace first character 128 * we want to match any of the following as a new section within a section 129 * \s{0,8} word \s{0,7} value 130 * \s{21} /word = value 131 * \s{21} /word 132 */ 133 protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); 134 135 protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); 136 protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); 137 138 139 private String parse(BufferedReader bufferedReader) { 140 String sectionKey; 141 List<String[]> section; 142 // Get an ordered list of key->value pairs in array-tuples 143 do { 144 section = this.readSection(bufferedReader); 145 sectionKey = section.get(0)[0]; 146 if (sectionKey == null) { 147 //if we reach the end of the file, section contains empty strings 148 if(section.get(0)[1]==null || section.get(0)[1].equals("") || 149 section.get(0)[1].length()==0) { 150 throw new ParserException(Messages.ENDOFFILE); 151 } 152 throw new ParserException(Messages.SECTIONKEYNULL); 153 } 154 // process section-by-section 155 switch (sectionKey) { 156 case LOCUS_TAG: parseLocusTag(section); break; 157 case DEFINITION_TAG: parseDefinitionTag(section); break; 158 case ACCESSION_TAG: parseAccessionTag(section); break; 159 case VERSION_TAG: parseVersionTag(section); break; 160 case KEYWORDS_TAG: break; // not implemented yet 161 case SOURCE_TAG: break; // ignore - can get all this from the first feature 162 case REFERENCE_TAG: parseReferenceTag(section); break; 163 case COMMENT_TAG: parseCommentTag(section); break; 164 case FEATURE_TAG: parseFeatureTag(section); break; 165 case BASE_COUNT_TAG: break; // ignore - can calculate from sequence content later if needed 166 case START_SEQUENCE_TAG: parseStartSequenceTag(section); break; 167 case DBSOURCE: break; // not implemented yet 168 case PRIMARY: break; // not implemented yet 169 case DBLINK: break; // not implemented yet 170 default: 171 if(!sectionKey.equals(END_SEQUENCE_TAG)) { 172 log.info("found unknown section key: %", sectionKey); 173 } 174 } 175 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 176 return seqData; 177 } 178 179 private void parseStartSequenceTag(List<String[]> section) { 180 // our first line is ignorable as it is the ORIGIN tag 181 // the second line onwards conveniently have the number as 182 // the [0] tuple, and sequence string as [1] so all we have 183 // to do is concat the [1] parts and then strip out spaces, 184 // and replace '.' and '~' with '-' for our parser. 185 StringBuilder seq = new StringBuilder(); 186 for (int i = 1; i < section.size(); i++) { 187 seq.append(section.get(i)[1]); 188 } 189 seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase(); 190 } 191 192 private void parseFeatureTag(List<String[]> section) { 193 // starting from second line of input, start a new feature whenever we come across 194 // a key that does not start with / 195 AbstractFeature gbFeature = null; 196 for (int i = 1; i < section.size(); i++) { 197 String key = section.get(i)[0]; 198 String val = section.get(i)[1]; 199 if (key.startsWith("/")) { 200 if (gbFeature == null) { 201 throw new ParserException("Malformed GenBank file: found a qualifier without feature."); 202 } 203 Boolean needsQuotes = false; 204 key = key.substring(1); // strip leading slash 205 val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); 206 if (val.endsWith("\"")) { 207 val = val.substring(1, val.length() - 1); // strip quotes 208 needsQuotes = true; // as the value has quotes then set that it needs quotes when written back out 209 } 210 // parameter on old feature 211 if (key.equals("db_xref")) { 212 Matcher m = dbxp.matcher(val); 213 if (m.matches()) { 214 String dbname = m.group(1); 215 String raccession = m.group(2); 216 DBReferenceInfo xref = new DBReferenceInfo(dbname, raccession); 217 xref.setNeedsQuotes(needsQuotes); 218 gbFeature.addQualifier(key, xref); 219 220 ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<>(); 221 listDBEntry.add(xref); 222 mapDB.put(key, listDBEntry); 223 } else { 224 throw new ParserException("Bad dbxref"); 225 } 226 } else if (key.equalsIgnoreCase("organism")) { 227 Qualifier q = new Qualifier(key, val.replace('\n', ' '), needsQuotes); 228 gbFeature.addQualifier(key, q); 229 } else { 230 if (key.equalsIgnoreCase("translation") || key.equals("anticodon") 231 || key.equals("transl_except")) { 232 // strip spaces from sequence 233 val = val.replaceAll("\\s+", ""); 234 Qualifier q = new Qualifier(key, val, needsQuotes); 235 gbFeature.addQualifier(key, q); 236 } else { 237 Qualifier q = new Qualifier(key, val, needsQuotes); 238 gbFeature.addQualifier(key, q); 239 } 240 } 241 } else { 242 // new feature! 243 gbFeature = new TextFeature(key, val, key, key); 244 Location l = 245 locationParser.parse(val); 246 gbFeature.setLocation((AbstractLocation)l); 247 248 if (!featureCollection.containsKey(key)) { 249 featureCollection.put(key, new ArrayList<>()); 250 } 251 featureCollection.get(key).add(gbFeature); 252 } 253 } 254 } 255 256 private void parseCommentTag(List<String[]> section) { 257 headerParser.setComment(section.get(0)[1]); 258 } 259 260 private void parseReferenceTag(List<String[]> section) { 261 GenbankReference genbankReference = new GenbankReference(); 262 for (String[] ref : section) { 263 if (ref[0].equals(AUTHORS_TAG)) { 264 genbankReference.setAuthors(ref[1]); 265 } else if (ref[0].equals(TITLE_TAG)) { 266 genbankReference.setTitle(ref[1]); 267 } else if (ref[0].equals(JOURNAL_TAG)) { 268 genbankReference.setJournal(ref[1]); 269 } 270 } 271 headerParser.addReference(genbankReference); 272 } 273 274 private void parseVersionTag(List<String[]> section) { 275 String ver = section.get(0)[1]; 276 Matcher m = vp.matcher(ver); 277 if (m.matches()) { 278 String verAcc = m.group(1); 279 if (!accession.equals(verAcc)) { 280 // the version refers to a different accession! 281 // believe the version line, and store the original 282 // accession away in the additional accession set 283 accession = verAcc; 284 } 285 if (m.group(3) != null) { 286 headerParser.setVersion(Integer.parseInt(m.group(3))); 287 } 288 if (m.group(5) != null) { 289 headerParser.setIdentifier(m.group(5)); 290 } 291 } else { 292 throw new ParserException("Bad version line"); 293 } 294 } 295 296 private void parseAccessionTag(List<String[]> section) { 297 // if multiple accessions, store only first as accession, 298 // and store rest in annotation 299 String[] accs = section.get(0)[1].split("\\s+"); 300 accession = accs[0].trim(); 301 headerParser.setAccession(accession); 302 } 303 304 private void parseDefinitionTag(List<String[]> section) { 305 headerParser.setDescription(section.get(0)[1]); 306 } 307 308 private void parseLocusTag(List<String[]> section) { 309 String loc = section.get(0)[1]; 310 header = loc; 311 Matcher m = lp.matcher(loc); 312 Matcher m2 = lp2.matcher(loc); 313 if (m.matches()) { 314 //remove any preceding or trailing whitespace from the locus name 315 String name = m.group(1).trim().replaceAll(" ","_"); 316 headerParser.setName(name); 317 headerParser.setAccession(name); // default if no accession found 318 long sequenceLength = Long.valueOf(m.group(2)); 319 String lengthUnits = m.group(3); 320 String type = m.group(6); 321 322 if (lengthUnits.equalsIgnoreCase("aa")) { 323 compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet(); 324 } else if (lengthUnits.equalsIgnoreCase("bp")) { 325 if (type != null) { 326 if (type.contains("RNA")) { 327 compoundType = RNACompoundSet.getRNACompoundSet(); 328 } else { 329 compoundType = DNACompoundSet.getDNACompoundSet(); 330 } 331 } else { 332 compoundType = DNACompoundSet.getDNACompoundSet(); 333 } 334 } 335 336 if (m.group(7) != null) isCircularSequence = m.group(7).equalsIgnoreCase("circular"); 337 338 // configure location parser with needed information 339 locationParser.setSequenceLength(sequenceLength); 340 locationParser.setSequenceCircular(isCircularSequence); 341 342 log.debug("compound type: {}", compoundType.getClass().getSimpleName()); 343 344 } else if (m2.matches()) { 345 // Locus Name Missing - use different Locus regex 346 headerParser.setName(""); 347 headerParser.setAccession(""); // default if no accession found 348 long sequenceLength = Long.valueOf(m2.group(1)); 349 String lengthUnits = m2.group(2); 350 String type = m2.group(5); 351 352 if (lengthUnits.equalsIgnoreCase("aa")) { 353 compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet(); 354 } else if (lengthUnits.equalsIgnoreCase("bp")) { 355 if (type != null) { 356 if (type.contains("RNA")) { 357 compoundType = RNACompoundSet.getRNACompoundSet(); 358 } else { 359 compoundType = DNACompoundSet.getDNACompoundSet(); 360 } 361 } else { 362 compoundType = DNACompoundSet.getDNACompoundSet(); 363 } 364 } 365 366 if (m2.group(6) != null) isCircularSequence = m2.group(6).equalsIgnoreCase("circular"); 367 368 // configure location parser with needed information 369 locationParser.setSequenceLength(sequenceLength); 370 locationParser.setSequenceCircular(isCircularSequence); 371 372 log.debug("compound type: {}", compoundType.getClass().getSimpleName()); 373 374 375 } else { 376 throw new ParserException("Bad locus line"); 377 } 378 } 379 380 381 // reads an indented section, combining split lines and creating a list of 382 // key->value tuples 383 // reads an indented section, combining split lines and creating a list of 384 // key->value tuples 385 // reads an indented section, combining split lines and creating a list of 386 // key->value tuples 387 private List<String[]> readSection(BufferedReader bufferedReader) { 388 List<String[]> section = new ArrayList<>(); 389 String line; 390 391 String currKey = null; 392 StringBuilder currVal = new StringBuilder(); 393 boolean done = false; 394 int linecount = 0; 395 396 try { 397 while (!done) { 398 bufferedReader.mark(320); 399 line = bufferedReader.readLine(); 400 String firstSecKey = section.isEmpty() ? "" 401 : section.get(0)[0]; 402 if (line != null && line.matches("\\p{Space}*")) { 403 // regular expression \p{Space}* will match line 404 // having only white space characters 405 continue; 406 } 407 if (line == null 408 || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey 409 .equals(START_SEQUENCE_TAG) || line 410 .startsWith(END_SEQUENCE_TAG)))) { 411 // dump out last part of section 412 section.add(new String[]{currKey, currVal.toString()}); 413 bufferedReader.reset(); 414 done = true; 415 } else { 416 Matcher m = sectp.matcher(line); 417 if (m.matches()) { 418 // new key 419 if (currKey != null) { 420 section.add(new String[]{currKey, 421 currVal.toString()}); 422 } 423 // key = group(2) or group(4) or group(6) - whichever is 424 // not null 425 currKey = m.group(2) == null ? (m.group(4) == null ? m 426 .group(6) : m.group(4)) : m.group(2); 427 currVal = new StringBuilder(); 428 // val = group(3) if group(2) not null, group(5) if 429 // group(4) not null, "" otherwise, trimmed 430 currVal.append((m.group(2) == null ? (m.group(4) == null ? "" 431 : m.group(5)) 432 : m.group(3)).trim()); 433 } else { 434 // concatted line or SEQ START/END line? 435 if (line.startsWith(START_SEQUENCE_TAG) 436 || line.startsWith(END_SEQUENCE_TAG)) { 437 currKey = line; 438 } else { 439 currVal.append("\n"); // newline in between lines - 440 // can be removed later 441 currVal.append(currKey.charAt(0) == '/' ? line 442 .substring(21) : line.substring(12)); 443 } 444 } 445 } 446 } 447 } catch (IOException | RuntimeException e) { 448 throw new ParserException(e.getMessage()); 449 } 450 return section; 451 } 452 453 @Override 454 public String getSequence(BufferedReader bufferedReader, int sequenceLength) { 455 featureCollection = new HashMap<>(); 456 mapDB = new LinkedHashMap<>(); 457 headerParser = new GenericGenbankHeaderParser<>(); 458 try { 459 parse(bufferedReader); 460 } catch (ParserException e) { 461 if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null; 462 else throw new ParserException(e.getMessage()); 463 } 464 465 return seqData; 466 } 467 468 public String getHeader() { 469 return header; 470 } 471 472 public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() { 473 return headerParser; 474 } 475 476 public Map<String, List<DBReferenceInfo>> getDatabaseReferences() { 477 return mapDB; 478 } 479 480 public List<String> getKeyWords() { 481 return new ArrayList<>(featureCollection.keySet()); 482 } 483 484 public List<AbstractFeature<AbstractSequence<C>, C>> getFeatures(String keyword) { 485 return featureCollection.get(keyword); 486 } 487 public Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> getFeatures() { 488 return featureCollection; 489 } 490 491 public void parseFeatures(AbstractSequence<C> sequence) { 492 for (String k: featureCollection.keySet()) 493 for (AbstractFeature<AbstractSequence<C>, C> f: featureCollection.get(k)) 494 sequence.addFeature(f); 495 } 496 497 public CompoundSet<?> getCompoundType() { 498 return compoundType; 499 } 500}