001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Richard Holland 015 * @author Mark Schreiber 016 * @author David Scott 017 * @author Bubba Puryear 018 * @author George Waldon 019 * @author Deepak Sheoran 020 * @author Karl Nicholas <github:karlnicholas> 021 * @author Jacek Grzebyta 022 * @author Paolo Pavan 023 * 024 * For more information on the BioJava project and its aims, 025 * or to join the biojava-l mailing list, visit the home page 026 * at: 027 * 028 * http://www.biojava.org/ 029 * 030 * Created on 01-21-2010 031 */ 032package org.biojava.nbio.core.sequence.io; 033 034import org.biojava.nbio.core.exceptions.Messages; 035import org.biojava.nbio.core.exceptions.ParserException; 036import org.biojava.nbio.core.sequence.DataSource; 037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 038import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 039import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 040import org.biojava.nbio.core.sequence.features.AbstractFeature; 041import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 042import org.biojava.nbio.core.sequence.features.Qualifier; 043import org.biojava.nbio.core.sequence.features.TextFeature; 044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; 045import org.biojava.nbio.core.sequence.location.InsdcParser; 046import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 047import org.biojava.nbio.core.sequence.location.template.Location; 048import org.biojava.nbio.core.sequence.reference.GenbankReference; 049import org.biojava.nbio.core.sequence.template.AbstractSequence; 050import org.biojava.nbio.core.sequence.template.Compound; 051import org.biojava.nbio.core.sequence.template.CompoundSet; 052import org.slf4j.Logger; 053import org.slf4j.LoggerFactory; 054 055import java.io.BufferedReader; 056import java.io.IOException; 057import java.util.*; 058import java.util.regex.Matcher; 059import java.util.regex.Pattern; 060 061public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{ 062 063 private String seqData = null; 064 private GenericGenbankHeaderParser<S, C> headerParser; 065 private String header; 066 private String accession; 067 private boolean isCircularSequence; 068 private Map<String, List<DBReferenceInfo>> mapDB; 069 /** 070 * this data structure collects list of features extracted from the 071 * FEATURE_TAG section They are organized by list of the same type (i.e. 072 * same genbank Feature) and are provided with location 073 */ 074 private Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> featureCollection; 075 076 private final Logger log = LoggerFactory.getLogger(getClass()); 077 078 // this is a compoundset parsed from header. 079 private CompoundSet<?> compoundType; 080 081 /** 082 * The name of this format 083 */ 084 public static final String GENBANK_FORMAT = "GENBANK"; 085 086 protected static final String LOCUS_TAG = "LOCUS"; 087 protected static final String DEFINITION_TAG = "DEFINITION"; 088 protected static final String ACCESSION_TAG = "ACCESSION"; 089 protected static final String VERSION_TAG = "VERSION"; 090 protected static final String KEYWORDS_TAG = "KEYWORDS"; 091 // "SEGMENT" 092 protected static final String SOURCE_TAG = "SOURCE"; 093 protected static final String ORGANISM_TAG = "ORGANISM"; 094 protected static final String REFERENCE_TAG = "REFERENCE"; 095 protected static final String AUTHORS_TAG = "AUTHORS"; 096 protected static final String CONSORTIUM_TAG = "CONSRTM"; 097 protected static final String TITLE_TAG = "TITLE"; 098 protected static final String JOURNAL_TAG = "JOURNAL"; 099 protected static final String PUBMED_TAG = "PUBMED"; 100 protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated 101 protected static final String REMARK_TAG = "REMARK"; 102 protected static final String COMMENT_TAG = "COMMENT"; 103 protected static final String FEATURE_TAG = "FEATURES"; 104 protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated 105 protected static final String BASE_COUNT_TAG = "BASE"; 106 // "CONTIG" 107 protected static final String START_SEQUENCE_TAG = "ORIGIN"; 108 protected static final String DBSOURCE = "DBSOURCE"; 109 protected static final String PRIMARY = "PRIMARY"; 110 protected static final String DBLINK = "DBLINK"; 111 protected static final String END_SEQUENCE_TAG = "//"; 112 // locus line 113 protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$"); 114 // version line 115 protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); 116 // reference line 117 protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); 118 protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); 119 // dbxref line 120 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 121 122 protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK); 123 /** 124 * sections start at a line and continue till the first line afterwards with a 125 * non-whitespace first character 126 * we want to match any of the following as a new section within a section 127 * \s{0,8} word \s{0,7} value 128 * \s{21} /word = value 129 * \s{21} /word 130 */ 131 protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); 132 133 protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); 134 protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); 135 136 137 private String parse(BufferedReader bufferedReader) { 138 String sectionKey; 139 List<String[]> section; 140 // Get an ordered list of key->value pairs in array-tuples 141 do { 142 section = this.readSection(bufferedReader); 143 sectionKey = section.get(0)[0]; 144 if (sectionKey == null) { 145 //if we reach the end of the file, section contains empty strings 146 if(section.get(0)[1]==null || section.get(0)[1].equals("") || 147 section.get(0)[1].length()==0) { 148 throw new ParserException(Messages.ENDOFFILE); 149 } 150 throw new ParserException(Messages.SECTIONKEYNULL); 151 } 152 // process section-by-section 153 switch (sectionKey) { 154 case LOCUS_TAG: parseLocusTag(section); break; 155 case DEFINITION_TAG: parseDefinitionTag(section); break; 156 case ACCESSION_TAG: parseAccessionTag(section); break; 157 case VERSION_TAG: parseVersionTag(section); break; 158 case KEYWORDS_TAG: break; // not implemented yet 159 case SOURCE_TAG: break; // ignore - can get all this from the first feature 160 case REFERENCE_TAG: parseReferenceTag(section); break; 161 case COMMENT_TAG: parseCommentTag(section); break; 162 case FEATURE_TAG: parseFeatureTag(section); break; 163 case BASE_COUNT_TAG: break; // ignore - can calculate from sequence content later if needed 164 case START_SEQUENCE_TAG: parseStartSequenceTag(section); break; 165 case DBSOURCE: break; // not implemented yet 166 case PRIMARY: break; // not implemented yet 167 case DBLINK: break; // not implemented yet 168 default: 169 if(!sectionKey.equals(END_SEQUENCE_TAG)) { 170 log.info("found unknown section key: %", sectionKey); 171 } 172 } 173 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 174 return seqData; 175 } 176 177 private void parseStartSequenceTag(List<String[]> section) { 178 // our first line is ignorable as it is the ORIGIN tag 179 // the second line onwards conveniently have the number as 180 // the [0] tuple, and sequence string as [1] so all we have 181 // to do is concat the [1] parts and then strip out spaces, 182 // and replace '.' and '~' with '-' for our parser. 183 StringBuilder seq = new StringBuilder(); 184 for (int i = 1; i < section.size(); i++) { 185 seq.append(section.get(i)[1]); 186 } 187 seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase(); 188 } 189 190 private void parseFeatureTag(List<String[]> section) { 191 // starting from second line of input, start a new feature whenever we come across 192 // a key that does not start with / 193 AbstractFeature gbFeature = null; 194 for (int i = 1; i < section.size(); i++) { 195 String key = section.get(i)[0]; 196 String val = section.get(i)[1]; 197 if (key.startsWith("/")) { 198 if (gbFeature == null) { 199 throw new ParserException("Malformed GenBank file: found a qualifier without feature."); 200 } 201 key = key.substring(1); // strip leading slash 202 val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); 203 if (val.endsWith("\"")) { 204 val = val.substring(1, val.length() - 1); // strip quotes 205 } 206 // parameter on old feature 207 if (key.equals("db_xref")) { 208 Matcher m = dbxp.matcher(val); 209 if (m.matches()) { 210 String dbname = m.group(1); 211 String raccession = m.group(2); 212 DBReferenceInfo xref = new DBReferenceInfo(dbname, raccession); 213 gbFeature.addQualifier(key, xref); 214 215 ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<>(); 216 listDBEntry.add(xref); 217 mapDB.put(key, listDBEntry); 218 } else { 219 throw new ParserException("Bad dbxref"); 220 } 221 } else if (key.equalsIgnoreCase("organism")) { 222 Qualifier q = new Qualifier(key, val.replace('\n', ' ')); 223 gbFeature.addQualifier(key, q); 224 } else { 225 if (key.equalsIgnoreCase("translation") || key.equals("anticodon") 226 || key.equals("transl_except")) { 227 // strip spaces from sequence 228 val = val.replaceAll("\\s+", ""); 229 Qualifier q = new Qualifier(key, val); 230 gbFeature.addQualifier(key, q); 231 } else { 232 Qualifier q = new Qualifier(key, val); 233 gbFeature.addQualifier(key, q); 234 } 235 } 236 } else { 237 // new feature! 238 gbFeature = new TextFeature(key, val, key, key); 239 Location l = 240 locationParser.parse(val); 241 gbFeature.setLocation((AbstractLocation)l); 242 243 if (!featureCollection.containsKey(key)) { 244 featureCollection.put(key, new ArrayList<>()); 245 } 246 featureCollection.get(key).add(gbFeature); 247 } 248 } 249 } 250 251 private void parseCommentTag(List<String[]> section) { 252 headerParser.setComment(section.get(0)[1]); 253 } 254 255 private void parseReferenceTag(List<String[]> section) { 256 GenbankReference genbankReference = new GenbankReference(); 257 for (String[] ref : section) { 258 if (ref[0].equals(AUTHORS_TAG)) { 259 genbankReference.setAuthors(ref[1]); 260 } else if (ref[0].equals(TITLE_TAG)) { 261 genbankReference.setTitle(ref[1]); 262 } else if (ref[0].equals(JOURNAL_TAG)) { 263 genbankReference.setJournal(ref[1]); 264 } 265 } 266 headerParser.addReference(genbankReference); 267 } 268 269 private void parseVersionTag(List<String[]> section) { 270 String ver = section.get(0)[1]; 271 Matcher m = vp.matcher(ver); 272 if (m.matches()) { 273 String verAcc = m.group(1); 274 if (!accession.equals(verAcc)) { 275 // the version refers to a different accession! 276 // believe the version line, and store the original 277 // accession away in the additional accession set 278 accession = verAcc; 279 } 280 if (m.group(3) != null) { 281 headerParser.setVersion(Integer.parseInt(m.group(3))); 282 } 283 if (m.group(5) != null) { 284 headerParser.setIdentifier(m.group(5)); 285 } 286 } else { 287 throw new ParserException("Bad version line"); 288 } 289 } 290 291 private void parseAccessionTag(List<String[]> section) { 292 // if multiple accessions, store only first as accession, 293 // and store rest in annotation 294 String[] accs = section.get(0)[1].split("\\s+"); 295 accession = accs[0].trim(); 296 headerParser.setAccession(accession); 297 } 298 299 private void parseDefinitionTag(List<String[]> section) { 300 headerParser.setDescription(section.get(0)[1]); 301 } 302 303 private void parseLocusTag(List<String[]> section) { 304 String loc = section.get(0)[1]; 305 header = loc; 306 Matcher m = lp.matcher(loc); 307 if (m.matches()) { 308 headerParser.setName(m.group(1)); 309 headerParser.setAccession(m.group(1)); // default if no accession found 310 long sequenceLength = Long.valueOf(m.group(2)); 311 String lengthUnits = m.group(3); 312 String type = m.group(6); 313 314 if (lengthUnits.equalsIgnoreCase("aa")) { 315 compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet(); 316 } else if (lengthUnits.equalsIgnoreCase("bp")) { 317 if (type != null) { 318 if (type.contains("RNA")) { 319 compoundType = RNACompoundSet.getRNACompoundSet(); 320 } else { 321 compoundType = DNACompoundSet.getDNACompoundSet(); 322 } 323 } else { 324 compoundType = DNACompoundSet.getDNACompoundSet(); 325 } 326 } 327 328 if (m.group(7) != null) isCircularSequence = m.group(7).equalsIgnoreCase("circular"); 329 330 // configure location parser with needed information 331 locationParser.setSequenceLength(sequenceLength); 332 locationParser.setSequenceCircular(isCircularSequence); 333 334 log.debug("compound type: {}", compoundType.getClass().getSimpleName()); 335 336 } else { 337 throw new ParserException("Bad locus line"); 338 } 339 } 340 341 342 // reads an indented section, combining split lines and creating a list of 343 // key->value tuples 344 // reads an indented section, combining split lines and creating a list of 345 // key->value tuples 346 // reads an indented section, combining split lines and creating a list of 347 // key->value tuples 348 private List<String[]> readSection(BufferedReader bufferedReader) { 349 List<String[]> section = new ArrayList<>(); 350 String line; 351 352 String currKey = null; 353 StringBuilder currVal = new StringBuilder(); 354 boolean done = false; 355 int linecount = 0; 356 357 try { 358 while (!done) { 359 bufferedReader.mark(320); 360 line = bufferedReader.readLine(); 361 String firstSecKey = section.isEmpty() ? "" 362 : section.get(0)[0]; 363 if (line != null && line.matches("\\p{Space}*")) { 364 // regular expression \p{Space}* will match line 365 // having only white space characters 366 continue; 367 } 368 if (line == null 369 || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey 370 .equals(START_SEQUENCE_TAG) || line 371 .startsWith(END_SEQUENCE_TAG)))) { 372 // dump out last part of section 373 section.add(new String[]{currKey, currVal.toString()}); 374 bufferedReader.reset(); 375 done = true; 376 } else { 377 Matcher m = sectp.matcher(line); 378 if (m.matches()) { 379 // new key 380 if (currKey != null) { 381 section.add(new String[]{currKey, 382 currVal.toString()}); 383 } 384 // key = group(2) or group(4) or group(6) - whichever is 385 // not null 386 currKey = m.group(2) == null ? (m.group(4) == null ? m 387 .group(6) : m.group(4)) : m.group(2); 388 currVal = new StringBuilder(); 389 // val = group(3) if group(2) not null, group(5) if 390 // group(4) not null, "" otherwise, trimmed 391 currVal.append((m.group(2) == null ? (m.group(4) == null ? "" 392 : m.group(5)) 393 : m.group(3)).trim()); 394 } else { 395 // concatted line or SEQ START/END line? 396 if (line.startsWith(START_SEQUENCE_TAG) 397 || line.startsWith(END_SEQUENCE_TAG)) { 398 currKey = line; 399 } else { 400 currVal.append("\n"); // newline in between lines - 401 // can be removed later 402 currVal.append(currKey.charAt(0) == '/' ? line 403 .substring(21) : line.substring(12)); 404 } 405 } 406 } 407 } 408 } catch (IOException | RuntimeException e) { 409 throw new ParserException(e.getMessage()); 410 } 411 return section; 412 } 413 414 @Override 415 public String getSequence(BufferedReader bufferedReader, int sequenceLength) { 416 featureCollection = new HashMap<>(); 417 mapDB = new LinkedHashMap<>(); 418 headerParser = new GenericGenbankHeaderParser<>(); 419 try { 420 parse(bufferedReader); 421 } catch (ParserException e) { 422 if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null; 423 else throw new ParserException(e.getMessage()); 424 } 425 426 return seqData; 427 } 428 429 public String getHeader() { 430 return header; 431 } 432 433 public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() { 434 return headerParser; 435 } 436 437 public Map<String, List<DBReferenceInfo>> getDatabaseReferences() { 438 return mapDB; 439 } 440 441 public List<String> getKeyWords() { 442 return new ArrayList<>(featureCollection.keySet()); 443 } 444 445 public List<AbstractFeature<AbstractSequence<C>, C>> getFeatures(String keyword) { 446 return featureCollection.get(keyword); 447 } 448 public Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> getFeatures() { 449 return featureCollection; 450 } 451 452 public void parseFeatures(AbstractSequence<C> sequence) { 453 for (String k: featureCollection.keySet()) 454 for (AbstractFeature<AbstractSequence<C>, C> f: featureCollection.get(k)) 455 sequence.addFeature(f); 456 } 457 458 public CompoundSet<?> getCompoundType() { 459 return compoundType; 460 } 461}