001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * @author Richard Holland 015 * @author Mark Schreiber 016 * @author David Scott 017 * @author Bubba Puryear 018 * @author George Waldon 019 * @author Deepak Sheoran 020 * @author Karl Nicholas <github:karlnicholas> 021 * @author Jacek Grzebyta 022 * @author Paolo Pavan 023 * 024 * For more information on the BioJava project and its aims, 025 * or to join the biojava-l mailing list, visit the home page 026 * at: 027 * 028 * http://www.biojava.org/ 029 * 030 * Created on 01-21-2010 031 */ 032package org.biojava.nbio.core.sequence.io; 033 034import org.biojava.nbio.core.exceptions.Messages; 035import org.biojava.nbio.core.exceptions.ParserException; 036import org.biojava.nbio.core.sequence.DataSource; 037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet; 038import org.biojava.nbio.core.sequence.compound.DNACompoundSet; 039import org.biojava.nbio.core.sequence.compound.RNACompoundSet; 040import org.biojava.nbio.core.sequence.features.AbstractFeature; 041import org.biojava.nbio.core.sequence.features.DBReferenceInfo; 042import org.biojava.nbio.core.sequence.features.Qualifier; 043import org.biojava.nbio.core.sequence.features.TextFeature; 044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface; 045import org.biojava.nbio.core.sequence.location.InsdcParser; 046import org.biojava.nbio.core.sequence.location.template.AbstractLocation; 047import org.biojava.nbio.core.sequence.location.template.Location; 048import org.biojava.nbio.core.sequence.reference.GenbankReference; 049import org.biojava.nbio.core.sequence.template.AbstractSequence; 050import org.biojava.nbio.core.sequence.template.Compound; 051import org.biojava.nbio.core.sequence.template.CompoundSet; 052import org.slf4j.Logger; 053import org.slf4j.LoggerFactory; 054 055import java.io.BufferedReader; 056import java.io.IOException; 057import java.util.ArrayList; 058import java.util.HashMap; 059import java.util.LinkedHashMap; 060import java.util.List; 061import java.util.regex.Matcher; 062import java.util.regex.Pattern; 063 064public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{ 065 066 private String seqData = null; 067 private GenericGenbankHeaderParser<S, C> headerParser; 068 private String header; 069 private String accession; 070 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> mapDB; 071 /** 072 * this data structure collects list of features extracted from the 073 * FEATURE_TAG section They are organized by list of the same type (i.e. 074 * same genbank Feature) and are provided with location 075 */ 076 private HashMap<String, ArrayList<AbstractFeature>> featureCollection; 077 078 private Logger log = LoggerFactory.getLogger(getClass()); 079 080 // this is a compoundset parsed from header. 081 private CompoundSet<?> compoundType; 082 083 /** 084 * The name of this format 085 */ 086 public static final String GENBANK_FORMAT = "GENBANK"; 087 088 protected static final String LOCUS_TAG = "LOCUS"; 089 protected static final String DEFINITION_TAG = "DEFINITION"; 090 protected static final String ACCESSION_TAG = "ACCESSION"; 091 protected static final String VERSION_TAG = "VERSION"; 092 protected static final String KEYWORDS_TAG = "KEYWORDS"; 093 // "SEGMENT" 094 protected static final String SOURCE_TAG = "SOURCE"; 095 protected static final String ORGANISM_TAG = "ORGANISM"; 096 protected static final String REFERENCE_TAG = "REFERENCE"; 097 protected static final String AUTHORS_TAG = "AUTHORS"; 098 protected static final String CONSORTIUM_TAG = "CONSRTM"; 099 protected static final String TITLE_TAG = "TITLE"; 100 protected static final String JOURNAL_TAG = "JOURNAL"; 101 protected static final String PUBMED_TAG = "PUBMED"; 102 protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated 103 protected static final String REMARK_TAG = "REMARK"; 104 protected static final String COMMENT_TAG = "COMMENT"; 105 protected static final String FEATURE_TAG = "FEATURES"; 106 protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated 107 protected static final String BASE_COUNT_TAG = "BASE"; 108 // "CONTIG" 109 protected static final String START_SEQUENCE_TAG = "ORIGIN"; 110 protected static final String END_SEQUENCE_TAG = "//"; 111 // locus line 112 protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}(([dms]s-)?(\\S+))?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$"); 113 // version line 114 protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$"); 115 // reference line 116 protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$"); 117 protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?"); 118 // dbxref line 119 protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$"); 120 121 protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK); 122 //sections start at a line and continue till the first line afterwards with a 123 //non-whitespace first character 124 //we want to match any of the following as a new section within a section 125 // \s{0,8} word \s{0,7} value 126 // \s{21} /word = value 127 // \s{21} /word 128 protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$"); 129 130 protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)"); 131 protected static final Pattern headerLine = Pattern.compile("^LOCUS.*"); 132 private static final String DBSOURCE = "DBSOURCE"; 133 private static final String PRIMARY = "PRIMARY"; 134 private static final String DBLINK = "DBLINK"; 135 136// private NCBITaxon tax = null; 137 138 139 140 private String parse(BufferedReader bufferedReader) { 141 String sectionKey = null; 142 List<String[]> section; 143 // Get an ordered list of key->value pairs in array-tuples 144 do { 145 section = this.readSection(bufferedReader); 146 sectionKey = section.get(0)[0]; 147 if (sectionKey == null) { 148 //if we reach the end of the file, section contains empty strings 149 if(section.get(0)[1]==null || section.get(0)[1]=="" || 150 section.get(0)[1].length()==0) { 151 throw new ParserException(Messages.ENDOFFILE); 152 } 153 throw new ParserException(Messages.SECTIONKEYNULL); 154 } 155 // process section-by-section 156 if (sectionKey.equals(LOCUS_TAG)) { 157 String loc = section.get(0)[1]; 158 header = loc; 159 Matcher m = lp.matcher(loc); 160 if (m.matches()) { 161 headerParser.setName(m.group(1)); 162 headerParser.setAccession(m.group(1)); // default if no accession found 163 164 String lengthUnits = m.group(2); 165 String type = m.group(5); 166 167 if (lengthUnits.equals("aa")) { 168 compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet(); 169 } else if (lengthUnits.equals("bp")) { 170 if (type != null) { 171 if (type.contains("RNA")) { 172 compoundType = RNACompoundSet.getRNACompoundSet(); 173 } else { 174 compoundType = DNACompoundSet.getDNACompoundSet(); 175 } 176 } else { 177 compoundType = DNACompoundSet.getDNACompoundSet(); 178 } 179 } 180 181 log.debug("compound type: {}", compoundType.getClass().getSimpleName()); 182 183 } else { 184 throw new ParserException("Bad locus line"); 185 } 186 } else if (sectionKey.equals(DEFINITION_TAG)) { 187 headerParser.setDescription(section.get(0)[1]); 188 } else if (sectionKey.equals(ACCESSION_TAG)) { 189 // if multiple accessions, store only first as accession, 190 // and store rest in annotation 191 String[] accs = section.get(0)[1].split("\\s+"); 192 accession = accs[0].trim(); 193 headerParser.setAccession(accession); 194 } else if (sectionKey.equals(VERSION_TAG)) { 195 String ver = section.get(0)[1]; 196 Matcher m = vp.matcher(ver); 197 if (m.matches()) { 198 String verAcc = m.group(1); 199 if (!accession.equals(verAcc)) { 200 // the version refers to a different accession! 201 // believe the version line, and store the original 202 // accession away in the additional accession set 203 accession = verAcc; 204 } 205 if (m.group(3) != null) { 206 headerParser.setVersion(Integer.parseInt(m.group(3))); 207 } 208 if (m.group(5) != null) { 209 headerParser.setIdentifier(m.group(5)); 210 } 211 } else { 212 throw new ParserException("Bad version line"); 213 } 214 } else if (sectionKey.equals(KEYWORDS_TAG)) { 215 } else if (sectionKey.equals(SOURCE_TAG)) { 216 // ignore - can get all this from the first feature 217 } else if (sectionKey.equals(REFERENCE_TAG)) { 218 if (!section.isEmpty()) { 219 GenbankReference genbankReference = new GenbankReference(); 220 for (String[] ref : section) { 221 if (ref[0].equals(AUTHORS_TAG)) { 222 genbankReference.setAuthors(ref[1]); 223 } else if (ref[0].equals(TITLE_TAG)) { 224 genbankReference.setTitle(ref[1]); 225 } else if (ref[0].equals(JOURNAL_TAG)) { 226 genbankReference.setJournal(ref[1]); 227 } 228 } 229 headerParser.addReference(genbankReference); 230 } 231 } else if (sectionKey.equals(COMMENT_TAG)) { 232 // Set up some comments 233 headerParser.setComment(section.get(0)[1]); 234 } else if (sectionKey.equals(FEATURE_TAG)) { 235 // starting from second line of input, start a new feature whenever we come across 236 // a key that does not start with / 237 AbstractFeature gbFeature = null; 238 for (int i = 1; i < section.size(); i++) { 239 String key = section.get(i)[0]; 240 String val = section.get(i)[1]; 241 if (key.startsWith("/")) { 242 if (gbFeature == null) { 243 throw new ParserException("Malformed GenBank file: found a qualifier without feature."); 244 } 245 key = key.substring(1); // strip leading slash 246 val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim(); 247 if (val.endsWith("\"")) { 248 val = val.substring(1, val.length() - 1); // strip quotes 249 } 250 // parameter on old feature 251 if (key.equals("db_xref")) { 252 Matcher m = dbxp.matcher(val); 253 if (m.matches()) { 254 String dbname = m.group(1); 255 String raccession = m.group(2); 256 Qualifier xref = new DBReferenceInfo(dbname, raccession); 257 gbFeature.addQualifier(key, xref); 258 259 ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<DBReferenceInfo>(); 260 listDBEntry.add((DBReferenceInfo) xref); 261 mapDB.put(key, listDBEntry); 262 } else { 263 throw new ParserException("Bad dbxref"); 264 } 265 } else if (key.equalsIgnoreCase("organism")) { 266 Qualifier q = new Qualifier(key, val.replace('\n', ' ')); 267 gbFeature.addQualifier(key, q); 268 } else { 269 if (key.equalsIgnoreCase("translation")) { 270 // strip spaces from sequence 271 val = val.replaceAll("\\s+", ""); 272 Qualifier q = new Qualifier(key, val); 273 gbFeature.addQualifier(key, q); 274 } else { 275 Qualifier q = new Qualifier(key, val); 276 gbFeature.addQualifier(key, q); 277 } 278 } 279 } else { 280 // new feature! 281 gbFeature = new TextFeature(key, val, key, key); 282 Location l = 283 locationParser.parse(val); 284 gbFeature.setLocation((AbstractLocation)l); 285 286 if (!featureCollection.containsKey(key)) { 287 featureCollection.put(key, new ArrayList()); 288 } 289 featureCollection.get(key).add(gbFeature); 290 } 291 } 292 } else if (sectionKey.equals(BASE_COUNT_TAG)) { 293 // ignore - can calculate from sequence content later if needed 294 } else if (sectionKey.equals(START_SEQUENCE_TAG)) { 295 // our first line is ignorable as it is the ORIGIN tag 296 // the second line onwards conveniently have the number as 297 // the [0] tuple, and sequence string as [1] so all we have 298 // to do is concat the [1] parts and then strip out spaces, 299 // and replace '.' and '~' with '-' for our parser. 300 StringBuffer seq = new StringBuffer(); 301 for (int i = 1; i < section.size(); i++) { 302 seq.append(section.get(i)[1]); 303 } 304 seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase(); 305 } else if(sectionKey.equals(DBSOURCE)) { 306 //TODO 307 } else if(sectionKey.equals(PRIMARY)) { 308 //TODO 309 } else if(sectionKey.equals(DBLINK)) { 310 //TODO 311 } else { 312 if(!sectionKey.equals(END_SEQUENCE_TAG)) { 313 log.info("found unknown section key: "+sectionKey); 314 } 315 } 316 } while (!sectionKey.equals(END_SEQUENCE_TAG)); 317 return seqData; 318 } 319 320 321 322 // reads an indented section, combining split lines and creating a list of 323 // key->value tuples 324 // reads an indented section, combining split lines and creating a list of 325 // key->value tuples 326 // reads an indented section, combining split lines and creating a list of 327 // key->value tuples 328 private List<String[]> readSection(BufferedReader bufferedReader) { 329 List<String[]> section = new ArrayList<String[]>(); 330 String line = ""; 331 332 String currKey = null; 333 StringBuffer currVal = new StringBuffer(); 334 boolean done = false; 335 int linecount = 0; 336 337 try { 338 while (!done) { 339 bufferedReader.mark(320); 340 line = bufferedReader.readLine(); 341 String firstSecKey = section.isEmpty() ? "" 342 : section.get(0)[0]; 343 if (line != null && line.matches("\\p{Space}*")) { 344 // regular expression \p{Space}* will match line 345 // having only white space characters 346 continue; 347 } 348 if (line == null 349 || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey 350 .equals(START_SEQUENCE_TAG) || line 351 .startsWith(END_SEQUENCE_TAG)))) { 352 // dump out last part of section 353 section.add(new String[]{currKey, currVal.toString()}); 354 bufferedReader.reset(); 355 done = true; 356 } else { 357 Matcher m = sectp.matcher(line); 358 if (m.matches()) { 359 // new key 360 if (currKey != null) { 361 section.add(new String[]{currKey, 362 currVal.toString()}); 363 } 364 // key = group(2) or group(4) or group(6) - whichever is 365 // not null 366 currKey = m.group(2) == null ? (m.group(4) == null ? m 367 .group(6) : m.group(4)) : m.group(2); 368 currVal = new StringBuffer(); 369 // val = group(3) if group(2) not null, group(5) if 370 // group(4) not null, "" otherwise, trimmed 371 currVal.append((m.group(2) == null ? (m.group(4) == null ? "" 372 : m.group(5)) 373 : m.group(3)).trim()); 374 } else { 375 // concatted line or SEQ START/END line? 376 if (line.startsWith(START_SEQUENCE_TAG) 377 || line.startsWith(END_SEQUENCE_TAG)) { 378 currKey = line; 379 } else { 380 currVal.append("\n"); // newline in between lines - 381 // can be removed later 382 currVal.append(currKey.charAt(0) == '/' ? line 383 .substring(21) : line.substring(12)); 384 } 385 } 386 } 387 } 388 } catch (IOException e) { 389 throw new ParserException(e.getMessage()); 390 } catch (RuntimeException e) { 391 throw new ParserException(e.getMessage()); 392 } 393 return section; 394 } 395 396 @Override 397 public String getSequence(BufferedReader bufferedReader, int sequenceLength) throws IOException { 398 featureCollection = new HashMap<String, ArrayList<AbstractFeature>>(); 399 mapDB = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>(); 400 headerParser = new GenericGenbankHeaderParser<S, C>(); 401 try { 402 parse(bufferedReader); 403 } catch (ParserException e) { 404 if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null; 405 else throw new ParserException(e.getMessage()); 406 } 407 408 return seqData; 409 } 410 411 public String getHeader() { 412 return header; 413 } 414 415 public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() { 416 return headerParser; 417 } 418 419 public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() { 420 return mapDB; 421 } 422 423 public ArrayList<String> getKeyWords() { 424 return new ArrayList<String>(featureCollection.keySet()); 425 } 426 427 public ArrayList<AbstractFeature> getFeatures(String keyword) { 428 return featureCollection.get(keyword); 429 } 430 public HashMap<String, ArrayList<AbstractFeature>> getFeatures() { 431 return featureCollection; 432 } 433 434 public void parseFeatures(AbstractSequence<C> sequence) { 435 for (String k: featureCollection.keySet()) 436 for (AbstractFeature f: featureCollection.get(k)) 437 sequence.addFeature(f); 438 } 439 440 public CompoundSet<?> getCompoundType() { 441 return compoundType; 442 } 443}