Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Richard Holland
015 * @author Mark Schreiber
016 * @author David Scott
017 * @author Bubba Puryear
018 * @author George Waldon
019 * @author Deepak Sheoran
020 * @author Karl Nicholas <github:karlnicholas>
021 * @author Jacek Grzebyta
022 * @author Paolo Pavan
023 *
024 * For more information on the BioJava project and its aims,
025 * or to join the biojava-l mailing list, visit the home page
026 * at:
027 *
028 *      http://www.biojava.org/
029 *
030 * Created on 01-21-2010
031 */
032package org.biojava.nbio.core.sequence.io;
033
034import org.biojava.nbio.core.exceptions.Messages;
035import org.biojava.nbio.core.exceptions.ParserException;
036import org.biojava.nbio.core.sequence.DataSource;
037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
038import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
039import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
040import org.biojava.nbio.core.sequence.features.AbstractFeature;
041import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
042import org.biojava.nbio.core.sequence.features.Qualifier;
043import org.biojava.nbio.core.sequence.features.TextFeature;
044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
045import org.biojava.nbio.core.sequence.location.InsdcParser;
046import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
047import org.biojava.nbio.core.sequence.location.template.Location;
048import org.biojava.nbio.core.sequence.reference.GenbankReference;
049import org.biojava.nbio.core.sequence.template.AbstractSequence;
050import org.biojava.nbio.core.sequence.template.Compound;
051import org.biojava.nbio.core.sequence.template.CompoundSet;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055import java.io.BufferedReader;
056import java.io.IOException;
057import java.util.*;
058import java.util.regex.Matcher;
059import java.util.regex.Pattern;
060
061public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{
062
063        private String seqData = null;
064        private GenericGenbankHeaderParser<S, C> headerParser;
065        private String header;
066        private String accession;
067        private boolean isCircularSequence;
068        private Map<String, List<DBReferenceInfo>> mapDB;
069        /**
070         * this data structure collects list of features extracted from the
071         * FEATURE_TAG section They are organized by list of the same type (i.e.
072         * same genbank Feature) and are provided with location
073         */
074        private Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> featureCollection;
075
076        private final Logger log = LoggerFactory.getLogger(getClass());
077
078        // this is a compoundset parsed from header.
079        private CompoundSet<?> compoundType;
080
081        /**
082         * The name of this format
083         */
084        public static final String GENBANK_FORMAT = "GENBANK";
085
086        protected static final String LOCUS_TAG = "LOCUS";
087        protected static final String DEFINITION_TAG = "DEFINITION";
088        protected static final String ACCESSION_TAG = "ACCESSION";
089        protected static final String VERSION_TAG = "VERSION";
090        protected static final String KEYWORDS_TAG = "KEYWORDS";
091        //                                                  "SEGMENT"
092        protected static final String SOURCE_TAG = "SOURCE";
093        protected static final String ORGANISM_TAG = "ORGANISM";
094        protected static final String REFERENCE_TAG = "REFERENCE";
095        protected static final String AUTHORS_TAG = "AUTHORS";
096        protected static final String CONSORTIUM_TAG = "CONSRTM";
097        protected static final String TITLE_TAG = "TITLE";
098        protected static final String JOURNAL_TAG = "JOURNAL";
099        protected static final String PUBMED_TAG = "PUBMED";
100        protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated
101        protected static final String REMARK_TAG = "REMARK";
102        protected static final String COMMENT_TAG = "COMMENT";
103        protected static final String FEATURE_TAG = "FEATURES";
104        protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated
105        protected static final String BASE_COUNT_TAG = "BASE";
106        //                                                  "CONTIG"
107        protected static final String START_SEQUENCE_TAG = "ORIGIN";
108        protected static final String DBSOURCE = "DBSOURCE";
109        protected static final String PRIMARY = "PRIMARY";
110        protected static final String DBLINK = "DBLINK";
111        protected static final String END_SEQUENCE_TAG = "//";
112        // locus line with name that may contain spaces but must start and end with non whitespace character
113        protected static final Pattern lp = Pattern.compile("^(\\S+[\\S ]*\\S*)\\s+(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$");
114        // locus line with no name
115        protected static final Pattern lp2 = Pattern.compile("^(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$"); 
116        // version line
117        protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
118        // reference line
119        protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$");
120        protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?");
121        // dbxref line
122        protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
123
124        protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK);
125        /**
126         * sections start at a line and continue till the first line afterwards with a
127         *      non-whitespace first character
128         *      we want to match any of the following as a new section within a section
129         *        \s{0,8} word \s{0,7} value
130         *        \s{21} /word = value
131         *        \s{21} /word
132         */
133        protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$");
134
135        protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
136        protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
137
138
139        private String parse(BufferedReader bufferedReader) {
140                String sectionKey;
141                List<String[]> section;
142                // Get an ordered list of key->value pairs in array-tuples
143                do {
144                        section = this.readSection(bufferedReader);
145                        sectionKey = section.get(0)[0];
146                        if (sectionKey == null) {
147                                //if we reach the end of the file, section contains empty strings
148                                if(section.get(0)[1]==null || section.get(0)[1].equals("") ||
149                                                section.get(0)[1].length()==0) {
150                                        throw new ParserException(Messages.ENDOFFILE);
151                                }
152                                throw new ParserException(Messages.SECTIONKEYNULL);
153                        }
154                        // process section-by-section
155                        switch (sectionKey) {
156                                case LOCUS_TAG: parseLocusTag(section); break;
157                                case DEFINITION_TAG: parseDefinitionTag(section); break;
158                                case ACCESSION_TAG: parseAccessionTag(section); break;
159                                case VERSION_TAG: parseVersionTag(section); break;
160                                case KEYWORDS_TAG: break;       // not implemented yet
161                                case SOURCE_TAG: break;         // ignore - can get all this from the first feature
162                                case REFERENCE_TAG: parseReferenceTag(section); break;
163                                case COMMENT_TAG: parseCommentTag(section); break;
164                                case FEATURE_TAG: parseFeatureTag(section); break;
165                                case BASE_COUNT_TAG: break;     // ignore - can calculate from sequence content later if needed
166                                case START_SEQUENCE_TAG: parseStartSequenceTag(section); break;
167                                case DBSOURCE: break;           // not implemented yet
168                                case PRIMARY: break;            // not implemented yet
169                                case DBLINK: break;                     // not implemented yet
170                                default:
171                                        if(!sectionKey.equals(END_SEQUENCE_TAG)) {
172                                                log.info("found unknown section key: %", sectionKey);
173                                        }
174                        }
175                } while (!sectionKey.equals(END_SEQUENCE_TAG));
176                return seqData;
177        }
178
179        private void parseStartSequenceTag(List<String[]> section) {
180                // our first line is ignorable as it is the ORIGIN tag
181                // the second line onwards conveniently have the number as
182                // the [0] tuple, and sequence string as [1] so all we have
183                // to do is concat the [1] parts and then strip out spaces,
184                // and replace '.' and '~' with '-' for our parser.
185                StringBuilder seq = new StringBuilder();
186                for (int i = 1; i < section.size(); i++) {
187                        seq.append(section.get(i)[1]);
188                }
189                seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase();
190        }
191
192        private void parseFeatureTag(List<String[]> section) {
193                // starting from second line of input, start a new feature whenever we come across
194                // a key that does not start with /
195                AbstractFeature gbFeature = null;
196                for (int i = 1; i < section.size(); i++) {
197                        String key = section.get(i)[0];
198                        String val = section.get(i)[1];
199                        if (key.startsWith("/")) {
200                                if (gbFeature == null) {
201                                        throw new ParserException("Malformed GenBank file: found a qualifier without feature.");
202                                }
203                                Boolean needsQuotes = false;
204                                key = key.substring(1); // strip leading slash
205                                val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();                          
206                                if (val.endsWith("\"")) {
207                                        val = val.substring(1, val.length() - 1); // strip quotes
208                                        needsQuotes = true; // as the value has quotes then set that it needs quotes when written back out
209                                }
210                                // parameter on old feature
211                                if (key.equals("db_xref")) {
212                                        Matcher m = dbxp.matcher(val);
213                                        if (m.matches()) {
214                                                String dbname = m.group(1);
215                                                String raccession = m.group(2);
216                                                DBReferenceInfo xref = new DBReferenceInfo(dbname, raccession);
217                                                xref.setNeedsQuotes(needsQuotes);
218                                                gbFeature.addQualifier(key, xref);
219
220                                                ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<>();
221                                                listDBEntry.add(xref);
222                                                mapDB.put(key, listDBEntry);
223                                        } else {
224                                                throw new ParserException("Bad dbxref");
225                                        }
226                                } else if (key.equalsIgnoreCase("organism")) {
227                                        Qualifier q = new Qualifier(key, val.replace('\n', ' '), needsQuotes);
228                                        gbFeature.addQualifier(key, q);
229                                } else {
230                                        if (key.equalsIgnoreCase("translation") || key.equals("anticodon")
231                                                        || key.equals("transl_except")) {
232                                                // strip spaces from sequence
233                                                val = val.replaceAll("\\s+", "");
234                                                Qualifier q = new Qualifier(key, val, needsQuotes);
235                                                gbFeature.addQualifier(key, q);
236                                        } else {
237                                                Qualifier q = new Qualifier(key, val, needsQuotes);
238                                                gbFeature.addQualifier(key, q);
239                                        }
240                                }
241                        } else {
242                                // new feature!
243                                gbFeature = new TextFeature(key, val, key, key);
244                                Location l =
245                                                locationParser.parse(val);
246                                gbFeature.setLocation((AbstractLocation)l);
247
248                                if (!featureCollection.containsKey(key)) {
249                                        featureCollection.put(key, new ArrayList<>());
250                                }
251                                featureCollection.get(key).add(gbFeature);
252                        }
253                }
254        }
255
256        private void parseCommentTag(List<String[]> section) {
257                headerParser.setComment(section.get(0)[1]);
258        }
259
260        private void parseReferenceTag(List<String[]> section) {
261                GenbankReference genbankReference = new GenbankReference();
262                for (String[] ref : section) {
263                        if (ref[0].equals(AUTHORS_TAG)) {
264                                genbankReference.setAuthors(ref[1]);
265                        } else if (ref[0].equals(TITLE_TAG)) {
266                                genbankReference.setTitle(ref[1]);
267                        } else if (ref[0].equals(JOURNAL_TAG)) {
268                                genbankReference.setJournal(ref[1]);
269                        }
270                }
271                headerParser.addReference(genbankReference);
272        }
273
274        private void parseVersionTag(List<String[]> section) {
275                String ver = section.get(0)[1];
276                Matcher m = vp.matcher(ver);
277                if (m.matches()) {
278                        String verAcc = m.group(1);
279                        if (!accession.equals(verAcc)) {
280                                // the version refers to a different accession!
281                                // believe the version line, and store the original
282                                // accession away in the additional accession set
283                                accession = verAcc;
284                        }
285                        if (m.group(3) != null) {
286                                headerParser.setVersion(Integer.parseInt(m.group(3)));
287                        }
288                        if (m.group(5) != null) {
289                                headerParser.setIdentifier(m.group(5));
290                        }
291                } else {
292                        throw new ParserException("Bad version line");
293                }
294        }
295
296        private void parseAccessionTag(List<String[]> section) {
297                // if multiple accessions, store only first as accession,
298                // and store rest in annotation
299                String[] accs = section.get(0)[1].split("\\s+");
300                accession = accs[0].trim();
301                headerParser.setAccession(accession);
302        }
303
304        private void parseDefinitionTag(List<String[]> section) {
305                headerParser.setDescription(section.get(0)[1]);
306        }
307
308        private void parseLocusTag(List<String[]> section) {
309                String loc = section.get(0)[1];
310                header = loc;
311                Matcher m = lp.matcher(loc);
312                Matcher m2 = lp2.matcher(loc);          
313                if (m.matches()) {
314                        //remove any preceding or trailing whitespace from the locus name
315                        String name = m.group(1).trim().replaceAll(" ","_");            
316                        headerParser.setName(name);
317                        headerParser.setAccession(name); // default if no accession found                       
318                        long sequenceLength = Long.valueOf(m.group(2));
319                        String lengthUnits = m.group(3);
320                        String type = m.group(6);
321
322                        if (lengthUnits.equalsIgnoreCase("aa")) {
323                                compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
324                        } else if (lengthUnits.equalsIgnoreCase("bp")) {
325                                if (type != null) {
326                                        if (type.contains("RNA")) {
327                                                compoundType = RNACompoundSet.getRNACompoundSet();
328                                        } else {
329                                                compoundType = DNACompoundSet.getDNACompoundSet();
330                                        }
331                                } else {
332                                        compoundType = DNACompoundSet.getDNACompoundSet();
333                                }
334                        }
335
336                        if (m.group(7) != null) isCircularSequence = m.group(7).equalsIgnoreCase("circular");
337
338                        // configure location parser with needed information
339                        locationParser.setSequenceLength(sequenceLength);
340                        locationParser.setSequenceCircular(isCircularSequence);
341
342                        log.debug("compound type: {}", compoundType.getClass().getSimpleName());
343
344                } else if (m2.matches()) {
345                        // Locus Name Missing - use different Locus regex
346                        headerParser.setName("");
347                        headerParser.setAccession(""); // default if no accession found                 
348                        long sequenceLength = Long.valueOf(m2.group(1));
349                        String lengthUnits = m2.group(2);
350                        String type = m2.group(5);
351
352                        if (lengthUnits.equalsIgnoreCase("aa")) {
353                                compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
354                        } else if (lengthUnits.equalsIgnoreCase("bp")) {
355                                if (type != null) {
356                                        if (type.contains("RNA")) {
357                                                compoundType = RNACompoundSet.getRNACompoundSet();
358                                        } else {
359                                                compoundType = DNACompoundSet.getDNACompoundSet();
360                                        }
361                                } else {
362                                        compoundType = DNACompoundSet.getDNACompoundSet();
363                                }
364                        }
365
366                        if (m2.group(6) != null) isCircularSequence = m2.group(6).equalsIgnoreCase("circular");
367
368                        // configure location parser with needed information
369                        locationParser.setSequenceLength(sequenceLength);
370                        locationParser.setSequenceCircular(isCircularSequence);
371
372                        log.debug("compound type: {}", compoundType.getClass().getSimpleName());
373                        
374                        
375                } else {
376                        throw new ParserException("Bad locus line");
377                }
378        }
379
380
381        // reads an indented section, combining split lines and creating a list of
382        // key->value tuples
383        // reads an indented section, combining split lines and creating a list of
384        // key->value tuples
385        // reads an indented section, combining split lines and creating a list of
386        // key->value tuples
387        private List<String[]> readSection(BufferedReader bufferedReader) {
388                List<String[]> section = new ArrayList<>();
389                String line;
390
391                String currKey = null;
392                StringBuilder currVal = new StringBuilder();
393                boolean done = false;
394                int linecount = 0;
395
396                try {
397                        while (!done) {
398                                bufferedReader.mark(320);
399                                line = bufferedReader.readLine();
400                                String firstSecKey = section.isEmpty() ? ""
401                                                : section.get(0)[0];
402                                if (line != null && line.matches("\\p{Space}*")) {
403                                        // regular expression \p{Space}* will match line
404                                        // having only white space characters
405                                        continue;
406                                }
407                                if (line == null
408                                                || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey
409                                                .equals(START_SEQUENCE_TAG) || line
410                                                .startsWith(END_SEQUENCE_TAG)))) {
411                                        // dump out last part of section
412                                        section.add(new String[]{currKey, currVal.toString()});
413                                        bufferedReader.reset();
414                                        done = true;
415                                } else {
416                                        Matcher m = sectp.matcher(line);
417                                        if (m.matches()) {
418                                                // new key
419                                                if (currKey != null) {
420                                                        section.add(new String[]{currKey,
421                                                                currVal.toString()});
422                                                }
423                                                // key = group(2) or group(4) or group(6) - whichever is
424                                                // not null
425                                                currKey = m.group(2) == null ? (m.group(4) == null ? m
426                                                                .group(6) : m.group(4)) : m.group(2);
427                                                currVal = new StringBuilder();
428                                                // val = group(3) if group(2) not null, group(5) if
429                                                // group(4) not null, "" otherwise, trimmed
430                                                currVal.append((m.group(2) == null ? (m.group(4) == null ? ""
431                                                                : m.group(5))
432                                                                : m.group(3)).trim());
433                                        } else {
434                                                // concatted line or SEQ START/END line?
435                                                if (line.startsWith(START_SEQUENCE_TAG)
436                                                                || line.startsWith(END_SEQUENCE_TAG)) {
437                                                        currKey = line;
438                                                } else {
439                                                        currVal.append("\n"); // newline in between lines -
440                                                        // can be removed later
441                                                        currVal.append(currKey.charAt(0) == '/' ? line
442                                                                        .substring(21) : line.substring(12));
443                                                }
444                                        }
445                                }
446                        }
447                } catch (IOException | RuntimeException e) {
448                        throw new ParserException(e.getMessage());
449                }
450                return section;
451        }
452
453        @Override
454        public String getSequence(BufferedReader bufferedReader, int sequenceLength) {
455                featureCollection = new HashMap<>();
456                mapDB = new LinkedHashMap<>();
457                headerParser = new GenericGenbankHeaderParser<>();
458                try {
459                        parse(bufferedReader);
460                } catch (ParserException e) {
461                        if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null;
462                        else throw new ParserException(e.getMessage());
463                }
464
465                return seqData;
466        }
467
468        public String getHeader() {
469                return header;
470        }
471
472        public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() {
473                return headerParser;
474        }
475
476        public Map<String, List<DBReferenceInfo>> getDatabaseReferences() {
477                return mapDB;
478        }
479
480        public List<String> getKeyWords() {
481                return new ArrayList<>(featureCollection.keySet());
482        }
483
484        public List<AbstractFeature<AbstractSequence<C>, C>> getFeatures(String keyword) {
485                return featureCollection.get(keyword);
486        }
487        public Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> getFeatures() {
488                return featureCollection;
489        }
490
491        public void parseFeatures(AbstractSequence<C> sequence) {
492                for (String k: featureCollection.keySet())
493                        for (AbstractFeature<AbstractSequence<C>, C> f: featureCollection.get(k))
494                                sequence.addFeature(f);
495        }
496
497        public CompoundSet<?> getCompoundType() {
498                return compoundType;
499        }
500}