Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Richard Holland
015 * @author Mark Schreiber
016 * @author David Scott
017 * @author Bubba Puryear
018 * @author George Waldon
019 * @author Deepak Sheoran
020 * @author Karl Nicholas <github:karlnicholas>
021 * @author Jacek Grzebyta
022 * @author Paolo Pavan
023 *
024 * For more information on the BioJava project and its aims,
025 * or to join the biojava-l mailing list, visit the home page
026 * at:
027 *
028 *      http://www.biojava.org/
029 *
030 * Created on 01-21-2010
031 */
032package org.biojava.nbio.core.sequence.io;
033
034import org.biojava.nbio.core.exceptions.Messages;
035import org.biojava.nbio.core.exceptions.ParserException;
036import org.biojava.nbio.core.sequence.DataSource;
037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
038import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
039import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
040import org.biojava.nbio.core.sequence.features.AbstractFeature;
041import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
042import org.biojava.nbio.core.sequence.features.Qualifier;
043import org.biojava.nbio.core.sequence.features.TextFeature;
044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
045import org.biojava.nbio.core.sequence.location.InsdcParser;
046import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
047import org.biojava.nbio.core.sequence.location.template.Location;
048import org.biojava.nbio.core.sequence.reference.GenbankReference;
049import org.biojava.nbio.core.sequence.template.AbstractSequence;
050import org.biojava.nbio.core.sequence.template.Compound;
051import org.biojava.nbio.core.sequence.template.CompoundSet;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055import java.io.BufferedReader;
056import java.io.IOException;
057import java.util.*;
058import java.util.regex.Matcher;
059import java.util.regex.Pattern;
060
061public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{
062
063        private String seqData = null;
064        private GenericGenbankHeaderParser<S, C> headerParser;
065        private String header;
066        private String accession;
067        private boolean isCircularSequence;
068        private Map<String, List<DBReferenceInfo>> mapDB;
069        /**
070         * this data structure collects list of features extracted from the
071         * FEATURE_TAG section They are organized by list of the same type (i.e.
072         * same genbank Feature) and are provided with location
073         */
074        private Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> featureCollection;
075
076        private final Logger log = LoggerFactory.getLogger(getClass());
077
078        // this is a compoundset parsed from header.
079        private CompoundSet<?> compoundType;
080
081        /**
082         * The name of this format
083         */
084        public static final String GENBANK_FORMAT = "GENBANK";
085
086        protected static final String LOCUS_TAG = "LOCUS";
087        protected static final String DEFINITION_TAG = "DEFINITION";
088        protected static final String ACCESSION_TAG = "ACCESSION";
089        protected static final String VERSION_TAG = "VERSION";
090        protected static final String KEYWORDS_TAG = "KEYWORDS";
091        //                                                  "SEGMENT"
092        protected static final String SOURCE_TAG = "SOURCE";
093        protected static final String ORGANISM_TAG = "ORGANISM";
094        protected static final String REFERENCE_TAG = "REFERENCE";
095        protected static final String AUTHORS_TAG = "AUTHORS";
096        protected static final String CONSORTIUM_TAG = "CONSRTM";
097        protected static final String TITLE_TAG = "TITLE";
098        protected static final String JOURNAL_TAG = "JOURNAL";
099        protected static final String PUBMED_TAG = "PUBMED";
100        protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated
101        protected static final String REMARK_TAG = "REMARK";
102        protected static final String COMMENT_TAG = "COMMENT";
103        protected static final String FEATURE_TAG = "FEATURES";
104        protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated
105        protected static final String BASE_COUNT_TAG = "BASE";
106        //                                                  "CONTIG"
107        protected static final String START_SEQUENCE_TAG = "ORIGIN";
108        protected static final String DBSOURCE = "DBSOURCE";
109        protected static final String PRIMARY = "PRIMARY";
110        protected static final String DBLINK = "DBLINK";
111        protected static final String END_SEQUENCE_TAG = "//";
112        // locus line
113        protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+(\\d+)\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$");
114        // version line
115        protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
116        // reference line
117        protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$");
118        protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?");
119        // dbxref line
120        protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
121
122        protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK);
123        /**
124         * sections start at a line and continue till the first line afterwards with a
125         *      non-whitespace first character
126         *      we want to match any of the following as a new section within a section
127         *        \s{0,8} word \s{0,7} value
128         *        \s{21} /word = value
129         *        \s{21} /word
130         */
131        protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$");
132
133        protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
134        protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
135
136
137        private String parse(BufferedReader bufferedReader) {
138                String sectionKey;
139                List<String[]> section;
140                // Get an ordered list of key->value pairs in array-tuples
141                do {
142                        section = this.readSection(bufferedReader);
143                        sectionKey = section.get(0)[0];
144                        if (sectionKey == null) {
145                                //if we reach the end of the file, section contains empty strings
146                                if(section.get(0)[1]==null || section.get(0)[1].equals("") ||
147                                                section.get(0)[1].length()==0) {
148                                        throw new ParserException(Messages.ENDOFFILE);
149                                }
150                                throw new ParserException(Messages.SECTIONKEYNULL);
151                        }
152                        // process section-by-section
153                        switch (sectionKey) {
154                                case LOCUS_TAG: parseLocusTag(section); break;
155                                case DEFINITION_TAG: parseDefinitionTag(section); break;
156                                case ACCESSION_TAG: parseAccessionTag(section); break;
157                                case VERSION_TAG: parseVersionTag(section); break;
158                                case KEYWORDS_TAG: break;       // not implemented yet
159                                case SOURCE_TAG: break;         // ignore - can get all this from the first feature
160                                case REFERENCE_TAG: parseReferenceTag(section); break;
161                                case COMMENT_TAG: parseCommentTag(section); break;
162                                case FEATURE_TAG: parseFeatureTag(section); break;
163                                case BASE_COUNT_TAG: break;     // ignore - can calculate from sequence content later if needed
164                                case START_SEQUENCE_TAG: parseStartSequenceTag(section); break;
165                                case DBSOURCE: break;           // not implemented yet
166                                case PRIMARY: break;            // not implemented yet
167                                case DBLINK: break;                     // not implemented yet
168                                default:
169                                        if(!sectionKey.equals(END_SEQUENCE_TAG)) {
170                                                log.info("found unknown section key: %", sectionKey);
171                                        }
172                        }
173                } while (!sectionKey.equals(END_SEQUENCE_TAG));
174                return seqData;
175        }
176
177        private void parseStartSequenceTag(List<String[]> section) {
178                // our first line is ignorable as it is the ORIGIN tag
179                // the second line onwards conveniently have the number as
180                // the [0] tuple, and sequence string as [1] so all we have
181                // to do is concat the [1] parts and then strip out spaces,
182                // and replace '.' and '~' with '-' for our parser.
183                StringBuilder seq = new StringBuilder();
184                for (int i = 1; i < section.size(); i++) {
185                        seq.append(section.get(i)[1]);
186                }
187                seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase();
188        }
189
190        private void parseFeatureTag(List<String[]> section) {
191                // starting from second line of input, start a new feature whenever we come across
192                // a key that does not start with /
193                AbstractFeature gbFeature = null;
194                for (int i = 1; i < section.size(); i++) {
195                        String key = section.get(i)[0];
196                        String val = section.get(i)[1];
197                        if (key.startsWith("/")) {
198                                if (gbFeature == null) {
199                                        throw new ParserException("Malformed GenBank file: found a qualifier without feature.");
200                                }
201                                key = key.substring(1); // strip leading slash
202                                val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();
203                                if (val.endsWith("\"")) {
204                                        val = val.substring(1, val.length() - 1); // strip quotes
205                                }
206                                // parameter on old feature
207                                if (key.equals("db_xref")) {
208                                        Matcher m = dbxp.matcher(val);
209                                        if (m.matches()) {
210                                                String dbname = m.group(1);
211                                                String raccession = m.group(2);
212                                                DBReferenceInfo xref = new DBReferenceInfo(dbname, raccession);
213                                                gbFeature.addQualifier(key, xref);
214
215                                                ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<>();
216                                                listDBEntry.add(xref);
217                                                mapDB.put(key, listDBEntry);
218                                        } else {
219                                                throw new ParserException("Bad dbxref");
220                                        }
221                                } else if (key.equalsIgnoreCase("organism")) {
222                                        Qualifier q = new Qualifier(key, val.replace('\n', ' '));
223                                        gbFeature.addQualifier(key, q);
224                                } else {
225                                        if (key.equalsIgnoreCase("translation") || key.equals("anticodon")
226                                                        || key.equals("transl_except")) {
227                                                // strip spaces from sequence
228                                                val = val.replaceAll("\\s+", "");
229                                                Qualifier q = new Qualifier(key, val);
230                                                gbFeature.addQualifier(key, q);
231                                        } else {
232                                                Qualifier q = new Qualifier(key, val);
233                                                gbFeature.addQualifier(key, q);
234                                        }
235                                }
236                        } else {
237                                // new feature!
238                                gbFeature = new TextFeature(key, val, key, key);
239                                Location l =
240                                                locationParser.parse(val);
241                                gbFeature.setLocation((AbstractLocation)l);
242
243                                if (!featureCollection.containsKey(key)) {
244                                        featureCollection.put(key, new ArrayList<>());
245                                }
246                                featureCollection.get(key).add(gbFeature);
247                        }
248                }
249        }
250
251        private void parseCommentTag(List<String[]> section) {
252                headerParser.setComment(section.get(0)[1]);
253        }
254
255        private void parseReferenceTag(List<String[]> section) {
256                GenbankReference genbankReference = new GenbankReference();
257                for (String[] ref : section) {
258                        if (ref[0].equals(AUTHORS_TAG)) {
259                                genbankReference.setAuthors(ref[1]);
260                        } else if (ref[0].equals(TITLE_TAG)) {
261                                genbankReference.setTitle(ref[1]);
262                        } else if (ref[0].equals(JOURNAL_TAG)) {
263                                genbankReference.setJournal(ref[1]);
264                        }
265                }
266                headerParser.addReference(genbankReference);
267        }
268
269        private void parseVersionTag(List<String[]> section) {
270                String ver = section.get(0)[1];
271                Matcher m = vp.matcher(ver);
272                if (m.matches()) {
273                        String verAcc = m.group(1);
274                        if (!accession.equals(verAcc)) {
275                                // the version refers to a different accession!
276                                // believe the version line, and store the original
277                                // accession away in the additional accession set
278                                accession = verAcc;
279                        }
280                        if (m.group(3) != null) {
281                                headerParser.setVersion(Integer.parseInt(m.group(3)));
282                        }
283                        if (m.group(5) != null) {
284                                headerParser.setIdentifier(m.group(5));
285                        }
286                } else {
287                        throw new ParserException("Bad version line");
288                }
289        }
290
291        private void parseAccessionTag(List<String[]> section) {
292                // if multiple accessions, store only first as accession,
293                // and store rest in annotation
294                String[] accs = section.get(0)[1].split("\\s+");
295                accession = accs[0].trim();
296                headerParser.setAccession(accession);
297        }
298
299        private void parseDefinitionTag(List<String[]> section) {
300                headerParser.setDescription(section.get(0)[1]);
301        }
302
303        private void parseLocusTag(List<String[]> section) {
304                String loc = section.get(0)[1];
305                header = loc;
306                Matcher m = lp.matcher(loc);
307                if (m.matches()) {
308                        headerParser.setName(m.group(1));
309                        headerParser.setAccession(m.group(1)); // default if no accession found
310                        long sequenceLength = Long.valueOf(m.group(2));
311                        String lengthUnits = m.group(3);
312                        String type = m.group(6);
313
314                        if (lengthUnits.equalsIgnoreCase("aa")) {
315                                compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
316                        } else if (lengthUnits.equalsIgnoreCase("bp")) {
317                                if (type != null) {
318                                        if (type.contains("RNA")) {
319                                                compoundType = RNACompoundSet.getRNACompoundSet();
320                                        } else {
321                                                compoundType = DNACompoundSet.getDNACompoundSet();
322                                        }
323                                } else {
324                                        compoundType = DNACompoundSet.getDNACompoundSet();
325                                }
326                        }
327
328                        if (m.group(7) != null) isCircularSequence = m.group(7).equalsIgnoreCase("circular");
329
330                        // configure location parser with needed information
331                        locationParser.setSequenceLength(sequenceLength);
332                        locationParser.setSequenceCircular(isCircularSequence);
333
334                        log.debug("compound type: {}", compoundType.getClass().getSimpleName());
335
336                } else {
337                        throw new ParserException("Bad locus line");
338                }
339        }
340
341
342        // reads an indented section, combining split lines and creating a list of
343        // key->value tuples
344        // reads an indented section, combining split lines and creating a list of
345        // key->value tuples
346        // reads an indented section, combining split lines and creating a list of
347        // key->value tuples
348        private List<String[]> readSection(BufferedReader bufferedReader) {
349                List<String[]> section = new ArrayList<>();
350                String line;
351
352                String currKey = null;
353                StringBuilder currVal = new StringBuilder();
354                boolean done = false;
355                int linecount = 0;
356
357                try {
358                        while (!done) {
359                                bufferedReader.mark(320);
360                                line = bufferedReader.readLine();
361                                String firstSecKey = section.isEmpty() ? ""
362                                                : section.get(0)[0];
363                                if (line != null && line.matches("\\p{Space}*")) {
364                                        // regular expression \p{Space}* will match line
365                                        // having only white space characters
366                                        continue;
367                                }
368                                if (line == null
369                                                || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey
370                                                .equals(START_SEQUENCE_TAG) || line
371                                                .startsWith(END_SEQUENCE_TAG)))) {
372                                        // dump out last part of section
373                                        section.add(new String[]{currKey, currVal.toString()});
374                                        bufferedReader.reset();
375                                        done = true;
376                                } else {
377                                        Matcher m = sectp.matcher(line);
378                                        if (m.matches()) {
379                                                // new key
380                                                if (currKey != null) {
381                                                        section.add(new String[]{currKey,
382                                                                currVal.toString()});
383                                                }
384                                                // key = group(2) or group(4) or group(6) - whichever is
385                                                // not null
386                                                currKey = m.group(2) == null ? (m.group(4) == null ? m
387                                                                .group(6) : m.group(4)) : m.group(2);
388                                                currVal = new StringBuilder();
389                                                // val = group(3) if group(2) not null, group(5) if
390                                                // group(4) not null, "" otherwise, trimmed
391                                                currVal.append((m.group(2) == null ? (m.group(4) == null ? ""
392                                                                : m.group(5))
393                                                                : m.group(3)).trim());
394                                        } else {
395                                                // concatted line or SEQ START/END line?
396                                                if (line.startsWith(START_SEQUENCE_TAG)
397                                                                || line.startsWith(END_SEQUENCE_TAG)) {
398                                                        currKey = line;
399                                                } else {
400                                                        currVal.append("\n"); // newline in between lines -
401                                                        // can be removed later
402                                                        currVal.append(currKey.charAt(0) == '/' ? line
403                                                                        .substring(21) : line.substring(12));
404                                                }
405                                        }
406                                }
407                        }
408                } catch (IOException | RuntimeException e) {
409                        throw new ParserException(e.getMessage());
410                }
411                return section;
412        }
413
414        @Override
415        public String getSequence(BufferedReader bufferedReader, int sequenceLength) {
416                featureCollection = new HashMap<>();
417                mapDB = new LinkedHashMap<>();
418                headerParser = new GenericGenbankHeaderParser<>();
419                try {
420                        parse(bufferedReader);
421                } catch (ParserException e) {
422                        if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null;
423                        else throw new ParserException(e.getMessage());
424                }
425
426                return seqData;
427        }
428
429        public String getHeader() {
430                return header;
431        }
432
433        public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() {
434                return headerParser;
435        }
436
437        public Map<String, List<DBReferenceInfo>> getDatabaseReferences() {
438                return mapDB;
439        }
440
441        public List<String> getKeyWords() {
442                return new ArrayList<>(featureCollection.keySet());
443        }
444
445        public List<AbstractFeature<AbstractSequence<C>, C>> getFeatures(String keyword) {
446                return featureCollection.get(keyword);
447        }
448        public Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> getFeatures() {
449                return featureCollection;
450        }
451
452        public void parseFeatures(AbstractSequence<C> sequence) {
453                for (String k: featureCollection.keySet())
454                        for (AbstractFeature<AbstractSequence<C>, C> f: featureCollection.get(k))
455                                sequence.addFeature(f);
456        }
457
458        public CompoundSet<?> getCompoundType() {
459                return compoundType;
460        }
461}