Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Richard Holland
015 * @author Mark Schreiber
016 * @author David Scott
017 * @author Bubba Puryear
018 * @author George Waldon
019 * @author Deepak Sheoran
020 * @author Karl Nicholas <github:karlnicholas>
021 * @author Jacek Grzebyta
022 * @author Paolo Pavan
023 *
024 * For more information on the BioJava project and its aims,
025 * or to join the biojava-l mailing list, visit the home page
026 * at:
027 *
028 *      http://www.biojava.org/
029 *
030 * Created on 01-21-2010
031 */
032package org.biojava.nbio.core.sequence.io;
033
034import org.biojava.nbio.core.exceptions.Messages;
035import org.biojava.nbio.core.exceptions.ParserException;
036import org.biojava.nbio.core.sequence.DataSource;
037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
038import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
039import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
040import org.biojava.nbio.core.sequence.features.AbstractFeature;
041import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
042import org.biojava.nbio.core.sequence.features.Qualifier;
043import org.biojava.nbio.core.sequence.features.TextFeature;
044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
045import org.biojava.nbio.core.sequence.location.InsdcParser;
046import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
047import org.biojava.nbio.core.sequence.location.template.Location;
048import org.biojava.nbio.core.sequence.template.AbstractSequence;
049import org.biojava.nbio.core.sequence.template.Compound;
050import org.biojava.nbio.core.sequence.template.CompoundSet;
051import org.slf4j.Logger;
052import org.slf4j.LoggerFactory;
053
054import java.io.BufferedReader;
055import java.io.IOException;
056import java.util.ArrayList;
057import java.util.HashMap;
058import java.util.LinkedHashMap;
059import java.util.List;
060import java.util.regex.Matcher;
061import java.util.regex.Pattern;
062
063public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{
064
065        private String seqData = null;
066        private GenericGenbankHeaderParser<S, C> headerParser;
067        private String header;
068        private String accession;
069        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> mapDB;
070        /**
071         * this data structure collects list of features extracted from the
072         * FEATURE_TAG section They are organized by list of the same type (i.e.
073         * same genbank Feature) and are provided with location
074         */
075        private HashMap<String, ArrayList<AbstractFeature>> featureCollection;
076
077        private Logger log = LoggerFactory.getLogger(getClass());
078
079        // this is a compoundset parsed from header.
080        private CompoundSet<?> compoundType;
081
082        /**
083         * The name of this format
084         */
085        public static final String GENBANK_FORMAT = "GENBANK";
086
087        protected static final String LOCUS_TAG = "LOCUS";
088        protected static final String DEFINITION_TAG = "DEFINITION";
089        protected static final String ACCESSION_TAG = "ACCESSION";
090        protected static final String VERSION_TAG = "VERSION";
091        protected static final String KEYWORDS_TAG = "KEYWORDS";
092        //                                                  "SEGMENT"
093        protected static final String SOURCE_TAG = "SOURCE";
094        protected static final String ORGANISM_TAG = "ORGANISM";
095        protected static final String REFERENCE_TAG = "REFERENCE";
096        protected static final String AUTHORS_TAG = "AUTHORS";
097        protected static final String CONSORTIUM_TAG = "CONSRTM";
098        protected static final String TITLE_TAG = "TITLE";
099        protected static final String JOURNAL_TAG = "JOURNAL";
100        protected static final String PUBMED_TAG = "PUBMED";
101        protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated
102        protected static final String REMARK_TAG = "REMARK";
103        protected static final String COMMENT_TAG = "COMMENT";
104        protected static final String FEATURE_TAG = "FEATURES";
105        protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated
106        protected static final String BASE_COUNT_TAG = "BASE";
107        //                                                  "CONTIG"
108        protected static final String START_SEQUENCE_TAG = "ORIGIN";
109        protected static final String END_SEQUENCE_TAG = "//";
110        // locus line
111        protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|aa)\\s{1,4}(([dms]s-)?(\\S+))?\\s+(circular|linear)?\\s*(\\S+)?\\s*(\\S+)?$");
112        // version line
113        protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
114        // reference line
115        protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$");
116        protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?");
117        // dbxref line
118        protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
119
120        protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK);
121        //sections start at a line and continue till the first line afterwards with a
122        //non-whitespace first character
123        //we want to match any of the following as a new section within a section
124        //  \s{0,8} word \s{0,7} value
125        //  \s{21} /word = value
126        //  \s{21} /word
127        protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$");
128
129        protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
130        protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
131        private static final String DBSOURCE = "DBSOURCE";
132        private static final String PRIMARY = "PRIMARY";
133        private static final String DBLINK = "DBLINK";
134
135//  private NCBITaxon tax = null;
136
137
138
139        private String parse(BufferedReader bufferedReader) {
140                String sectionKey = null;
141                List<String[]> section;
142                // Get an ordered list of key->value pairs in array-tuples
143                do {
144                        section = this.readSection(bufferedReader);
145                        sectionKey = section.get(0)[0];
146                        if (sectionKey == null) {
147                                //if we reach the end of the file, section contains empty strings
148                                if(section.get(0)[1]==null || section.get(0)[1]=="" ||
149                                                section.get(0)[1].length()==0) {
150                                        throw new ParserException(Messages.ENDOFFILE);
151                                }
152                                throw new ParserException(Messages.SECTIONKEYNULL);
153                        }
154                        // process section-by-section
155                        if (sectionKey.equals(LOCUS_TAG)) {
156                                String loc = section.get(0)[1];
157                                header = loc;
158                                Matcher m = lp.matcher(loc);
159                                if (m.matches()) {
160                                        headerParser.setName(m.group(1));
161                                        headerParser.setAccession(m.group(1)); // default if no accession found
162
163                                        String lengthUnits = m.group(2);
164                                        String type = m.group(5);
165
166                                        if (lengthUnits.equals("aa")) {
167                                                compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
168                                        } else if (lengthUnits.equals("bp")) {
169                                                if (type != null) {
170                                                        if (type.contains("RNA")) {
171                                                                compoundType = RNACompoundSet.getRNACompoundSet();
172                                                        } else {
173                                                                compoundType = DNACompoundSet.getDNACompoundSet();
174                                                        }
175                                                } else {
176                                                        compoundType = DNACompoundSet.getDNACompoundSet();
177                                                }
178                                        }
179
180                                        log.debug("compound type: {}", compoundType.getClass().getSimpleName());
181
182                                } else {
183                                        throw new ParserException("Bad locus line");
184                                }
185                        } else if (sectionKey.equals(DEFINITION_TAG)) {
186                                headerParser.setDescription(section.get(0)[1]);
187                        } else if (sectionKey.equals(ACCESSION_TAG)) {
188                                // if multiple accessions, store only first as accession,
189                                // and store rest in annotation
190                                String[] accs = section.get(0)[1].split("\\s+");
191                                accession = accs[0].trim();
192                                headerParser.setAccession(accession);
193                        } else if (sectionKey.equals(VERSION_TAG)) {
194                                String ver = section.get(0)[1];
195                                Matcher m = vp.matcher(ver);
196                                if (m.matches()) {
197                                        String verAcc = m.group(1);
198                                        if (!accession.equals(verAcc)) {
199                                                // the version refers to a different accession!
200                                                // believe the version line, and store the original
201                                                // accession away in the additional accession set
202                                                accession = verAcc;
203                                        }
204                                        if (m.group(3) != null) {
205                                                headerParser.setVersion(Integer.parseInt(m.group(3)));
206                                        }
207                                        if (m.group(5) != null) {
208                                                headerParser.setIdentifier(m.group(5));
209                                        }
210                                } else {
211                                        throw new ParserException("Bad version line");
212                                }
213                        } else if (sectionKey.equals(KEYWORDS_TAG)) {
214                        } else if (sectionKey.equals(SOURCE_TAG)) {
215                                // ignore - can get all this from the first feature
216                        } else if (sectionKey.equals(REFERENCE_TAG)) {
217                        } else if (sectionKey.equals(COMMENT_TAG)) {
218                                // Set up some comments
219                                headerParser.setComment(section.get(0)[1]);
220                        } else if (sectionKey.equals(FEATURE_TAG)) {
221                                // starting from second line of input, start a new feature whenever we come across
222                                // a key that does not start with /
223                                AbstractFeature gbFeature = null;
224                                for (int i = 1; i < section.size(); i++) {
225                                        String key = section.get(i)[0];
226                                        String val = section.get(i)[1];
227                                        if (key.startsWith("/")) {
228                                                if (gbFeature == null) {
229                                                        throw new ParserException("Malformed GenBank file: found a qualifier without feature.");
230                                                }
231                                                key = key.substring(1); // strip leading slash
232                                                val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();
233                                                if (val.endsWith("\"")) {
234                                                        val = val.substring(1, val.length() - 1); // strip quotes
235                                                }
236                                                // parameter on old feature
237                                                if (key.equals("db_xref")) {
238                                                        Matcher m = dbxp.matcher(val);
239                                                        if (m.matches()) {
240                                                                String dbname = m.group(1);
241                                                                String raccession = m.group(2);
242                                                                Qualifier xref = new DBReferenceInfo(dbname, raccession);
243                                                                gbFeature.addQualifier(key, xref);
244
245                                                                ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<DBReferenceInfo>();
246                                                                listDBEntry.add((DBReferenceInfo) xref);
247                                                                mapDB.put(key, listDBEntry);
248                                                        } else {
249                                                                throw new ParserException("Bad dbxref");
250                                                        }
251                                                } else if (key.equalsIgnoreCase("organism")) {
252                                                        Qualifier q = new Qualifier(key, val.replace('\n', ' '));
253                                                        gbFeature.addQualifier(key, q);
254                                                } else {
255                                                        if (key.equalsIgnoreCase("translation")) {
256                                                                // strip spaces from sequence
257                                                                val = val.replaceAll("\\s+", "");
258                                                                Qualifier q = new Qualifier(key, val);
259                                                                gbFeature.addQualifier(key, q);
260                                                        } else {
261                                                                Qualifier q = new Qualifier(key, val);
262                                                                gbFeature.addQualifier(key, q);
263                                                        }
264                                                }
265                                        } else {
266                                                // new feature!
267                                                gbFeature = new TextFeature(key, val, key, key);
268                                                Location l =
269                                                                locationParser.parse(val);
270                                                gbFeature.setLocation((AbstractLocation)l);
271
272                                                if (!featureCollection.containsKey(key)) {
273                                                        featureCollection.put(key, new ArrayList());
274                                                }
275                                                featureCollection.get(key).add(gbFeature);
276                                        }
277                                }
278                        } else if (sectionKey.equals(BASE_COUNT_TAG)) {
279                                // ignore - can calculate from sequence content later if needed
280                        } else if (sectionKey.equals(START_SEQUENCE_TAG)) {
281                                // our first line is ignorable as it is the ORIGIN tag
282                                // the second line onwards conveniently have the number as
283                                // the [0] tuple, and sequence string as [1] so all we have
284                                // to do is concat the [1] parts and then strip out spaces,
285                                // and replace '.' and '~' with '-' for our parser.
286                                StringBuffer seq = new StringBuffer();
287                                for (int i = 1; i < section.size(); i++) {
288                                        seq.append(section.get(i)[1]);
289                                }
290                                seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase();
291                        } else if(sectionKey.equals(DBSOURCE)) {
292                                //TODO
293                        } else if(sectionKey.equals(PRIMARY)) {
294                                //TODO
295                        } else if(sectionKey.equals(DBLINK)) {
296                                //TODO
297                        } else {
298                                if(!sectionKey.equals(END_SEQUENCE_TAG)) {
299                                        log.info("found unknown section key: "+sectionKey);
300                                }
301                        }
302                } while (!sectionKey.equals(END_SEQUENCE_TAG));
303                return seqData;
304        }
305
306
307
308        // reads an indented section, combining split lines and creating a list of
309        // key->value tuples
310        // reads an indented section, combining split lines and creating a list of
311        // key->value tuples
312        // reads an indented section, combining split lines and creating a list of
313        // key->value tuples
314        private List<String[]> readSection(BufferedReader bufferedReader) {
315                List<String[]> section = new ArrayList<String[]>();
316                String line = "";
317
318                String currKey = null;
319                StringBuffer currVal = new StringBuffer();
320                boolean done = false;
321                int linecount = 0;
322
323                try {
324                        while (!done) {
325                                bufferedReader.mark(320);
326                                line = bufferedReader.readLine();
327                                String firstSecKey = section.isEmpty() ? ""
328                                                : section.get(0)[0];
329                                if (line != null && line.matches("\\p{Space}*")) {
330                                        // regular expression \p{Space}* will match line
331                                        // having only white space characters
332                                        continue;
333                                }
334                                if (line == null
335                                                || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey
336                                                .equals(START_SEQUENCE_TAG) || line
337                                                .startsWith(END_SEQUENCE_TAG)))) {
338                                        // dump out last part of section
339                                        section.add(new String[]{currKey, currVal.toString()});
340                                        bufferedReader.reset();
341                                        done = true;
342                                } else {
343                                        Matcher m = sectp.matcher(line);
344                                        if (m.matches()) {
345                                                // new key
346                                                if (currKey != null) {
347                                                        section.add(new String[]{currKey,
348                                                                currVal.toString()});
349                                                }
350                                                // key = group(2) or group(4) or group(6) - whichever is
351                                                // not null
352                                                currKey = m.group(2) == null ? (m.group(4) == null ? m
353                                                                .group(6) : m.group(4)) : m.group(2);
354                                                currVal = new StringBuffer();
355                        // val = group(3) if group(2) not null, group(5) if
356                                                // group(4) not null, "" otherwise, trimmed
357                                                currVal.append((m.group(2) == null ? (m.group(4) == null ? ""
358                                                                : m.group(5))
359                                                                : m.group(3)).trim());
360                                        } else {
361                                                // concatted line or SEQ START/END line?
362                                                if (line.startsWith(START_SEQUENCE_TAG)
363                                                                || line.startsWith(END_SEQUENCE_TAG)) {
364                                                        currKey = line;
365                                                } else {
366                                                        currVal.append("\n"); // newline in between lines -
367                                                        // can be removed later
368                                                        currVal.append(currKey.charAt(0) == '/' ? line
369                                                                        .substring(21) : line.substring(12));
370                                                }
371                                        }
372                                }
373                        }
374                } catch (IOException e) {
375                        throw new ParserException(e.getMessage());
376                } catch (RuntimeException e) {
377                        throw new ParserException(e.getMessage());
378                }
379                return section;
380        }
381
382        @Override
383        public String getSequence(BufferedReader bufferedReader, int sequenceLength) throws IOException {
384                featureCollection = new HashMap<String, ArrayList<AbstractFeature>>();
385                mapDB = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
386                headerParser = new GenericGenbankHeaderParser<S, C>();
387                try {
388                        parse(bufferedReader);
389                } catch (ParserException e) {
390                        if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null;
391                        else throw new ParserException(e.getMessage());
392                }
393
394                return seqData;
395        }
396
397        public String getHeader() {
398                return header;
399        }
400
401        public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() {
402                return headerParser;
403        }
404
405        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() {
406                return mapDB;
407        }
408
409        public ArrayList<String> getKeyWords() {
410                return new ArrayList<String>(featureCollection.keySet());
411        }
412
413        public ArrayList<AbstractFeature> getFeatures(String keyword) {
414                return featureCollection.get(keyword);
415        }
416        public HashMap<String, ArrayList<AbstractFeature>> getFeatures() {
417                return featureCollection;
418        }
419
420        public void parseFeatures(AbstractSequence<C> sequence) {
421                for (String k: featureCollection.keySet())
422                        for (AbstractFeature f: featureCollection.get(k))
423                                sequence.addFeature(f);
424        }
425
426        public CompoundSet<?> getCompoundType() {
427                return compoundType;
428        }
429}