001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * @author Richard Holland
015 * @author Mark Schreiber
016 * @author David Scott
017 * @author Bubba Puryear
018 * @author George Waldon
019 * @author Deepak Sheoran
020 * @author Karl Nicholas <github:karlnicholas>
021 * @author Jacek Grzebyta
022 * @author Paolo Pavan
023 *
024 * For more information on the BioJava project and its aims,
025 * or to join the biojava-l mailing list, visit the home page
026 * at:
027 *
028 *      http://www.biojava.org/
029 *
030 * Created on 01-21-2010
031 */
032package org.biojava.nbio.core.sequence.io;
033
034import org.biojava.nbio.core.exceptions.Messages;
035import org.biojava.nbio.core.exceptions.ParserException;
036import org.biojava.nbio.core.sequence.DataSource;
037import org.biojava.nbio.core.sequence.compound.AminoAcidCompoundSet;
038import org.biojava.nbio.core.sequence.compound.DNACompoundSet;
039import org.biojava.nbio.core.sequence.compound.RNACompoundSet;
040import org.biojava.nbio.core.sequence.features.AbstractFeature;
041import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
042import org.biojava.nbio.core.sequence.features.Qualifier;
043import org.biojava.nbio.core.sequence.features.TextFeature;
044import org.biojava.nbio.core.sequence.io.template.SequenceParserInterface;
045import org.biojava.nbio.core.sequence.location.InsdcParser;
046import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
047import org.biojava.nbio.core.sequence.location.template.Location;
048import org.biojava.nbio.core.sequence.reference.GenbankReference;
049import org.biojava.nbio.core.sequence.template.AbstractSequence;
050import org.biojava.nbio.core.sequence.template.Compound;
051import org.biojava.nbio.core.sequence.template.CompoundSet;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055import java.io.BufferedReader;
056import java.io.IOException;
057import java.util.ArrayList;
058import java.util.HashMap;
059import java.util.LinkedHashMap;
060import java.util.List;
061import java.util.regex.Matcher;
062import java.util.regex.Pattern;
063
064public class GenbankSequenceParser<S extends AbstractSequence<C>, C extends Compound> implements SequenceParserInterface{
065
066        private String seqData = null;
067        private GenericGenbankHeaderParser<S, C> headerParser;
068        private String header;
069        private String accession;
070        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> mapDB;
071        /**
072         * this data structure collects list of features extracted from the
073         * FEATURE_TAG section They are organized by list of the same type (i.e.
074         * same genbank Feature) and are provided with location
075         */
076        private HashMap<String, ArrayList<AbstractFeature>> featureCollection;
077
078        private Logger log = LoggerFactory.getLogger(getClass());
079
080        // this is a compoundset parsed from header.
081        private CompoundSet<?> compoundType;
082
083        /**
084         * The name of this format
085         */
086        public static final String GENBANK_FORMAT = "GENBANK";
087
088        protected static final String LOCUS_TAG = "LOCUS";
089        protected static final String DEFINITION_TAG = "DEFINITION";
090        protected static final String ACCESSION_TAG = "ACCESSION";
091        protected static final String VERSION_TAG = "VERSION";
092        protected static final String KEYWORDS_TAG = "KEYWORDS";
093        //                                                  "SEGMENT"
094        protected static final String SOURCE_TAG = "SOURCE";
095        protected static final String ORGANISM_TAG = "ORGANISM";
096        protected static final String REFERENCE_TAG = "REFERENCE";
097        protected static final String AUTHORS_TAG = "AUTHORS";
098        protected static final String CONSORTIUM_TAG = "CONSRTM";
099        protected static final String TITLE_TAG = "TITLE";
100        protected static final String JOURNAL_TAG = "JOURNAL";
101        protected static final String PUBMED_TAG = "PUBMED";
102        protected static final String MEDLINE_TAG = "MEDLINE"; //deprecated
103        protected static final String REMARK_TAG = "REMARK";
104        protected static final String COMMENT_TAG = "COMMENT";
105        protected static final String FEATURE_TAG = "FEATURES";
106        protected static final String BASE_COUNT_TAG_FULL = "BASE COUNT"; //deprecated
107        protected static final String BASE_COUNT_TAG = "BASE";
108        //                                                  "CONTIG"
109        protected static final String START_SEQUENCE_TAG = "ORIGIN";
110        protected static final String END_SEQUENCE_TAG = "//";
111        // locus line
112        protected static final Pattern lp = Pattern.compile("^(\\S+)\\s+\\d+\\s+(bp|BP|aa|AA)\\s{0,4}(([dmsDMS][sS]-)?(\\S+))?\\s*(circular|CIRCULAR|linear|LINEAR)?\\s*(\\S+)?\\s*(\\S+)?$");
113        // version line
114        protected static final Pattern vp = Pattern.compile("^(\\S*?)(\\.(\\d+))?(\\s+GI:(\\S+))?$");
115        // reference line
116        protected static final Pattern refRange = Pattern.compile("^\\s*(\\d+)\\s+to\\s+(\\d+)$");
117        protected static final Pattern refp = Pattern.compile("^(\\d+)\\s*(?:(\\((?:bases|residues)\\s+(\\d+\\s+to\\s+\\d+(\\s*;\\s*\\d+\\s+to\\s+\\d+)*)\\))|\\(sites\\))?");
118        // dbxref line
119        protected static final Pattern dbxp = Pattern.compile("^([^:]+):(\\S+)$");
120
121        protected static final InsdcParser locationParser = new InsdcParser(DataSource.GENBANK);
122        //sections start at a line and continue till the first line afterwards with a
123        //non-whitespace first character
124        //we want to match any of the following as a new section within a section
125        //  \s{0,8} word \s{0,7} value
126        //  \s{21} /word = value
127        //  \s{21} /word
128        protected static final Pattern sectp = Pattern.compile("^(\\s{0,8}(\\S+)\\s{0,7}(.*)|\\s{21}(/\\S+?)=(.*)|\\s{21}(/\\S+))$");
129
130        protected static final Pattern readableFiles = Pattern.compile(".*(g[bp]k*$|\\u002eg[bp].*)");
131        protected static final Pattern headerLine = Pattern.compile("^LOCUS.*");
132        private static final String DBSOURCE = "DBSOURCE";
133        private static final String PRIMARY = "PRIMARY";
134        private static final String DBLINK = "DBLINK";
135
136//  private NCBITaxon tax = null;
137
138
139
140        private String parse(BufferedReader bufferedReader) {
141                String sectionKey = null;
142                List<String[]> section;
143                // Get an ordered list of key->value pairs in array-tuples
144                do {
145                        section = this.readSection(bufferedReader);
146                        sectionKey = section.get(0)[0];
147                        if (sectionKey == null) {
148                                //if we reach the end of the file, section contains empty strings
149                                if(section.get(0)[1]==null || section.get(0)[1]=="" ||
150                                                section.get(0)[1].length()==0) {
151                                        throw new ParserException(Messages.ENDOFFILE);
152                                }
153                                throw new ParserException(Messages.SECTIONKEYNULL);
154                        }
155                        // process section-by-section
156                        if (sectionKey.equals(LOCUS_TAG)) {
157                                String loc = section.get(0)[1];
158                                header = loc;
159                                Matcher m = lp.matcher(loc);
160                                if (m.matches()) {
161                                        headerParser.setName(m.group(1));
162                                        headerParser.setAccession(m.group(1)); // default if no accession found
163
164                                        String lengthUnits = m.group(2);
165                                        String type = m.group(5);
166
167                                        if (lengthUnits.equalsIgnoreCase("aa")) {
168                                                compoundType = AminoAcidCompoundSet.getAminoAcidCompoundSet();
169                                        } else if (lengthUnits.equalsIgnoreCase("bp")) {
170                                                if (type != null) {
171                                                        if (type.contains("RNA")) {
172                                                                compoundType = RNACompoundSet.getRNACompoundSet();
173                                                        } else {
174                                                                compoundType = DNACompoundSet.getDNACompoundSet();
175                                                        }
176                                                } else {
177                                                        compoundType = DNACompoundSet.getDNACompoundSet();
178                                                }
179                                        }
180
181                                        log.debug("compound type: {}", compoundType.getClass().getSimpleName());
182
183                                } else {
184                                        throw new ParserException("Bad locus line");
185                                }
186                        } else if (sectionKey.equals(DEFINITION_TAG)) {
187                                headerParser.setDescription(section.get(0)[1]);
188                        } else if (sectionKey.equals(ACCESSION_TAG)) {
189                                // if multiple accessions, store only first as accession,
190                                // and store rest in annotation
191                                String[] accs = section.get(0)[1].split("\\s+");
192                                accession = accs[0].trim();
193                                headerParser.setAccession(accession);
194                        } else if (sectionKey.equals(VERSION_TAG)) {
195                                String ver = section.get(0)[1];
196                                Matcher m = vp.matcher(ver);
197                                if (m.matches()) {
198                                        String verAcc = m.group(1);
199                                        if (!accession.equals(verAcc)) {
200                                                // the version refers to a different accession!
201                                                // believe the version line, and store the original
202                                                // accession away in the additional accession set
203                                                accession = verAcc;
204                                        }
205                                        if (m.group(3) != null) {
206                                                headerParser.setVersion(Integer.parseInt(m.group(3)));
207                                        }
208                                        if (m.group(5) != null) {
209                                                headerParser.setIdentifier(m.group(5));
210                                        }
211                                } else {
212                                        throw new ParserException("Bad version line");
213                                }
214                        } else if (sectionKey.equals(KEYWORDS_TAG)) {
215                        } else if (sectionKey.equals(SOURCE_TAG)) {
216                                // ignore - can get all this from the first feature
217                        } else if (sectionKey.equals(REFERENCE_TAG)) {
218                                if (!section.isEmpty()) {
219                                        GenbankReference genbankReference = new GenbankReference();
220                                        for (String[] ref : section) {
221                                                if (ref[0].equals(AUTHORS_TAG)) {
222                                                        genbankReference.setAuthors(ref[1]);
223                                                } else if (ref[0].equals(TITLE_TAG)) {
224                                                        genbankReference.setTitle(ref[1]);
225                                                } else if (ref[0].equals(JOURNAL_TAG)) {
226                                                        genbankReference.setJournal(ref[1]);
227                                                }
228                                        }
229                                        headerParser.addReference(genbankReference);
230                                }
231                        } else if (sectionKey.equals(COMMENT_TAG)) {
232                                // Set up some comments
233                                headerParser.setComment(section.get(0)[1]);
234                        } else if (sectionKey.equals(FEATURE_TAG)) {
235                                // starting from second line of input, start a new feature whenever we come across
236                                // a key that does not start with /
237                                AbstractFeature gbFeature = null;
238                                for (int i = 1; i < section.size(); i++) {
239                                        String key = section.get(i)[0];
240                                        String val = section.get(i)[1];
241                                        if (key.startsWith("/")) {
242                                                if (gbFeature == null) {
243                                                        throw new ParserException("Malformed GenBank file: found a qualifier without feature.");
244                                                }
245                                                key = key.substring(1); // strip leading slash
246                                                val = val.replaceAll("\\s*[\\n\\r]+\\s*", " ").trim();
247                                                if (val.endsWith("\"")) {
248                                                        val = val.substring(1, val.length() - 1); // strip quotes
249                                                }
250                                                // parameter on old feature
251                                                if (key.equals("db_xref")) {
252                                                        Matcher m = dbxp.matcher(val);
253                                                        if (m.matches()) {
254                                                                String dbname = m.group(1);
255                                                                String raccession = m.group(2);
256                                                                Qualifier xref = new DBReferenceInfo(dbname, raccession);
257                                                                gbFeature.addQualifier(key, xref);
258
259                                                                ArrayList<DBReferenceInfo> listDBEntry = new ArrayList<DBReferenceInfo>();
260                                                                listDBEntry.add((DBReferenceInfo) xref);
261                                                                mapDB.put(key, listDBEntry);
262                                                        } else {
263                                                                throw new ParserException("Bad dbxref");
264                                                        }
265                                                } else if (key.equalsIgnoreCase("organism")) {
266                                                        Qualifier q = new Qualifier(key, val.replace('\n', ' '));
267                                                        gbFeature.addQualifier(key, q);
268                                                } else {
269                                                        if (key.equalsIgnoreCase("translation")) {
270                                                                // strip spaces from sequence
271                                                                val = val.replaceAll("\\s+", "");
272                                                                Qualifier q = new Qualifier(key, val);
273                                                                gbFeature.addQualifier(key, q);
274                                                        } else {
275                                                                Qualifier q = new Qualifier(key, val);
276                                                                gbFeature.addQualifier(key, q);
277                                                        }
278                                                }
279                                        } else {
280                                                // new feature!
281                                                gbFeature = new TextFeature(key, val, key, key);
282                                                Location l =
283                                                                locationParser.parse(val);
284                                                gbFeature.setLocation((AbstractLocation)l);
285
286                                                if (!featureCollection.containsKey(key)) {
287                                                        featureCollection.put(key, new ArrayList());
288                                                }
289                                                featureCollection.get(key).add(gbFeature);
290                                        }
291                                }
292                        } else if (sectionKey.equals(BASE_COUNT_TAG)) {
293                                // ignore - can calculate from sequence content later if needed
294                        } else if (sectionKey.equals(START_SEQUENCE_TAG)) {
295                                // our first line is ignorable as it is the ORIGIN tag
296                                // the second line onwards conveniently have the number as
297                                // the [0] tuple, and sequence string as [1] so all we have
298                                // to do is concat the [1] parts and then strip out spaces,
299                                // and replace '.' and '~' with '-' for our parser.
300                                StringBuffer seq = new StringBuffer();
301                                for (int i = 1; i < section.size(); i++) {
302                                        seq.append(section.get(i)[1]);
303                                }
304                                seqData = seq.toString().replaceAll("\\s+", "").replaceAll("[\\.|~]", "-").toUpperCase();
305                        } else if(sectionKey.equals(DBSOURCE)) {
306                                //TODO
307                        } else if(sectionKey.equals(PRIMARY)) {
308                                //TODO
309                        } else if(sectionKey.equals(DBLINK)) {
310                                //TODO
311                        } else {
312                                if(!sectionKey.equals(END_SEQUENCE_TAG)) {
313                                        log.info("found unknown section key: "+sectionKey);
314                                }
315                        }
316                } while (!sectionKey.equals(END_SEQUENCE_TAG));
317                return seqData;
318        }
319
320
321
322        // reads an indented section, combining split lines and creating a list of
323        // key->value tuples
324        // reads an indented section, combining split lines and creating a list of
325        // key->value tuples
326        // reads an indented section, combining split lines and creating a list of
327        // key->value tuples
328        private List<String[]> readSection(BufferedReader bufferedReader) {
329                List<String[]> section = new ArrayList<String[]>();
330                String line = "";
331
332                String currKey = null;
333                StringBuffer currVal = new StringBuffer();
334                boolean done = false;
335                int linecount = 0;
336
337                try {
338                        while (!done) {
339                                bufferedReader.mark(320);
340                                line = bufferedReader.readLine();
341                                String firstSecKey = section.isEmpty() ? ""
342                                                : section.get(0)[0];
343                                if (line != null && line.matches("\\p{Space}*")) {
344                                        // regular expression \p{Space}* will match line
345                                        // having only white space characters
346                                        continue;
347                                }
348                                if (line == null
349                                                || (!line.startsWith(" ") && linecount++ > 0 && (!firstSecKey
350                                                .equals(START_SEQUENCE_TAG) || line
351                                                .startsWith(END_SEQUENCE_TAG)))) {
352                                        // dump out last part of section
353                                        section.add(new String[]{currKey, currVal.toString()});
354                                        bufferedReader.reset();
355                                        done = true;
356                                } else {
357                                        Matcher m = sectp.matcher(line);
358                                        if (m.matches()) {
359                                                // new key
360                                                if (currKey != null) {
361                                                        section.add(new String[]{currKey,
362                                                                currVal.toString()});
363                                                }
364                                                // key = group(2) or group(4) or group(6) - whichever is
365                                                // not null
366                                                currKey = m.group(2) == null ? (m.group(4) == null ? m
367                                                                .group(6) : m.group(4)) : m.group(2);
368                                                currVal = new StringBuffer();
369                        // val = group(3) if group(2) not null, group(5) if
370                                                // group(4) not null, "" otherwise, trimmed
371                                                currVal.append((m.group(2) == null ? (m.group(4) == null ? ""
372                                                                : m.group(5))
373                                                                : m.group(3)).trim());
374                                        } else {
375                                                // concatted line or SEQ START/END line?
376                                                if (line.startsWith(START_SEQUENCE_TAG)
377                                                                || line.startsWith(END_SEQUENCE_TAG)) {
378                                                        currKey = line;
379                                                } else {
380                                                        currVal.append("\n"); // newline in between lines -
381                                                        // can be removed later
382                                                        currVal.append(currKey.charAt(0) == '/' ? line
383                                                                        .substring(21) : line.substring(12));
384                                                }
385                                        }
386                                }
387                        }
388                } catch (IOException e) {
389                        throw new ParserException(e.getMessage());
390                } catch (RuntimeException e) {
391                        throw new ParserException(e.getMessage());
392                }
393                return section;
394        }
395
396        @Override
397        public String getSequence(BufferedReader bufferedReader, int sequenceLength) throws IOException {
398                featureCollection = new HashMap<String, ArrayList<AbstractFeature>>();
399                mapDB = new LinkedHashMap<String, ArrayList<DBReferenceInfo>>();
400                headerParser = new GenericGenbankHeaderParser<S, C>();
401                try {
402                        parse(bufferedReader);
403                } catch (ParserException e) {
404                        if(e.getMessage().equalsIgnoreCase(Messages.ENDOFFILE)) return null;
405                        else throw new ParserException(e.getMessage());
406                }
407
408                return seqData;
409        }
410
411        public String getHeader() {
412                return header;
413        }
414
415        public GenericGenbankHeaderParser<S, C> getSequenceHeaderParser() {
416                return headerParser;
417        }
418
419        public LinkedHashMap<String, ArrayList<DBReferenceInfo>> getDatabaseReferences() {
420                return mapDB;
421        }
422
423        public ArrayList<String> getKeyWords() {
424                return new ArrayList<String>(featureCollection.keySet());
425        }
426
427        public ArrayList<AbstractFeature> getFeatures(String keyword) {
428                return featureCollection.get(keyword);
429        }
430        public HashMap<String, ArrayList<AbstractFeature>> getFeatures() {
431                return featureCollection;
432        }
433
434        public void parseFeatures(AbstractSequence<C> sequence) {
435                for (String k: featureCollection.keySet())
436                        for (AbstractFeature f: featureCollection.get(k))
437                                sequence.addFeature(f);
438        }
439
440        public CompoundSet<?> getCompoundType() {
441                return compoundType;
442        }
443}