001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.location;
023
024import org.biojava.nbio.core.exceptions.ParserException;
025import org.biojava.nbio.core.sequence.AccessionID;
026import org.biojava.nbio.core.sequence.DNASequence;
027import org.biojava.nbio.core.sequence.DataSource;
028import org.biojava.nbio.core.sequence.Strand;
029import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
030import org.biojava.nbio.core.sequence.location.template.Location;
031import org.biojava.nbio.core.sequence.location.template.Point;
032import org.biojava.nbio.core.sequence.template.AbstractSequence;
033import org.biojava.nbio.core.sequence.template.Compound;
034
035import java.io.IOException;
036import java.io.Reader;
037import java.util.ArrayList;
038import java.util.List;
039import java.util.regex.Matcher;
040import java.util.regex.Pattern;
041
042/**
043 * Parser for working with INSDC style locations. This class supports the
044 * full range of location types generated by Genbank, INSDC and ENA.
045 *
046 * @author ayates
047 * @author jgrzebyta
048 * @author Paolo Pavan
049 */
050public class InsdcParser <S extends AbstractSequence<C>, C extends Compound>{
051
052        private final DataSource dataSource;
053
054                /**
055         * parse a location. if group(1) is null than the feature is on the positive
056         * strand, group(2) start position, group(3) end position.
057         */
058        // why in the location the first character was ignored?
059        //protected static final Pattern singleLocationPattern = Pattern.compile("(?:[A-Z]([A-Za-z\\.0-9_]*?):)?(<?)(\\d+)(\\.{2}|\\^)?(>?)(\\d+)?(>?)?");
060
061        // fixed issue #254
062        protected static final Pattern singleLocationPattern = Pattern.compile("(?:([A-Za-z\\.0-9_]*?):)?(<?)(\\d+)(\\.{2}|\\^)?(>?)(\\d+)?(>?)?");
063        /**
064         * Decodes a split pattern. Split patterns are a composition of multiple
065         * locationsString qualified by actions: join(location,location, ...
066         * location): The indicated elements should be joined (placed end-to-end) to
067         * form one contiguous sequence. order(location,location, ... location): The
068         * elements can be found in the specified order (5' to 3' direction),
069         * nothing is implied about their reasonableness
070         * bond(location,location...location): Found in protein files. These
071         * generally are used to describe disulfide bonds.
072         * complement(location,location...location): consider locations in their
073         * complement versus
074         *
075         * takes in input a comma splitted location string. The split must be done
076         * for outer level commas group(1) is the qualifier group(2) is the location
077         * string to getFeatures. In case of complex splits it will contain the
078         * nested expression
079         *
080         * Not really sure that they are not declared obsolete but they are still in
081         * several files.
082         */
083        //protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?([^\\)]+)\\)?");
084        protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?(.+)\\)?");
085        /**
086         * designed to recursively split a location string in tokens. Valid tokens
087         * are those divided by coma that are not inside a bracket. I. e. split on
088         * the comma only if that comma has zero, or an even number of quotes ahead
089         * of it.
090         */
091        protected static final String locationSplitPattern = ",(?=([^\\(|\\)]+\\([^\\(|\\)]+\\))[^\\(|\\)]+)";
092        /**
093         * these variables are used to compute the global start and end of complex
094         * features
095         */
096        protected Integer featureGlobalStart, featureGlobalEnd;
097
098        //private S referenceSequence = new org.biojava.nbio.core.sequence.DNASequence();
099        private AbstractSequence referenceSequence = new DNASequence();
100
101        enum complexFeaturesAppendEnum {
102
103                FLATTEN, HIERARCHICAL;
104        }
105        /**
106         * define the mode in which complex features should be appended in FLATTEN
107         * mode their will be appended to a single feature in HIERARCHICAL mode, the
108         * single mother feature will have a tree of features that will reflect the
109         * construction in genbank file
110         */
111        private complexFeaturesAppendEnum complexFeaturesAppendMode = complexFeaturesAppendEnum.HIERARCHICAL;
112
113        public void setComplexFeaturesAppendMode(complexFeaturesAppendEnum complexFeaturesAppendMode) {
114                this.complexFeaturesAppendMode = complexFeaturesAppendMode;
115        }
116
117        public InsdcParser() {
118                this(DataSource.ENA);
119        }
120
121        public InsdcParser(DataSource dataSource) {
122                this.dataSource = dataSource;
123        }
124
125        public DataSource getDataSource() {
126                return dataSource;
127        }
128
129
130
131        /**
132         * Main method for parsing a location from a String instance
133         *
134         * @param locationString Represents a logical location
135         * @return The parsed location
136         * @throws ParserException thrown in the event of any error during parsing
137         */
138        public Location parse(String locationString) throws ParserException {
139                featureGlobalStart = Integer.MAX_VALUE;
140                featureGlobalEnd = 1;
141
142                Location l;
143                List<Location> ll = parseLocationString(locationString, 1);
144
145                if (ll.size() == 1) {
146                        l = ll.get(0);
147                } else {
148                        l = new SimpleLocation(
149                                        featureGlobalStart,
150                                        featureGlobalEnd,
151                                        Strand.UNDEFINED,
152                                        ll);
153                }
154                return l;
155        }
156
157        /**
158         * Reader based version of the parse methods.
159         *
160         * @param reader The source of the data; assumes that end of the reader
161         * stream is the end of the location string to parse
162         * @return The parsed location
163         * @throws IOException Thrown with any reader error
164         * @throws ParserException Thrown with any error with parsing locations
165         */
166        public List<AbstractLocation> parse(Reader reader) throws IOException, ParserException {
167                // use parse(String s) instead!
168                return null;
169        }
170
171        private List<Location> parseLocationString(String string, int versus) throws ParserException {
172                Matcher m;
173                List<Location> boundedLocationsCollection = new ArrayList<Location>();
174
175                //String[] tokens = string.split(locationSplitPattern);
176                List<String> tokens = splitString(string);
177                for (String t : tokens) {
178                        m = genbankSplitPattern.matcher(t);
179                        if (!m.find()) {
180                                throw new ParserException("Cannot interpret split pattern " + t
181                                                + "\nin location string:" + string);
182                        }
183                        String splitQualifier = m.group(1);
184                        String splitString = m.group(2);
185
186                        if (!splitQualifier.isEmpty()) {
187                                //recursive case
188                                int localVersus = splitQualifier.equalsIgnoreCase("complement") ? -1 : 1;
189                                List<Location> subLocations = parseLocationString(splitString, versus * localVersus);
190
191                                switch (complexFeaturesAppendMode) {
192                                        case FLATTEN:
193                                                boundedLocationsCollection.addAll(subLocations);
194                                                break;
195                                        case HIERARCHICAL:
196                                                if (subLocations.size() == 1) {
197                                                        boundedLocationsCollection.addAll(subLocations);
198                                                } else {
199                                                        Point min = Location.Tools.getMin(subLocations).getStart();
200                                                        Point max = Location.Tools.getMax(subLocations).getEnd();
201                                                        AbstractLocation motherLocation
202                                                                        = new SimpleLocation(
203                                                                                        min,
204                                                                                        max
205                                                                        );
206
207                                                        if (splitQualifier.equalsIgnoreCase("join")) {
208                                                                motherLocation = new InsdcLocations.GroupLocation(subLocations);
209                                                        }
210                                                        if (splitQualifier.equalsIgnoreCase("order")) {
211                                                                motherLocation = new InsdcLocations.OrderLocation(subLocations);
212                                                        }
213                                                        if (splitQualifier.equalsIgnoreCase("bond")) {
214                                                                motherLocation = new InsdcLocations.BondLocation(subLocations);
215                                                        }
216                                                        motherLocation.setStrand(getGroupLocationStrand(subLocations));
217                                                        boundedLocationsCollection.add(motherLocation);
218                                                }
219                                        break;
220                                }
221                        } else {
222                                //base case
223                                m = singleLocationPattern.matcher(splitString);
224                                if (!m.find()) {
225                                        throw new ParserException("Cannot interpret location pattern " + splitString
226                                                        + "\nin location string:" + string);
227                                }
228
229                                String accession = m.group(1);
230                                Strand s = versus == 1 ? Strand.POSITIVE : Strand.NEGATIVE;
231                                int start = new Integer(m.group(3));
232                                int end = m.group(6) == null ? start : new Integer(m.group(6));
233
234                                if (featureGlobalStart > start) {
235                                        featureGlobalStart = start;
236                                }
237                                if (featureGlobalEnd < end) {
238                                        featureGlobalEnd = end;
239                                }
240
241                                AbstractLocation l = new SimpleLocation(
242                                                start,
243                                                end,
244                                                s
245                                );
246
247                                if(m.group(4) != null && m.group(4).equals("^")) l.setBetweenCompounds(true);
248
249                                if (m.group(2).equals("<")) {
250                                        l.setPartialOn5prime(true);
251                                }
252                                if (m.group(5) != null && (m.group(5).equals(">") || m.group(7).equals(">"))) {
253                                        l.setPartialOn3prime(true);
254                                }
255
256                                if (!(accession == null || "".equals(accession))) l.setAccession(new AccessionID(accession));
257
258                                boundedLocationsCollection.add(l);
259
260                        }
261                }
262
263                return boundedLocationsCollection;
264        }
265
266
267        private List<String> splitString(String input) {
268                List<String> result = new ArrayList<String>();
269                int start = 0;
270                int openedParenthesis = 0;
271                for (int current = 0; current < input.length(); current++) {
272                        if (input.charAt(current) == '(') {
273                                openedParenthesis++;
274                        }
275                        if (input.charAt(current) == ')') {
276                                openedParenthesis--;
277                        }
278                        boolean atLastChar = (current == input.length() - 1);
279                        if (atLastChar) {
280                                result.add(input.substring(start));
281                        } else if (input.charAt(current) == ',' && openedParenthesis == 0) {
282                                result.add(input.substring(start, current));
283                                start = current + 1;
284                        }
285                }
286                return result;
287        }
288
289        private Strand getGroupLocationStrand(List<Location> ll){
290                Strand returnStrand = null;
291
292                for (Location l: ll) {
293                        if (returnStrand == null) returnStrand = l.getStrand();
294                        if (returnStrand != l.getStrand()) return Strand.UNDEFINED;
295                }
296                return returnStrand;
297        }
298
299        public static void main(String args[]){
300                String[] testStrings = {
301                        "J00194.1:100..202",
302                        "A00001.5:34..45",
303                        "43..129",
304                        "bond(55,110)",
305                        "bond(34,35),join(56..80),complement(45,73)",
306                        "order(complement(30,40),70..80),bond(34,35),join(56,80),complement(45..56)",
307                        "join(join(complement(30,40),complement(70..80)),bond(34,35),join(56,80),complement(45..56))",
308                        "complement(join(complement(2000..4000),complement(70..80)),bond(34,35),join(56,80),complement(45..56))",
309
310                };
311                InsdcParser p = new InsdcParser();
312                p.setComplexFeaturesAppendMode(complexFeaturesAppendEnum.HIERARCHICAL);
313
314                for (String s: testStrings){
315                        Location l = p.parse(s);
316                        System.out.println(l.toString());
317                }
318
319        }
320
321}