001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 */
022package org.biojava.nbio.core.sequence.location;
023
024import org.biojava.nbio.core.exceptions.ParserException;
025import org.biojava.nbio.core.sequence.AccessionID;
026import org.biojava.nbio.core.sequence.DataSource;
027import org.biojava.nbio.core.sequence.Strand;
028import org.biojava.nbio.core.sequence.location.template.AbstractLocation;
029import org.biojava.nbio.core.sequence.location.template.Location;
030import org.biojava.nbio.core.sequence.location.template.Point;
031
032import java.util.ArrayList;
033import java.util.List;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036
037/**
038 * Parser for working with INSDC style locations. This class supports the
039 * full range of location types generated by Genbank, INSDC and ENA.
040 *
041 * @author ayates
042 * @author jgrzebyta
043 * @author Paolo Pavan
044 */
045public class InsdcParser {
046
047        private boolean isSequenceCircular;
048        private long sequenceLength;
049
050        private final DataSource dataSource;
051
052                /**
053         * parse a location. if group(1) is null than the feature is on the positive
054         * strand, group(2) start position, group(3) end position.
055         */
056        protected static final Pattern singleLocationPattern = Pattern.compile("(?:([A-Za-z\\.0-9_]*?):)?(<?)(\\d+)(\\.{2}|\\^)?(>?)(\\d+)?(>?)?");
057        /**
058         * Decodes a split pattern. Split patterns are a composition of multiple
059         * locationsString qualified by actions: join(location,location, ...
060         * location): The indicated elements should be joined (placed end-to-end) to
061         * form one contiguous sequence. order(location,location, ... location): The
062         * elements can be found in the specified order (5' to 3' direction),
063         * nothing is implied about their reasonableness
064         * bond(location,location...location): Found in protein files. These
065         * generally are used to describe disulfide bonds.
066         * complement(location,location...location): consider locations in their
067         * complement versus
068         *
069         * takes in input a comma split location string. The split must be done
070         * for outer level commas group(1) is the qualifier group(2) is the location
071         * string to getFeatures. In case of complex splits it will contain the
072         * nested expression
073         *
074         * Not really sure that they are not declared obsolete but they are still in
075         * several files.
076         */
077        protected static final Pattern genbankSplitPattern = Pattern.compile("^\\s?(join|order|bond|complement|)\\(?([\\s\\S]+)\\)?");
078        /**
079         * designed to recursively split a location string in tokens. Valid tokens
080         * are those divided by coma that are not inside a bracket. I. e. split on
081         * the comma only if that comma has zero, or an even number of quotes ahead
082         * of it.
083         */
084        protected static final String locationSplitPattern = ",(?=([^\\(|\\)]+\\([^\\(|\\)]+\\))[^\\(|\\)]+)";
085        /**
086         * these variables are used to compute the global start and end of complex
087         * features
088         */
089        protected Integer featureGlobalStart, featureGlobalEnd;
090
091        enum complexFeaturesAppendEnum {
092
093                FLATTEN, HIERARCHICAL;
094        }
095        /**
096         * define the mode in which complex features should be appended in FLATTEN
097         * mode their will be appended to a single feature in HIERARCHICAL mode, the
098         * single mother feature will have a tree of features that will reflect the
099         * construction in genbank file
100         */
101        private complexFeaturesAppendEnum complexFeaturesAppendMode = complexFeaturesAppendEnum.HIERARCHICAL;
102
103        public void setComplexFeaturesAppendMode(complexFeaturesAppendEnum complexFeaturesAppendMode) {
104                this.complexFeaturesAppendMode = complexFeaturesAppendMode;
105        }
106
107        public InsdcParser() {
108                this(DataSource.ENA);
109        }
110
111        public InsdcParser(DataSource dataSource) {
112                this.dataSource = dataSource;
113        }
114
115        public DataSource getDataSource() {
116                return dataSource;
117        }
118
119        public void setSequenceCircular(boolean sequenceCircular) {
120                isSequenceCircular = sequenceCircular;
121        }
122
123        public void setSequenceLength(long sequenceLength) {
124                this.sequenceLength = sequenceLength;
125        }
126
127        /**
128         * Main method for parsing a location from a String instance
129         *
130         * @param locationString Represents a logical location
131         * @return The parsed location
132         * @throws ParserException thrown in the event of any error during parsing
133         */
134        public Location parse(String locationString) {
135                featureGlobalStart = Integer.MAX_VALUE;
136                featureGlobalEnd = 1;
137
138                Location l;
139                List<Location> ll = parseLocationString(locationString, 1);
140
141                if (ll.size() == 1) {
142                        l = ll.get(0);
143                } else {
144                        l = new SimpleLocation(
145                                        new SimplePoint(featureGlobalStart),
146                                        new SimplePoint(featureGlobalEnd),
147                                        Strand.UNDEFINED,
148                                        isSequenceCircular,
149                                        ll);
150                }
151                return l;
152        }
153
154        private List<Location> parseLocationString(String string, int versus) {
155                Matcher m;
156                List<Location> boundedLocationsCollection = new ArrayList<>();
157
158                List<String> tokens = splitString(string);
159                for (String t : tokens) {
160                        m = genbankSplitPattern.matcher(t);
161                        if (!m.find()) {
162                                throw new ParserException("Cannot interpret split pattern " + t
163                                                + "\nin location string:" + string);
164                        }
165                        String splitQualifier = m.group(1);
166                        String splitString = m.group(2);
167
168                        if (!splitQualifier.isEmpty()) {
169                                //recursive case
170                                int localVersus = "complement".equalsIgnoreCase(splitQualifier) ? -1 : 1;
171                                List<Location> subLocations = parseLocationString(
172                                                splitString, versus * localVersus);
173
174                                switch (complexFeaturesAppendMode) {
175                                        case FLATTEN:
176                                                boundedLocationsCollection.addAll(subLocations);
177                                                break;
178                                        case HIERARCHICAL:
179                                                if (subLocations.size() == 1) {
180                                                        boundedLocationsCollection.addAll(subLocations);
181                                                } else {
182                                                        Point min = Location.Tools.getMin(subLocations).getStart();
183                                                        Point max = Location.Tools.getMax(subLocations).getEnd();
184                                                        AbstractLocation motherLocation
185                                                                        = new SimpleLocation(
186                                                                                        min,
187                                                                                        max
188                                                                        );
189
190                                                        if ("join".equalsIgnoreCase(splitQualifier)) {
191                                                                motherLocation = new InsdcLocations.GroupLocation(subLocations);
192                                                        }
193                                                        if ("order".equalsIgnoreCase(splitQualifier)) {
194                                                                motherLocation = new InsdcLocations.OrderLocation(subLocations);
195                                                        }
196                                                        if ("bond".equalsIgnoreCase(splitQualifier)) {
197                                                                motherLocation = new InsdcLocations.BondLocation(subLocations);
198                                                        }
199                                                        motherLocation.setStrand(getGroupLocationStrand(subLocations));
200                                                        boundedLocationsCollection.add(motherLocation);
201                                                }
202                                        break;
203                                }
204                        } else {
205                                //base case
206                                m = singleLocationPattern.matcher(splitString);
207                                if (!m.find()) {
208                                        throw new ParserException("Cannot interpret location pattern " + splitString
209                                                        + "\nin location string:" + string);
210                                }
211
212                                String accession = m.group(1);
213                                Strand s = versus == 1 ? Strand.POSITIVE : Strand.NEGATIVE;
214                                int start = Integer.valueOf(m.group(3));
215                                int end = m.group(6) == null ? start : Integer.valueOf(m.group(6));
216
217                                if (featureGlobalStart > start) {
218                                        featureGlobalStart = start;
219                                }
220                                if (featureGlobalEnd < end) {
221                                        featureGlobalEnd = end;
222                                }
223
224                                AbstractLocation l;
225                                if (start <= end) {
226                                        l = new SimpleLocation(
227                                                        start,
228                                                        end,
229                                                        s
230                                        );
231                                } else {
232                                        // in case of location spanning the end point, Location contract wants sublocations
233                                        AbstractLocation l5prime = new SimpleLocation(
234                                                        1,
235                                                        end,
236                                                        Strand.UNDEFINED
237                                                        );
238                                        AbstractLocation l3prime = new SimpleLocation(
239                                                        start,
240                                                        (int) sequenceLength,
241                                                        Strand.UNDEFINED
242                                                        );
243
244                                        l = new InsdcLocations.GroupLocation(
245                                                        new SimplePoint(start),
246                                                        new SimplePoint(end),
247                                                        s,
248                                                        isSequenceCircular,
249                                                        l5prime, l3prime
250                                        );
251
252                                }
253
254                                if("^".equals(m.group(4))) l.setBetweenCompounds(true);
255
256                                if ("<".equals(m.group(2))) {
257                                        l.setPartialOn5prime(true);
258                                }
259                                if (m.group(5) != null && (">".equals(m.group(5)) || ">".equals(m.group(7)))) {
260                                        l.setPartialOn3prime(true);
261                                }
262
263                                if (!(accession == null || "".equals(accession))) l.setAccession(new AccessionID(accession));
264
265                                boundedLocationsCollection.add(l);
266
267                        }
268                }
269
270                return boundedLocationsCollection;
271        }
272
273
274        private List<String> splitString(String input) {
275                List<String> result = new ArrayList<>();
276                int start = 0;
277                int openedParenthesis = 0;
278                for (int current = 0; current < input.length(); current++) {
279                        if (input.charAt(current) == '(') {
280                                openedParenthesis++;
281                        }
282                        if (input.charAt(current) == ')') {
283                                openedParenthesis--;
284                        }
285                        boolean atLastChar = (current == input.length() - 1);
286                        if (atLastChar) {
287                                result.add(input.substring(start));
288                        } else if (input.charAt(current) == ',' && openedParenthesis == 0) {
289                                result.add(input.substring(start, current));
290                                start = current + 1;
291                        }
292                }
293                return result;
294        }
295
296        private Strand getGroupLocationStrand(List<Location> ll){
297                Strand returnStrand = null;
298
299                for (Location l: ll) {
300                        if (returnStrand == null) returnStrand = l.getStrand();
301                        if (returnStrand != l.getStrand()) return Strand.UNDEFINED;
302                }
303                return returnStrand;
304        }
305
306}