001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojava.bio.seq.io;
023
024import java.util.ArrayList;
025import java.util.List;
026import java.util.Map;
027
028import org.biojava.bio.BioError;
029import org.biojava.bio.BioException;
030import org.biojava.bio.seq.Feature;
031import org.biojava.bio.seq.RemoteFeature;
032import org.biojava.bio.seq.StrandedFeature;
033import org.biojava.bio.symbol.BetweenLocation;
034import org.biojava.bio.symbol.FuzzyLocation;
035import org.biojava.bio.symbol.FuzzyPointLocation;
036import org.biojava.bio.symbol.Location;
037import org.biojava.bio.symbol.LocationTools;
038import org.biojava.bio.symbol.PointLocation;
039import org.biojava.bio.symbol.RangeLocation;
040import org.biojava.utils.ChangeVetoException;
041import org.biojava.utils.SmallMap;
042
043/**
044 * <code>EmblLikeLocationParser</code> parses EMBL/Genbank style
045 * locations. Supported location forms:
046 *
047 * <pre>
048 *   123
049 *  <123 or >123
050 *  (123.567)
051 *  (123.567)..789
052 *   123..(567.789)
053 *  (123.345)..(567.789)
054 *   123..456
055 *  <123..567 or 123..>567 or <123..>567
056 *   123^567
057 *   AL123465:(123..567)
058 * </pre>
059 *
060 * @author Keith James
061 * @author Greg Cox
062 * @since 1.2
063 * @deprecated Use org.biojavax.bio.seq.io framework instead
064 */
065public class EmblLikeLocationParser
066{
067    // For the LocationLexer inner classs
068    private String        location;
069    private LocationLexer lexer;
070    private int           nextCharIndex;
071    private Object        thisToken;
072
073    // seq ID of the sequence we are parsing features for
074    private String        parentSeqID;
075
076    // Stores join/order/complement instructions
077    private List instructStack = new ArrayList();
078        // joinType is a hack to store if a compound location is a
079        // join(...) or an order(...) location.  If this isn't sufficient
080        // for your needs, feel free to fix it.  If you do, please make
081        // sure the AbstractGenEmblFileFormer class is still able to
082        // format join and order correctly. The joinType is stored in the
083        // Feature Annotation under the internal data key
084        // Feature.PROPERTY_DATA_KEY which means that it won't get printed
085        // in flatfile dumps.
086        private String joinType = null;
087
088    // List of sublocations.  Used for compound locations on the current
089    // sequence
090    private List  subLocations = new ArrayList();
091        // List of subRegions.  Used to store remote regions
092        private List  subRegions = new ArrayList();
093
094    // These hold working data for each (sub)location and are cleared
095    // by calling the processCoords() function
096    private String mRegionSeqID;
097    private List   startCoords = new ArrayList();
098    private List     endCoords = new ArrayList();
099    private boolean isPointLoc = true;
100    private boolean fuzzyCoord = false;
101    private boolean unboundMin = false;
102    private boolean unboundMax = false;
103    private boolean isBetweenLocation = false;
104
105    // Currently set per Feature; this is a deficiency in the current
106    // parser. Features are assumed to be on the positive strand until
107    // complemented.
108    // No features have a strand type of UNKNOWN
109    private StrandedFeature.Strand mStrandType = StrandedFeature.POSITIVE;
110
111    EmblLikeLocationParser(String parentSeqID)
112    {
113        this.lexer = new LocationLexer();
114        this.parentSeqID = parentSeqID;
115    }
116
117    /**
118     * <code>parseLocation</code> creates a <code>Location</code> from
119     * the String and returns a stranded location.
120     *
121     * @param location a location <code>String</code>.
122     * @param theTemplate the template to be filled with the parsed out location
123     * information.
124     *
125     * @exception BioException if an error occurs.
126     */
127    public Feature.Template parseLocation(String location, Feature.Template theTemplate)
128                throws BioException
129    {
130        this.location = location;
131
132        //fixme: mrp: removed this check - it may be killing performance
133        //if ((countChar(location, '(')) != (countChar(location, ')')))
134        //    throw new BioException("Unbalanced parentheses in location: "
135        //                           + location);
136
137        nextCharIndex = 0;
138
139        instructStack.clear();
140        subLocations.clear();
141        subRegions.clear();
142
143        // 'join' vs. 'order'
144        joinType = null;
145
146        thisToken = lexer.getNextToken();
147        while (thisToken != null)
148        {
149            if (String.class.isInstance(thisToken))
150            {
151                String toke = (String) thisToken;
152                if (toke.equals(".."))
153                {
154                                // This token indicates that this isn't a point
155                    isPointLoc = false;
156                }
157                else
158                {
159                    instructStack.add(thisToken);
160                }
161            }
162            else if (Integer.class.isInstance(thisToken))
163            {
164                if (isPointLoc)
165                    startCoords.add(thisToken);
166                else
167                    endCoords.add(thisToken);
168            }
169            else if (Character.class.isInstance(thisToken))
170            {
171                char toke = ((Character) thisToken).charValue();
172
173                switch (toke)
174                {
175                    case '(':
176                        break;
177
178                                        case ':':
179                                                processInstructs();
180                                                break;
181
182                    case '^':
183                        isBetweenLocation = true;
184                        break;
185
186                    case '<':
187                        unboundMin = true;
188                        break;
189
190                    case '>':
191                        unboundMax = true;
192                        break;
193
194                    case '.':
195                        // Catch range: (123.567)
196                        fuzzyCoord = true;
197                        break;
198
199                    case ',':
200                        processCoords();
201                        break;
202
203                    case ')':
204                        // Catch the end of range: (123.567)
205                        if (fuzzyCoord)
206                        {
207                            fuzzyCoord = false;
208                        }
209                        else
210                        {
211                            processCoords();
212                            processInstructs();
213                        }
214                        break;
215
216                    default:
217                        throw new BioException("Unknown character '"
218                                               + toke
219                                               + "' within location: "
220                                               + location);
221                }
222            }
223            thisToken = lexer.getNextToken();
224        }
225        processCoords();
226
227                // The location has been processed, and now the template gets filled
228                if (subLocations.size() == 1)
229                {
230                        theTemplate.location = (Location)subLocations.get(0);
231                }
232                else
233                {
234                        // EMBL ordering is in reverse on the complementary strand
235                        // but LocationTools sorts them anyway
236                        theTemplate.location = LocationTools.union(subLocations);
237                }
238
239                if (theTemplate instanceof StrandedFeature.Template)
240                {
241                        ((StrandedFeature.Template) theTemplate).strand = mStrandType;
242                }
243
244                if (subRegions.size() > subLocations.size())
245                {
246                        // This is a remote feature, so a new template has to be made
247                        RemoteFeature.Template newTemplate = new RemoteFeature.Template(theTemplate);
248                        newTemplate.regions = new ArrayList(subRegions);
249// FIXME:
250// I don't know how to create an appropriate resolver, so I'm leaving it
251// blank.  No doubt this will break things.
252// -- Gcox
253                        newTemplate.resolver = null;
254
255                        theTemplate = newTemplate;
256                }
257
258                if (joinType != null)
259                {
260                        try
261                        {
262                // Feature.PROPERTY_DATA_KEY signifies internal (not
263                // for flatfile) data
264                Map dat = new SmallMap();
265                dat.put("JoinType", joinType);
266                theTemplate.annotation.setProperty(Feature.PROPERTY_DATA_KEY,
267                                                   dat);
268                        }
269                        catch (ChangeVetoException cve)
270                        {
271                                throw new BioError(cve);
272                        }
273                }
274                return theTemplate;
275    }
276
277    /**
278     * <code>processCoords</code> uses the coordinate data in the
279     * start/endCoords Lists to create a Location and add to the
280     * subLocations List. As this code will require further
281     * modification to support fuzzy point locations, please keep any
282     * changes well-commented.
283     *
284     * @exception BioException if an error occurs.
285     */
286    private void processCoords() throws BioException
287    {
288        int outerMin, innerMin, innerMax, outerMax;
289        Location createdLocation = null;
290
291        // This is expected where two calls to processCoords() are
292        // made sequentially e.g. where two levels of parens are
293        // closed. The second call will have no data to process.
294        if (startCoords.isEmpty() && endCoords.isEmpty())
295            return;
296
297        // Range of form 5^6 or 5^7
298        if (isBetweenLocation)
299        {
300            // Create a ranged location, and wrap it in a between location
301            int minCoord = ((Integer) startCoords.get(0)).intValue();
302            int maxCoord = ((Integer) startCoords.get(1)).intValue();
303            createdLocation = new BetweenLocation(new RangeLocation(minCoord, maxCoord));
304        }
305        // Range of form: 123
306        else if (startCoords.size() == 1 && endCoords.isEmpty())
307        {
308            innerMin = outerMin = ((Integer) startCoords.get(0)).intValue();
309            innerMax = outerMax = innerMin;
310
311            // This looks like a point, but is actually a range which
312            // lies entirely outside the current entry
313            if (unboundMin || unboundMax)
314            {
315                createdLocation = new FuzzyPointLocation(unboundMin ? Integer.MIN_VALUE : innerMin,
316                                                         unboundMax ? Integer.MAX_VALUE : innerMax,
317                                                         FuzzyPointLocation.RESOLVE_AVERAGE);
318            }
319            else if (isPointLoc)
320            {
321                createdLocation = new PointLocation(innerMin);
322            }
323            else
324            {
325                throw new BioException("Internal error in location parsing; parser became confused: "
326                                       + location);
327            }
328        }
329        // Range of form: (123.567)
330        else if (startCoords.size() == 2 && endCoords.isEmpty())
331        {
332            innerMin = outerMin = ((Integer) startCoords.get(0)).intValue();
333            innerMax = outerMax = ((Integer) startCoords.get(1)).intValue();
334
335            createdLocation = new FuzzyPointLocation(innerMin,
336                                                     innerMax,
337                                                     FuzzyPointLocation.RESOLVE_AVERAGE);
338        }
339        // Range of form: 123..567 or <123..567 or 123..>567 or <123..>567
340        else if (startCoords.size() == 1 && endCoords.size() == 1)
341        {
342            innerMin = outerMin = ((Integer) startCoords.get(0)).intValue();
343            innerMax = outerMax = ((Integer) endCoords.get(0)).intValue();
344
345            if (unboundMin || unboundMax)
346            {
347                createdLocation = new FuzzyLocation(unboundMin ? Integer.MIN_VALUE : outerMin,
348                                                    unboundMax ? Integer.MAX_VALUE : outerMax,
349                                                    innerMin,
350                                                    innerMax,
351                                                    FuzzyLocation.RESOLVE_INNER);
352            }
353            else
354            {
355                try
356                {
357                    createdLocation = new RangeLocation(outerMin, outerMax);
358                }
359                catch (IndexOutOfBoundsException ioe)
360                {
361                    throw new BioException(ioe);
362                }
363            }
364        }
365        // Range of form: (123.567)..789
366        else if (startCoords.size() == 2 && endCoords.size() == 1)
367        {
368            outerMin = ((Integer) startCoords.get(0)).intValue();
369            innerMin = ((Integer) startCoords.get(1)).intValue();
370            innerMax = outerMax = ((Integer) endCoords.get(0)).intValue();
371
372            createdLocation = new FuzzyLocation(outerMin,
373                                                outerMax,
374                                                innerMin,
375                                                innerMax,
376                                                FuzzyLocation.RESOLVE_INNER);
377        }
378        // Range of form: 123..(567.789)
379        else if (startCoords.size() == 1 && endCoords.size() == 2)
380        {
381            outerMin = innerMin = ((Integer) startCoords.get(0)).intValue();
382            innerMax = ((Integer) endCoords.get(0)).intValue();
383            outerMax = ((Integer) endCoords.get(1)).intValue();
384
385            createdLocation = new FuzzyLocation(outerMin,
386                                                outerMax,
387                                                innerMin,
388                                                innerMax,
389                                                FuzzyLocation.RESOLVE_INNER);
390        }
391        // Range of form: (123.345)..(567.789)
392        else if (startCoords.size() == 2 && endCoords.size() == 2)
393        {
394            outerMin = ((Integer) startCoords.get(0)).intValue();
395            innerMin = ((Integer) startCoords.get(1)).intValue();
396            innerMax = ((Integer) endCoords.get(0)).intValue();
397            outerMax = ((Integer) endCoords.get(1)).intValue();
398
399            createdLocation = new FuzzyLocation(outerMin,
400                                                outerMax,
401                                                innerMin,
402                                                innerMax,
403                                                FuzzyLocation.RESOLVE_INNER);
404        }
405        else
406        {
407            throw new BioException("Internal error in location parsing; parser became confused; "
408                                   + location);
409        }
410
411        startCoords.clear();
412        endCoords.clear();
413
414        if (mRegionSeqID == null)
415        {
416            subLocations.add(createdLocation);
417            subRegions.add(new RemoteFeature.Region(createdLocation, parentSeqID, false));
418        }
419        else
420        {
421            subRegions.add(new RemoteFeature.Region(createdLocation, mRegionSeqID, true));
422        }
423
424        mRegionSeqID = null;
425        isPointLoc   = true;
426        unboundMin   = false;
427        unboundMax   = false;
428        fuzzyCoord   = false;
429        isBetweenLocation = false;
430        mStrandType  = StrandedFeature.POSITIVE;
431    }
432    
433    /**
434     * <code>processInstructs</code> pops an instruction off the stack
435     * and applies it to the sub(locations).
436     *
437     * @exception BioException if an unsupported instruction is found.
438     */
439    private void processInstructs() throws BioException
440    {
441        String instruct = (String) instructStack.remove(instructStack.size() - 1);
442        if (instruct.equals("join") || instruct.equals("order"))
443        {
444            joinType = instruct;
445        }
446        else if (instruct.equals("complement"))
447        {
448            // This should only set the strand for a single range
449            // within a feature. However, BioJava Locations have no
450            // concept of strand and therefore are unable to support
451            // construction of Features where some ranges are on
452            // different strands. As a result the mStrandType
453            // flag currently sets the strand for the whole feature.
454            mStrandType = StrandedFeature.NEGATIVE;
455        }
456        else
457        {
458            // This is a primary accession number
459            // e.g. J00194:(100..202)
460            mRegionSeqID = instruct;
461        }
462    }
463
464
465    /**
466     * <code>LocationLexer</code> is based on the
467     * <code>LocationLexer</code> class in the Artemis source code by
468     * Kim Rutherford.
469     *
470     * @author Kim Rutherford
471     * @author Keith James
472     * @author Greg Cox
473     * @since 1.2
474     */
475    private class LocationLexer
476    {
477        private StringBuffer  intString = new StringBuffer();
478        private StringBuffer textString = new StringBuffer();
479
480        /**
481         * <code>getNextToken</code> returns the next token. A null
482         * indicates no more tokens.
483         *
484         * @return an <code>Object</code> value.
485         */
486        Object getNextToken()
487        {
488            while (true)
489            {
490                if (nextCharIndex == location.length())
491                    return null;
492
493                char thisChar = location.charAt(nextCharIndex);
494
495                switch (thisChar)
496                {
497                    case ' ' : case '\t' :
498                        nextCharIndex++;
499                        continue;
500
501                    case ':' : case '^' : case ',' :
502                    case '(' : case ')' : case '<' :
503                    case '>' :
504                        nextCharIndex++;
505                        return new Character(thisChar);
506
507                    case '.' :
508                        if (location.charAt(nextCharIndex + 1) == '.')
509                        {
510                            nextCharIndex += 2;
511                            return "..";
512                        }
513                        else
514                        {
515                            nextCharIndex++;
516                            return new Character('.');
517                        }
518
519                    case '0' : case '1' : case '2' : case '3' : case '4' :
520                    case '5' : case '6' : case '7' : case '8' : case '9' :
521                        return followInteger();
522
523                    default :
524                        String text = followText();
525                        if (text.equals(""))
526                        {
527                            nextCharIndex++;
528                            return new String("" + thisChar);
529                        }
530                        else
531                            return text;
532                }
533            }
534        }
535
536        /**
537         * <code>followInteger</code> returns single sequence
538         * coordinate.
539         *
540         * @return an <code>Integer</code> value.
541         */
542        private Integer followInteger()
543        {
544            intString.setLength(0);
545            char thisChar = location.charAt(nextCharIndex);
546
547            while (Character.isDigit(thisChar))
548            {
549                intString.append(thisChar);
550                nextCharIndex++;
551
552                if (nextCharIndex >= location.length())
553                    break;
554
555                thisChar = location.charAt(nextCharIndex);
556            }
557
558            return new Integer(intString.substring(0));
559        }
560
561        /**
562         * <code>followText</code> returns a single text string.
563         *
564         * @return a <code>String</code> value.
565         */
566        private String followText()
567        {
568            textString.setLength(0);
569            char thisChar = location.charAt(nextCharIndex);
570
571            // First character must be a letter
572            if (! Character.isLetter(thisChar))
573                return "";
574
575            while (Character.isLetterOrDigit(thisChar) ||
576                   thisChar == '.')
577            {
578                textString.append(thisChar);
579                nextCharIndex++;
580
581                if (nextCharIndex >= location.length())
582                    break;
583
584                thisChar = location.charAt(nextCharIndex);
585            }
586
587            return textString.substring(0);
588        }
589    }
590}