001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.seq.io;
023
024import java.util.ArrayList;
025import java.util.Collections;
026import java.util.Iterator;
027import java.util.List;
028import java.util.regex.Matcher;
029import java.util.regex.Pattern;
030
031import org.biojava.bio.seq.io.ParseException;
032import org.biojava.utils.ChangeVetoException;
033import org.biojavax.CrossRef;
034import org.biojavax.Namespace;
035import org.biojavax.RichObjectFactory;
036import org.biojavax.SimpleCrossRef;
037import org.biojavax.bio.seq.CompoundRichLocation;
038import org.biojavax.bio.seq.Position;
039import org.biojavax.bio.seq.RichLocation;
040import org.biojavax.bio.seq.SimplePosition;
041import org.biojavax.bio.seq.SimpleRichLocation;
042import org.biojavax.bio.seq.RichLocation.Strand;
043import org.biojavax.ontology.ComparableTerm;
044
045
046/**
047 * Parses Genbank location strings into RichLocation objects.
048 * @author Richard Holland
049 * @authour Deepak Sheoran
050 * @since 1.5
051 */
052public class GenbankLocationParser {
053    
054    // No instances please
055    private GenbankLocationParser() {}
056
057    /**
058     * Parses a location.
059     * @param featureNS the namespace of the feature this location lives on.
060     * @param featureAccession the accession of the sequence of the feature this location lives on.
061     * @param locationString the GenBank location string.
062     * @return RichLocation the equivalent RichLocation object.
063     * @throws ParseException if the parsing failed.
064     */
065    public static RichLocation parseLocation(Namespace featureNS, String featureAccession, String locationString) throws ParseException {
066        /*
067         
068          FROM GENBANK FEATURE TABLE DOCS
069         
0703.5.3 Location examples
071The following is a list of common location descriptors with their meanings:
072Location                  Description
073         
074467                       Points to a single base in the presented sequence
075         
076340..565                  Points to a continuous range of bases bounded by and
077                          including the starting and ending bases
078         
079<345..500                 Indicates that the exact lower boundary point of a
080                          feature is unknown.  The location begins at some
081                          base previous to the first base specified (which need
082                          not be contained in the presented sequence) and con-
083                          tinues to and includes the ending base
084         
085<1..888                   The feature starts before the first sequenced base
086                          and continues to and includes base 888
087         
088(102.110)                 Indicates that the exact location is unknown but that
089                          it is one of the bases between bases 102 and 110, in-
090                          clusive
091         
092(23.45)..600              Specifies that the starting point is one of the bases
093                          between bases 23 and 45, inclusive, and the end point
094                          is base 600
095         
096(122.133)..(204.221)      The feature starts at a base between 122 and 133,
097                          inclusive, and ends at a base between 204 and 221,
098                          inclusive
099         
100123^124                   Points to a site between bases 123 and 124
101         
102145^177                   Points to a site between two adjacent bases anywhere
103                          between bases 145 and 177
104         
105join(12..78,134..202)     Regions 12 to 78 and 134 to 202 should be joined to
106                          form one contiguous sequence
107         
108complement(1..23)         Complements region 1 to 23
109         
110complement(join(2691..4571,4918..5163)
111                          Joins regions 2691 to 4571 and 4918 to 5163, then
112                          complements the joined segments (the feature is
113                          on the strand complementary to the presented strand)
114         
115join(complement(4918..5163),complement(2691..4571))
116                          Complements regions 4918 to 5163 and 2691 to 4571,
117                          then joins the complemented segments (the feature is
118                          on the strand complementary to the presented strand)
119         
120complement(34..(122.126)) Start at one of the bases complementary to those
121                          between 122 and 126 on the presented strand and finish
122                          at the base complementary to base 34 (the feature is
123                          on the strand complementary to the presented strand)
124         
125J00194:100..202           Points to bases 100 to 202, inclusive, in the entry
126                          (in this database) with primary accession number
127                          'J00194'
128         
129         */
130        
131        rank = 0;
132        return parseLocString(featureNS, featureAccession, null, Strand.POSITIVE_STRAND, locationString);
133    }
134    
135    // O beautiful regex, we worship you.
136    // this matches grouped locations
137    private static Pattern gp = Pattern.compile("^([^\\(\\):]*?:)?(complement|join|order)?\\({0,1}(.*?\\)*{0,1})$");
138    // this matches range locations
139    private static Pattern rp = Pattern.compile("^\\(*(.*?)\\)*(\\.\\.\\(*(.*)\\)*)?$");
140    // this matches accession/version pairs
141    private static Pattern xp = Pattern.compile("^(.*?)(\\.(\\d+))?:$");
142    // this matches a single base position 
143    private static Pattern pp = Pattern.compile("^\\(*(<|>)?(\\d+)(([\\.\\^])(\\d+))?(<|>)?\\)*$");
144    // used to assign an ascending rank to each location found
145    private static int rank;
146    
147    // this function allows us to recursively parse bracketed groups
148    private static RichLocation parseLocString(Namespace featureNS, String featureAccession, CrossRef parentXref, Strand parentStrand, String locStr) throws ParseException {
149        // First attempt to find the group enclosing everything we've been passed        
150        Matcher gm = gp.matcher(locStr);
151        if (!gm.matches()) {
152            String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Bad location string found", locStr);
153            throw new ParseException(message);
154        }
155        String xrefName = gm.group(1);
156        String groupType = gm.group(2);
157        String subLocStr = gm.group(3);
158
159        // The group may have a crossref. If it doesn't, use the parent crossref instead.
160        CrossRef crossRef = parentXref;
161        if (xrefName!=null) {
162            
163            // Try and find an accession (crossref)
164            Matcher xm = xp.matcher(xrefName);
165            if (!xm.matches()) {
166                String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Bad location xref found", locStr);
167                throw new ParseException(message);
168            }
169            String xrefAccession = xm.group(1);
170            String xrefVersion = xm.group(3);
171            if (xrefAccession.equals(featureAccession)) crossRef = null;
172            else {
173                // Found an accession, but does it have a version?
174                if (xrefVersion!=null) {
175                    crossRef = (SimpleCrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{
176                        featureNS.getName(),xrefAccession,Integer.valueOf(xrefVersion)
177                    });
178                } else {
179                    crossRef = (SimpleCrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{
180                        featureNS.getName(),xrefAccession,new Integer(0)
181                    });
182                }
183            }
184        }
185        
186        // The group may actually be a strand. If it isn't, or if it is the same strand as the parent, 
187        // use the parent strand instead. 
188        Strand strand = parentStrand;
189        if (groupType!=null) {
190            
191            // Is it a strand group?
192            if (groupType.equalsIgnoreCase("complement")) {
193                // It's a complement location
194                if (parentStrand==Strand.NEGATIVE_STRAND) strand = Strand.POSITIVE_STRAND;
195                else strand = Strand.NEGATIVE_STRAND;
196                
197                // Get the parsed contents of the complement 'group'
198                List resultBlocks = new ArrayList(RichLocation.Tools.flatten(parseLocString(featureNS,featureAccession,crossRef,strand,subLocStr)));
199                // Work out the rank of the first block.
200                int firstRank = ((RichLocation)resultBlocks.get(0)).getRank();
201                // Reverse the order of its members c(j(x,y)) = j(cy,cx)
202                Collections.reverse(resultBlocks);
203                // Reset ranks so that they persist and retrieve to BioSQL in the correct order.
204                for (Iterator i = resultBlocks.iterator(); i.hasNext(); ) {
205                    RichLocation rl = (RichLocation)i.next();
206                    try {
207                        rl.setRank(firstRank++);
208                    } catch (ChangeVetoException e) {
209                        throw new ParseException(e);
210                    }
211                }
212                return RichLocation.Tools.construct(resultBlocks);
213            } 
214            
215            // Otherwise, it's a compound location
216            else {
217                ComparableTerm groupTypeTerm = null;
218                if (groupType.equalsIgnoreCase("order")) groupTypeTerm = CompoundRichLocation.getOrderTerm();
219                else if (groupType.equalsIgnoreCase("join")) groupTypeTerm = CompoundRichLocation.getJoinTerm();
220                else {
221                    String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Unknown group type found", locStr);
222                    throw new ParseException(message);
223                }
224                
225                // recurse on each block and return the compounded result
226                List members = new ArrayList();                
227                StringBuffer sb = new StringBuffer();
228                char[] chars = subLocStr.toCharArray();
229                int bracketCount = 0;
230                try{
231                for (int i = 0; i < chars.length; i++) {
232                    char c = chars[i];
233                    if (c=='(') bracketCount++;
234                    else if (c==')') bracketCount--;
235                    if (c==',' && bracketCount==0) {
236                        String subStr = sb.toString();
237                        members.add(parseLocString(featureNS,featureAccession,crossRef,parentStrand,subStr));
238                        sb.setLength(0); // reset buffer
239                    } else sb.append(c);
240                }
241                }catch(RuntimeException e){
242                    String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Problem with location format", locStr);
243                    throw new ParseException(message);
244                }
245                if (sb.length()>0) members.add(parseLocString(featureNS,featureAccession,crossRef,parentStrand,sb.toString()));
246                
247                // Merge the members of the group
248                RichLocation result = RichLocation.Tools.construct(RichLocation.Tools.merge(members));
249                
250                // Set the group term if the result was a group.
251                try {
252                    if (result instanceof CompoundRichLocation) result.setTerm(groupTypeTerm);
253                } catch (ChangeVetoException e) {
254                    ParseException e2 = new ParseException("Unable to set group term");
255                    e2.initCause(e);
256                    throw e2;
257                }
258                
259                return result;
260            }
261        }
262        
263        // Process a simple location.
264        Matcher rm = rp.matcher(subLocStr);
265        if (!rm.matches()) {
266            String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Bad location description found", subLocStr);
267            throw new ParseException(message);
268        }
269        String start = rm.group(1);
270        String end = rm.group(3);
271        
272        Position startPos = parsePosition(start);
273        if (end==null) {
274            // A point location
275            return new SimpleRichLocation(startPos,startPos,++rank,strand,crossRef);
276        } else {
277            // A range location
278            Position endPos = parsePosition(end);
279            return new SimpleRichLocation(startPos,endPos,++rank,strand,crossRef);
280        }
281    }
282    
283    // this function parses a single position - usually just half of one location
284    private static Position parsePosition(String position) throws ParseException {
285        Matcher pm = pp.matcher(position);
286        if (!pm.matches()) throw new ParseException("Could not understand position: "+position);
287        String startfuzz = pm.group(1);
288        String endfuzz = pm.group(6);
289        boolean endStartsFuzzy = ((startfuzz!=null && startfuzz.equals("<")) || (endfuzz!=null && endfuzz.equals("<")));
290        boolean endEndsFuzzy = ((endfuzz!=null && endfuzz.equals(">")) || (startfuzz!=null && startfuzz.equals(">")));
291        String endStart = pm.group(2);
292        String endRangeType = pm.group(4);
293        String endEnd = pm.group(5);
294        if (endRangeType!=null) {
295            // fuzziest
296            return new SimplePosition(endStartsFuzzy,endEndsFuzzy,Integer.parseInt(endStart),Integer.parseInt(endEnd),endRangeType);
297        } else {
298            // less fuzzy
299            return new SimplePosition(endStartsFuzzy,endEndsFuzzy,Integer.parseInt(endStart));
300        }
301    }
302
303    /**
304     * Writes a location in Genbank format.
305     * @param l the location to write
306     * @return the formatted string representing the location.
307     */
308    public static String writeLocation(RichLocation l) {
309        //write out location text
310        //use crossrefs to calculate remote location positions
311        //one big join (or order?) with complemented parts
312        if (l instanceof CompoundRichLocation) {
313//            return _writeGroupLocation(l.blockIterator(),l.getTerm());
314            return writeCompoundLocation((CompoundRichLocation) l);
315        } else {
316            return _writeSingleLocation(l);
317        }
318    }
319    
320    // writes out a single position
321    private static String _writePosition(Position p) {
322        StringBuffer sb = new StringBuffer();
323        int s = p.getStart();
324        int e = p.getEnd();
325        String t = p.getType();
326        boolean fs = p.getFuzzyStart();
327        boolean fe = p.getFuzzyEnd();
328        boolean useParenthesis=isUsingParenthesis(p);
329//      System.out.println("GenbankLocationParser._writePosition-p: ["+p+"], s:"+s+", e:"+e+", t: ["+t+"], fs? "+fs+", fe? "+fe);
330        if (s!=e) {
331            // a range - put in brackets
332            if (useParenthesis) sb.append("(");
333            if (fs) sb.append("<");
334            sb.append(s);
335            sb.append(t);
336            if (fe) sb.append(">");
337            sb.append(e);
338            if (useParenthesis) sb.append(")");
339        } else {
340            // not a range - no brackets
341            if (fs) sb.append("<");
342            if (fe) sb.append(">");
343            sb.append(s);
344        }
345        return sb.toString();
346    }
347    
348    // write out a single location
349    private static String _writeSingleLocation(RichLocation l) {
350        StringBuffer loc = new StringBuffer();
351        if (l.getCrossRef()!=null) {
352            loc.append(l.getCrossRef().getAccession());
353            final int version = l.getCrossRef().getVersion();
354            if (version!=0) {
355                loc.append(".");
356                loc.append(version);
357            }
358            loc.append(":");
359        }
360        loc.append(_writePosition(l.getMinPosition()));
361        if (!l.getMinPosition().equals(l.getMaxPosition())) {
362            loc.append("..");
363            loc.append(_writePosition(l.getMaxPosition()));
364        }
365        if (l.getStrand()==Strand.NEGATIVE_STRAND) {
366            loc.insert(0,"complement(");
367            loc.append(")");
368        }
369//        if (l.getCrossRef()!=null) {
370//            loc.insert(0,":");
371//            int version = l.getCrossRef().getVersion();
372//            if (version!=0) {
373//                loc.insert(0,version);
374//                loc.insert(0,".");
375//            }
376//            loc.insert(0,l.getCrossRef().getAccession());
377//        }
378        return loc.toString();
379    }
380    
381    // write out a group location
382    private static String _writeGroupLocation(Iterator i, ComparableTerm parentTerm) {
383        StringBuffer sb = new StringBuffer();
384        sb.append(parentTerm.getName());
385        sb.append("(");
386        while (i.hasNext()) {
387            RichLocation l = (RichLocation)i.next();
388            if (l instanceof CompoundRichLocation) {
389                sb.append(_writeGroupLocation(l.blockIterator(),l.getTerm()));
390            } else {
391                sb.append(_writeSingleLocation(l));
392            }
393            if (i.hasNext()) sb.append(",");
394        }
395        sb.append(")");
396        return sb.toString();
397    }
398    
399    private final static String writeCompoundLocation(final CompoundRichLocation theLocation) {
400        if (isAnyLocationComplemented(theLocation)) {
401                if (isAnyLocationOnDifferentStrand(theLocation)) {
402                        return _writeGroupLocation(theLocation.blockIterator(), theLocation.getTerm());
403                } else {
404                        return writeComplementLocation(theLocation.blockIterator());
405                }
406        } else {
407                return _writeGroupLocation(theLocation.blockIterator(), theLocation.getTerm());
408        }
409        
410    }
411    
412    private final static String writeComplementLocation(Iterator i) {
413        StringBuffer sb = new StringBuffer();
414        while (i.hasNext()) {
415            RichLocation l = (RichLocation)i.next();
416            if (l instanceof CompoundRichLocation) {
417                sb.insert(0, writeCompoundLocation((CompoundRichLocation) l));
418            } else {
419                sb.insert(0, writeSingleLocation(l));
420            }
421            if (i.hasNext()) sb.insert(0, ",");
422        }
423        sb.insert(0, "complement(join(");
424        sb.append("))");
425        return sb.toString();
426    }
427
428    private final static String writeSingleLocation(final RichLocation theRichLocation) {
429        StringBuffer loc = new StringBuffer();
430        if (theRichLocation.getCrossRef()!=null) {
431            loc.append(theRichLocation.getCrossRef().getAccession());
432            final int version = theRichLocation.getCrossRef().getVersion();
433            if (version!=0) {
434                loc.append(".");
435                loc.append(version);
436            }
437            loc.append(":");
438        }
439        loc.append(_writePosition(theRichLocation.getMinPosition()));
440        if (!theRichLocation.getMinPosition().equals(theRichLocation.getMaxPosition())) {
441            loc.append("..");
442            loc.append(_writePosition(theRichLocation.getMaxPosition()));
443        }
444        return loc.toString();
445    }
446    
447    private final static boolean isAnyLocationComplemented(final CompoundRichLocation theLocation) {
448        final Iterator c = theLocation.blockIterator();
449        while (c.hasNext()) {
450                final RichLocation location = (RichLocation) c.next();
451                if (location.getStrand()==Strand.NEGATIVE_STRAND) return true;
452        }
453        return false;
454    }
455    
456    private final static boolean isAnyLocationOnDifferentStrand(final CompoundRichLocation theLocation) {
457        final Iterator c = theLocation.blockIterator();
458        final Strand firstStrand = ((RichLocation) c.next()).getStrand();
459        while (c.hasNext()) {
460                final RichLocation location = (RichLocation) c.next();
461                if (location.getStrand() != firstStrand) return true;
462        }
463        return false;
464    }
465    
466    private final static boolean isUsingParenthesis(final Position thePosition) {
467        return !isBetweenBases(thePosition);
468    }
469    
470    private final static boolean isBetweenBases(final Position thePosition) {
471        return thePosition.getType() != null && thePosition.getType().equals(Position.BETWEEN_BASES);
472    }
473}