001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.seq.io; 023 024import java.util.ArrayList; 025import java.util.Collections; 026import java.util.Iterator; 027import java.util.List; 028import java.util.regex.Matcher; 029import java.util.regex.Pattern; 030 031import org.biojava.bio.seq.io.ParseException; 032import org.biojava.utils.ChangeVetoException; 033import org.biojavax.CrossRef; 034import org.biojavax.Namespace; 035import org.biojavax.RichObjectFactory; 036import org.biojavax.SimpleCrossRef; 037import org.biojavax.bio.seq.CompoundRichLocation; 038import org.biojavax.bio.seq.Position; 039import org.biojavax.bio.seq.RichLocation; 040import org.biojavax.bio.seq.SimplePosition; 041import org.biojavax.bio.seq.SimpleRichLocation; 042import org.biojavax.bio.seq.RichLocation.Strand; 043import org.biojavax.ontology.ComparableTerm; 044 045 046/** 047 * Parses Genbank location strings into RichLocation objects. 048 * @author Richard Holland 049 * @authour Deepak Sheoran 050 * @since 1.5 051 */ 052public class GenbankLocationParser { 053 054 // No instances please 055 private GenbankLocationParser() {} 056 057 /** 058 * Parses a location. 059 * @param featureNS the namespace of the feature this location lives on. 060 * @param featureAccession the accession of the sequence of the feature this location lives on. 061 * @param locationString the GenBank location string. 062 * @return RichLocation the equivalent RichLocation object. 063 * @throws ParseException if the parsing failed. 064 */ 065 public static RichLocation parseLocation(Namespace featureNS, String featureAccession, String locationString) throws ParseException { 066 /* 067 068 FROM GENBANK FEATURE TABLE DOCS 069 0703.5.3 Location examples 071The following is a list of common location descriptors with their meanings: 072Location Description 073 074467 Points to a single base in the presented sequence 075 076340..565 Points to a continuous range of bases bounded by and 077 including the starting and ending bases 078 079<345..500 Indicates that the exact lower boundary point of a 080 feature is unknown. The location begins at some 081 base previous to the first base specified (which need 082 not be contained in the presented sequence) and con- 083 tinues to and includes the ending base 084 085<1..888 The feature starts before the first sequenced base 086 and continues to and includes base 888 087 088(102.110) Indicates that the exact location is unknown but that 089 it is one of the bases between bases 102 and 110, in- 090 clusive 091 092(23.45)..600 Specifies that the starting point is one of the bases 093 between bases 23 and 45, inclusive, and the end point 094 is base 600 095 096(122.133)..(204.221) The feature starts at a base between 122 and 133, 097 inclusive, and ends at a base between 204 and 221, 098 inclusive 099 100123^124 Points to a site between bases 123 and 124 101 102145^177 Points to a site between two adjacent bases anywhere 103 between bases 145 and 177 104 105join(12..78,134..202) Regions 12 to 78 and 134 to 202 should be joined to 106 form one contiguous sequence 107 108complement(1..23) Complements region 1 to 23 109 110complement(join(2691..4571,4918..5163) 111 Joins regions 2691 to 4571 and 4918 to 5163, then 112 complements the joined segments (the feature is 113 on the strand complementary to the presented strand) 114 115join(complement(4918..5163),complement(2691..4571)) 116 Complements regions 4918 to 5163 and 2691 to 4571, 117 then joins the complemented segments (the feature is 118 on the strand complementary to the presented strand) 119 120complement(34..(122.126)) Start at one of the bases complementary to those 121 between 122 and 126 on the presented strand and finish 122 at the base complementary to base 34 (the feature is 123 on the strand complementary to the presented strand) 124 125J00194:100..202 Points to bases 100 to 202, inclusive, in the entry 126 (in this database) with primary accession number 127 'J00194' 128 129 */ 130 131 rank = 0; 132 return parseLocString(featureNS, featureAccession, null, Strand.POSITIVE_STRAND, locationString); 133 } 134 135 // O beautiful regex, we worship you. 136 // this matches grouped locations 137 private static Pattern gp = Pattern.compile("^([^\\(\\):]*?:)?(complement|join|order)?\\({0,1}(.*?\\)*{0,1})$"); 138 // this matches range locations 139 private static Pattern rp = Pattern.compile("^\\(*(.*?)\\)*(\\.\\.\\(*(.*)\\)*)?$"); 140 // this matches accession/version pairs 141 private static Pattern xp = Pattern.compile("^(.*?)(\\.(\\d+))?:$"); 142 // this matches a single base position 143 private static Pattern pp = Pattern.compile("^\\(*(<|>)?(\\d+)(([\\.\\^])(\\d+))?(<|>)?\\)*$"); 144 // used to assign an ascending rank to each location found 145 private static int rank; 146 147 // this function allows us to recursively parse bracketed groups 148 private static RichLocation parseLocString(Namespace featureNS, String featureAccession, CrossRef parentXref, Strand parentStrand, String locStr) throws ParseException { 149 // First attempt to find the group enclosing everything we've been passed 150 Matcher gm = gp.matcher(locStr); 151 if (!gm.matches()) { 152 String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Bad location string found", locStr); 153 throw new ParseException(message); 154 } 155 String xrefName = gm.group(1); 156 String groupType = gm.group(2); 157 String subLocStr = gm.group(3); 158 159 // The group may have a crossref. If it doesn't, use the parent crossref instead. 160 CrossRef crossRef = parentXref; 161 if (xrefName!=null) { 162 163 // Try and find an accession (crossref) 164 Matcher xm = xp.matcher(xrefName); 165 if (!xm.matches()) { 166 String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Bad location xref found", locStr); 167 throw new ParseException(message); 168 } 169 String xrefAccession = xm.group(1); 170 String xrefVersion = xm.group(3); 171 if (xrefAccession.equals(featureAccession)) crossRef = null; 172 else { 173 // Found an accession, but does it have a version? 174 if (xrefVersion!=null) { 175 crossRef = (SimpleCrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{ 176 featureNS.getName(),xrefAccession,Integer.valueOf(xrefVersion) 177 }); 178 } else { 179 crossRef = (SimpleCrossRef)RichObjectFactory.getObject(SimpleCrossRef.class,new Object[]{ 180 featureNS.getName(),xrefAccession,new Integer(0) 181 }); 182 } 183 } 184 } 185 186 // The group may actually be a strand. If it isn't, or if it is the same strand as the parent, 187 // use the parent strand instead. 188 Strand strand = parentStrand; 189 if (groupType!=null) { 190 191 // Is it a strand group? 192 if (groupType.equalsIgnoreCase("complement")) { 193 // It's a complement location 194 if (parentStrand==Strand.NEGATIVE_STRAND) strand = Strand.POSITIVE_STRAND; 195 else strand = Strand.NEGATIVE_STRAND; 196 197 // Get the parsed contents of the complement 'group' 198 List resultBlocks = new ArrayList(RichLocation.Tools.flatten(parseLocString(featureNS,featureAccession,crossRef,strand,subLocStr))); 199 // Work out the rank of the first block. 200 int firstRank = ((RichLocation)resultBlocks.get(0)).getRank(); 201 // Reverse the order of its members c(j(x,y)) = j(cy,cx) 202 Collections.reverse(resultBlocks); 203 // Reset ranks so that they persist and retrieve to BioSQL in the correct order. 204 for (Iterator i = resultBlocks.iterator(); i.hasNext(); ) { 205 RichLocation rl = (RichLocation)i.next(); 206 try { 207 rl.setRank(firstRank++); 208 } catch (ChangeVetoException e) { 209 throw new ParseException(e); 210 } 211 } 212 return RichLocation.Tools.construct(resultBlocks); 213 } 214 215 // Otherwise, it's a compound location 216 else { 217 ComparableTerm groupTypeTerm = null; 218 if (groupType.equalsIgnoreCase("order")) groupTypeTerm = CompoundRichLocation.getOrderTerm(); 219 else if (groupType.equalsIgnoreCase("join")) groupTypeTerm = CompoundRichLocation.getJoinTerm(); 220 else { 221 String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Unknown group type found", locStr); 222 throw new ParseException(message); 223 } 224 225 // recurse on each block and return the compounded result 226 List members = new ArrayList(); 227 StringBuffer sb = new StringBuffer(); 228 char[] chars = subLocStr.toCharArray(); 229 int bracketCount = 0; 230 try{ 231 for (int i = 0; i < chars.length; i++) { 232 char c = chars[i]; 233 if (c=='(') bracketCount++; 234 else if (c==')') bracketCount--; 235 if (c==',' && bracketCount==0) { 236 String subStr = sb.toString(); 237 members.add(parseLocString(featureNS,featureAccession,crossRef,parentStrand,subStr)); 238 sb.setLength(0); // reset buffer 239 } else sb.append(c); 240 } 241 }catch(RuntimeException e){ 242 String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Problem with location format", locStr); 243 throw new ParseException(message); 244 } 245 if (sb.length()>0) members.add(parseLocString(featureNS,featureAccession,crossRef,parentStrand,sb.toString())); 246 247 // Merge the members of the group 248 RichLocation result = RichLocation.Tools.construct(RichLocation.Tools.merge(members)); 249 250 // Set the group term if the result was a group. 251 try { 252 if (result instanceof CompoundRichLocation) result.setTerm(groupTypeTerm); 253 } catch (ChangeVetoException e) { 254 ParseException e2 = new ParseException("Unable to set group term"); 255 e2.initCause(e); 256 throw e2; 257 } 258 259 return result; 260 } 261 } 262 263 // Process a simple location. 264 Matcher rm = rp.matcher(subLocStr); 265 if (!rm.matches()) { 266 String message = ParseException.newMessage(GenbankLocationParser.class, featureAccession, "unknown", "Bad location description found", subLocStr); 267 throw new ParseException(message); 268 } 269 String start = rm.group(1); 270 String end = rm.group(3); 271 272 Position startPos = parsePosition(start); 273 if (end==null) { 274 // A point location 275 return new SimpleRichLocation(startPos,startPos,++rank,strand,crossRef); 276 } else { 277 // A range location 278 Position endPos = parsePosition(end); 279 return new SimpleRichLocation(startPos,endPos,++rank,strand,crossRef); 280 } 281 } 282 283 // this function parses a single position - usually just half of one location 284 private static Position parsePosition(String position) throws ParseException { 285 Matcher pm = pp.matcher(position); 286 if (!pm.matches()) throw new ParseException("Could not understand position: "+position); 287 String startfuzz = pm.group(1); 288 String endfuzz = pm.group(6); 289 boolean endStartsFuzzy = ((startfuzz!=null && startfuzz.equals("<")) || (endfuzz!=null && endfuzz.equals("<"))); 290 boolean endEndsFuzzy = ((endfuzz!=null && endfuzz.equals(">")) || (startfuzz!=null && startfuzz.equals(">"))); 291 String endStart = pm.group(2); 292 String endRangeType = pm.group(4); 293 String endEnd = pm.group(5); 294 if (endRangeType!=null) { 295 // fuzziest 296 return new SimplePosition(endStartsFuzzy,endEndsFuzzy,Integer.parseInt(endStart),Integer.parseInt(endEnd),endRangeType); 297 } else { 298 // less fuzzy 299 return new SimplePosition(endStartsFuzzy,endEndsFuzzy,Integer.parseInt(endStart)); 300 } 301 } 302 303 /** 304 * Writes a location in Genbank format. 305 * @param l the location to write 306 * @return the formatted string representing the location. 307 */ 308 public static String writeLocation(RichLocation l) { 309 //write out location text 310 //use crossrefs to calculate remote location positions 311 //one big join (or order?) with complemented parts 312 if (l instanceof CompoundRichLocation) { 313// return _writeGroupLocation(l.blockIterator(),l.getTerm()); 314 return writeCompoundLocation((CompoundRichLocation) l); 315 } else { 316 return _writeSingleLocation(l); 317 } 318 } 319 320 // writes out a single position 321 private static String _writePosition(Position p) { 322 StringBuffer sb = new StringBuffer(); 323 int s = p.getStart(); 324 int e = p.getEnd(); 325 String t = p.getType(); 326 boolean fs = p.getFuzzyStart(); 327 boolean fe = p.getFuzzyEnd(); 328 boolean useParenthesis=isUsingParenthesis(p); 329// System.out.println("GenbankLocationParser._writePosition-p: ["+p+"], s:"+s+", e:"+e+", t: ["+t+"], fs? "+fs+", fe? "+fe); 330 if (s!=e) { 331 // a range - put in brackets 332 if (useParenthesis) sb.append("("); 333 if (fs) sb.append("<"); 334 sb.append(s); 335 sb.append(t); 336 if (fe) sb.append(">"); 337 sb.append(e); 338 if (useParenthesis) sb.append(")"); 339 } else { 340 // not a range - no brackets 341 if (fs) sb.append("<"); 342 if (fe) sb.append(">"); 343 sb.append(s); 344 } 345 return sb.toString(); 346 } 347 348 // write out a single location 349 private static String _writeSingleLocation(RichLocation l) { 350 StringBuffer loc = new StringBuffer(); 351 if (l.getCrossRef()!=null) { 352 loc.append(l.getCrossRef().getAccession()); 353 final int version = l.getCrossRef().getVersion(); 354 if (version!=0) { 355 loc.append("."); 356 loc.append(version); 357 } 358 loc.append(":"); 359 } 360 loc.append(_writePosition(l.getMinPosition())); 361 if (!l.getMinPosition().equals(l.getMaxPosition())) { 362 loc.append(".."); 363 loc.append(_writePosition(l.getMaxPosition())); 364 } 365 if (l.getStrand()==Strand.NEGATIVE_STRAND) { 366 loc.insert(0,"complement("); 367 loc.append(")"); 368 } 369// if (l.getCrossRef()!=null) { 370// loc.insert(0,":"); 371// int version = l.getCrossRef().getVersion(); 372// if (version!=0) { 373// loc.insert(0,version); 374// loc.insert(0,"."); 375// } 376// loc.insert(0,l.getCrossRef().getAccession()); 377// } 378 return loc.toString(); 379 } 380 381 // write out a group location 382 private static String _writeGroupLocation(Iterator i, ComparableTerm parentTerm) { 383 StringBuffer sb = new StringBuffer(); 384 sb.append(parentTerm.getName()); 385 sb.append("("); 386 while (i.hasNext()) { 387 RichLocation l = (RichLocation)i.next(); 388 if (l instanceof CompoundRichLocation) { 389 sb.append(_writeGroupLocation(l.blockIterator(),l.getTerm())); 390 } else { 391 sb.append(_writeSingleLocation(l)); 392 } 393 if (i.hasNext()) sb.append(","); 394 } 395 sb.append(")"); 396 return sb.toString(); 397 } 398 399 private final static String writeCompoundLocation(final CompoundRichLocation theLocation) { 400 if (isAnyLocationComplemented(theLocation)) { 401 if (isAnyLocationOnDifferentStrand(theLocation)) { 402 return _writeGroupLocation(theLocation.blockIterator(), theLocation.getTerm()); 403 } else { 404 return writeComplementLocation(theLocation.blockIterator()); 405 } 406 } else { 407 return _writeGroupLocation(theLocation.blockIterator(), theLocation.getTerm()); 408 } 409 410 } 411 412 private final static String writeComplementLocation(Iterator i) { 413 StringBuffer sb = new StringBuffer(); 414 while (i.hasNext()) { 415 RichLocation l = (RichLocation)i.next(); 416 if (l instanceof CompoundRichLocation) { 417 sb.insert(0, writeCompoundLocation((CompoundRichLocation) l)); 418 } else { 419 sb.insert(0, writeSingleLocation(l)); 420 } 421 if (i.hasNext()) sb.insert(0, ","); 422 } 423 sb.insert(0, "complement(join("); 424 sb.append("))"); 425 return sb.toString(); 426 } 427 428 private final static String writeSingleLocation(final RichLocation theRichLocation) { 429 StringBuffer loc = new StringBuffer(); 430 if (theRichLocation.getCrossRef()!=null) { 431 loc.append(theRichLocation.getCrossRef().getAccession()); 432 final int version = theRichLocation.getCrossRef().getVersion(); 433 if (version!=0) { 434 loc.append("."); 435 loc.append(version); 436 } 437 loc.append(":"); 438 } 439 loc.append(_writePosition(theRichLocation.getMinPosition())); 440 if (!theRichLocation.getMinPosition().equals(theRichLocation.getMaxPosition())) { 441 loc.append(".."); 442 loc.append(_writePosition(theRichLocation.getMaxPosition())); 443 } 444 return loc.toString(); 445 } 446 447 private final static boolean isAnyLocationComplemented(final CompoundRichLocation theLocation) { 448 final Iterator c = theLocation.blockIterator(); 449 while (c.hasNext()) { 450 final RichLocation location = (RichLocation) c.next(); 451 if (location.getStrand()==Strand.NEGATIVE_STRAND) return true; 452 } 453 return false; 454 } 455 456 private final static boolean isAnyLocationOnDifferentStrand(final CompoundRichLocation theLocation) { 457 final Iterator c = theLocation.blockIterator(); 458 final Strand firstStrand = ((RichLocation) c.next()).getStrand(); 459 while (c.hasNext()) { 460 final RichLocation location = (RichLocation) c.next(); 461 if (location.getStrand() != firstStrand) return true; 462 } 463 return false; 464 } 465 466 private final static boolean isUsingParenthesis(final Position thePosition) { 467 return !isBetweenBases(thePosition); 468 } 469 470 private final static boolean isBetweenBases(final Position thePosition) { 471 return thePosition.getType() != null && thePosition.getType().equals(Position.BETWEEN_BASES); 472 } 473}