001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.io; 023 024import java.util.ArrayList; 025import java.util.List; 026import java.util.Map; 027 028import org.biojava.bio.BioError; 029import org.biojava.bio.BioException; 030import org.biojava.bio.seq.Feature; 031import org.biojava.bio.seq.RemoteFeature; 032import org.biojava.bio.seq.StrandedFeature; 033import org.biojava.bio.symbol.BetweenLocation; 034import org.biojava.bio.symbol.FuzzyLocation; 035import org.biojava.bio.symbol.FuzzyPointLocation; 036import org.biojava.bio.symbol.Location; 037import org.biojava.bio.symbol.LocationTools; 038import org.biojava.bio.symbol.PointLocation; 039import org.biojava.bio.symbol.RangeLocation; 040import org.biojava.utils.ChangeVetoException; 041import org.biojava.utils.SmallMap; 042 043/** 044 * <code>EmblLikeLocationParser</code> parses EMBL/Genbank style 045 * locations. Supported location forms: 046 * 047 * <pre> 048 * 123 049 * <123 or >123 050 * (123.567) 051 * (123.567)..789 052 * 123..(567.789) 053 * (123.345)..(567.789) 054 * 123..456 055 * <123..567 or 123..>567 or <123..>567 056 * 123^567 057 * AL123465:(123..567) 058 * </pre> 059 * 060 * @author Keith James 061 * @author Greg Cox 062 * @since 1.2 063 * @deprecated Use org.biojavax.bio.seq.io framework instead 064 */ 065public class EmblLikeLocationParser 066{ 067 // For the LocationLexer inner classs 068 private String location; 069 private LocationLexer lexer; 070 private int nextCharIndex; 071 private Object thisToken; 072 073 // seq ID of the sequence we are parsing features for 074 private String parentSeqID; 075 076 // Stores join/order/complement instructions 077 private List instructStack = new ArrayList(); 078 // joinType is a hack to store if a compound location is a 079 // join(...) or an order(...) location. If this isn't sufficient 080 // for your needs, feel free to fix it. If you do, please make 081 // sure the AbstractGenEmblFileFormer class is still able to 082 // format join and order correctly. The joinType is stored in the 083 // Feature Annotation under the internal data key 084 // Feature.PROPERTY_DATA_KEY which means that it won't get printed 085 // in flatfile dumps. 086 private String joinType = null; 087 088 // List of sublocations. Used for compound locations on the current 089 // sequence 090 private List subLocations = new ArrayList(); 091 // List of subRegions. Used to store remote regions 092 private List subRegions = new ArrayList(); 093 094 // These hold working data for each (sub)location and are cleared 095 // by calling the processCoords() function 096 private String mRegionSeqID; 097 private List startCoords = new ArrayList(); 098 private List endCoords = new ArrayList(); 099 private boolean isPointLoc = true; 100 private boolean fuzzyCoord = false; 101 private boolean unboundMin = false; 102 private boolean unboundMax = false; 103 private boolean isBetweenLocation = false; 104 105 // Currently set per Feature; this is a deficiency in the current 106 // parser. Features are assumed to be on the positive strand until 107 // complemented. 108 // No features have a strand type of UNKNOWN 109 private StrandedFeature.Strand mStrandType = StrandedFeature.POSITIVE; 110 111 EmblLikeLocationParser(String parentSeqID) 112 { 113 this.lexer = new LocationLexer(); 114 this.parentSeqID = parentSeqID; 115 } 116 117 /** 118 * <code>parseLocation</code> creates a <code>Location</code> from 119 * the String and returns a stranded location. 120 * 121 * @param location a location <code>String</code>. 122 * @param theTemplate the template to be filled with the parsed out location 123 * information. 124 * 125 * @exception BioException if an error occurs. 126 */ 127 public Feature.Template parseLocation(String location, Feature.Template theTemplate) 128 throws BioException 129 { 130 this.location = location; 131 132 //fixme: mrp: removed this check - it may be killing performance 133 //if ((countChar(location, '(')) != (countChar(location, ')'))) 134 // throw new BioException("Unbalanced parentheses in location: " 135 // + location); 136 137 nextCharIndex = 0; 138 139 instructStack.clear(); 140 subLocations.clear(); 141 subRegions.clear(); 142 143 // 'join' vs. 'order' 144 joinType = null; 145 146 thisToken = lexer.getNextToken(); 147 while (thisToken != null) 148 { 149 if (String.class.isInstance(thisToken)) 150 { 151 String toke = (String) thisToken; 152 if (toke.equals("..")) 153 { 154 // This token indicates that this isn't a point 155 isPointLoc = false; 156 } 157 else 158 { 159 instructStack.add(thisToken); 160 } 161 } 162 else if (Integer.class.isInstance(thisToken)) 163 { 164 if (isPointLoc) 165 startCoords.add(thisToken); 166 else 167 endCoords.add(thisToken); 168 } 169 else if (Character.class.isInstance(thisToken)) 170 { 171 char toke = ((Character) thisToken).charValue(); 172 173 switch (toke) 174 { 175 case '(': 176 break; 177 178 case ':': 179 processInstructs(); 180 break; 181 182 case '^': 183 isBetweenLocation = true; 184 break; 185 186 case '<': 187 unboundMin = true; 188 break; 189 190 case '>': 191 unboundMax = true; 192 break; 193 194 case '.': 195 // Catch range: (123.567) 196 fuzzyCoord = true; 197 break; 198 199 case ',': 200 processCoords(); 201 break; 202 203 case ')': 204 // Catch the end of range: (123.567) 205 if (fuzzyCoord) 206 { 207 fuzzyCoord = false; 208 } 209 else 210 { 211 processCoords(); 212 processInstructs(); 213 } 214 break; 215 216 default: 217 throw new BioException("Unknown character '" 218 + toke 219 + "' within location: " 220 + location); 221 } 222 } 223 thisToken = lexer.getNextToken(); 224 } 225 processCoords(); 226 227 // The location has been processed, and now the template gets filled 228 if (subLocations.size() == 1) 229 { 230 theTemplate.location = (Location)subLocations.get(0); 231 } 232 else 233 { 234 // EMBL ordering is in reverse on the complementary strand 235 // but LocationTools sorts them anyway 236 theTemplate.location = LocationTools.union(subLocations); 237 } 238 239 if (theTemplate instanceof StrandedFeature.Template) 240 { 241 ((StrandedFeature.Template) theTemplate).strand = mStrandType; 242 } 243 244 if (subRegions.size() > subLocations.size()) 245 { 246 // This is a remote feature, so a new template has to be made 247 RemoteFeature.Template newTemplate = new RemoteFeature.Template(theTemplate); 248 newTemplate.regions = new ArrayList(subRegions); 249// FIXME: 250// I don't know how to create an appropriate resolver, so I'm leaving it 251// blank. No doubt this will break things. 252// -- Gcox 253 newTemplate.resolver = null; 254 255 theTemplate = newTemplate; 256 } 257 258 if (joinType != null) 259 { 260 try 261 { 262 // Feature.PROPERTY_DATA_KEY signifies internal (not 263 // for flatfile) data 264 Map dat = new SmallMap(); 265 dat.put("JoinType", joinType); 266 theTemplate.annotation.setProperty(Feature.PROPERTY_DATA_KEY, 267 dat); 268 } 269 catch (ChangeVetoException cve) 270 { 271 throw new BioError(cve); 272 } 273 } 274 return theTemplate; 275 } 276 277 /** 278 * <code>processCoords</code> uses the coordinate data in the 279 * start/endCoords Lists to create a Location and add to the 280 * subLocations List. As this code will require further 281 * modification to support fuzzy point locations, please keep any 282 * changes well-commented. 283 * 284 * @exception BioException if an error occurs. 285 */ 286 private void processCoords() throws BioException 287 { 288 int outerMin, innerMin, innerMax, outerMax; 289 Location createdLocation = null; 290 291 // This is expected where two calls to processCoords() are 292 // made sequentially e.g. where two levels of parens are 293 // closed. The second call will have no data to process. 294 if (startCoords.isEmpty() && endCoords.isEmpty()) 295 return; 296 297 // Range of form 5^6 or 5^7 298 if (isBetweenLocation) 299 { 300 // Create a ranged location, and wrap it in a between location 301 int minCoord = ((Integer) startCoords.get(0)).intValue(); 302 int maxCoord = ((Integer) startCoords.get(1)).intValue(); 303 createdLocation = new BetweenLocation(new RangeLocation(minCoord, maxCoord)); 304 } 305 // Range of form: 123 306 else if (startCoords.size() == 1 && endCoords.isEmpty()) 307 { 308 innerMin = outerMin = ((Integer) startCoords.get(0)).intValue(); 309 innerMax = outerMax = innerMin; 310 311 // This looks like a point, but is actually a range which 312 // lies entirely outside the current entry 313 if (unboundMin || unboundMax) 314 { 315 createdLocation = new FuzzyPointLocation(unboundMin ? Integer.MIN_VALUE : innerMin, 316 unboundMax ? Integer.MAX_VALUE : innerMax, 317 FuzzyPointLocation.RESOLVE_AVERAGE); 318 } 319 else if (isPointLoc) 320 { 321 createdLocation = new PointLocation(innerMin); 322 } 323 else 324 { 325 throw new BioException("Internal error in location parsing; parser became confused: " 326 + location); 327 } 328 } 329 // Range of form: (123.567) 330 else if (startCoords.size() == 2 && endCoords.isEmpty()) 331 { 332 innerMin = outerMin = ((Integer) startCoords.get(0)).intValue(); 333 innerMax = outerMax = ((Integer) startCoords.get(1)).intValue(); 334 335 createdLocation = new FuzzyPointLocation(innerMin, 336 innerMax, 337 FuzzyPointLocation.RESOLVE_AVERAGE); 338 } 339 // Range of form: 123..567 or <123..567 or 123..>567 or <123..>567 340 else if (startCoords.size() == 1 && endCoords.size() == 1) 341 { 342 innerMin = outerMin = ((Integer) startCoords.get(0)).intValue(); 343 innerMax = outerMax = ((Integer) endCoords.get(0)).intValue(); 344 345 if (unboundMin || unboundMax) 346 { 347 createdLocation = new FuzzyLocation(unboundMin ? Integer.MIN_VALUE : outerMin, 348 unboundMax ? Integer.MAX_VALUE : outerMax, 349 innerMin, 350 innerMax, 351 FuzzyLocation.RESOLVE_INNER); 352 } 353 else 354 { 355 try 356 { 357 createdLocation = new RangeLocation(outerMin, outerMax); 358 } 359 catch (IndexOutOfBoundsException ioe) 360 { 361 throw new BioException(ioe); 362 } 363 } 364 } 365 // Range of form: (123.567)..789 366 else if (startCoords.size() == 2 && endCoords.size() == 1) 367 { 368 outerMin = ((Integer) startCoords.get(0)).intValue(); 369 innerMin = ((Integer) startCoords.get(1)).intValue(); 370 innerMax = outerMax = ((Integer) endCoords.get(0)).intValue(); 371 372 createdLocation = new FuzzyLocation(outerMin, 373 outerMax, 374 innerMin, 375 innerMax, 376 FuzzyLocation.RESOLVE_INNER); 377 } 378 // Range of form: 123..(567.789) 379 else if (startCoords.size() == 1 && endCoords.size() == 2) 380 { 381 outerMin = innerMin = ((Integer) startCoords.get(0)).intValue(); 382 innerMax = ((Integer) endCoords.get(0)).intValue(); 383 outerMax = ((Integer) endCoords.get(1)).intValue(); 384 385 createdLocation = new FuzzyLocation(outerMin, 386 outerMax, 387 innerMin, 388 innerMax, 389 FuzzyLocation.RESOLVE_INNER); 390 } 391 // Range of form: (123.345)..(567.789) 392 else if (startCoords.size() == 2 && endCoords.size() == 2) 393 { 394 outerMin = ((Integer) startCoords.get(0)).intValue(); 395 innerMin = ((Integer) startCoords.get(1)).intValue(); 396 innerMax = ((Integer) endCoords.get(0)).intValue(); 397 outerMax = ((Integer) endCoords.get(1)).intValue(); 398 399 createdLocation = new FuzzyLocation(outerMin, 400 outerMax, 401 innerMin, 402 innerMax, 403 FuzzyLocation.RESOLVE_INNER); 404 } 405 else 406 { 407 throw new BioException("Internal error in location parsing; parser became confused; " 408 + location); 409 } 410 411 startCoords.clear(); 412 endCoords.clear(); 413 414 if (mRegionSeqID == null) 415 { 416 subLocations.add(createdLocation); 417 subRegions.add(new RemoteFeature.Region(createdLocation, parentSeqID, false)); 418 } 419 else 420 { 421 subRegions.add(new RemoteFeature.Region(createdLocation, mRegionSeqID, true)); 422 } 423 424 mRegionSeqID = null; 425 isPointLoc = true; 426 unboundMin = false; 427 unboundMax = false; 428 fuzzyCoord = false; 429 isBetweenLocation = false; 430 mStrandType = StrandedFeature.POSITIVE; 431 } 432 433 /** 434 * <code>processInstructs</code> pops an instruction off the stack 435 * and applies it to the sub(locations). 436 * 437 * @exception BioException if an unsupported instruction is found. 438 */ 439 private void processInstructs() throws BioException 440 { 441 String instruct = (String) instructStack.remove(instructStack.size() - 1); 442 if (instruct.equals("join") || instruct.equals("order")) 443 { 444 joinType = instruct; 445 } 446 else if (instruct.equals("complement")) 447 { 448 // This should only set the strand for a single range 449 // within a feature. However, BioJava Locations have no 450 // concept of strand and therefore are unable to support 451 // construction of Features where some ranges are on 452 // different strands. As a result the mStrandType 453 // flag currently sets the strand for the whole feature. 454 mStrandType = StrandedFeature.NEGATIVE; 455 } 456 else 457 { 458 // This is a primary accession number 459 // e.g. J00194:(100..202) 460 mRegionSeqID = instruct; 461 } 462 } 463 464 465 /** 466 * <code>LocationLexer</code> is based on the 467 * <code>LocationLexer</code> class in the Artemis source code by 468 * Kim Rutherford. 469 * 470 * @author Kim Rutherford 471 * @author Keith James 472 * @author Greg Cox 473 * @since 1.2 474 */ 475 private class LocationLexer 476 { 477 private StringBuffer intString = new StringBuffer(); 478 private StringBuffer textString = new StringBuffer(); 479 480 /** 481 * <code>getNextToken</code> returns the next token. A null 482 * indicates no more tokens. 483 * 484 * @return an <code>Object</code> value. 485 */ 486 Object getNextToken() 487 { 488 while (true) 489 { 490 if (nextCharIndex == location.length()) 491 return null; 492 493 char thisChar = location.charAt(nextCharIndex); 494 495 switch (thisChar) 496 { 497 case ' ' : case '\t' : 498 nextCharIndex++; 499 continue; 500 501 case ':' : case '^' : case ',' : 502 case '(' : case ')' : case '<' : 503 case '>' : 504 nextCharIndex++; 505 return new Character(thisChar); 506 507 case '.' : 508 if (location.charAt(nextCharIndex + 1) == '.') 509 { 510 nextCharIndex += 2; 511 return ".."; 512 } 513 else 514 { 515 nextCharIndex++; 516 return new Character('.'); 517 } 518 519 case '0' : case '1' : case '2' : case '3' : case '4' : 520 case '5' : case '6' : case '7' : case '8' : case '9' : 521 return followInteger(); 522 523 default : 524 String text = followText(); 525 if (text.equals("")) 526 { 527 nextCharIndex++; 528 return new String("" + thisChar); 529 } 530 else 531 return text; 532 } 533 } 534 } 535 536 /** 537 * <code>followInteger</code> returns single sequence 538 * coordinate. 539 * 540 * @return an <code>Integer</code> value. 541 */ 542 private Integer followInteger() 543 { 544 intString.setLength(0); 545 char thisChar = location.charAt(nextCharIndex); 546 547 while (Character.isDigit(thisChar)) 548 { 549 intString.append(thisChar); 550 nextCharIndex++; 551 552 if (nextCharIndex >= location.length()) 553 break; 554 555 thisChar = location.charAt(nextCharIndex); 556 } 557 558 return new Integer(intString.substring(0)); 559 } 560 561 /** 562 * <code>followText</code> returns a single text string. 563 * 564 * @return a <code>String</code> value. 565 */ 566 private String followText() 567 { 568 textString.setLength(0); 569 char thisChar = location.charAt(nextCharIndex); 570 571 // First character must be a letter 572 if (! Character.isLetter(thisChar)) 573 return ""; 574 575 while (Character.isLetterOrDigit(thisChar) || 576 thisChar == '.') 577 { 578 textString.append(thisChar); 579 nextCharIndex++; 580 581 if (nextCharIndex >= location.length()) 582 break; 583 584 thisChar = location.charAt(nextCharIndex); 585 } 586 587 return textString.substring(0); 588 } 589 } 590}