001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.symbol; 023 024import java.io.Serializable; 025import java.util.ArrayList; 026import java.util.Collections; 027import java.util.Iterator; 028import java.util.List; 029 030import org.biojava.bio.BioError; 031import org.biojava.utils.AssertionFailure; 032 033/** 034 * This implementation of GappedSymbolList wraps a SymbolList, allowing you to 035 * insert gaps. Please note that this is a view onto another SymbolList. Gaps 036 * created and removed are only in the view not the underlying original. This 037 * means that any gaps present in the original cannot be manipulated in this 038 * view. To manipulate the original you would need to use Edit objects. 039 * 040 * @author Matthew Pocock 041 * @since 1.3 042 */ 043public class SimpleGappedSymbolList extends AbstractSymbolList implements 044 GappedSymbolList, Serializable { 045 /** 046 * An aligned block. 047 * <p> 048 * The alignment is actualy stoored as a list of these objects. Each block 049 * is contiguous with the next in the source fields, but may be gapped in 050 * the view fields. 051 * 052 * @author Matthew Pocock 053 */ 054 protected static final class Block implements Serializable { 055 private static final long serialVersionUID = 4090888450309921439L; 056 057 public int sourceStart; 058 public int sourceEnd; 059 public int viewStart; 060 public int viewEnd; 061 062 public Block(Block block) { 063 this.sourceStart = block.sourceStart; 064 this.sourceEnd = block.sourceEnd; 065 this.viewStart = block.viewStart; 066 this.viewEnd = block.viewEnd; 067 068 // fixme: should be using 1.4 assertion syntax 069 // assert isSane() : "Block is a sane shape: " + this.toString(); 070 assert isSane() : "Block is a sane shape: " + this.toString(); 071 } 072 073 public Block(int sourceStart, int sourceEnd, int viewStart, int viewEnd) { 074 this.sourceStart = sourceStart; 075 this.sourceEnd = sourceEnd; 076 this.viewStart = viewStart; 077 this.viewEnd = viewEnd; 078 } 079 080 public boolean isSane() { 081 return (viewEnd - viewStart) == (sourceEnd - sourceStart); 082 } 083 084 public String toString() { 085 return "Block: source=" + sourceStart + "," + sourceEnd + " view=" 086 + viewStart + "," + viewEnd; 087 } 088 } 089 090 private static final long serialVersionUID = 4258621048300499709L; 091 092 /** 093 * The Alphabet - the same as source but guaranteed to include the gap 094 * character. 095 */ 096 private final Alphabet alpha; 097 098 /** 099 * The SymbolList to view. 100 */ 101 private final SymbolList source; 102 103 /** 104 * The list of ungapped blocks that align between source and this view. 105 */ 106 private final List<Block> blocks; 107 108 /** 109 * The total length of the alignment - necessary to allow leading & trailing 110 * gaps. 111 */ 112 private int length; 113 114 /** 115 * Create a new SimpleGappedSymbolList that will view source, inheriting all 116 * existing gaps. 117 * 118 * @param gappedSource 119 * the underlying sequence 120 */ 121 public SimpleGappedSymbolList(GappedSymbolList gappedSource) { 122 this.source = gappedSource.getSourceSymbolList(); 123 this.alpha = this.source.getAlphabet(); 124 this.blocks = new ArrayList<Block>(); 125 this.length = this.source.length(); 126 Block b = new Block(1, length, 1, length); 127 blocks.add(b); 128 int n = 1; 129 for (int i = 1; i <= this.length(); i++) 130 if (this.alpha.getGapSymbol().equals(gappedSource.symbolAt(i))) 131 this.addGapInSource(n); 132 else 133 n++; 134 } 135 136 /** 137 * Create a new SimpleGappedSymbolList that will view source. 138 * 139 * @param source 140 * the underlying sequence 141 */ 142 public SimpleGappedSymbolList(SymbolList source) { 143 this.source = source; 144 this.alpha = source.getAlphabet(); 145 this.blocks = new ArrayList<Block>(); 146 this.length = source.length(); 147 Block b = new Block(1, length, 1, length); 148 blocks.add(b); 149 } 150 151 public SimpleGappedSymbolList(Alphabet alpha) { 152 this.alpha = alpha; 153 this.source = new SimpleSymbolList(this.alpha); 154 this.blocks = new ArrayList<Block>(); 155 this.length = source.length(); 156 // Block b = new Block(1, length, 1, length); 157 // blocks.add(b); 158 } 159 160 public void addGapInSource(int pos) throws IndexOutOfBoundsException { 161 addGapsInSource(pos, 1); 162 } 163 164 public void addGapInView(int pos) throws IndexOutOfBoundsException { 165 addGapsInView(pos, 1); 166 } 167 168 public void addGapsInSource(int pos, int length) { 169 if (pos < 1 || pos > (length() + 1)) { 170 throw new IndexOutOfBoundsException( 171 "Attempted to add a gap outside of this sequence (1.." 172 + length() + ") at " + pos); 173 } 174 175 int i = blocks.size() / 2; 176 int imin = 0; 177 int imax = blocks.size() - 1; 178 179 do { 180 Block b = blocks.get(i); 181 if (b.sourceStart < pos && b.sourceEnd >= pos) { // found block that 182 // need 183 // splitting 184 // with gaps 185 Block c = new Block(pos, b.sourceEnd, sourceToView(b, pos), 186 b.viewEnd); 187 b.viewEnd = sourceToView(b, pos - 1); 188 b.sourceEnd = pos - 1; 189 blocks.add(i + 1, c); 190 renumber(i + 1, length); 191 return; 192 } else { 193 if (b.sourceStart < pos) { 194 imin = i + 1; 195 i = imin + (imax - imin) / 2; 196 } else { 197 imax = i - 1; 198 i = imin + (imax - imin) / 2; 199 } 200 } 201 } while (imin <= imax); 202 203 // extending an already existing run of gaps; 204 if (i < blocks.size()) { 205 Block b = blocks.get(i); 206 if (b.sourceStart <= pos) { 207 i--; 208 } 209 } 210 renumber(i + 1, length); 211 212 assert isSane() : "Data corrupted: " + blocks; 213 } 214 215 public void addGapsInView(int pos, int length) 216 throws IndexOutOfBoundsException { 217 if (pos < 1 || pos > (length() + 1)) { 218 throw new IndexOutOfBoundsException( 219 "Attempted to add a gap outside of this sequence (1.." 220 + length() + ") at " + pos); 221 } 222 223 int i = blocks.size() / 2; 224 int imin = 0; 225 int imax = blocks.size() - 1; 226 227 do { 228 Block b = blocks.get(i); 229 if (b.viewStart < pos && b.viewEnd >= pos) { // found block that 230 // need splitting 231 // with gaps 232 Block c = new Block(viewToSource(b, pos), b.sourceEnd, pos, 233 b.viewEnd); 234 b.viewEnd = pos - 1; 235 b.sourceEnd = viewToSource(b, pos - 1); 236 blocks.add(i + 1, c); 237 renumber(i + 1, length); 238 return; 239 } else { 240 if (b.viewStart < pos) { 241 imin = i + 1; 242 i = imin + (imax - imin) / 2; 243 } else { 244 imax = i - 1; 245 i = imin + (imax - imin) / 2; 246 } 247 } 248 } while (imin <= imax); 249 250 // extending an already existing run of gaps; 251 if (i < blocks.size()) { 252 Block b = blocks.get(i); 253 if (pos <= b.viewStart) { 254 i--; 255 } 256 } else { 257 i--; 258 } 259 renumber(i + 1, length); 260 } 261 262 /** 263 * Get list of the un-gapped region of the SymbolList. 264 * <p> 265 * The gapped symbol list can be represented as a list of ungapped regions. 266 * Given a list of start-stop pairs in the ungapped coordinate system each 267 * with a corresponding pair of start-stop pairs in the gapped view, the 268 * entire gapped list can be reconstructed. 269 * 270 * @return a List of Block instances 271 */ 272 public List<Block> BlockIterator() { 273 return Collections.unmodifiableList(blocks); 274 } 275 276 /** 277 * Debugging method 278 */ 279 public void dumpBlocks() { 280 for (Iterator<Block> i = blocks.iterator(); i.hasNext();) { 281 Block b = i.next(); 282 System.out.println(b.sourceStart + ".." + b.sourceEnd + "\t" 283 + b.viewStart + ".." + b.viewEnd); 284 } 285 } 286 287 public int firstNonGap() { 288 int first = (blocks.get(0)).viewStart; 289 return first; 290 } 291 292 /** 293 * Translates a Location from the gapped view into the underlying sequence. 294 * End points that are in gaps are moved 'inwards' to shorten the location. 295 * 296 * @since 1.3 297 */ 298 299 public Location gappedToLocation(Location l) { 300 if (l.isContiguous()) { 301 return gappedToBlock(l); 302 } else { 303 List<Location> lblocks = new ArrayList<Location>(); 304 for (Iterator<Location> i = l.blockIterator(); i.hasNext();) { 305 lblocks.add(gappedToBlock((Location) i.next())); 306 } 307 return LocationTools.union(lblocks); 308 } 309 } 310 311 public Alphabet getAlphabet() { 312 return alpha; 313 } 314 315 public SymbolList getSourceSymbolList() { 316 return source; 317 } 318 319 public Location getUngappedLocation() { 320 List<Location> locList = new ArrayList<Location>(blocks.size()); 321 for (Iterator<Block> i = blocks.iterator(); i.hasNext();) { 322 Block block = i.next(); 323 locList.add(new RangeLocation(block.viewStart, block.viewEnd)); 324 } 325 326 return LocationTools.union(locList); 327 } 328 329 public int lastNonGap() { 330 int last = (blocks.get(blocks.size() - 1)).viewEnd; 331 return last; 332 } 333 334 public int length() { 335 return length; 336 } 337 338 /** 339 * Translate a Location onto the gapped view, splitting blocks if necessary 340 * 341 * @since 1.3 342 */ 343 344 public Location locationToGapped(Location l) { 345 if (l.isContiguous()) { 346 return blockToGapped(l); 347 } else { 348 List<Location> lblocks = new ArrayList<Location>(); 349 for (Iterator<Location> i = l.blockIterator(); i.hasNext();) 350 lblocks.add(blockToGapped(i.next())); 351 return LocationTools.union(lblocks); 352 } 353 } 354 355 public void removeGap(int pos) throws IndexOutOfBoundsException, 356 IllegalSymbolException { 357 if (pos < 1 || pos > length()) { 358 throw new IndexOutOfBoundsException( 359 "Attempted to remove gap outside of this sequence (1.." 360 + length() + ") at " + pos); 361 } 362 int i = findViewGap(pos); 363 if (i == -2) { 364 throw new IllegalSymbolException( 365 "Attempted to remove a gap at a non-gap index: " + pos 366 + " -> " + symbolAt(pos).getName()); 367 } 368 369 if (i == -1 || i == (blocks.size() - 1)) { // at the beginning or the 370 // end 371 renumber(i + 1, -1); 372 } else { // internal 373 Block l = blocks.get(i); 374 Block r = blocks.get(i + 1); 375 renumber(i + 1, -1); 376 int gap = r.viewStart - l.viewEnd; 377 if (gap == 1) { // removing the last gap - need to join blocks l & r 378 // together 379 l.sourceEnd = r.sourceEnd; 380 l.viewEnd = r.viewEnd; 381 blocks.remove(i + 1); 382 } 383 } 384 385 assert isSane() : "Data corrupted: " + blocks; 386 } 387 388 public void removeGaps(int pos, int length) 389 throws IndexOutOfBoundsException, IllegalSymbolException { 390 int end = pos + length - 1; 391 if (pos < 1 || length < 1 || end > length()) { 392 throw new IndexOutOfBoundsException( 393 "Attempted to remove gap outside of this sequence (1.." 394 + length() + ") at (" + pos + ".." + end + ")"); 395 } 396 int i = findViewGap(pos); 397 if (i == -2) { 398 throw new IllegalSymbolException( 399 "Attempted to remove a gap at a non-gap index: " + pos 400 + " -> " + symbolAt(pos).getName()); 401 } 402 403 if (i == -1) { // removing track at the beginning 404 Block b = blocks.get(0); 405 if (b.viewStart <= end) { 406 throw new IllegalSymbolException( 407 "Attempted to remove some non-gap characters at (" 408 + pos + ".." + end + ")"); 409 } 410 renumber(i + 1, -length); 411 } else if (i == (blocks.size() - 1)) { // removing trailing gaps 412 this.length -= length; 413 } else { // removing internal gaps 414 Block l = blocks.get(i); 415 Block r = blocks.get(i + 1); 416 int gap = r.viewStart - l.viewEnd; 417 if (gap < length) { 418 throw new IllegalSymbolException("Removing " + length 419 + " gaps from + " + i + " but there are only " + gap 420 + " gaps there: " + blocks); 421 } 422 423 renumber(i + 1, -length); 424 if (gap == length) { // deleted an entire gapped region 425 l.sourceEnd = r.sourceEnd; 426 l.viewEnd = r.viewEnd; 427 blocks.remove(i + 1); 428 } 429 } 430 431 assert isSane() : "Data corrupted: removeGaps(" + pos + "," + length 432 + ") " + blocks; 433 } 434 435 public final int sourceToView(int indx) throws IndexOutOfBoundsException { 436 if (indx < 1 || indx > source.length()) { 437 throw new IndexOutOfBoundsException( 438 "Attempted to address source sequence (1.." + length() 439 + ") at " + indx); 440 } 441 int j = findSourceBlock(indx); 442 return sourceToView(blocks.get(j), indx); 443 } 444 445 public Symbol symbolAt(int indx) throws IndexOutOfBoundsException { 446 if (indx > length() || indx < 1) { 447 throw new IndexOutOfBoundsException( 448 "Attempted to read outside of this sequence (1.." 449 + length() + ") at " + indx); 450 } 451 int i = findViewBlock(indx); 452 if (i < 0) { 453 if ((indx < firstNonGap()) || (indx > lastNonGap())) { 454 return Alphabet.EMPTY_ALPHABET.getGapSymbol(); 455 } else { 456 return getAlphabet().getGapSymbol(); 457 } 458 } else { 459 try { 460 Block b = blocks.get(i); 461 return source.symbolAt(b.sourceStart - b.viewStart + indx); 462 } catch (IndexOutOfBoundsException e) { 463 throw new AssertionFailure( 464 "Internal book-keeping error fetching index: " + indx 465 + " of " + length() + " blocks: " + blocks, e); 466 } 467 } 468 } 469 470 public final int viewToSource(int indx) throws IndexOutOfBoundsException { 471 if (indx < 1 || indx > length()) { 472 throw new IndexOutOfBoundsException( 473 "Attempted to address sequence (1.." + length() + ") at " 474 + indx); 475 } 476 int j = findViewBlock(indx); 477 if (j < 0) { 478 int nj = -j - 1; 479 // System.out.println("Converted: " + indx + ":" + j + " to " + nj); 480 if (nj < blocks.size()) { 481 Block b = blocks.get(nj); 482 // System.out.println("Has a following block: " + b); 483 return -b.sourceStart; 484 } else { 485 // System.out.println("Has no following block"); 486 return -(source.length() + 1); 487 } 488 } else { 489 return viewToSource(blocks.get(j), indx); 490 } 491 } 492 493 private Location blockToGapped(Location l) { 494 int start = l.getMin(); 495 int end = l.getMax(); 496 497 List<Location> lblocks = new ArrayList<Location>(); 498 499 while (start <= end) { 500 int containingBlockIdx = findSourceBlock(start); 501 Block containingBlock = blocks.get(containingBlockIdx); 502 int gbstart = start - containingBlock.sourceStart 503 + containingBlock.viewStart; 504 int gbend = Math 505 .min(end - start + gbstart, containingBlock.viewEnd); 506 lblocks.add(new RangeLocation(gbstart, gbend)); 507 start = start + gbend - gbstart + 1; 508 } 509 510 return LocationTools.union(lblocks); 511 } 512 513 private Location gappedToBlock(Location l) { 514 int start = l.getMin(); 515 int end = l.getMax(); 516 517 int startBlockI = findViewBlock(start); 518 int endBlockI = findViewBlock(end); 519 520 if (startBlockI < 0) { // in a gap 521 int sb = -startBlockI - 1; 522 if (sb == blocks.size()) { 523 start = Integer.MAX_VALUE; 524 } else { 525 Block startBlock = blocks.get(sb); 526 527 start = startBlock.sourceStart; 528 } 529 } else { 530 Block startBlock = blocks.get(startBlockI); 531 start = start - startBlock.viewStart + startBlock.sourceStart; 532 } 533 534 if (endBlockI < 0) { // in a gap 535 int eb = -endBlockI - 1; 536 if (eb == 0) { 537 end = Integer.MIN_VALUE; 538 } else { 539 Block endBlock = blocks.get(eb - 1); 540 541 end = endBlock.sourceEnd; 542 } 543 } else { 544 Block endBlock = blocks.get(endBlockI); 545 end = end - endBlock.viewEnd + endBlock.sourceEnd; 546 } 547 548 if (start > end) { 549 return Location.empty; 550 } else { 551 return new RangeLocation(start, end); 552 } 553 } 554 555 /** 556 * Finds the index of the block containing the source coordinate indx. 557 * 558 * @param indx 559 * the index to find 560 * @return the index of the Block containing indx 561 */ 562 protected final int findSourceBlock(int indx) { 563 int i = blocks.size() / 2; 564 int imin = 0; 565 int imax = blocks.size() - 1; 566 567 do { 568 Block b = blocks.get(i); 569 if (b.sourceStart <= indx && b.sourceEnd >= indx) { 570 return i; 571 } else { 572 if (b.sourceStart < indx) { 573 imin = i + 1; 574 i = imin + (imax - imin) / 2; 575 } else { 576 imax = i - 1; 577 i = imin + (imax - imin) / 2; 578 } 579 } 580 } while (imin <= imax); 581 582 throw new BioError( 583 "Something is screwed. Could not find source block containing index " 584 + indx + " in sequence of length " + source.length()); 585 } 586 587 /** 588 * Finds the index of the Block before the gap at indx within the following 589 * gap. 590 * 591 * @param indx 592 * the index to find within a gap 593 * @return the index of the block with indx in the gap 594 */ 595 protected final int findSourceGap(int indx) { 596 int i = blocks.size() / 2; 597 int imin = 0; 598 int imax = blocks.size() - 1; 599 600 do { 601 Block b = blocks.get(i); 602 if (b.sourceStart <= indx && b.sourceEnd >= indx) { 603 return -1; 604 } else { 605 if (b.sourceStart < indx) { 606 imin = i + 1; 607 i = imin + (imax - imin) / 2; 608 } else { 609 imax = i - 1; 610 i = imin + (imax - imin) / 2; 611 } 612 } 613 } while (imin <= imax); 614 615 Block b = blocks.get(i); 616 if (b.viewEnd < indx) { 617 return i; 618 } else { 619 return i - 1; 620 } 621 } 622 623 /** 624 * Finds the index of the Block containing indx within the view ranges. 625 * <p> 626 * If indx is not within a view block, then it is the index of a gap. The 627 * method will return -(indx+1) where indx is the block emediately following 628 * the gap. 629 * 630 * @param indx 631 * the index to find within a view range. 632 * @return the index of the block containing index or one less than the 633 * negative of the index of the block following the gap 634 */ 635 protected final int findViewBlock(int indx) { 636 int i = blocks.size() / 2; 637 int imin = 0; 638 int imax = blocks.size() - 1; 639 // System.out.println("Searching for " + indx); 640 641 Block b; 642 do { 643 // System.out.println(imin + " < " + i + " < " + imax); 644 b = blocks.get(i); 645 // System.out.println("Checking " + b.viewStart + ".." + b.viewEnd); 646 if (b.viewStart <= indx && b.viewEnd >= indx) { 647 // System.out.println("hit"); 648 return i; 649 } else { 650 if (b.viewStart < indx) { 651 // System.out.println("Too low"); 652 imin = i + 1; 653 i = imin + (imax - imin) / 2; 654 } else { 655 // System.out.println("Too high"); 656 imax = i - 1; 657 i = imin + (imax - imin) / 2; 658 } 659 } 660 } while (imin <= imax); 661 662 if (i >= blocks.size()) { 663 return -blocks.size() - 1; 664 } 665 666 if (blocks.get(i) != b) { 667 if (blocks.get(i - 1) == b) { 668 --i; 669 } else if (blocks.get(i + 1) == b) { 670 ++i; 671 } 672 } 673 674 // System.out.println("Finding block for: " + indx + " in " + blocks); 675 // System.out.println("\ti=" + i); 676 // System.out.println("\t" + blocks.get(i)); 677 if (indx > b.viewStart) { // block before gap - move to block after gap 678 // System.out.println("\tAdvancing i"); 679 i++; 680 if (i < blocks.size()) { 681 // System.out.println("\t" + blocks.get(i)); 682 } 683 } 684 return -i - 1; 685 } 686 687 /** 688 * Finds the index of the Block before the gap at indx within the view 689 * range. 690 * <p> 691 * If indx is in-fact a real symbol, then there will be no Block before it. 692 * In this case, the method returns -2. It returns -1 if indx is within the 693 * leading gaps and blocks.size()-1 if it is within the trailing gaps. 694 * 695 * @param indx 696 * the index to find within a view range 697 * @return the index of the block with indx in the following gap 698 */ 699 protected final int findViewGap(int indx) { 700 int i = blocks.size() / 2; 701 int imin = 0; 702 int imax = blocks.size() - 1; 703 704 do { 705 Block b = blocks.get(i); 706 if (b.viewStart <= indx && b.viewEnd >= indx) { 707 return -2; 708 } else { 709 if (b.viewStart < indx) { 710 imin = i + 1; 711 i = imin + (imax - imin) / 2; 712 } else { 713 imax = i - 1; 714 i = imin + (imax - imin) / 2; 715 } 716 } 717 } while (imin <= imax); 718 719 if (i < blocks.size()) { 720 Block b = blocks.get(i); 721 if (b.viewEnd < indx) { 722 return i; 723 } else { 724 return i - 1; 725 } 726 } else { 727 return i - 1; 728 } 729 } 730 731 protected boolean isSane() { 732 for (Iterator<Block> i = blocks.iterator(); i.hasNext();) 733 if (!i.next().isSane()) 734 return false; 735 return true; 736 } 737 738 /** 739 * Renumber the view indexes from block, adding delta to each offset. 740 * <p> 741 * This adjusts viewStart and viewEnd to be += delta for each block 742 * i->blocks.size(), and sets the total length to += delta. 743 * 744 * @param i 745 * the first 746 */ 747 protected final void renumber(int i, int delta) { 748 for (int j = i; j < blocks.size(); j++) { 749 Block b = blocks.get(j); 750 b.viewStart += delta; 751 b.viewEnd += delta; 752 } 753 length += delta; 754 } 755 756 /** 757 * Coordinate conversion from source to view. 758 * 759 * @param b 760 * the block containing indx 761 * @param indx 762 * the index to project 763 * @return the position of indx projected from source to view 764 */ 765 protected final int sourceToView(Block b, int indx) { 766 return indx - b.sourceStart + b.viewStart; 767 } 768 769 /** 770 * Coordinate conversion from view to source. 771 * 772 * @param b 773 * the block containing indx 774 * @param indx 775 * the index to project 776 * @return the position of indx projected from view to source 777 */ 778 protected final int viewToSource(Block b, int indx) { 779 return indx - b.viewStart + b.sourceStart; 780 } 781}