001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojava.bio.seq.impl; 023 024import java.util.ArrayList; 025import java.util.List; 026import java.util.Set; 027import java.util.SortedMap; 028import java.util.TreeMap; 029import java.util.TreeSet; 030 031import org.biojava.bio.seq.ComponentFeature; 032import org.biojava.bio.seq.DNATools; 033import org.biojava.bio.seq.StrandedFeature; 034import org.biojava.bio.symbol.AbstractSymbolList; 035import org.biojava.bio.symbol.Alphabet; 036import org.biojava.bio.symbol.IllegalSymbolException; 037import org.biojava.bio.symbol.Location; 038import org.biojava.bio.symbol.RangeLocation; 039import org.biojava.bio.symbol.Symbol; 040import org.biojava.bio.symbol.SymbolList; 041 042/** 043 * Support class for applications which need to patch together sections 044 * of sequence into a single SymbolList. This class isn't intended 045 * for direct use in user code -- instead, it is a helper for people 046 * implementing the full BioJava assembly model. See SimpleAssembly 047 * for an example. 048 * 049 * @author Thomas Down 050 * @author Greg Cox 051 * @author Matthew Pocock 052 * @author David Huen (support for overlapped features) 053 * @since 1.1 054 */ 055 056public class NewAssembledSymbolList extends AbstractSymbolList { 057 private boolean autoLength = true; 058 private int length = 0; 059 060 private SortedMap components; // maps between locations and the actual components 061 private List componentList; // an ordered list of component locations in assembly space 062 063 private final Symbol noninformativeSymbol = DNATools.n(); 064 065 066 private class TranslatedSymbolList extends AbstractSymbolList 067 { 068 ComponentFeature cf; 069 int translation; 070 int length; 071 SymbolList underlyingSymList; 072 073 private TranslatedSymbolList(ComponentFeature cf) 074// throws IllegalAlphabetException 075 { 076// if (cf.getComponentSequence().getAlphabet() != DNATools.getDNA()) 077// throw new IllegalAlphabetException("TranslatedSymbolList only accepts DNA."); 078 079 this.cf = cf; 080 081 if (cf.getStrand() == StrandedFeature.POSITIVE) { 082 translation = cf.getLocation().getMin() - cf.getComponentLocation().getMin(); 083 } 084 else { 085 translation = cf.getLocation().getMax() + cf.getComponentLocation().getMin(); 086 } 087 088 // cache these for speed 089 underlyingSymList = cf.getComponentSequence(); 090 length = underlyingSymList.length(); 091 } 092 093 /** 094 * this is the only method that needs implementation. (need to reverse complement?) 095 */ 096 public Symbol symbolAt(int i) 097 { 098 // compute index 099 int idx; 100 101 try { 102 if (cf.getStrand() == StrandedFeature.POSITIVE) { 103 idx = i - translation; 104 if ((idx<1) || (idx>length)) return noninformativeSymbol; 105// System.out.println("##########+++ " + i + " " + idx); 106 return underlyingSymList.symbolAt(idx); 107 } 108 else { 109 idx = translation - i; 110 if ((idx<1) || (idx>length)) return noninformativeSymbol; 111// System.out.println("##########--- " + i + " " + idx); 112 return DNATools.complement(underlyingSymList.symbolAt(idx)); 113 } 114 } 115 catch (IllegalSymbolException ise) { 116 return noninformativeSymbol; 117 } 118 } 119 120 121 /** 122 * should I bother with this at all? 123 */ 124 public Alphabet getAlphabet() { return DNATools.getDNA(); } 125 126 /** 127 * this one has to be faked to see much longer 128 * than it is so the translated locations appear valid. 129 */ 130 public int length() { return cf.getLocation().getMax(); } 131 132 public ComponentFeature getFeature() { return cf; } 133 } 134 135 136 { 137 components = new TreeMap(Location.naturalOrder); 138 componentList = new ArrayList(); 139 } 140 141 public void setLength(int len) { 142 autoLength = false; 143 length = len; 144 } 145 146 public void putComponent(ComponentFeature f) { 147 // when receiving a new entry there are 148 // two passes. 149 150 // in pass one, we remove any redundant entries from the 151 // existing components map and modify the new entry to deal 152 // with existing entries. 153 154 // in pass two, we recreate componentList from the comonents 155 // key. 156 157 // components has as key the assembly sequence ranges 158 // that each component maps. The value is the component 159 // SymbolList itself. 160 161 // componentList stores the ranges as Locations to accelerate 162 // searches for the current component. 163 164 // PASS 1 165 // ====== 166 // when entering a new component, the following checks 167 // are needed:- 168 // 1) if the new range is completely contained by 169 // a current range, discard it and its component. 170 // 2) if the new range completely contains a current 171 // range, discard the current range and its component. 172 // 3) if the new range overlaps an existing range, adjust 173 // the new range to remove the overlap. 174 175 176 // find the first range that contains/right of the 177 // left limit of the current range 178 Location newLoc = new RangeLocation(f.getLocation().getMin(), f.getLocation().getMax()); 179 int startIdx = idxRightOfPoint(f.getLocation().getMin()); 180 181// System.out.println("testing " + f.getLocation() + " result is " + startIdx); 182 183 if (startIdx == NOTFOUND) { 184 // either there are no entries yet or that all of 185 // them are to left of this point. 186 // in this case, we just insert the new entry 187// System.out.println("putting " + f.getLocation() + " " + f.getLocation() + " " + f.getStrand() + " " + f.getComponentSequence().seqString()); 188 components.put(f.getLocation(), new TranslatedSymbolList(f)); 189 recreateList(); 190 return; 191 } 192 else { 193 // we are going to reconstruct the mappings 194 // according to above rules 195 196 for (int i=startIdx; 197 i < componentList.size(); // check for end of componentList 198 i++) { 199 200 Location thisLoc = (Location) componentList.get(i); 201// System.out.println("comparing " + newLoc + " against " + thisLoc); 202 203 // check if there are any other relevant entries 204 if (newLoc.getMin() > thisLoc.getMax()) break; 205 206 if (newLoc.contains(thisLoc)) { 207 // eliminate this entry from both componentList 208 // and from the components 209 // old mapping can also be discarded 210 components.remove(thisLoc); 211 } 212 else if (thisLoc.contains(newLoc)) { 213 // throw away new entry 214 return; 215 } 216 else if (newLoc.overlaps(thisLoc)) { 217 // resolve the overlap 218 if (newLoc.getMin() < thisLoc.getMin()) { 219 // newLoc on left: correct right limit 220 newLoc = new RangeLocation(newLoc.getMin(), thisLoc.getMin() - 1); 221 } 222 else { 223 // right overlap 224 newLoc = new RangeLocation(thisLoc.getMax() + 1, newLoc.getMax()); 225 } 226 } 227 else { 228 // no overlap of any kind at all 229 // this can only mean we can slot in the entry here 230// System.out.println("putting " + newLoc + " " + f.getLocation() + " " + f.getStrand() + " " + f.getComponentSequence().seqString()); 231 components.put(newLoc, new TranslatedSymbolList(f)); 232 recreateList(); 233 return; 234 } 235 } 236 237 // you can get here if you have "consumed" the last location 238 // or overlapped it. It only remains to put in the new 239 // entry. In this case, it is either going to be last 240 // (because you have consumed the last, or second last 241 // (because you have overlapped the last). 242// System.out.println("putting " + newLoc + " " + f.getLocation() + " " + f.getStrand() + " " + f.getComponentSequence().seqString()); 243 components.put(newLoc, new TranslatedSymbolList(f)); 244 recreateList(); 245 return; 246 } 247 248 } 249 250 private void recreateList() 251 { 252 componentList.clear(); 253 componentList.addAll(components.keySet()); 254 } 255 256 public void removeComponent(ComponentFeature f) { 257 // search thru current components to 258 // remove the specified component 259 for (int i=0; i< componentList.size(); i++) { 260 Location loc = (Location) componentList.get(i); 261 TranslatedSymbolList sl = (TranslatedSymbolList) components.get(loc); 262 263 if (sl.getFeature() != f) continue; 264 265 // have a match, remove from both componentList 266 // and component 267 // loc is now the mapping range of the to-be-removed feature 268 componentList.remove(i); 269 270 // at this point the former left and right neighbours of the 271 // removed feature are i-1 and i. 272 273 // check bounds of both neighbouring locations 274 // and extend if possible to cover the void 275 // left by the removal 276 // extension would not occur if the neighbours 277 // are not abutting to begin with 278 int minExtensionFromRight = loc.getMin(); 279 280 // the way in which the locations are assembled 281 // mean that at most, the left and right neighbours can be extended 282 // to some point within the removed range. 283 if (i !=0 ) { 284 // there was a left neighbour, extend it if poss 285 Location leftLoc = (Location) componentList.get(i-1); 286 Location leftCFLoc = (Location) components.get(leftLoc); 287 288 if (leftCFLoc.getMax() > loc.getMin()) { 289 // there is scope for extension 290 // compute the right limit 291 int newRightLimitOnLeftRange = Math.min(leftCFLoc.getMax(), loc.getMax()); 292 293 Location newLeftLoc = new RangeLocation( 294 leftLoc.getMin(), 295 newRightLimitOnLeftRange 296 ); 297 298 // update all tables 299 componentList.remove(i-1); 300 componentList.add(i-1, newLeftLoc); 301 Object value = components.remove(leftLoc); 302 components.put(newLeftLoc, value); 303 304 minExtensionFromRight = newRightLimitOnLeftRange + 1; 305 } 306 } 307 308 if (i != componentList.size()) { // we use i because that's the index of the right neighbour after deletion 309 // there was a right neighbour 310 Location rightLoc = (Location) componentList.get(i); 311 Location rightCFLoc = (Location) components.get(rightLoc); 312 313 if ((loc.getMax() > rightCFLoc.getMin()) && // did the removed range limit the right neighbour? 314 (minExtensionFromRight < rightLoc.getMin()) ) { // is there any potential extension? 315 316 // there is potential for extension 317 int newLeftLimitOnRightRange = Math.max(minExtensionFromRight, rightCFLoc.getMin()); 318 319 Location newRightLoc = new RangeLocation( 320 newLeftLimitOnRightRange, 321 rightLoc.getMax() 322 ); 323 324 // update all tables 325 componentList.remove(i); 326 componentList.add(i, newRightLoc); 327 Object value = components.remove(rightLoc); 328 components.put(newRightLoc, value); 329 330 } 331 } 332 } 333 } 334 335 private SymbolList getComponentSymbols(Location loc) { 336 // the only kind of object in this class is a TranslatedSymbolList 337 // this already handles the coordinate transformations 338 // I only need to select the right one 339 return (SymbolList) components.get(loc); 340 } 341 342 343 public Set getComponentLocationSet() { 344// return components.keySet(); 345 TreeSet newSet = new TreeSet(Location.naturalOrder); 346 newSet.addAll(componentList); 347 return newSet; 348 } 349 350 /** 351 * Find the location containing p in a sorted list of non-overlapping contiguous 352 * locations. 353 */ 354 355 private Location lastLocation = Location.empty; 356 357 // searches for first range that contains the point. 358 // returns null on failure 359 private Location locationOfPoint(int p) { 360 if (lastLocation.contains(p)) { 361 return lastLocation; 362 } 363 364 int first = 0; 365 int last = componentList.size() - 1; 366 367 while (first <= last) { 368 int check = (first + last) / 2; 369 Location checkL = (Location) componentList.get(check); 370 if (checkL.contains(p)) { 371 lastLocation = checkL; 372 return checkL; 373 } 374 375 if (p < checkL.getMin()) { 376 last = check - 1; 377 } else { 378 first = check + 1; 379 } 380 } 381 382 return null; 383 } 384 385 private int NOTFOUND = -1; 386 387 private int idxRightOfPoint(int p) 388 { 389 390 int first = 0; 391 int last = componentList.size() - 1; 392 393 int check = 0; 394 Location checkL = null; 395 while (first <= last) { 396 check = (first + last) / 2; 397 checkL = (Location) componentList.get(check); 398 if (checkL.contains(p)) { 399// System.out.println("idxRightOfPoint("+p+") returning " + check + " on " + checkL); 400 return check; 401 } 402 403 if (p < checkL.getMin()) 404 last = check - 1; 405 else 406 first = check + 1; 407 } 408 409 try { 410 if (p < checkL.getMin()) { 411 return check; 412 } else { 413 int nextIdx = check+1; 414 if ((nextIdx < componentList.size()) 415 && (p < ((Location) componentList.get(nextIdx)).getMin()) ) 416 return nextIdx; 417 else 418 return NOTFOUND; 419 } 420 } 421 catch (NullPointerException npe) { 422 // this deals with the start 423 // when there are no entries 424 return NOTFOUND; 425 } 426 } 427 428 public Alphabet getAlphabet() { 429 return DNATools.getDNA(); 430 } 431 432 public int length() { 433 if(autoLength) { 434 int componentCount = componentList.size(); 435 436 if (componentCount == 0) 437 // there's nothing in 'ere. 438 return 0; 439 else { 440 Location last = (Location) componentList.get(componentCount - 1); 441 return last.getMax(); 442 } 443 } else { 444 return length; 445 } 446 } 447 448 public Symbol symbolAt(int pos) { 449 Location l = locationOfPoint(pos); 450 451 if (l == null) { 452// System.out.println("symbol at " + pos + " is n"); 453 return noninformativeSymbol; 454 } 455 456 SymbolList syms = getComponentSymbols(l); 457 458 try { 459// System.out.println("symbol at " + pos + " is " + DNATools.getDNA().getTokenization("token").tokenizeSymbol(syms.symbolAt(pos)) 460// + " location is " + l + " symbolList is " + syms); 461 } 462 catch (Exception be) { be.printStackTrace(); } 463 return syms.symbolAt(pos); 464 } 465 466 public SymbolList subList(int start, int end) 467 { 468 Location l = locationOfPoint(start); 469 if (l != null && l.contains(end)) { 470 SymbolList symbols = getComponentSymbols(l); 471// int tstart = start - l.getMin() + 1; 472// int tend = end - l.getMin() + 1; 473 return symbols.subList(start, end); 474 } 475 476 // All is lost. Fall back onto `view' subList from AbstractSymbolList 477 478 return super.subList(start, end); 479 } 480/* 481 public Symbol symbolAt(int pos) { 482 // System.out.println(this + " symbolAt(" + pos + ")"); 483 Location l = locationOfPoint(pos); 484 if (l != null) { 485 SymbolList syms = getComponentSymbols(l); 486 return syms.symbolAt(pos - l.getMin() + 1); 487 } 488 489 return noninformativeSymbol; 490 } 491 492 493 public SymbolList subList(int start, int end) { 494 // System.out.println(this + " subList(" + start + ", " + end + ")"); 495 Location l = locationOfPoint(start); 496 if (l != null && l.contains(end)) { 497 SymbolList symbols = getComponentSymbols(l); 498 int tstart = start - l.getMin() + 1; 499 int tend = end - l.getMin() + 1; 500 return symbols.subList(tstart, tend); 501 } 502 503 // All is lost. Fall back onto `view' subList from AbstractSymbolList 504 505 return super.subList(start, end); 506 } 507*/ 508}