001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @author Richard Holland 023 * @author Scooter Willis 024 * @author Paolo Pavan 025 * 026 */ 027package org.biojava.nbio.core.sequence.template; 028 029import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 030import org.biojava.nbio.core.sequence.AccessionID; 031import org.biojava.nbio.core.sequence.DataSource; 032import org.biojava.nbio.core.sequence.Strand; 033import org.biojava.nbio.core.sequence.TaxonomyID; 034import org.biojava.nbio.core.sequence.features.*; 035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader; 036import org.biojava.nbio.core.sequence.location.SequenceLocation; 037import org.biojava.nbio.core.sequence.location.SimpleLocation; 038import org.biojava.nbio.core.sequence.location.template.Location; 039import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader; 040import org.slf4j.Logger; 041import org.slf4j.LoggerFactory; 042 043import java.util.*; 044 045/** 046 * 047 * The base class for DNA, RNA and Protein sequences. 048 * @param <C> 049 */ 050public abstract class AbstractSequence<C extends Compound> implements Sequence<C> { 051 052 private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class); 053 054 private TaxonomyID taxonomy; 055 private AccessionID accession; 056 private SequenceReader<C> sequenceStorage = null; 057 private CompoundSet<C> compoundSet; 058 private AnnotationType annotationType = AnnotationType.UNKNOWN; 059 private String description; 060 private String originalHeader; 061 private Collection<Object> userCollection; 062 private Integer bioBegin = null; 063 private Integer bioEnd = null; 064 private AbstractSequence<?> parentSequence = null; 065 private String source = null; 066 private ArrayList<String> notesList = new ArrayList<String>(); 067 private Double sequenceScore = null; 068 private FeaturesKeyWordInterface featuresKeyWord = null; 069 private DatabaseReferenceInterface databaseReferences = null; 070 private FeatureRetriever featureRetriever = null; 071 private ArrayList<FeatureInterface<AbstractSequence<C>, C>> features = 072 new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 073 private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>> groupedFeatures = 074 new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>>(); 075 076 public AbstractSequence() { 077 } 078 079 /** 080 * Create a Sequence from a simple string where the values should be found in compoundSet 081 * @param seqString 082 * @param compoundSet 083 * @throws CompoundNotFoundException 084 */ 085 public AbstractSequence(String seqString, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 086 setCompoundSet(compoundSet); 087 sequenceStorage = new ArrayListSequenceReader<C>(); 088 sequenceStorage.setCompoundSet(this.getCompoundSet()); 089 sequenceStorage.setContents(seqString); 090 } 091 092 /** 093 * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location 094 * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in 095 * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database. 096 * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to 097 * various methods will be valid. 098 * 099 * @param proxyLoader 100 * @param compoundSet 101 */ 102 public AbstractSequence(SequenceReader<C> proxyLoader, CompoundSet<C> compoundSet) { 103 setCompoundSet(compoundSet); 104 setProxySequenceReader(proxyLoader); 105 } 106 107 /** 108 * Very important method that allows external mappings of sequence data and features. This method 109 * will gain additional interface inspection that allows external data sources with knowledge 110 * of features for a sequence to be supported. 111 * 112 * @param proxyLoader 113 */ 114 public void setProxySequenceReader(SequenceReader<C> proxyLoader) { 115 this.sequenceStorage = proxyLoader; 116 if (proxyLoader instanceof FeaturesKeyWordInterface) { 117 this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage); 118 } 119 if (proxyLoader instanceof DatabaseReferenceInterface) { 120 this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage); 121 } 122 123 if (proxyLoader instanceof FeatureRetriever) { 124 this.setFeatureRetriever((FeatureRetriever) sequenceStorage); 125 HashMap<String, ArrayList<AbstractFeature>> ff = getFeatureRetriever().getFeatures(); 126 for (String k: ff.keySet()){ 127 for (AbstractFeature f: ff.get(k)){ 128 this.addFeature(f); 129 } 130 } 131 // success of next statement guaranteed because source is a compulsory field 132 //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref"); 133 ArrayList<DBReferenceInfo> dbQualifiers = (ArrayList)ff.get("source").get(0).getQualifiers().get("db_xref"); 134 DBReferenceInfo dbQualifier = dbQualifiers.get(0); 135 136 if (dbQualifier != null) this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase()+":"+dbQualifier.getId(), DataSource.UNKNOWN)); 137 } 138 139 if(getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader){ // we have lots of unsupported operations for this call so quick fix to allow this tow rork 140 this.setAccession(proxyLoader.getAccession()); 141 } 142 } 143 144 public SequenceReader<C> getProxySequenceReader() { 145 return sequenceStorage; 146 } 147 148 /** 149 * @return the bioBegin 150 */ 151 public Integer getBioBegin() { 152 if (bioBegin == null) { 153 return 1; 154 } else { 155 return bioBegin; 156 } 157 } 158 159 /** 160 * @param bioBegin the bioBegin to set 161 */ 162 public void setBioBegin(Integer begin) { 163 this.bioBegin = begin; 164 } 165 166 /** 167 * @return the bioEnd 168 */ 169 public Integer getBioEnd() { 170 if (bioEnd == null) { 171 return this.getLength(); 172 } else { 173 return bioEnd; 174 } 175 } 176 177 /** 178 * @param bioEnd the bioEnd to set 179 */ 180 public void setBioEnd(Integer end) { 181 this.bioEnd = end; 182 } 183 184 /** 185 * Provided for convince if the developer needs to associate data with a sequence 186 * 187 * @return 188 */ 189 public Collection<Object> getUserCollection() { 190 191 return userCollection; 192 } 193 194 /** 195 * 196 * @param userCollection 197 */ 198 public void setUserCollection(Collection<Object> userCollection) { 199 this.userCollection = userCollection; 200 } 201 202 /** 203 * @return the annotation 204 */ 205 public AnnotationType getAnnotationType() { 206 return annotationType; 207 } 208 209 /** 210 * @param annotation the annotation to set 211 */ 212 public void setAnnotationType(AnnotationType annotationType) { 213 this.annotationType = annotationType; 214 } 215 216 /** 217 * @return the description 218 */ 219 public String getDescription() { 220 return description; 221 } 222 223 /** 224 * @param description the description to set 225 */ 226 public void setDescription(String description) { 227 this.description = description; 228 } 229 230 /** 231 * @return the originalHeader 232 */ 233 public String getOriginalHeader() { 234 return originalHeader; 235 } 236 237 /** 238 * @param originalHeader the originalHeader to set 239 */ 240 public void setOriginalHeader(String originalHeader) { 241 this.originalHeader = originalHeader; 242 } 243 244 /** 245 * @return the parentSequence 246 */ 247 public AbstractSequence<?> getParentSequence() { 248 return parentSequence; 249 } 250 251 /** 252 * @param parentSequence the parentSequence to set 253 */ 254 public void setParentSequence(AbstractSequence<?> parentSequence) { 255 this.parentSequence = parentSequence; 256 } 257 258 /** 259 * Added support for the source of this sequence for GFF3 export 260 * If a sub sequence doesn't have source then check for parent source 261 * @return the source 262 */ 263 public String getSource() { 264 if (source != null) { 265 return source; 266 } 267 if (parentSequence != null) { 268 return parentSequence.getSource(); 269 } 270 return null; 271 } 272 273 /** 274 * Added support for the source of this sequence for GFF3 export 275 * @param source the source to set 276 */ 277 public void setSource(String source) { 278 279 this.source = source; 280 } 281 282 /** 283 * Add notes about this sequence that will get exported for GFF3 284 * @param note 285 */ 286 public void addNote(String note) { 287 notesList.add(note); 288 } 289 290 public void removeNote(String note) { 291 notesList.remove(note); 292 } 293 294 /** 295 * @return the notesList 296 */ 297 public ArrayList<String> getNotesList() { 298 return notesList; 299 } 300 301 /** 302 * @param notesList the notesList to set 303 */ 304 public void setNotesList(ArrayList<String> notesList) { 305 this.notesList = notesList; 306 } 307 308 /** 309 * Provide place holder for a metric that indicate a score associated with the sequence 310 * @return the sequenceScore 311 */ 312 public Double getSequenceScore() { 313 return sequenceScore; 314 } 315 316 /** 317 * @param sequenceScore the sequenceScore to set 318 */ 319 public void setSequenceScore(Double sequenceScore) { 320 this.sequenceScore = sequenceScore; 321 } 322 323 /** 324 * Return features at a sequence position by type 325 * @param featureType 326 * @param bioSequencePosition 327 * @return 328 */ 329 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(String featureType, int bioSequencePosition) { 330 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits = 331 new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 332 List<FeatureInterface<AbstractSequence<C>, C>> features = getFeaturesByType(featureType); 333 if (features != null) { 334 for (FeatureInterface<AbstractSequence<C>, C> feature : features) { 335 if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { 336 featureHits.add(feature); 337 } 338 } 339 } 340 return featureHits; 341 } 342 343 /** 344 * Return features at a sequence position 345 * @param featureType 346 * @param bioSequencePosition 347 * @return 348 */ 349 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(int bioSequencePosition) { 350 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits = 351 new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 352 if (features != null) { 353 for (FeatureInterface<AbstractSequence<C>, C> feature : features) { 354 if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { 355 featureHits.add(feature); 356 } 357 } 358 } 359 return featureHits; 360 } 361 362 /** 363 * 364 * @return 365 */ 366 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures() { 367 return features; 368 } 369 370 /** 371 * Method to help set the proper details for a feature as it relates to a sequence 372 * where the feature needs to have a location on the sequence 373 * @param bioStart 374 * @param bioEnd 375 * @param feature 376 */ 377 public void addFeature(int bioStart, int bioEnd, FeatureInterface<AbstractSequence<C>, C> feature) { 378 SequenceLocation<AbstractSequence<C>, C> sequenceLocation = 379 new SequenceLocation<AbstractSequence<C>, C>(bioStart, bioEnd, this); 380 feature.setLocation(sequenceLocation); 381 addFeature(feature); 382 } 383 384 /** 385 * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than 386 * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features 387 * in SequenceFeaturePanel 388 * @param feature 389 */ 390 public void addFeature(FeatureInterface<AbstractSequence<C>, C> feature) { 391 features.add(feature); 392 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType()); 393 if (featureList == null) { 394 featureList = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 395 groupedFeatures.put(feature.getType(), featureList); 396 } 397 featureList.add(feature); 398 Collections.sort(features, AbstractFeature.LOCATION_LENGTH); 399 Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH); 400 } 401 402 /** 403 * Remove a feature from the sequence 404 * @param feature 405 */ 406 public void removeFeature(FeatureInterface<AbstractSequence<C>, C> feature) { 407 features.remove(feature); 408 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType()); 409 if (featureList != null) { 410 featureList.remove(feature); 411 if (featureList.isEmpty()) { 412 groupedFeatures.remove(feature.getType()); 413 } 414 } 415 } 416 417 /** 418 * 419 * @param type 420 * @return 421 */ 422 public List<FeatureInterface<AbstractSequence<C>, C>> getFeaturesByType(String type) { 423 List<FeatureInterface<AbstractSequence<C>, C>> features = groupedFeatures.get(type); 424 if (features == null) { 425 features = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 426 } 427 return features; 428 } 429 430 /** 431 * @return the featuresKeyWord 432 */ 433 public FeaturesKeyWordInterface getFeaturesKeyWord() { 434 return featuresKeyWord; 435 } 436 437 /** 438 * @param featuresKeyWord the featuresKeyWord to set 439 */ 440 public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) { 441 this.featuresKeyWord = featuresKeyWord; 442 } 443 444 /** 445 * @return the databaseReferences 446 */ 447 public DatabaseReferenceInterface getDatabaseReferences() { 448 return databaseReferences; 449 } 450 451 /** 452 * @param databaseReferences the databaseReferences to set 453 */ 454 public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) { 455 this.databaseReferences = databaseReferences; 456 } 457 458 public FeatureRetriever getFeatureRetriever() { 459 return featureRetriever; 460 } 461 462 public void setFeatureRetriever(FeatureRetriever featureRetriever) { 463 this.featureRetriever = featureRetriever; 464 } 465 466 467 468 public enum AnnotationType { 469 470 CURATED, PREDICTED, UNKNOWN; 471 } 472 473 /** 474 * @return the accession 475 */ 476 @Override 477 public AccessionID getAccession() { 478 return accession; 479 } 480 481 /** 482 * @param accession the accession to set 483 */ 484 public void setAccession(AccessionID accession) { 485 this.accession = accession; 486 } 487 488 /** 489 * @return the species 490 */ 491 public TaxonomyID getTaxonomy() { 492 return taxonomy; 493 } 494 495 /** 496 * @param species the species to set 497 */ 498 public void setTaxonomy(TaxonomyID taxonomy) { 499 this.taxonomy = taxonomy; 500 } 501 502 @Override 503 public CompoundSet<C> getCompoundSet() { 504 if (compoundSet != null) { 505 return compoundSet; 506 } 507 // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence, 508 // e.g., the case where the parent sequence for a protein is a CDS. 509 /* 510 if (parentSequence != null) { 511 return parentSequence.getCompoundSet(); 512 } 513 */ 514 return null; 515 516 517 } 518 519 public void setCompoundSet(CompoundSet<C> compoundSet) { 520 this.compoundSet = compoundSet; 521 } 522 523 @Override 524 public String toString() { 525 return getSequenceAsString(); 526 } 527 528 private SequenceReader<C> getSequenceStorage() { 529 if (sequenceStorage != null) { 530 return sequenceStorage; 531 } 532 if (parentSequence != null) { 533 534 //return parentSequence.getSequenceStorage(); 535 536 if ( this.compoundSet.equals(parentSequence.getCompoundSet())){ 537 sequenceStorage = new ArrayListSequenceReader<C>(); 538 sequenceStorage.setCompoundSet(this.getCompoundSet()); 539 try { 540 sequenceStorage.setContents(parentSequence.getSequenceAsString()); 541 } catch (CompoundNotFoundException e) { 542 // TODO is there a better way to handle this exception? 543 logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}",e.getMessage()); 544 } 545 return sequenceStorage; 546 } 547 548 } 549 550 return null; 551 } 552 553 /** 554 * 555 * @param begin 556 * @param end 557 * @param strand 558 * @return 559 */ 560 public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) { 561 562 Location loc = new SimpleLocation(bioStart, bioEnd, strand); 563 return loc.getSubSequence(this).getSequenceAsString(); 564 } 565 566 /** 567 * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand. 568 * @return 569 */ 570 @Override 571 public String getSequenceAsString() { 572 return SequenceMixin.toString(this); 573 574 } 575 576 /** 577 * 578 * @return 579 */ 580 @Override 581 public List<C> getAsList() { 582 return SequenceMixin.toList(this); 583 } 584 585 /** 586 * 587 * @param position The 1-indexed position of the amino acid 588 * @return 589 */ 590 @Override 591 public C getCompoundAt(int position) { 592 593 return getSequenceStorage().getCompoundAt(position); 594 } 595 596 /** 597 * 598 * @param compound 599 * @return The first index of compound in this sequence (1-based) 600 */ 601 @Override 602 public int getIndexOf(C compound) { 603 return getSequenceStorage().getIndexOf(compound); 604 } 605 606 /** 607 * 608 * @param compound 609 * @return The last index of compound in this sequence (1-based) 610 */ 611 @Override 612 public int getLastIndexOf(C compound) { 613 return getSequenceStorage().getLastIndexOf(compound); 614 } 615 616 /** 617 * 618 * @return 619 */ 620 @Override 621 public int getLength() { 622 return getSequenceStorage().getLength(); 623 } 624 625 /** 626 * 627 * @param bioStart 628 * @param bioEnd 629 * @return 630 */ 631 @Override 632 public SequenceView<C> getSubSequence(final Integer bioStart, final Integer bioEnd) { 633 return new SequenceProxyView<C>(this, bioStart, bioEnd); 634 } 635 636 /** 637 * 638 * @return 639 */ 640 @Override 641 public Iterator<C> iterator() { 642 return getSequenceStorage().iterator(); 643 } 644 645 /** 646 * 647 * @param compounds 648 * @return 649 */ 650 @Override 651 public int countCompounds(C... compounds) { 652 return SequenceMixin.countCompounds(this, compounds); 653 } 654 655 /** 656 * 657 * @return 658 */ 659 @Override 660 public SequenceView<C> getInverse() { 661 return SequenceMixin.inverse(this); 662 } 663 664 //TODO needs equals and hashcode 665}