001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @author Richard Holland 023 * @author Scooter Willis 024 * @author Paolo Pavan 025 * 026 */ 027package org.biojava.nbio.core.sequence.template; 028 029import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 030import org.biojava.nbio.core.sequence.AccessionID; 031import org.biojava.nbio.core.sequence.DataSource; 032import org.biojava.nbio.core.sequence.Strand; 033import org.biojava.nbio.core.sequence.TaxonomyID; 034import org.biojava.nbio.core.sequence.features.*; 035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader; 036import org.biojava.nbio.core.sequence.location.SequenceLocation; 037import org.biojava.nbio.core.sequence.location.SimpleLocation; 038import org.biojava.nbio.core.sequence.location.template.Location; 039import org.biojava.nbio.core.sequence.reference.AbstractReference; 040import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader; 041import org.biojava.nbio.core.util.Equals; 042import org.slf4j.Logger; 043import org.slf4j.LoggerFactory; 044 045import java.util.*; 046 047/** 048 * 049 * The base class for DNA, RNA and Protein sequences. 050 * @param <C> the compound type 051 */ 052public abstract class AbstractSequence<C extends Compound> implements Sequence<C> { 053 054 private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class); 055 056 private TaxonomyID taxonomy; 057 private AccessionID accession; 058 private SequenceReader<C> sequenceStorage = null; 059 private CompoundSet<C> compoundSet; 060 private AnnotationType annotationType = AnnotationType.UNKNOWN; 061 private String description; 062 private String originalHeader; 063 private Collection<Object> userCollection; 064 private Integer bioBegin = null; 065 private Integer bioEnd = null; 066 private AbstractSequence<?> parentSequence = null; 067 private String source = null; 068 private List<String> notesList = new ArrayList<>(); 069 private Double sequenceScore = null; 070 private FeaturesKeyWordInterface featuresKeyWord = null; 071 private DatabaseReferenceInterface databaseReferences = null; 072 private FeatureRetriever featureRetriever = null; 073 private List<FeatureInterface<AbstractSequence<C>, C>> features = 074 new ArrayList<>(); 075 private Map<String, List<FeatureInterface<AbstractSequence<C>, C>>> groupedFeatures = 076 new LinkedHashMap<>(); 077 private List<String> comments = new ArrayList<>(); 078 private List<AbstractReference> references; 079 080 public AbstractSequence() { 081 } 082 083 /** 084 * Create a Sequence from a simple string where the values should be found in compoundSet 085 * @param seqString 086 * @param compoundSet 087 * @throws CompoundNotFoundException 088 */ 089 public AbstractSequence(String seqString, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 090 setCompoundSet(compoundSet); 091 initSequenceStorage(seqString); 092 } 093 094 // so it can be called from subclass constructors 095 protected void initSequenceStorage(String seqString) throws CompoundNotFoundException { 096 sequenceStorage = new ArrayListSequenceReader<>(); 097 sequenceStorage.setCompoundSet(this.getCompoundSet()); 098 sequenceStorage.setContents(seqString); 099 } 100 101 /** 102 * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location 103 * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in 104 * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database. 105 * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to 106 * various methods will be valid. 107 * 108 * @param proxyLoader 109 * @param compoundSet 110 */ 111 public AbstractSequence(SequenceReader<C> proxyLoader, CompoundSet<C> compoundSet) { 112 setCompoundSet(compoundSet); 113 setProxySequenceReader(proxyLoader); 114 } 115 116 /** 117 * Very important method that allows external mappings of sequence data and features. This method 118 * will gain additional interface inspection that allows external data sources with knowledge 119 * of features for a sequence to be supported. 120 * 121 * @param proxyLoader 122 */ 123 public void setProxySequenceReader(SequenceReader<C> proxyLoader) { 124 this.sequenceStorage = proxyLoader; 125 if (proxyLoader instanceof FeaturesKeyWordInterface) { 126 this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage); 127 } 128 if (proxyLoader instanceof DatabaseReferenceInterface) { 129 this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage); 130 } 131 132 if (proxyLoader instanceof FeatureRetriever) { 133 this.setFeatureRetriever((FeatureRetriever) sequenceStorage); 134 Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> ff = getFeatureRetriever().getFeatures(); 135 for (String k: ff.keySet()){ 136 for (AbstractFeature f: ff.get(k)){ 137 this.addFeature(f); 138 } 139 } 140 // success of next statement guaranteed because source is a compulsory field 141 //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref"); 142 List<DBReferenceInfo> dbQualifiers = (ArrayList)ff.get("source").get(0).getQualifiers().get("db_xref"); 143 DBReferenceInfo dbQualifier = dbQualifiers.get(0); 144 145 if (dbQualifier != null) this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase()+":"+dbQualifier.getId(), DataSource.UNKNOWN)); 146 } 147 148 if(getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader){ // we have lots of unsupported operations for this call so quick fix to allow this tow rork 149 this.setAccession(proxyLoader.getAccession()); 150 } 151 } 152 153 public SequenceReader<C> getProxySequenceReader() { 154 return sequenceStorage; 155 } 156 157 /** 158 * @return the bioBegin 159 */ 160 public Integer getBioBegin() { 161 if (bioBegin == null) { 162 return 1; 163 } else { 164 return bioBegin; 165 } 166 } 167 168 /** 169 * @param bioBegin the bioBegin to set 170 */ 171 public void setBioBegin(Integer bioBegin) { 172 this.bioBegin = bioBegin; 173 } 174 175 /** 176 * @return the bioEnd 177 */ 178 public Integer getBioEnd() { 179 if (bioEnd == null) { 180 return this.getLength(); 181 } else { 182 return bioEnd; 183 } 184 } 185 186 /** 187 * @param bioEnd the bioEnd to set 188 */ 189 public void setBioEnd(Integer bioEnd) { 190 this.bioEnd = bioEnd; 191 } 192 193 /** 194 * Provided for convince if the developer needs to associate data with a sequence 195 * 196 * @return 197 */ 198 public Collection<Object> getUserCollection() { 199 200 return userCollection; 201 } 202 203 /** 204 * 205 * @param userCollection 206 */ 207 public void setUserCollection(Collection<Object> userCollection) { 208 this.userCollection = userCollection; 209 } 210 211 /** 212 * @return the annotation 213 */ 214 public AnnotationType getAnnotationType() { 215 return annotationType; 216 } 217 218 /** 219 * @param annotationType the annotation to set 220 */ 221 public void setAnnotationType(AnnotationType annotationType) { 222 this.annotationType = annotationType; 223 } 224 225 /** 226 * @return the description 227 */ 228 public String getDescription() { 229 return description; 230 } 231 232 /** 233 * @param description the description to set 234 */ 235 public void setDescription(String description) { 236 this.description = description; 237 } 238 239 /** 240 * @return the originalHeader 241 */ 242 public String getOriginalHeader() { 243 return originalHeader; 244 } 245 246 /** 247 * @param originalHeader the originalHeader to set 248 */ 249 public void setOriginalHeader(String originalHeader) { 250 this.originalHeader = originalHeader; 251 } 252 253 /** 254 * @return the parentSequence 255 */ 256 public AbstractSequence<?> getParentSequence() { 257 return parentSequence; 258 } 259 260 /** 261 * @param parentSequence the parentSequence to set 262 */ 263 public void setParentSequence(AbstractSequence<?> parentSequence) { 264 this.parentSequence = parentSequence; 265 } 266 267 /** 268 * Added support for the source of this sequence for GFF3 export 269 * If a sub sequence doesn't have source then check for parent source 270 * @return the source 271 */ 272 public String getSource() { 273 if (source != null) { 274 return source; 275 } 276 if (parentSequence != null) { 277 return parentSequence.getSource(); 278 } 279 return null; 280 } 281 282 /** 283 * Added support for the source of this sequence for GFF3 export 284 * @param source the source to set 285 */ 286 public void setSource(String source) { 287 288 this.source = source; 289 } 290 291 /** 292 * Add notes about this sequence that will get exported for GFF3 293 * @param note 294 */ 295 public void addNote(String note) { 296 notesList.add(note); 297 } 298 299 public void removeNote(String note) { 300 notesList.remove(note); 301 } 302 303 /** 304 * @return the notesList 305 */ 306 public List<String> getNotesList() { 307 return notesList; 308 } 309 310 /** 311 * @param notesList the notesList to set 312 */ 313 public void setNotesList(List<String> notesList) { 314 this.notesList = notesList; 315 } 316 317 /** 318 * Provide place holder for a metric that indicate a score associated with the sequence 319 * @return the sequenceScore 320 */ 321 public Double getSequenceScore() { 322 return sequenceScore; 323 } 324 325 /** 326 * @param sequenceScore the sequenceScore to set 327 */ 328 public void setSequenceScore(Double sequenceScore) { 329 this.sequenceScore = sequenceScore; 330 } 331 332 /** 333 * @since 5.0.0 334 * @return the list of {@link AbstractReference} 335 */ 336 public List<AbstractReference> getReferences() { 337 return references; 338 } 339 340 /** 341 * Set the list of {@link AbstractReference} 342 * @since 5.0.0 343 * @param references 344 */ 345 public void setReferences(List<AbstractReference> references) { 346 this.references = references; 347 } 348 349 /** 350 * Return features at a sequence position by type 351 * @param featureType 352 * @param bioSequencePosition 353 * @return 354 */ 355 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(String featureType, int bioSequencePosition) { 356 List<FeatureInterface<AbstractSequence<C>, C>> featureHits = 357 new ArrayList<>(); 358 List<FeatureInterface<AbstractSequence<C>, C>> features = getFeaturesByType(featureType); 359 if (features != null) { 360 for (FeatureInterface<AbstractSequence<C>, C> feature : features) { 361 if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { 362 featureHits.add(feature); 363 } 364 } 365 } 366 return featureHits; 367 } 368 369 /** 370 * Return features at a sequence position 371 * @param bioSequencePosition 372 * @return 373 */ 374 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(int bioSequencePosition) { 375 List<FeatureInterface<AbstractSequence<C>, C>> featureHits = 376 new ArrayList<>(); 377 if (features != null) { 378 for (FeatureInterface<AbstractSequence<C>, C> feature : features) { 379 if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { 380 featureHits.add(feature); 381 } 382 } 383 } 384 return featureHits; 385 } 386 387 /** 388 * 389 * @return 390 */ 391 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures() { 392 return features; 393 } 394 395 /** 396 * Method to help set the proper details for a feature as it relates to a sequence 397 * where the feature needs to have a location on the sequence 398 * @param bioStart 399 * @param bioEnd 400 * @param feature 401 */ 402 public void addFeature(int bioStart, int bioEnd, FeatureInterface<AbstractSequence<C>, C> feature) { 403 SequenceLocation<AbstractSequence<C>, C> sequenceLocation = 404 new SequenceLocation<>(bioStart, bioEnd, this); 405 feature.setLocation(sequenceLocation); 406 addFeature(feature); 407 } 408 409 /** 410 * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than 411 * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features 412 * in SequenceFeaturePanel 413 * @param feature 414 */ 415 public void addFeature(FeatureInterface<AbstractSequence<C>, C> feature) { 416 features.add(feature); 417 List<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType()); 418 if (featureList == null) { 419 featureList = new ArrayList<>(); 420 groupedFeatures.put(feature.getType(), featureList); 421 } 422 featureList.add(feature); 423 Collections.sort(features, AbstractFeature.LOCATION_LENGTH); 424 Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH); 425 } 426 427 /** 428 * Remove a feature from the sequence 429 * @param feature 430 */ 431 public void removeFeature(FeatureInterface<AbstractSequence<C>, C> feature) { 432 features.remove(feature); 433 List<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType()); 434 if (featureList != null) { 435 featureList.remove(feature); 436 if (featureList.isEmpty()) { 437 groupedFeatures.remove(feature.getType()); 438 } 439 } 440 } 441 442 /** 443 * 444 * @param type 445 * @return 446 */ 447 public List<FeatureInterface<AbstractSequence<C>, C>> getFeaturesByType(String type) { 448 List<FeatureInterface<AbstractSequence<C>, C>> features = groupedFeatures.get(type); 449 if (features == null) { 450 features = new ArrayList<>(); 451 } 452 return features; 453 } 454 455 /** 456 * 457 * @return comments 458 */ 459 public List<String> getComments() { 460 return comments; 461 } 462 463 /** 464 * Set comments. 465 * @param comments 466 */ 467 public void setComments(List<String> comments) { 468 this.comments = comments; 469 } 470 471 /** 472 * @return the featuresKeyWord 473 */ 474 public FeaturesKeyWordInterface getFeaturesKeyWord() { 475 return featuresKeyWord; 476 } 477 478 /** 479 * @param featuresKeyWord the featuresKeyWord to set 480 */ 481 public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) { 482 this.featuresKeyWord = featuresKeyWord; 483 } 484 485 /** 486 * @return the databaseReferences 487 */ 488 public DatabaseReferenceInterface getDatabaseReferences() { 489 return databaseReferences; 490 } 491 492 /** 493 * @param databaseReferences the databaseReferences to set 494 */ 495 public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) { 496 this.databaseReferences = databaseReferences; 497 } 498 499 public FeatureRetriever getFeatureRetriever() { 500 return featureRetriever; 501 } 502 503 public void setFeatureRetriever(FeatureRetriever featureRetriever) { 504 this.featureRetriever = featureRetriever; 505 } 506 507 508 509 public enum AnnotationType { 510 511 CURATED, PREDICTED, UNKNOWN; 512 } 513 514 /** 515 * @return the accession 516 */ 517 @Override 518 public AccessionID getAccession() { 519 return accession; 520 } 521 522 /** 523 * @param accession the accession to set 524 */ 525 public void setAccession(AccessionID accession) { 526 this.accession = accession; 527 } 528 529 /** 530 * @return the species 531 */ 532 public TaxonomyID getTaxonomy() { 533 return taxonomy; 534 } 535 536 /** 537 * @param taxonomy the species to set 538 */ 539 public void setTaxonomy(TaxonomyID taxonomy) { 540 this.taxonomy = taxonomy; 541 } 542 543 @Override 544 public CompoundSet<C> getCompoundSet() { 545 if (compoundSet != null) { 546 return compoundSet; 547 } 548 // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence, 549 // e.g., the case where the parent sequence for a protein is a CDS. 550 /* 551 if (parentSequence != null) { 552 return parentSequence.getCompoundSet(); 553 } 554 */ 555 return null; 556 557 558 } 559 560 public void setCompoundSet(CompoundSet<C> compoundSet) { 561 this.compoundSet = compoundSet; 562 } 563 564 @Override 565 public boolean equals(Object o){ 566 567 if(! Equals.classEqual(this, o)) { 568 return false; 569 } 570 571 Sequence<C> other = (Sequence<C>)o; 572 573 if ( other.getCompoundSet() != getCompoundSet()) 574 return false; 575 576 577 List<C> rawCompounds = getAsList(); 578 List<C> otherCompounds = other.getAsList(); 579 580 if ( rawCompounds.size() != otherCompounds.size()) 581 return false; 582 583 for (int i = 0 ; i < rawCompounds.size() ; i++){ 584 Compound myCompound = rawCompounds.get(i); 585 Compound otherCompound = otherCompounds.get(i); 586 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 587 return false; 588 } 589 return true; 590 } 591 592 @Override 593 public int hashCode(){ 594 String s = getSequenceAsString(); 595 return s.hashCode(); 596 } 597 598 @Override 599 public String toString() { 600 return getSequenceAsString(); 601 } 602 603 private SequenceReader<C> getSequenceStorage() { 604 if (sequenceStorage != null) { 605 return sequenceStorage; 606 } 607 if (parentSequence != null) { 608 609 //return parentSequence.getSequenceStorage(); 610 611 if ( this.compoundSet.equals(parentSequence.getCompoundSet())){ 612 sequenceStorage = new ArrayListSequenceReader<>(); 613 sequenceStorage.setCompoundSet(this.getCompoundSet()); 614 try { 615 sequenceStorage.setContents(parentSequence.getSequenceAsString()); 616 } catch (CompoundNotFoundException e) { 617 // TODO is there a better way to handle this exception? 618 logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}",e.getMessage()); 619 } 620 return sequenceStorage; 621 } 622 623 } 624 625 return null; 626 } 627 628 /** 629 * 630 * @param bioStart 631 * @param bioEnd 632 * @param strand 633 * @return 634 */ 635 public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) { 636 637 Location loc = new SimpleLocation(bioStart, bioEnd, strand); 638 return loc.getSubSequence(this).getSequenceAsString(); 639 } 640 641 /** 642 * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand. 643 * @return 644 */ 645 @Override 646 public String getSequenceAsString() { 647 return SequenceMixin.toString(this); 648 649 } 650 651 /** 652 * 653 * @return 654 */ 655 @Override 656 public List<C> getAsList() { 657 658 return sequenceStorage.getAsList(); 659 } 660 661 /** 662 * 663 * @param position The 1-indexed position of the amino acid 664 * @return 665 */ 666 @Override 667 public C getCompoundAt(int position) { 668 669 return getSequenceStorage().getCompoundAt(position); 670 } 671 672 /** 673 * 674 * @param compound 675 * @return The first index of compound in this sequence (1-based) 676 */ 677 @Override 678 public int getIndexOf(C compound) { 679 return getSequenceStorage().getIndexOf(compound); 680 } 681 682 /** 683 * 684 * @param compound 685 * @return The last index of compound in this sequence (1-based) 686 */ 687 @Override 688 public int getLastIndexOf(C compound) { 689 return getSequenceStorage().getLastIndexOf(compound); 690 } 691 692 /** 693 * 694 * @return 695 */ 696 @Override 697 public int getLength() { 698 return getSequenceStorage().getLength(); 699 } 700 701 /** 702 * 703 * @param bioStart 704 * @param bioEnd 705 * @return 706 */ 707 @Override 708 public SequenceView<C> getSubSequence(final Integer bioStart, final Integer bioEnd) { 709 return new SequenceProxyView<>(this, bioStart, bioEnd); 710 } 711 712 /** 713 * 714 * @return 715 */ 716 @Override 717 public Iterator<C> iterator() { 718 return getSequenceStorage().iterator(); 719 } 720 721 /** 722 * 723 * @param compounds 724 * @return 725 */ 726 @Override 727 public int countCompounds(C... compounds) { 728 return SequenceMixin.countCompounds(this, compounds); 729 } 730 731 /** 732 * 733 * @return 734 */ 735 @Override 736 public SequenceView<C> getInverse() { 737 return SequenceMixin.inverse(this); 738 } 739 740 741}