001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on 01-21-2010 021 * 022 * @author Richard Holland 023 * @author Scooter Willis 024 * @author Paolo Pavan 025 * 026 */ 027package org.biojava.nbio.core.sequence.template; 028 029import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 030import org.biojava.nbio.core.sequence.AccessionID; 031import org.biojava.nbio.core.sequence.DataSource; 032import org.biojava.nbio.core.sequence.Strand; 033import org.biojava.nbio.core.sequence.TaxonomyID; 034import org.biojava.nbio.core.sequence.features.*; 035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader; 036import org.biojava.nbio.core.sequence.location.SequenceLocation; 037import org.biojava.nbio.core.sequence.location.SimpleLocation; 038import org.biojava.nbio.core.sequence.location.template.Location; 039import org.biojava.nbio.core.sequence.reference.AbstractReference; 040import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader; 041import org.biojava.nbio.core.util.Equals; 042import org.slf4j.Logger; 043import org.slf4j.LoggerFactory; 044 045import java.util.*; 046 047/** 048 * 049 * The base class for DNA, RNA and Protein sequences. 050 * @param <C> 051 */ 052public abstract class AbstractSequence<C extends Compound> implements Sequence<C> { 053 054 private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class); 055 056 private TaxonomyID taxonomy; 057 private AccessionID accession; 058 private SequenceReader<C> sequenceStorage = null; 059 private CompoundSet<C> compoundSet; 060 private AnnotationType annotationType = AnnotationType.UNKNOWN; 061 private String description; 062 private String originalHeader; 063 private Collection<Object> userCollection; 064 private Integer bioBegin = null; 065 private Integer bioEnd = null; 066 private AbstractSequence<?> parentSequence = null; 067 private String source = null; 068 private ArrayList<String> notesList = new ArrayList<String>(); 069 private Double sequenceScore = null; 070 private FeaturesKeyWordInterface featuresKeyWord = null; 071 private DatabaseReferenceInterface databaseReferences = null; 072 private FeatureRetriever featureRetriever = null; 073 private ArrayList<FeatureInterface<AbstractSequence<C>, C>> features = 074 new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 075 private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>> groupedFeatures = 076 new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>>(); 077 private List<String> comments = new ArrayList<>(); 078 private List<AbstractReference> references; 079 080 public AbstractSequence() { 081 } 082 083 /** 084 * Create a Sequence from a simple string where the values should be found in compoundSet 085 * @param seqString 086 * @param compoundSet 087 * @throws CompoundNotFoundException 088 */ 089 public AbstractSequence(String seqString, CompoundSet<C> compoundSet) throws CompoundNotFoundException { 090 setCompoundSet(compoundSet); 091 sequenceStorage = new ArrayListSequenceReader<C>(); 092 sequenceStorage.setCompoundSet(this.getCompoundSet()); 093 sequenceStorage.setContents(seqString); 094 } 095 096 /** 097 * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location 098 * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in 099 * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database. 100 * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to 101 * various methods will be valid. 102 * 103 * @param proxyLoader 104 * @param compoundSet 105 */ 106 public AbstractSequence(SequenceReader<C> proxyLoader, CompoundSet<C> compoundSet) { 107 setCompoundSet(compoundSet); 108 setProxySequenceReader(proxyLoader); 109 } 110 111 /** 112 * Very important method that allows external mappings of sequence data and features. This method 113 * will gain additional interface inspection that allows external data sources with knowledge 114 * of features for a sequence to be supported. 115 * 116 * @param proxyLoader 117 */ 118 public void setProxySequenceReader(SequenceReader<C> proxyLoader) { 119 this.sequenceStorage = proxyLoader; 120 if (proxyLoader instanceof FeaturesKeyWordInterface) { 121 this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage); 122 } 123 if (proxyLoader instanceof DatabaseReferenceInterface) { 124 this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage); 125 } 126 127 if (proxyLoader instanceof FeatureRetriever) { 128 this.setFeatureRetriever((FeatureRetriever) sequenceStorage); 129 HashMap<String, ArrayList<AbstractFeature>> ff = getFeatureRetriever().getFeatures(); 130 for (String k: ff.keySet()){ 131 for (AbstractFeature f: ff.get(k)){ 132 this.addFeature(f); 133 } 134 } 135 // success of next statement guaranteed because source is a compulsory field 136 //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref"); 137 ArrayList<DBReferenceInfo> dbQualifiers = (ArrayList)ff.get("source").get(0).getQualifiers().get("db_xref"); 138 DBReferenceInfo dbQualifier = dbQualifiers.get(0); 139 140 if (dbQualifier != null) this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase()+":"+dbQualifier.getId(), DataSource.UNKNOWN)); 141 } 142 143 if(getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader){ // we have lots of unsupported operations for this call so quick fix to allow this tow rork 144 this.setAccession(proxyLoader.getAccession()); 145 } 146 } 147 148 public SequenceReader<C> getProxySequenceReader() { 149 return sequenceStorage; 150 } 151 152 /** 153 * @return the bioBegin 154 */ 155 public Integer getBioBegin() { 156 if (bioBegin == null) { 157 return 1; 158 } else { 159 return bioBegin; 160 } 161 } 162 163 /** 164 * @param bioBegin the bioBegin to set 165 */ 166 public void setBioBegin(Integer bioBegin) { 167 this.bioBegin = bioBegin; 168 } 169 170 /** 171 * @return the bioEnd 172 */ 173 public Integer getBioEnd() { 174 if (bioEnd == null) { 175 return this.getLength(); 176 } else { 177 return bioEnd; 178 } 179 } 180 181 /** 182 * @param bioEnd the bioEnd to set 183 */ 184 public void setBioEnd(Integer bioEnd) { 185 this.bioEnd = bioEnd; 186 } 187 188 /** 189 * Provided for convince if the developer needs to associate data with a sequence 190 * 191 * @return 192 */ 193 public Collection<Object> getUserCollection() { 194 195 return userCollection; 196 } 197 198 /** 199 * 200 * @param userCollection 201 */ 202 public void setUserCollection(Collection<Object> userCollection) { 203 this.userCollection = userCollection; 204 } 205 206 /** 207 * @return the annotation 208 */ 209 public AnnotationType getAnnotationType() { 210 return annotationType; 211 } 212 213 /** 214 * @param annotationType the annotation to set 215 */ 216 public void setAnnotationType(AnnotationType annotationType) { 217 this.annotationType = annotationType; 218 } 219 220 /** 221 * @return the description 222 */ 223 public String getDescription() { 224 return description; 225 } 226 227 /** 228 * @param description the description to set 229 */ 230 public void setDescription(String description) { 231 this.description = description; 232 } 233 234 /** 235 * @return the originalHeader 236 */ 237 public String getOriginalHeader() { 238 return originalHeader; 239 } 240 241 /** 242 * @param originalHeader the originalHeader to set 243 */ 244 public void setOriginalHeader(String originalHeader) { 245 this.originalHeader = originalHeader; 246 } 247 248 /** 249 * @return the parentSequence 250 */ 251 public AbstractSequence<?> getParentSequence() { 252 return parentSequence; 253 } 254 255 /** 256 * @param parentSequence the parentSequence to set 257 */ 258 public void setParentSequence(AbstractSequence<?> parentSequence) { 259 this.parentSequence = parentSequence; 260 } 261 262 /** 263 * Added support for the source of this sequence for GFF3 export 264 * If a sub sequence doesn't have source then check for parent source 265 * @return the source 266 */ 267 public String getSource() { 268 if (source != null) { 269 return source; 270 } 271 if (parentSequence != null) { 272 return parentSequence.getSource(); 273 } 274 return null; 275 } 276 277 /** 278 * Added support for the source of this sequence for GFF3 export 279 * @param source the source to set 280 */ 281 public void setSource(String source) { 282 283 this.source = source; 284 } 285 286 /** 287 * Add notes about this sequence that will get exported for GFF3 288 * @param note 289 */ 290 public void addNote(String note) { 291 notesList.add(note); 292 } 293 294 public void removeNote(String note) { 295 notesList.remove(note); 296 } 297 298 /** 299 * @return the notesList 300 */ 301 public ArrayList<String> getNotesList() { 302 return notesList; 303 } 304 305 /** 306 * @param notesList the notesList to set 307 */ 308 public void setNotesList(ArrayList<String> notesList) { 309 this.notesList = notesList; 310 } 311 312 /** 313 * Provide place holder for a metric that indicate a score associated with the sequence 314 * @return the sequenceScore 315 */ 316 public Double getSequenceScore() { 317 return sequenceScore; 318 } 319 320 /** 321 * @param sequenceScore the sequenceScore to set 322 */ 323 public void setSequenceScore(Double sequenceScore) { 324 this.sequenceScore = sequenceScore; 325 } 326 327 /** 328 * @since 5.0.0 329 * @return the list of {@link AbstractReference} 330 */ 331 public List<AbstractReference> getReferences() { 332 return references; 333 } 334 335 /** 336 * Set the list of {@link AbstractReference} 337 * @since 5.0.0 338 * @param references 339 */ 340 public void setReferences(List<AbstractReference> references) { 341 this.references = references; 342 } 343 344 /** 345 * Return features at a sequence position by type 346 * @param featureType 347 * @param bioSequencePosition 348 * @return 349 */ 350 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(String featureType, int bioSequencePosition) { 351 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits = 352 new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 353 List<FeatureInterface<AbstractSequence<C>, C>> features = getFeaturesByType(featureType); 354 if (features != null) { 355 for (FeatureInterface<AbstractSequence<C>, C> feature : features) { 356 if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { 357 featureHits.add(feature); 358 } 359 } 360 } 361 return featureHits; 362 } 363 364 /** 365 * Return features at a sequence position 366 * @param bioSequencePosition 367 * @return 368 */ 369 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(int bioSequencePosition) { 370 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits = 371 new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 372 if (features != null) { 373 for (FeatureInterface<AbstractSequence<C>, C> feature : features) { 374 if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) { 375 featureHits.add(feature); 376 } 377 } 378 } 379 return featureHits; 380 } 381 382 /** 383 * 384 * @return 385 */ 386 public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures() { 387 return features; 388 } 389 390 /** 391 * Method to help set the proper details for a feature as it relates to a sequence 392 * where the feature needs to have a location on the sequence 393 * @param bioStart 394 * @param bioEnd 395 * @param feature 396 */ 397 public void addFeature(int bioStart, int bioEnd, FeatureInterface<AbstractSequence<C>, C> feature) { 398 SequenceLocation<AbstractSequence<C>, C> sequenceLocation = 399 new SequenceLocation<AbstractSequence<C>, C>(bioStart, bioEnd, this); 400 feature.setLocation(sequenceLocation); 401 addFeature(feature); 402 } 403 404 /** 405 * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than 406 * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features 407 * in SequenceFeaturePanel 408 * @param feature 409 */ 410 public void addFeature(FeatureInterface<AbstractSequence<C>, C> feature) { 411 features.add(feature); 412 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType()); 413 if (featureList == null) { 414 featureList = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 415 groupedFeatures.put(feature.getType(), featureList); 416 } 417 featureList.add(feature); 418 Collections.sort(features, AbstractFeature.LOCATION_LENGTH); 419 Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH); 420 } 421 422 /** 423 * Remove a feature from the sequence 424 * @param feature 425 */ 426 public void removeFeature(FeatureInterface<AbstractSequence<C>, C> feature) { 427 features.remove(feature); 428 ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType()); 429 if (featureList != null) { 430 featureList.remove(feature); 431 if (featureList.isEmpty()) { 432 groupedFeatures.remove(feature.getType()); 433 } 434 } 435 } 436 437 /** 438 * 439 * @param type 440 * @return 441 */ 442 public List<FeatureInterface<AbstractSequence<C>, C>> getFeaturesByType(String type) { 443 List<FeatureInterface<AbstractSequence<C>, C>> features = groupedFeatures.get(type); 444 if (features == null) { 445 features = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>(); 446 } 447 return features; 448 } 449 450 /** 451 * 452 * @return comments 453 */ 454 public List<String> getComments() { 455 return comments; 456 } 457 458 /** 459 * Set comments. 460 * @param comments 461 */ 462 public void setComments(List<String> comments) { 463 this.comments = comments; 464 } 465 466 /** 467 * @return the featuresKeyWord 468 */ 469 public FeaturesKeyWordInterface getFeaturesKeyWord() { 470 return featuresKeyWord; 471 } 472 473 /** 474 * @param featuresKeyWord the featuresKeyWord to set 475 */ 476 public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) { 477 this.featuresKeyWord = featuresKeyWord; 478 } 479 480 /** 481 * @return the databaseReferences 482 */ 483 public DatabaseReferenceInterface getDatabaseReferences() { 484 return databaseReferences; 485 } 486 487 /** 488 * @param databaseReferences the databaseReferences to set 489 */ 490 public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) { 491 this.databaseReferences = databaseReferences; 492 } 493 494 public FeatureRetriever getFeatureRetriever() { 495 return featureRetriever; 496 } 497 498 public void setFeatureRetriever(FeatureRetriever featureRetriever) { 499 this.featureRetriever = featureRetriever; 500 } 501 502 503 504 public enum AnnotationType { 505 506 CURATED, PREDICTED, UNKNOWN; 507 } 508 509 /** 510 * @return the accession 511 */ 512 @Override 513 public AccessionID getAccession() { 514 return accession; 515 } 516 517 /** 518 * @param accession the accession to set 519 */ 520 public void setAccession(AccessionID accession) { 521 this.accession = accession; 522 } 523 524 /** 525 * @return the species 526 */ 527 public TaxonomyID getTaxonomy() { 528 return taxonomy; 529 } 530 531 /** 532 * @param taxonomy the species to set 533 */ 534 public void setTaxonomy(TaxonomyID taxonomy) { 535 this.taxonomy = taxonomy; 536 } 537 538 @Override 539 public CompoundSet<C> getCompoundSet() { 540 if (compoundSet != null) { 541 return compoundSet; 542 } 543 // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence, 544 // e.g., the case where the parent sequence for a protein is a CDS. 545 /* 546 if (parentSequence != null) { 547 return parentSequence.getCompoundSet(); 548 } 549 */ 550 return null; 551 552 553 } 554 555 public void setCompoundSet(CompoundSet<C> compoundSet) { 556 this.compoundSet = compoundSet; 557 } 558 559 @Override 560 public boolean equals(Object o){ 561 562 if(! Equals.classEqual(this, o)) { 563 return false; 564 } 565 566 Sequence<C> other = (Sequence<C>)o; 567 568 if ( other.getCompoundSet() != getCompoundSet()) 569 return false; 570 571 572 List<C> rawCompounds = getAsList(); 573 List<C> otherCompounds = other.getAsList(); 574 575 if ( rawCompounds.size() != otherCompounds.size()) 576 return false; 577 578 for (int i = 0 ; i < rawCompounds.size() ; i++){ 579 Compound myCompound = rawCompounds.get(i); 580 Compound otherCompound = otherCompounds.get(i); 581 if ( ! myCompound.equalsIgnoreCase(otherCompound)) 582 return false; 583 } 584 return true; 585 } 586 587 @Override 588 public int hashCode(){ 589 String s = getSequenceAsString(); 590 return s.hashCode(); 591 } 592 593 @Override 594 public String toString() { 595 return getSequenceAsString(); 596 } 597 598 private SequenceReader<C> getSequenceStorage() { 599 if (sequenceStorage != null) { 600 return sequenceStorage; 601 } 602 if (parentSequence != null) { 603 604 //return parentSequence.getSequenceStorage(); 605 606 if ( this.compoundSet.equals(parentSequence.getCompoundSet())){ 607 sequenceStorage = new ArrayListSequenceReader<C>(); 608 sequenceStorage.setCompoundSet(this.getCompoundSet()); 609 try { 610 sequenceStorage.setContents(parentSequence.getSequenceAsString()); 611 } catch (CompoundNotFoundException e) { 612 // TODO is there a better way to handle this exception? 613 logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}",e.getMessage()); 614 } 615 return sequenceStorage; 616 } 617 618 } 619 620 return null; 621 } 622 623 /** 624 * 625 * @param bioStart 626 * @param bioEnd 627 * @param strand 628 * @return 629 */ 630 public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) { 631 632 Location loc = new SimpleLocation(bioStart, bioEnd, strand); 633 return loc.getSubSequence(this).getSequenceAsString(); 634 } 635 636 /** 637 * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand. 638 * @return 639 */ 640 @Override 641 public String getSequenceAsString() { 642 return SequenceMixin.toString(this); 643 644 } 645 646 /** 647 * 648 * @return 649 */ 650 @Override 651 public List<C> getAsList() { 652 653 return sequenceStorage.getAsList(); 654 } 655 656 /** 657 * 658 * @param position The 1-indexed position of the amino acid 659 * @return 660 */ 661 @Override 662 public C getCompoundAt(int position) { 663 664 return getSequenceStorage().getCompoundAt(position); 665 } 666 667 /** 668 * 669 * @param compound 670 * @return The first index of compound in this sequence (1-based) 671 */ 672 @Override 673 public int getIndexOf(C compound) { 674 return getSequenceStorage().getIndexOf(compound); 675 } 676 677 /** 678 * 679 * @param compound 680 * @return The last index of compound in this sequence (1-based) 681 */ 682 @Override 683 public int getLastIndexOf(C compound) { 684 return getSequenceStorage().getLastIndexOf(compound); 685 } 686 687 /** 688 * 689 * @return 690 */ 691 @Override 692 public int getLength() { 693 return getSequenceStorage().getLength(); 694 } 695 696 /** 697 * 698 * @param bioStart 699 * @param bioEnd 700 * @return 701 */ 702 @Override 703 public SequenceView<C> getSubSequence(final Integer bioStart, final Integer bioEnd) { 704 return new SequenceProxyView<C>(this, bioStart, bioEnd); 705 } 706 707 /** 708 * 709 * @return 710 */ 711 @Override 712 public Iterator<C> iterator() { 713 return getSequenceStorage().iterator(); 714 } 715 716 /** 717 * 718 * @param compounds 719 * @return 720 */ 721 @Override 722 public int countCompounds(C... compounds) { 723 return SequenceMixin.countCompounds(this, compounds); 724 } 725 726 /** 727 * 728 * @return 729 */ 730 @Override 731 public SequenceView<C> getInverse() { 732 return SequenceMixin.inverse(this); 733 } 734 735 736}