001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @author Richard Holland
023 * @author Scooter Willis
024 * @author Paolo Pavan
025 *
026 */
027package org.biojava.nbio.core.sequence.template;
028
029import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
030import org.biojava.nbio.core.sequence.AccessionID;
031import org.biojava.nbio.core.sequence.DataSource;
032import org.biojava.nbio.core.sequence.Strand;
033import org.biojava.nbio.core.sequence.TaxonomyID;
034import org.biojava.nbio.core.sequence.features.*;
035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader;
036import org.biojava.nbio.core.sequence.location.SequenceLocation;
037import org.biojava.nbio.core.sequence.location.SimpleLocation;
038import org.biojava.nbio.core.sequence.location.template.Location;
039import org.biojava.nbio.core.sequence.reference.AbstractReference;
040import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader;
041import org.biojava.nbio.core.util.Equals;
042import org.slf4j.Logger;
043import org.slf4j.LoggerFactory;
044
045import java.util.*;
046
047/**
048 *
049 * The base class for DNA, RNA and Protein sequences.
050 * @param <C>
051 */
052public abstract class AbstractSequence<C extends Compound> implements Sequence<C> {
053
054        private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class);
055
056        private TaxonomyID taxonomy;
057        private AccessionID accession;
058        private SequenceReader<C> sequenceStorage = null;
059        private CompoundSet<C> compoundSet;
060        private AnnotationType annotationType = AnnotationType.UNKNOWN;
061        private String description;
062        private String originalHeader;
063        private Collection<Object> userCollection;
064        private Integer bioBegin = null;
065        private Integer bioEnd = null;
066        private AbstractSequence<?> parentSequence = null;
067        private String source = null;
068        private ArrayList<String> notesList = new ArrayList<String>();
069        private Double sequenceScore = null;
070        private FeaturesKeyWordInterface featuresKeyWord = null;
071        private DatabaseReferenceInterface databaseReferences = null;
072        private FeatureRetriever featureRetriever = null;
073        private ArrayList<FeatureInterface<AbstractSequence<C>, C>> features =
074                        new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
075        private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>> groupedFeatures =
076                        new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>>();
077        private List<String> comments = new ArrayList<>();
078        private List<AbstractReference> references;
079
080        public AbstractSequence() {
081        }
082
083        /**
084         * Create a Sequence from a simple string where the values should be found in compoundSet
085         * @param seqString
086         * @param compoundSet
087         * @throws CompoundNotFoundException
088         */
089        public AbstractSequence(String seqString, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
090                setCompoundSet(compoundSet);
091                initSequenceStorage(seqString);
092        }
093
094        //  so it can be called from subclass constructors
095        protected void initSequenceStorage(String seqString) throws CompoundNotFoundException {
096                sequenceStorage = new ArrayListSequenceReader<C>();
097                sequenceStorage.setCompoundSet(this.getCompoundSet());
098                sequenceStorage.setContents(seqString);
099        }
100
101        /**
102         * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location
103         * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in
104         * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database.
105         * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to
106         * various methods will be valid.
107         *
108         * @param proxyLoader
109         * @param compoundSet
110         */
111        public AbstractSequence(SequenceReader<C> proxyLoader, CompoundSet<C> compoundSet) {
112                setCompoundSet(compoundSet);
113                setProxySequenceReader(proxyLoader);
114        }
115
116        /**
117         * Very important method that allows external mappings of sequence data and features. This method
118         * will gain additional interface inspection that allows external data sources with knowledge
119         * of features for a sequence to be supported.
120         *
121         * @param proxyLoader
122         */
123        public void setProxySequenceReader(SequenceReader<C> proxyLoader) {
124                this.sequenceStorage = proxyLoader;
125                if (proxyLoader instanceof FeaturesKeyWordInterface) {
126                        this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage);
127                }
128                if (proxyLoader instanceof DatabaseReferenceInterface) {
129                        this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage);
130                }
131
132                if (proxyLoader instanceof FeatureRetriever) {
133                        this.setFeatureRetriever((FeatureRetriever) sequenceStorage);
134                        Map<String, List<AbstractFeature<AbstractSequence<C>, C>>> ff = getFeatureRetriever().getFeatures();
135                        for (String k: ff.keySet()){
136                                for (AbstractFeature f: ff.get(k)){
137                                        this.addFeature(f);
138                                }
139                        }
140                        // success of next statement guaranteed because source is a compulsory field
141                        //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref");
142                        ArrayList<DBReferenceInfo> dbQualifiers = (ArrayList)ff.get("source").get(0).getQualifiers().get("db_xref");
143                        DBReferenceInfo dbQualifier = dbQualifiers.get(0);
144
145                        if (dbQualifier != null) this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase()+":"+dbQualifier.getId(), DataSource.UNKNOWN));
146                }
147
148                if(getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader){ // we have lots of unsupported operations for this call so quick fix to allow this tow rork
149                        this.setAccession(proxyLoader.getAccession());
150                }
151        }
152
153        public SequenceReader<C> getProxySequenceReader() {
154                return sequenceStorage;
155        }
156
157        /**
158         * @return the bioBegin
159         */
160        public Integer getBioBegin() {
161                if (bioBegin == null) {
162                        return 1;
163                } else {
164                        return bioBegin;
165                }
166        }
167
168        /**
169         * @param bioBegin the bioBegin to set
170         */
171        public void setBioBegin(Integer bioBegin) {
172                this.bioBegin = bioBegin;
173        }
174
175        /**
176         * @return the bioEnd
177         */
178        public Integer getBioEnd() {
179                if (bioEnd == null) {
180                        return this.getLength();
181                } else {
182                        return bioEnd;
183                }
184        }
185
186        /**
187         * @param bioEnd the bioEnd to set
188         */
189        public void setBioEnd(Integer bioEnd) {
190                this.bioEnd = bioEnd;
191        }
192
193        /**
194         * Provided for convince if the developer needs to associate data with a sequence
195         *
196         * @return
197         */
198        public Collection<Object> getUserCollection() {
199
200                return userCollection;
201        }
202
203        /**
204         *
205         * @param userCollection
206         */
207        public void setUserCollection(Collection<Object> userCollection) {
208                this.userCollection = userCollection;
209        }
210
211        /**
212         * @return the annotation
213         */
214        public AnnotationType getAnnotationType() {
215                return annotationType;
216        }
217
218        /**
219         * @param annotationType the annotation to set
220         */
221        public void setAnnotationType(AnnotationType annotationType) {
222                this.annotationType = annotationType;
223        }
224
225        /**
226         * @return the description
227         */
228        public String getDescription() {
229                return description;
230        }
231
232        /**
233         * @param description the description to set
234         */
235        public void setDescription(String description) {
236                this.description = description;
237        }
238
239        /**
240         * @return the originalHeader
241         */
242        public String getOriginalHeader() {
243                return originalHeader;
244        }
245
246        /**
247         * @param originalHeader the originalHeader to set
248         */
249        public void setOriginalHeader(String originalHeader) {
250                this.originalHeader = originalHeader;
251        }
252
253        /**
254         * @return the parentSequence
255         */
256        public AbstractSequence<?> getParentSequence() {
257                return parentSequence;
258        }
259
260        /**
261         * @param parentSequence the parentSequence to set
262         */
263        public void setParentSequence(AbstractSequence<?> parentSequence) {
264                this.parentSequence = parentSequence;
265        }
266
267        /**
268         * Added support for the source of this sequence for GFF3 export
269         * If a sub sequence doesn't have  source then check for parent source
270         * @return the source
271         */
272        public String getSource() {
273                if (source != null) {
274                        return source;
275                }
276                if (parentSequence != null) {
277                        return parentSequence.getSource();
278                }
279                return null;
280        }
281
282        /**
283         * Added support for the source of this sequence for GFF3 export
284         * @param source the source to set
285         */
286        public void setSource(String source) {
287
288                this.source = source;
289        }
290
291        /**
292         * Add notes about this sequence that will get exported for GFF3
293         * @param note
294         */
295        public void addNote(String note) {
296                notesList.add(note);
297        }
298
299        public void removeNote(String note) {
300                notesList.remove(note);
301        }
302
303        /**
304         * @return the notesList
305         */
306        public ArrayList<String> getNotesList() {
307                return notesList;
308        }
309
310        /**
311         * @param notesList the notesList to set
312         */
313        public void setNotesList(ArrayList<String> notesList) {
314                this.notesList = notesList;
315        }
316
317        /**
318         * Provide place holder for a metric that indicate a score associated with the sequence
319         * @return the sequenceScore
320         */
321        public Double getSequenceScore() {
322                return sequenceScore;
323        }
324
325        /**
326         * @param sequenceScore the sequenceScore to set
327         */
328        public void setSequenceScore(Double sequenceScore) {
329                this.sequenceScore = sequenceScore;
330        }
331
332        /**
333         * @since 5.0.0
334         * @return the list of {@link AbstractReference}
335         */
336        public List<AbstractReference> getReferences() {
337                return references;
338        }
339
340        /**
341         * Set the list of {@link AbstractReference}
342         * @since 5.0.0
343         * @param references
344         */
345        public void setReferences(List<AbstractReference> references) {
346                this.references = references;
347        }
348
349        /**
350         * Return features at a sequence position by type
351         * @param featureType
352         * @param bioSequencePosition
353         * @return
354         */
355        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(String featureType, int bioSequencePosition) {
356                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits =
357                                new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
358                List<FeatureInterface<AbstractSequence<C>, C>> features = getFeaturesByType(featureType);
359                if (features != null) {
360                        for (FeatureInterface<AbstractSequence<C>, C> feature : features) {
361                                if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) {
362                                        featureHits.add(feature);
363                                }
364                        }
365                }
366                return featureHits;
367        }
368
369        /**
370         * Return features at a sequence position
371         * @param bioSequencePosition
372         * @return
373         */
374        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(int bioSequencePosition) {
375                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits =
376                                new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
377                if (features != null) {
378                        for (FeatureInterface<AbstractSequence<C>, C> feature : features) {
379                                if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) {
380                                        featureHits.add(feature);
381                                }
382                        }
383                }
384                return featureHits;
385        }
386
387        /**
388         *
389         * @return
390         */
391        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures() {
392                return features;
393        }
394
395        /**
396         * Method to help set the proper details for a feature as it relates to a sequence
397         * where the feature needs to have a location on the sequence
398         * @param bioStart
399         * @param bioEnd
400         * @param feature
401         */
402        public void addFeature(int bioStart, int bioEnd, FeatureInterface<AbstractSequence<C>, C> feature) {
403                SequenceLocation<AbstractSequence<C>, C> sequenceLocation =
404                                new SequenceLocation<AbstractSequence<C>, C>(bioStart, bioEnd, this);
405                feature.setLocation(sequenceLocation);
406                addFeature(feature);
407        }
408
409        /**
410         * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than
411         * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features
412         * in SequenceFeaturePanel
413         * @param feature
414         */
415        public void addFeature(FeatureInterface<AbstractSequence<C>, C> feature) {
416                features.add(feature);
417                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType());
418                if (featureList == null) {
419                        featureList = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
420                        groupedFeatures.put(feature.getType(), featureList);
421                }
422                featureList.add(feature);
423                Collections.sort(features, AbstractFeature.LOCATION_LENGTH);
424                Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH);
425        }
426
427        /**
428         * Remove a feature from the sequence
429         * @param feature
430         */
431        public void removeFeature(FeatureInterface<AbstractSequence<C>, C> feature) {
432                features.remove(feature);
433                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType());
434                if (featureList != null) {
435                        featureList.remove(feature);
436                        if (featureList.isEmpty()) {
437                                groupedFeatures.remove(feature.getType());
438                        }
439                }
440        }
441
442        /**
443         *
444         * @param type
445         * @return
446         */
447        public List<FeatureInterface<AbstractSequence<C>, C>> getFeaturesByType(String type) {
448                List<FeatureInterface<AbstractSequence<C>, C>> features = groupedFeatures.get(type);
449                if (features == null) {
450                        features = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
451                }
452                return features;
453        }
454
455        /**
456         *
457         * @return comments
458         */
459        public List<String> getComments() {
460                return comments;
461        }
462
463        /**
464         * Set comments.
465         * @param comments
466         */
467        public void setComments(List<String> comments) {
468                this.comments = comments;
469        }
470
471        /**
472         * @return the featuresKeyWord
473         */
474        public FeaturesKeyWordInterface getFeaturesKeyWord() {
475                return featuresKeyWord;
476        }
477
478        /**
479         * @param featuresKeyWord the featuresKeyWord to set
480         */
481        public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) {
482                this.featuresKeyWord = featuresKeyWord;
483        }
484
485        /**
486         * @return the databaseReferences
487         */
488        public DatabaseReferenceInterface getDatabaseReferences() {
489                return databaseReferences;
490        }
491
492        /**
493         * @param databaseReferences the databaseReferences to set
494         */
495        public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) {
496                this.databaseReferences = databaseReferences;
497        }
498
499        public FeatureRetriever getFeatureRetriever() {
500                return featureRetriever;
501        }
502
503        public void setFeatureRetriever(FeatureRetriever featureRetriever) {
504                this.featureRetriever = featureRetriever;
505        }
506
507
508
509        public enum AnnotationType {
510
511                CURATED, PREDICTED, UNKNOWN;
512        }
513
514        /**
515         * @return the accession
516         */
517        @Override
518        public AccessionID getAccession() {
519                return accession;
520        }
521
522        /**
523         * @param accession the accession to set
524         */
525        public void setAccession(AccessionID accession) {
526                this.accession = accession;
527        }
528
529        /**
530         * @return the species
531         */
532        public TaxonomyID getTaxonomy() {
533                return taxonomy;
534        }
535
536        /**
537         * @param taxonomy the species to set
538         */
539        public void setTaxonomy(TaxonomyID taxonomy) {
540                this.taxonomy = taxonomy;
541        }
542
543        @Override
544        public CompoundSet<C> getCompoundSet() {
545                if (compoundSet != null) {
546                        return compoundSet;
547                }
548                // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence,
549                // e.g., the case where the parent sequence for a protein is a CDS.
550                /*
551                if (parentSequence != null) {
552                        return parentSequence.getCompoundSet();
553                }
554                */
555                return null;
556
557
558        }
559
560        public void setCompoundSet(CompoundSet<C> compoundSet) {
561                this.compoundSet = compoundSet;
562        }
563
564        @Override
565        public boolean equals(Object o){
566
567                if(! Equals.classEqual(this, o)) {
568                        return false;
569                }
570
571                Sequence<C> other = (Sequence<C>)o;
572
573                if ( other.getCompoundSet() != getCompoundSet())
574                        return false;
575
576
577                List<C> rawCompounds = getAsList();
578                List<C> otherCompounds = other.getAsList();
579
580                if ( rawCompounds.size() != otherCompounds.size())
581                        return false;
582
583                for (int i = 0 ; i < rawCompounds.size() ; i++){
584                        Compound myCompound = rawCompounds.get(i);
585                        Compound otherCompound = otherCompounds.get(i);
586                        if ( ! myCompound.equalsIgnoreCase(otherCompound))
587                                return false;
588                }
589                return true;
590        }
591
592        @Override
593        public int hashCode(){
594                String s = getSequenceAsString();
595                return s.hashCode();
596        }
597
598        @Override
599        public String toString() {
600                return getSequenceAsString();
601        }
602
603        private SequenceReader<C> getSequenceStorage() {
604                if (sequenceStorage != null) {
605                        return sequenceStorage;
606                }
607                if (parentSequence != null) {
608
609                        //return parentSequence.getSequenceStorage();
610
611                        if ( this.compoundSet.equals(parentSequence.getCompoundSet())){
612                                sequenceStorage = new ArrayListSequenceReader<C>();
613                                sequenceStorage.setCompoundSet(this.getCompoundSet());
614                                try {
615                                        sequenceStorage.setContents(parentSequence.getSequenceAsString());
616                                } catch (CompoundNotFoundException e) {
617                                        // TODO is there a better way to handle this exception?
618                                        logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}",e.getMessage());
619                                }
620                                return sequenceStorage;
621                        }
622
623                }
624
625                return null;
626        }
627
628        /**
629         *
630         * @param bioStart
631         * @param bioEnd
632         * @param strand
633         * @return
634         */
635        public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) {
636
637                Location loc = new SimpleLocation(bioStart, bioEnd, strand);
638                return loc.getSubSequence(this).getSequenceAsString();
639        }
640
641        /**
642         * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand.
643         * @return
644         */
645        @Override
646        public String getSequenceAsString() {
647                return SequenceMixin.toString(this);
648
649        }
650
651        /**
652         *
653         * @return
654         */
655        @Override
656        public List<C> getAsList() {
657
658                return sequenceStorage.getAsList();
659        }
660
661        /**
662         *
663         * @param position The 1-indexed position of the amino acid
664         * @return
665         */
666        @Override
667        public C getCompoundAt(int position) {
668
669                return getSequenceStorage().getCompoundAt(position);
670        }
671
672        /**
673         *
674         * @param compound
675         * @return The first index of compound in this sequence (1-based)
676         */
677        @Override
678        public int getIndexOf(C compound) {
679                return getSequenceStorage().getIndexOf(compound);
680        }
681
682        /**
683         *
684         * @param compound
685         * @return The last index of compound in this sequence (1-based)
686         */
687        @Override
688        public int getLastIndexOf(C compound) {
689                return getSequenceStorage().getLastIndexOf(compound);
690        }
691
692        /**
693         *
694         * @return
695         */
696        @Override
697        public int getLength() {
698                return getSequenceStorage().getLength();
699        }
700
701        /**
702         *
703         * @param bioStart
704         * @param bioEnd
705         * @return
706         */
707        @Override
708        public SequenceView<C> getSubSequence(final Integer bioStart, final Integer bioEnd) {
709                return new SequenceProxyView<C>(this, bioStart, bioEnd);
710        }
711
712        /**
713         *
714         * @return
715         */
716        @Override
717        public Iterator<C> iterator() {
718                return getSequenceStorage().iterator();
719        }
720
721        /**
722         *
723         * @param compounds
724         * @return
725         */
726        @Override
727        public int countCompounds(C... compounds) {
728                return SequenceMixin.countCompounds(this, compounds);
729        }
730
731        /**
732         *
733         * @return
734         */
735        @Override
736        public SequenceView<C> getInverse() {
737                return SequenceMixin.inverse(this);
738        }
739
740
741}