001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @author Richard Holland
023 * @author Scooter Willis
024 * @author Paolo Pavan
025 *
026 */
027package org.biojava.nbio.core.sequence.template;
028
029import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
030import org.biojava.nbio.core.sequence.AccessionID;
031import org.biojava.nbio.core.sequence.DataSource;
032import org.biojava.nbio.core.sequence.Strand;
033import org.biojava.nbio.core.sequence.TaxonomyID;
034import org.biojava.nbio.core.sequence.features.*;
035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader;
036import org.biojava.nbio.core.sequence.location.SequenceLocation;
037import org.biojava.nbio.core.sequence.location.SimpleLocation;
038import org.biojava.nbio.core.sequence.location.template.Location;
039import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader;
040import org.slf4j.Logger;
041import org.slf4j.LoggerFactory;
042
043import java.util.*;
044
045/**
046 *
047 * The base class for DNA, RNA and Protein sequences.
048 * @param <C>
049 */
050public abstract class AbstractSequence<C extends Compound> implements Sequence<C> {
051
052        private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class);
053
054        private TaxonomyID taxonomy;
055        private AccessionID accession;
056        private SequenceReader<C> sequenceStorage = null;
057        private CompoundSet<C> compoundSet;
058        private AnnotationType annotationType = AnnotationType.UNKNOWN;
059        private String description;
060        private String originalHeader;
061        private Collection<Object> userCollection;
062        private Integer bioBegin = null;
063        private Integer bioEnd = null;
064        private AbstractSequence<?> parentSequence = null;
065        private String source = null;
066        private ArrayList<String> notesList = new ArrayList<String>();
067        private Double sequenceScore = null;
068        private FeaturesKeyWordInterface featuresKeyWord = null;
069        private DatabaseReferenceInterface databaseReferences = null;
070        private FeatureRetriever featureRetriever = null;
071        private ArrayList<FeatureInterface<AbstractSequence<C>, C>> features =
072                        new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
073        private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>> groupedFeatures =
074                        new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>>();
075
076        public AbstractSequence() {
077        }
078
079        /**
080         * Create a Sequence from a simple string where the values should be found in compoundSet
081         * @param seqString
082         * @param compoundSet
083         * @throws CompoundNotFoundException
084         */
085        public AbstractSequence(String seqString, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
086                setCompoundSet(compoundSet);
087                sequenceStorage = new ArrayListSequenceReader<C>();
088                sequenceStorage.setCompoundSet(this.getCompoundSet());
089                sequenceStorage.setContents(seqString);
090        }
091
092        /**
093         * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location
094         * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in
095         * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database.
096         * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to
097         * various methods will be valid.
098         *
099         * @param proxyLoader
100         * @param compoundSet
101         */
102        public AbstractSequence(SequenceReader<C> proxyLoader, CompoundSet<C> compoundSet) {
103                setCompoundSet(compoundSet);
104                setProxySequenceReader(proxyLoader);
105        }
106
107        /**
108         * Very important method that allows external mappings of sequence data and features. This method
109         * will gain additional interface inspection that allows external data sources with knowledge
110         * of features for a sequence to be supported.
111         *
112         * @param proxyLoader
113         */
114        public void setProxySequenceReader(SequenceReader<C> proxyLoader) {
115                this.sequenceStorage = proxyLoader;
116                if (proxyLoader instanceof FeaturesKeyWordInterface) {
117                        this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage);
118                }
119                if (proxyLoader instanceof DatabaseReferenceInterface) {
120                        this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage);
121                }
122
123                if (proxyLoader instanceof FeatureRetriever) {
124                        this.setFeatureRetriever((FeatureRetriever) sequenceStorage);
125                        HashMap<String, ArrayList<AbstractFeature>> ff = getFeatureRetriever().getFeatures();
126                        for (String k: ff.keySet()){
127                                for (AbstractFeature f: ff.get(k)){
128                                        this.addFeature(f);
129                                }
130                        }
131                        // success of next statement guaranteed because source is a compulsory field
132                        //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref");
133                        ArrayList<DBReferenceInfo> dbQualifiers = (ArrayList)ff.get("source").get(0).getQualifiers().get("db_xref");
134                        DBReferenceInfo dbQualifier = dbQualifiers.get(0);
135
136                        if (dbQualifier != null) this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase()+":"+dbQualifier.getId(), DataSource.UNKNOWN));
137                }
138
139                if(getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader){ // we have lots of unsupported operations for this call so quick fix to allow this tow rork
140                        this.setAccession(proxyLoader.getAccession());
141                }
142        }
143
144        public SequenceReader<C> getProxySequenceReader() {
145                return sequenceStorage;
146        }
147
148        /**
149         * @return the bioBegin
150         */
151        public Integer getBioBegin() {
152                if (bioBegin == null) {
153                        return 1;
154                } else {
155                        return bioBegin;
156                }
157        }
158
159        /**
160         * @param bioBegin the bioBegin to set
161         */
162        public void setBioBegin(Integer begin) {
163                this.bioBegin = begin;
164        }
165
166        /**
167         * @return the bioEnd
168         */
169        public Integer getBioEnd() {
170                if (bioEnd == null) {
171                        return this.getLength();
172                } else {
173                        return bioEnd;
174                }
175        }
176
177        /**
178         * @param bioEnd the bioEnd to set
179         */
180        public void setBioEnd(Integer end) {
181                this.bioEnd = end;
182        }
183
184        /**
185         * Provided for convince if the developer needs to associate data with a sequence
186         *
187         * @return
188         */
189        public Collection<Object> getUserCollection() {
190
191                return userCollection;
192        }
193
194        /**
195         *
196         * @param userCollection
197         */
198        public void setUserCollection(Collection<Object> userCollection) {
199                this.userCollection = userCollection;
200        }
201
202        /**
203         * @return the annotation
204         */
205        public AnnotationType getAnnotationType() {
206                return annotationType;
207        }
208
209        /**
210         * @param annotation the annotation to set
211         */
212        public void setAnnotationType(AnnotationType annotationType) {
213                this.annotationType = annotationType;
214        }
215
216        /**
217         * @return the description
218         */
219        public String getDescription() {
220                return description;
221        }
222
223        /**
224         * @param description the description to set
225         */
226        public void setDescription(String description) {
227                this.description = description;
228        }
229
230        /**
231         * @return the originalHeader
232         */
233        public String getOriginalHeader() {
234                return originalHeader;
235        }
236
237        /**
238         * @param originalHeader the originalHeader to set
239         */
240        public void setOriginalHeader(String originalHeader) {
241                this.originalHeader = originalHeader;
242        }
243
244        /**
245         * @return the parentSequence
246         */
247        public AbstractSequence<?> getParentSequence() {
248                return parentSequence;
249        }
250
251        /**
252         * @param parentSequence the parentSequence to set
253         */
254        public void setParentSequence(AbstractSequence<?> parentSequence) {
255                this.parentSequence = parentSequence;
256        }
257
258        /**
259         * Added support for the source of this sequence for GFF3 export
260         * If a sub sequence doesn't have  source then check for parent source
261         * @return the source
262         */
263        public String getSource() {
264                if (source != null) {
265                        return source;
266                }
267                if (parentSequence != null) {
268                        return parentSequence.getSource();
269                }
270                return null;
271        }
272
273        /**
274         * Added support for the source of this sequence for GFF3 export
275         * @param source the source to set
276         */
277        public void setSource(String source) {
278
279                this.source = source;
280        }
281
282        /**
283         * Add notes about this sequence that will get exported for GFF3
284         * @param note
285         */
286        public void addNote(String note) {
287                notesList.add(note);
288        }
289
290        public void removeNote(String note) {
291                notesList.remove(note);
292        }
293
294        /**
295         * @return the notesList
296         */
297        public ArrayList<String> getNotesList() {
298                return notesList;
299        }
300
301        /**
302         * @param notesList the notesList to set
303         */
304        public void setNotesList(ArrayList<String> notesList) {
305                this.notesList = notesList;
306        }
307
308        /**
309         * Provide place holder for a metric that indicate a score associated with the sequence
310         * @return the sequenceScore
311         */
312        public Double getSequenceScore() {
313                return sequenceScore;
314        }
315
316        /**
317         * @param sequenceScore the sequenceScore to set
318         */
319        public void setSequenceScore(Double sequenceScore) {
320                this.sequenceScore = sequenceScore;
321        }
322
323        /**
324         * Return features at a sequence position by type
325         * @param featureType
326         * @param bioSequencePosition
327         * @return
328         */
329        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(String featureType, int bioSequencePosition) {
330                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits =
331                                new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
332                List<FeatureInterface<AbstractSequence<C>, C>> features = getFeaturesByType(featureType);
333                if (features != null) {
334                        for (FeatureInterface<AbstractSequence<C>, C> feature : features) {
335                                if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) {
336                                        featureHits.add(feature);
337                                }
338                        }
339                }
340                return featureHits;
341        }
342
343        /**
344         * Return features at a sequence position
345         * @param featureType
346         * @param bioSequencePosition
347         * @return
348         */
349        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(int bioSequencePosition) {
350                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits =
351                                new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
352                if (features != null) {
353                        for (FeatureInterface<AbstractSequence<C>, C> feature : features) {
354                                if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) {
355                                        featureHits.add(feature);
356                                }
357                        }
358                }
359                return featureHits;
360        }
361
362        /**
363         *
364         * @return
365         */
366        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures() {
367                return features;
368        }
369
370        /**
371         * Method to help set the proper details for a feature as it relates to a sequence
372         * where the feature needs to have a location on the sequence
373         * @param bioStart
374         * @param bioEnd
375         * @param feature
376         */
377        public void addFeature(int bioStart, int bioEnd, FeatureInterface<AbstractSequence<C>, C> feature) {
378                SequenceLocation<AbstractSequence<C>, C> sequenceLocation =
379                                new SequenceLocation<AbstractSequence<C>, C>(bioStart, bioEnd, this);
380                feature.setLocation(sequenceLocation);
381                addFeature(feature);
382        }
383
384        /**
385         * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than
386         * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features
387         * in SequenceFeaturePanel
388         * @param feature
389         */
390        public void addFeature(FeatureInterface<AbstractSequence<C>, C> feature) {
391                features.add(feature);
392                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType());
393                if (featureList == null) {
394                        featureList = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
395                        groupedFeatures.put(feature.getType(), featureList);
396                }
397                featureList.add(feature);
398                Collections.sort(features, AbstractFeature.LOCATION_LENGTH);
399                Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH);
400        }
401
402        /**
403         * Remove a feature from the sequence
404         * @param feature
405         */
406        public void removeFeature(FeatureInterface<AbstractSequence<C>, C> feature) {
407                features.remove(feature);
408                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType());
409                if (featureList != null) {
410                        featureList.remove(feature);
411                        if (featureList.isEmpty()) {
412                                groupedFeatures.remove(feature.getType());
413                        }
414                }
415        }
416
417        /**
418         *
419         * @param type
420         * @return
421         */
422        public List<FeatureInterface<AbstractSequence<C>, C>> getFeaturesByType(String type) {
423                List<FeatureInterface<AbstractSequence<C>, C>> features = groupedFeatures.get(type);
424                if (features == null) {
425                        features = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
426                }
427                return features;
428        }
429
430        /**
431         * @return the featuresKeyWord
432         */
433        public FeaturesKeyWordInterface getFeaturesKeyWord() {
434                return featuresKeyWord;
435        }
436
437        /**
438         * @param featuresKeyWord the featuresKeyWord to set
439         */
440        public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) {
441                this.featuresKeyWord = featuresKeyWord;
442        }
443
444        /**
445         * @return the databaseReferences
446         */
447        public DatabaseReferenceInterface getDatabaseReferences() {
448                return databaseReferences;
449        }
450
451        /**
452         * @param databaseReferences the databaseReferences to set
453         */
454        public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) {
455                this.databaseReferences = databaseReferences;
456        }
457
458        public FeatureRetriever getFeatureRetriever() {
459                return featureRetriever;
460        }
461
462        public void setFeatureRetriever(FeatureRetriever featureRetriever) {
463                this.featureRetriever = featureRetriever;
464        }
465
466
467
468        public enum AnnotationType {
469
470                CURATED, PREDICTED, UNKNOWN;
471        }
472
473        /**
474         * @return the accession
475         */
476        @Override
477        public AccessionID getAccession() {
478                return accession;
479        }
480
481        /**
482         * @param accession the accession to set
483         */
484        public void setAccession(AccessionID accession) {
485                this.accession = accession;
486        }
487
488        /**
489         * @return the species
490         */
491        public TaxonomyID getTaxonomy() {
492                return taxonomy;
493        }
494
495        /**
496         * @param species the species to set
497         */
498        public void setTaxonomy(TaxonomyID taxonomy) {
499                this.taxonomy = taxonomy;
500        }
501
502        @Override
503        public CompoundSet<C> getCompoundSet() {
504                if (compoundSet != null) {
505                        return compoundSet;
506                }
507                // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence,
508                // e.g., the case where the parent sequence for a protein is a CDS.
509                /*
510                if (parentSequence != null) {
511                        return parentSequence.getCompoundSet();
512                }
513                */
514                return null;
515
516
517        }
518
519        public void setCompoundSet(CompoundSet<C> compoundSet) {
520                this.compoundSet = compoundSet;
521        }
522
523        @Override
524        public String toString() {
525                return getSequenceAsString();
526        }
527
528        private SequenceReader<C> getSequenceStorage() {
529                if (sequenceStorage != null) {
530                        return sequenceStorage;
531                }
532                if (parentSequence != null) {
533
534                        //return parentSequence.getSequenceStorage();
535
536                        if ( this.compoundSet.equals(parentSequence.getCompoundSet())){
537                                sequenceStorage = new ArrayListSequenceReader<C>();
538                                sequenceStorage.setCompoundSet(this.getCompoundSet());
539                                try {
540                                        sequenceStorage.setContents(parentSequence.getSequenceAsString());
541                                } catch (CompoundNotFoundException e) {
542                                        // TODO is there a better way to handle this exception?
543                                        logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}",e.getMessage());
544                                }
545                                return sequenceStorage;
546                        }
547
548                }
549
550                return null;
551        }
552
553        /**
554         *
555         * @param begin
556         * @param end
557         * @param strand
558         * @return
559         */
560        public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) {
561
562                Location loc = new SimpleLocation(bioStart, bioEnd, strand);
563                return loc.getSubSequence(this).getSequenceAsString();
564        }
565
566        /**
567         * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand.
568         * @return
569         */
570        @Override
571        public String getSequenceAsString() {
572                return SequenceMixin.toString(this);
573
574        }
575
576        /**
577         *
578         * @return
579         */
580        @Override
581        public List<C> getAsList() {
582                return SequenceMixin.toList(this);
583        }
584
585        /**
586         *
587         * @param position The 1-indexed position of the amino acid
588         * @return
589         */
590        @Override
591        public C getCompoundAt(int position) {
592
593                return getSequenceStorage().getCompoundAt(position);
594        }
595
596        /**
597         *
598         * @param compound
599         * @return The first index of compound in this sequence (1-based)
600         */
601        @Override
602        public int getIndexOf(C compound) {
603                return getSequenceStorage().getIndexOf(compound);
604        }
605
606        /**
607         *
608         * @param compound
609         * @return The last index of compound in this sequence (1-based)
610         */
611        @Override
612        public int getLastIndexOf(C compound) {
613                return getSequenceStorage().getLastIndexOf(compound);
614        }
615
616        /**
617         *
618         * @return
619         */
620        @Override
621        public int getLength() {
622                return getSequenceStorage().getLength();
623        }
624
625        /**
626         *
627         * @param bioStart
628         * @param bioEnd
629         * @return
630         */
631        @Override
632        public SequenceView<C> getSubSequence(final Integer bioStart, final Integer bioEnd) {
633                return new SequenceProxyView<C>(this, bioStart, bioEnd);
634        }
635
636        /**
637         *
638         * @return
639         */
640        @Override
641        public Iterator<C> iterator() {
642                return getSequenceStorage().iterator();
643        }
644
645        /**
646         *
647         * @param compounds
648         * @return
649         */
650        @Override
651        public int countCompounds(C... compounds) {
652                return SequenceMixin.countCompounds(this, compounds);
653        }
654
655        /**
656         *
657         * @return
658         */
659        @Override
660        public SequenceView<C> getInverse() {
661                return SequenceMixin.inverse(this);
662        }
663
664        //TODO needs equals and hashcode
665}