001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on 01-21-2010
021 *
022 * @author Richard Holland
023 * @author Scooter Willis
024 * @author Paolo Pavan
025 *
026 */
027package org.biojava.nbio.core.sequence.template;
028
029import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
030import org.biojava.nbio.core.sequence.AccessionID;
031import org.biojava.nbio.core.sequence.DataSource;
032import org.biojava.nbio.core.sequence.Strand;
033import org.biojava.nbio.core.sequence.TaxonomyID;
034import org.biojava.nbio.core.sequence.features.*;
035import org.biojava.nbio.core.sequence.loader.UniprotProxySequenceReader;
036import org.biojava.nbio.core.sequence.location.SequenceLocation;
037import org.biojava.nbio.core.sequence.location.SimpleLocation;
038import org.biojava.nbio.core.sequence.location.template.Location;
039import org.biojava.nbio.core.sequence.reference.AbstractReference;
040import org.biojava.nbio.core.sequence.storage.ArrayListSequenceReader;
041import org.biojava.nbio.core.util.Equals;
042import org.slf4j.Logger;
043import org.slf4j.LoggerFactory;
044
045import java.util.*;
046
047/**
048 *
049 * The base class for DNA, RNA and Protein sequences.
050 * @param <C>
051 */
052public abstract class AbstractSequence<C extends Compound> implements Sequence<C> {
053
054        private final static Logger logger = LoggerFactory.getLogger(AbstractSequence.class);
055
056        private TaxonomyID taxonomy;
057        private AccessionID accession;
058        private SequenceReader<C> sequenceStorage = null;
059        private CompoundSet<C> compoundSet;
060        private AnnotationType annotationType = AnnotationType.UNKNOWN;
061        private String description;
062        private String originalHeader;
063        private Collection<Object> userCollection;
064        private Integer bioBegin = null;
065        private Integer bioEnd = null;
066        private AbstractSequence<?> parentSequence = null;
067        private String source = null;
068        private ArrayList<String> notesList = new ArrayList<String>();
069        private Double sequenceScore = null;
070        private FeaturesKeyWordInterface featuresKeyWord = null;
071        private DatabaseReferenceInterface databaseReferences = null;
072        private FeatureRetriever featureRetriever = null;
073        private ArrayList<FeatureInterface<AbstractSequence<C>, C>> features =
074                        new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
075        private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>> groupedFeatures =
076                        new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<C>, C>>>();
077        private List<String> comments = new ArrayList<>();
078        private List<AbstractReference> references;
079
080        public AbstractSequence() {
081        }
082
083        /**
084         * Create a Sequence from a simple string where the values should be found in compoundSet
085         * @param seqString
086         * @param compoundSet
087         * @throws CompoundNotFoundException
088         */
089        public AbstractSequence(String seqString, CompoundSet<C> compoundSet) throws CompoundNotFoundException {
090                setCompoundSet(compoundSet);
091                sequenceStorage = new ArrayListSequenceReader<C>();
092                sequenceStorage.setCompoundSet(this.getCompoundSet());
093                sequenceStorage.setContents(seqString);
094        }
095
096        /**
097         * A ProxySequenceReader allows abstraction of both the storage of the sequence data and the location
098         * of the sequence data. A variety of use cases are possible. A ProxySequenceReader that knows the offset and of the sequence in
099         * a large fasta file. A ProxySequenceReader that can pull Sequence data from UniProt, NCBI or a custom database.
100         * If the ProxySequenceReader implements various interfaces then the sequence will set those interfaces so that calls to
101         * various methods will be valid.
102         *
103         * @param proxyLoader
104         * @param compoundSet
105         */
106        public AbstractSequence(SequenceReader<C> proxyLoader, CompoundSet<C> compoundSet) {
107                setCompoundSet(compoundSet);
108                setProxySequenceReader(proxyLoader);
109        }
110
111        /**
112         * Very important method that allows external mappings of sequence data and features. This method
113         * will gain additional interface inspection that allows external data sources with knowledge
114         * of features for a sequence to be supported.
115         *
116         * @param proxyLoader
117         */
118        public void setProxySequenceReader(SequenceReader<C> proxyLoader) {
119                this.sequenceStorage = proxyLoader;
120                if (proxyLoader instanceof FeaturesKeyWordInterface) {
121                        this.setFeaturesKeyWord((FeaturesKeyWordInterface) sequenceStorage);
122                }
123                if (proxyLoader instanceof DatabaseReferenceInterface) {
124                        this.setDatabaseReferences((DatabaseReferenceInterface) sequenceStorage);
125                }
126
127                if (proxyLoader instanceof FeatureRetriever) {
128                        this.setFeatureRetriever((FeatureRetriever) sequenceStorage);
129                        HashMap<String, ArrayList<AbstractFeature>> ff = getFeatureRetriever().getFeatures();
130                        for (String k: ff.keySet()){
131                                for (AbstractFeature f: ff.get(k)){
132                                        this.addFeature(f);
133                                }
134                        }
135                        // success of next statement guaranteed because source is a compulsory field
136                        //DBReferenceInfo dbQualifier = (DBReferenceInfo)ff.get("source").get(0).getQualifiers().get("db_xref");
137                        ArrayList<DBReferenceInfo> dbQualifiers = (ArrayList)ff.get("source").get(0).getQualifiers().get("db_xref");
138                        DBReferenceInfo dbQualifier = dbQualifiers.get(0);
139
140                        if (dbQualifier != null) this.setTaxonomy(new TaxonomyID(dbQualifier.getDatabase()+":"+dbQualifier.getId(), DataSource.UNKNOWN));
141                }
142
143                if(getAccession() == null && proxyLoader instanceof UniprotProxySequenceReader){ // we have lots of unsupported operations for this call so quick fix to allow this tow rork
144                        this.setAccession(proxyLoader.getAccession());
145                }
146        }
147
148        public SequenceReader<C> getProxySequenceReader() {
149                return sequenceStorage;
150        }
151
152        /**
153         * @return the bioBegin
154         */
155        public Integer getBioBegin() {
156                if (bioBegin == null) {
157                        return 1;
158                } else {
159                        return bioBegin;
160                }
161        }
162
163        /**
164         * @param bioBegin the bioBegin to set
165         */
166        public void setBioBegin(Integer bioBegin) {
167                this.bioBegin = bioBegin;
168        }
169
170        /**
171         * @return the bioEnd
172         */
173        public Integer getBioEnd() {
174                if (bioEnd == null) {
175                        return this.getLength();
176                } else {
177                        return bioEnd;
178                }
179        }
180
181        /**
182         * @param bioEnd the bioEnd to set
183         */
184        public void setBioEnd(Integer bioEnd) {
185                this.bioEnd = bioEnd;
186        }
187
188        /**
189         * Provided for convince if the developer needs to associate data with a sequence
190         *
191         * @return
192         */
193        public Collection<Object> getUserCollection() {
194
195                return userCollection;
196        }
197
198        /**
199         *
200         * @param userCollection
201         */
202        public void setUserCollection(Collection<Object> userCollection) {
203                this.userCollection = userCollection;
204        }
205
206        /**
207         * @return the annotation
208         */
209        public AnnotationType getAnnotationType() {
210                return annotationType;
211        }
212
213        /**
214         * @param annotationType the annotation to set
215         */
216        public void setAnnotationType(AnnotationType annotationType) {
217                this.annotationType = annotationType;
218        }
219
220        /**
221         * @return the description
222         */
223        public String getDescription() {
224                return description;
225        }
226
227        /**
228         * @param description the description to set
229         */
230        public void setDescription(String description) {
231                this.description = description;
232        }
233
234        /**
235         * @return the originalHeader
236         */
237        public String getOriginalHeader() {
238                return originalHeader;
239        }
240
241        /**
242         * @param originalHeader the originalHeader to set
243         */
244        public void setOriginalHeader(String originalHeader) {
245                this.originalHeader = originalHeader;
246        }
247
248        /**
249         * @return the parentSequence
250         */
251        public AbstractSequence<?> getParentSequence() {
252                return parentSequence;
253        }
254
255        /**
256         * @param parentSequence the parentSequence to set
257         */
258        public void setParentSequence(AbstractSequence<?> parentSequence) {
259                this.parentSequence = parentSequence;
260        }
261
262        /**
263         * Added support for the source of this sequence for GFF3 export
264         * If a sub sequence doesn't have  source then check for parent source
265         * @return the source
266         */
267        public String getSource() {
268                if (source != null) {
269                        return source;
270                }
271                if (parentSequence != null) {
272                        return parentSequence.getSource();
273                }
274                return null;
275        }
276
277        /**
278         * Added support for the source of this sequence for GFF3 export
279         * @param source the source to set
280         */
281        public void setSource(String source) {
282
283                this.source = source;
284        }
285
286        /**
287         * Add notes about this sequence that will get exported for GFF3
288         * @param note
289         */
290        public void addNote(String note) {
291                notesList.add(note);
292        }
293
294        public void removeNote(String note) {
295                notesList.remove(note);
296        }
297
298        /**
299         * @return the notesList
300         */
301        public ArrayList<String> getNotesList() {
302                return notesList;
303        }
304
305        /**
306         * @param notesList the notesList to set
307         */
308        public void setNotesList(ArrayList<String> notesList) {
309                this.notesList = notesList;
310        }
311
312        /**
313         * Provide place holder for a metric that indicate a score associated with the sequence
314         * @return the sequenceScore
315         */
316        public Double getSequenceScore() {
317                return sequenceScore;
318        }
319
320        /**
321         * @param sequenceScore the sequenceScore to set
322         */
323        public void setSequenceScore(Double sequenceScore) {
324                this.sequenceScore = sequenceScore;
325        }
326
327        /**
328         * @since 5.0.0
329         * @return the list of {@link AbstractReference}
330         */
331        public List<AbstractReference> getReferences() {
332                return references;
333        }
334
335        /**
336         * Set the list of {@link AbstractReference}
337         * @since 5.0.0
338         * @param references
339         */
340        public void setReferences(List<AbstractReference> references) {
341                this.references = references;
342        }
343
344        /**
345         * Return features at a sequence position by type
346         * @param featureType
347         * @param bioSequencePosition
348         * @return
349         */
350        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(String featureType, int bioSequencePosition) {
351                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits =
352                                new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
353                List<FeatureInterface<AbstractSequence<C>, C>> features = getFeaturesByType(featureType);
354                if (features != null) {
355                        for (FeatureInterface<AbstractSequence<C>, C> feature : features) {
356                                if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) {
357                                        featureHits.add(feature);
358                                }
359                        }
360                }
361                return featureHits;
362        }
363
364        /**
365         * Return features at a sequence position
366         * @param bioSequencePosition
367         * @return
368         */
369        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures(int bioSequencePosition) {
370                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureHits =
371                                new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
372                if (features != null) {
373                        for (FeatureInterface<AbstractSequence<C>, C> feature : features) {
374                                if (bioSequencePosition >= feature.getLocations().getStart().getPosition() && bioSequencePosition <= feature.getLocations().getEnd().getPosition()) {
375                                        featureHits.add(feature);
376                                }
377                        }
378                }
379                return featureHits;
380        }
381
382        /**
383         *
384         * @return
385         */
386        public List<FeatureInterface<AbstractSequence<C>, C>> getFeatures() {
387                return features;
388        }
389
390        /**
391         * Method to help set the proper details for a feature as it relates to a sequence
392         * where the feature needs to have a location on the sequence
393         * @param bioStart
394         * @param bioEnd
395         * @param feature
396         */
397        public void addFeature(int bioStart, int bioEnd, FeatureInterface<AbstractSequence<C>, C> feature) {
398                SequenceLocation<AbstractSequence<C>, C> sequenceLocation =
399                                new SequenceLocation<AbstractSequence<C>, C>(bioStart, bioEnd, this);
400                feature.setLocation(sequenceLocation);
401                addFeature(feature);
402        }
403
404        /**
405         * Add a feature to this sequence. The feature will be added to the collection where the order is start position and if more than
406         * one feature at the same start position then longest is added first. This helps on doing feature layout for displaying features
407         * in SequenceFeaturePanel
408         * @param feature
409         */
410        public void addFeature(FeatureInterface<AbstractSequence<C>, C> feature) {
411                features.add(feature);
412                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType());
413                if (featureList == null) {
414                        featureList = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
415                        groupedFeatures.put(feature.getType(), featureList);
416                }
417                featureList.add(feature);
418                Collections.sort(features, AbstractFeature.LOCATION_LENGTH);
419                Collections.sort(featureList, AbstractFeature.LOCATION_LENGTH);
420        }
421
422        /**
423         * Remove a feature from the sequence
424         * @param feature
425         */
426        public void removeFeature(FeatureInterface<AbstractSequence<C>, C> feature) {
427                features.remove(feature);
428                ArrayList<FeatureInterface<AbstractSequence<C>, C>> featureList = groupedFeatures.get(feature.getType());
429                if (featureList != null) {
430                        featureList.remove(feature);
431                        if (featureList.isEmpty()) {
432                                groupedFeatures.remove(feature.getType());
433                        }
434                }
435        }
436
437        /**
438         *
439         * @param type
440         * @return
441         */
442        public List<FeatureInterface<AbstractSequence<C>, C>> getFeaturesByType(String type) {
443                List<FeatureInterface<AbstractSequence<C>, C>> features = groupedFeatures.get(type);
444                if (features == null) {
445                        features = new ArrayList<FeatureInterface<AbstractSequence<C>, C>>();
446                }
447                return features;
448        }
449
450        /**
451         *
452         * @return comments
453         */
454        public List<String> getComments() {
455                return comments;
456        }
457
458        /**
459         * Set comments.
460         * @param comments
461         */
462        public void setComments(List<String> comments) {
463                this.comments = comments;
464        }
465
466        /**
467         * @return the featuresKeyWord
468         */
469        public FeaturesKeyWordInterface getFeaturesKeyWord() {
470                return featuresKeyWord;
471        }
472
473        /**
474         * @param featuresKeyWord the featuresKeyWord to set
475         */
476        public void setFeaturesKeyWord(FeaturesKeyWordInterface featuresKeyWord) {
477                this.featuresKeyWord = featuresKeyWord;
478        }
479
480        /**
481         * @return the databaseReferences
482         */
483        public DatabaseReferenceInterface getDatabaseReferences() {
484                return databaseReferences;
485        }
486
487        /**
488         * @param databaseReferences the databaseReferences to set
489         */
490        public void setDatabaseReferences(DatabaseReferenceInterface databaseReferences) {
491                this.databaseReferences = databaseReferences;
492        }
493
494        public FeatureRetriever getFeatureRetriever() {
495                return featureRetriever;
496        }
497
498        public void setFeatureRetriever(FeatureRetriever featureRetriever) {
499                this.featureRetriever = featureRetriever;
500        }
501
502
503
504        public enum AnnotationType {
505
506                CURATED, PREDICTED, UNKNOWN;
507        }
508
509        /**
510         * @return the accession
511         */
512        @Override
513        public AccessionID getAccession() {
514                return accession;
515        }
516
517        /**
518         * @param accession the accession to set
519         */
520        public void setAccession(AccessionID accession) {
521                this.accession = accession;
522        }
523
524        /**
525         * @return the species
526         */
527        public TaxonomyID getTaxonomy() {
528                return taxonomy;
529        }
530
531        /**
532         * @param taxonomy the species to set
533         */
534        public void setTaxonomy(TaxonomyID taxonomy) {
535                this.taxonomy = taxonomy;
536        }
537
538        @Override
539        public CompoundSet<C> getCompoundSet() {
540                if (compoundSet != null) {
541                        return compoundSet;
542                }
543                // This is invalid since the parentSequence isn't guaranteed to have the same compound set as this sequence,
544                // e.g., the case where the parent sequence for a protein is a CDS.
545                /*
546                if (parentSequence != null) {
547                        return parentSequence.getCompoundSet();
548                }
549                */
550                return null;
551
552
553        }
554
555        public void setCompoundSet(CompoundSet<C> compoundSet) {
556                this.compoundSet = compoundSet;
557        }
558
559        @Override
560        public boolean equals(Object o){
561
562                if(! Equals.classEqual(this, o)) {
563                        return false;
564                }
565
566                Sequence<C> other = (Sequence<C>)o;
567
568                if ( other.getCompoundSet() != getCompoundSet())
569                        return false;
570
571
572                List<C> rawCompounds = getAsList();
573                List<C> otherCompounds = other.getAsList();
574
575                if ( rawCompounds.size() != otherCompounds.size())
576                        return false;
577
578                for (int i = 0 ; i < rawCompounds.size() ; i++){
579                        Compound myCompound = rawCompounds.get(i);
580                        Compound otherCompound = otherCompounds.get(i);
581                        if ( ! myCompound.equalsIgnoreCase(otherCompound))
582                                return false;
583                }
584                return true;
585        }
586
587        @Override
588        public int hashCode(){
589                String s = getSequenceAsString();
590                return s.hashCode();
591        }
592
593        @Override
594        public String toString() {
595                return getSequenceAsString();
596        }
597
598        private SequenceReader<C> getSequenceStorage() {
599                if (sequenceStorage != null) {
600                        return sequenceStorage;
601                }
602                if (parentSequence != null) {
603
604                        //return parentSequence.getSequenceStorage();
605
606                        if ( this.compoundSet.equals(parentSequence.getCompoundSet())){
607                                sequenceStorage = new ArrayListSequenceReader<C>();
608                                sequenceStorage.setCompoundSet(this.getCompoundSet());
609                                try {
610                                        sequenceStorage.setContents(parentSequence.getSequenceAsString());
611                                } catch (CompoundNotFoundException e) {
612                                        // TODO is there a better way to handle this exception?
613                                        logger.error("Problem setting contents from parent sequence, some unrecognised compound: {}",e.getMessage());
614                                }
615                                return sequenceStorage;
616                        }
617
618                }
619
620                return null;
621        }
622
623        /**
624         *
625         * @param bioStart
626         * @param bioEnd
627         * @param strand
628         * @return
629         */
630        public String getSequenceAsString(Integer bioStart, Integer bioEnd, Strand strand) {
631
632                Location loc = new SimpleLocation(bioStart, bioEnd, strand);
633                return loc.getSubSequence(this).getSequenceAsString();
634        }
635
636        /**
637         * Default case is to assume strand is positive because only CDSSequence can be either positive or negative Strand.
638         * @return
639         */
640        @Override
641        public String getSequenceAsString() {
642                return SequenceMixin.toString(this);
643
644        }
645
646        /**
647         *
648         * @return
649         */
650        @Override
651        public List<C> getAsList() {
652
653                return sequenceStorage.getAsList();
654        }
655
656        /**
657         *
658         * @param position The 1-indexed position of the amino acid
659         * @return
660         */
661        @Override
662        public C getCompoundAt(int position) {
663
664                return getSequenceStorage().getCompoundAt(position);
665        }
666
667        /**
668         *
669         * @param compound
670         * @return The first index of compound in this sequence (1-based)
671         */
672        @Override
673        public int getIndexOf(C compound) {
674                return getSequenceStorage().getIndexOf(compound);
675        }
676
677        /**
678         *
679         * @param compound
680         * @return The last index of compound in this sequence (1-based)
681         */
682        @Override
683        public int getLastIndexOf(C compound) {
684                return getSequenceStorage().getLastIndexOf(compound);
685        }
686
687        /**
688         *
689         * @return
690         */
691        @Override
692        public int getLength() {
693                return getSequenceStorage().getLength();
694        }
695
696        /**
697         *
698         * @param bioStart
699         * @param bioEnd
700         * @return
701         */
702        @Override
703        public SequenceView<C> getSubSequence(final Integer bioStart, final Integer bioEnd) {
704                return new SequenceProxyView<C>(this, bioStart, bioEnd);
705        }
706
707        /**
708         *
709         * @return
710         */
711        @Override
712        public Iterator<C> iterator() {
713                return getSequenceStorage().iterator();
714        }
715
716        /**
717         *
718         * @param compounds
719         * @return
720         */
721        @Override
722        public int countCompounds(C... compounds) {
723                return SequenceMixin.countCompounds(this, compounds);
724        }
725
726        /**
727         *
728         * @return
729         */
730        @Override
731        public SequenceView<C> getInverse() {
732                return SequenceMixin.inverse(this);
733        }
734
735
736}