Source code

001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.*;
027import org.biojava.nbio.core.sequence.features.FeatureInterface;
028import org.biojava.nbio.core.sequence.io.DNASequenceCreator;
029import org.biojava.nbio.core.sequence.io.FastaReader;
030import org.biojava.nbio.core.sequence.io.PlainFastaHeaderParser;
031import org.biojava.nbio.core.sequence.location.InsdcParser;
032import org.biojava.nbio.core.sequence.location.template.Location;
033import org.biojava.nbio.core.sequence.template.AbstractSequence;
034import org.biojava.nbio.core.sequence.template.CompoundSet;
035import org.biojava.nbio.core.sequence.template.ProxySequenceReader;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import java.io.IOException;
040import java.io.InputStream;
041import java.net.URL;
042import java.util.LinkedHashMap;
043import java.util.List;
044import org.biojava.nbio.core.sequence.features.Qualifier;
045
046/**
047 * The representation of a ProteinSequence
048 *
049 * @author Scooter Willis
050 * @author Paolo Pavan
051 */
052public class ProteinSequence extends AbstractSequence<AminoAcidCompound> {
053
054        private final static Logger logger = LoggerFactory.getLogger(ProteinSequence.class);
055
056        /*
057         private ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> features
058         = new ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>();
059         private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>> groupedFeatures
060         = new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>>();
061         */
062        /**
063         * Create a protein from a string
064         *
065         * @param seqString
066         * @throws CompoundNotFoundException
067         */
068        public ProteinSequence(String seqString) throws CompoundNotFoundException {
069                this(seqString, AminoAcidCompoundSet.getAminoAcidCompoundSet());
070        }
071
072        /**
073         * Create a protein from a string with a user defined set of amino acids
074         *
075         * @param seqString
076         * @param compoundSet
077         * @throws CompoundNotFoundException
078         */
079        public ProteinSequence(String seqString, CompoundSet<AminoAcidCompound> compoundSet) throws CompoundNotFoundException {
080                super(seqString, compoundSet);
081        }
082
083        /**
084         * A protein sequence where the storage of the sequence is somewhere else.
085         * Could be loaded from a large Fasta file or via a Uniprot Proxy reader via
086         * Uniprot ID
087         *
088         * @param proxyLoader
089         */
090        public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader) {
091                this(proxyLoader, AminoAcidCompoundSet.getAminoAcidCompoundSet());
092        }
093
094        /**
095         * A protein sequence where the storage of the sequence is somewhere else
096         * with user defined set of amino acids. Could be loaded from a large Fasta
097         * file or via a Uniprot Proxy reader via Uniprot ID
098         *
099         * @param proxyLoader
100         * @param compoundSet
101         */
102        public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader, CompoundSet<AminoAcidCompound> compoundSet) {
103                super(proxyLoader, compoundSet);
104
105                // do protein-specific tasks
106                // add source if found
107                List<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> CDSFeatures = getFeaturesByType("CDS");
108
109                // cases if a protein has more than 1 parent are not supported yet
110                if (CDSFeatures.size() == 1) {
111                        Qualifier codedBy = CDSFeatures.get(0).getQualifiers().get("coded_by").get(0);
112
113                        if (codedBy != null) {
114                                String codedBySeq = codedBy.getValue();
115
116                                InsdcParser parser = new InsdcParser(DataSource.GENBANK);
117                                Location location = parser.parse(codedBySeq);
118
119                                try {
120                                        DNASequence dnaSeq = new DNASequence(getSequence(location), DNACompoundSet.getDNACompoundSet());
121                                        setParentDNASequence(dnaSeq, location.getStart().getPosition(), location.getEnd().getPosition());
122                                } catch (CompoundNotFoundException e) {
123                                        // TODO is there another solution to handle this exception?
124                                        logger.error("Could not add 'coded_by' parent DNA location feature, unrecognised compounds found in DNA sequence: {}", e.getMessage());
125                                }
126                        }
127                }
128
129        }
130
131        /**
132         * A Protein sequence can be stand alone or loaded from a transcript
133         * sequence. The design goal is to allow the creation of a Protein sequence
134         * from a Uniprot ID or some other Protein ID that based on cross reference
135         * you should be able to get the GeneSequence that codes for the protein if
136         * the CDS/Gene region is known. From the GeneSequence you should then be
137         * able to get the ChromosomeSequence which then allows you explore flaning
138         * regions of the gene sequences. The framework is in place to do this but
139         * currently hasn't been implement in the reverse direction starting from
140         * the Protein sequence.
141         *
142         * @param parentDNASequence
143         * @param begin
144         * @param end
145         */
146        //TODO - Someone needs to check if this is a bug.  Shouldn't a parentDNASequence be something other then AminoAcid?
147        //However, due to the derivation of this class, this is the only possible type argument for this parameter...
148        public void setParentDNASequence(AbstractSequence<NucleotideCompound> parentDNASequence, Integer begin, Integer end) {
149                this.setParentSequence(parentDNASequence);
150                setBioBegin(begin);
151                setBioEnd(end);
152        }
153
154        private DNASequence getRawParentSequence(String accessId) throws IOException {
155                String seqUrlTemplate = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s&rettype=fasta&retmode=text";
156                URL url = new URL(String.format(seqUrlTemplate, accessId));
157
158                logger.trace("Getting parent DNA sequence from URL: {}", url.toString());
159
160                InputStream is = url.openConnection().getInputStream();
161
162                FastaReader<DNASequence, NucleotideCompound> parentReader
163                                = new FastaReader<DNASequence, NucleotideCompound>(is,
164                                                new PlainFastaHeaderParser<DNASequence, NucleotideCompound>(),
165                                                new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet()));
166                LinkedHashMap<String, DNASequence> seq = parentReader.process();
167
168                DNASequence parentSeq = null;
169                if (seq.size() == 1) {
170                        parentSeq = seq.values().iterator().next();
171                }
172                is.close();
173
174                return parentSeq;
175        }
176
177        private String getSequence(Location cdna) {
178                DNASequence rawParent;
179                if (!cdna.isComplex()) {
180                        try {
181                                rawParent = getRawParentSequence(cdna.getAccession().getID());
182                                return cdna.getSubSequence(rawParent).getSequenceAsString();
183                        } catch (IOException e) {
184                                // return null
185                                logger.error("Caught IOException when getting DNA sequence for id {}. Error: {}", cdna.getAccession().getID(), e.getMessage());
186                                return null;
187                        }
188                } else {
189                        // in case of complex
190                        StringBuilder sb = new StringBuilder();
191
192                        for (Location sub : cdna.getSubLocations()) {
193                                String sebStr = getSequence(sub);
194                                sb.append((sebStr == null ? "" : sebStr));
195                        }
196
197                        return sb.toString();
198                }
199        }
200
201}