001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * Created on DATE
021 *
022 */
023package org.biojava.nbio.core.sequence;
024
025import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
026import org.biojava.nbio.core.sequence.compound.*;
027import org.biojava.nbio.core.sequence.features.FeatureInterface;
028import org.biojava.nbio.core.sequence.io.DNASequenceCreator;
029import org.biojava.nbio.core.sequence.io.FastaReader;
030import org.biojava.nbio.core.sequence.io.PlainFastaHeaderParser;
031import org.biojava.nbio.core.sequence.loader.StringProxySequenceReader;
032import org.biojava.nbio.core.sequence.location.InsdcParser;
033import org.biojava.nbio.core.sequence.location.template.Location;
034import org.biojava.nbio.core.sequence.template.AbstractSequence;
035import org.biojava.nbio.core.sequence.template.CompoundSet;
036import org.biojava.nbio.core.sequence.template.ProxySequenceReader;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import java.io.IOException;
041import java.io.InputStream;
042import java.net.URL;
043import java.util.LinkedHashMap;
044import java.util.List;
045import org.biojava.nbio.core.sequence.features.Qualifier;
046
047/**
048 * The representation of a ProteinSequence
049 *
050 * @author Scooter Willis
051 * @author Paolo Pavan
052 */
053public class ProteinSequence extends AbstractSequence<AminoAcidCompound> {
054
055        private final static Logger logger = LoggerFactory.getLogger(ProteinSequence.class);
056
057        /*
058         private ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> features
059         = new ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>();
060         private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>> groupedFeatures
061         = new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>>();
062         */
063        /**
064         * Create a protein from a string
065         *
066         * @param seqString
067         * @throws CompoundNotFoundException
068         */
069        public ProteinSequence(String seqString) throws CompoundNotFoundException {
070                this(seqString, AminoAcidCompoundSet.getAminoAcidCompoundSet());
071        }
072
073        /**
074         * Create a protein from a string with a user defined set of amino acids
075         *
076         * @param seqString
077         * @param compoundSet
078         * @throws CompoundNotFoundException
079         */
080        public ProteinSequence(String seqString, CompoundSet<AminoAcidCompound> compoundSet) throws CompoundNotFoundException {
081                super(seqString, compoundSet);
082        }
083
084        /**
085         * A protein sequence where the storage of the sequence is somewhere else.
086         * Could be loaded from a large Fasta file or via a Uniprot Proxy reader via
087         * Uniprot ID
088         *
089         * @param proxyLoader
090         */
091        public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader) {
092                this(proxyLoader, AminoAcidCompoundSet.getAminoAcidCompoundSet());
093        }
094
095        /**
096         * A protein sequence where the storage of the sequence is somewhere else
097         * with user defined set of amino acids. Could be loaded from a large Fasta
098         * file or via a Uniprot Proxy reader via Uniprot ID
099         *
100         * @param proxyLoader
101         * @param compoundSet
102         */
103        public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader, CompoundSet<AminoAcidCompound> compoundSet) {
104                super(proxyLoader, compoundSet);
105
106                // do protein-specific tasks
107                // add source if found
108                List<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> CDSFeatures = getFeaturesByType("CDS");
109
110                // cases if a protein has more than 1 parent are not supported yet
111                if (CDSFeatures.size() == 1) {
112                        Qualifier codedBy = CDSFeatures.get(0).getQualifiers().get("coded_by").get(0);
113
114                        if (codedBy != null) {
115                                String codedBySeq = codedBy.getValue();
116
117                                InsdcParser parser = new InsdcParser(DataSource.GENBANK);
118                                Location location = parser.parse(codedBySeq);
119
120                                try {
121                                        DNASequence dnaSeq = new DNASequence(getSequence(location), DNACompoundSet.getDNACompoundSet());
122                                        setParentDNASequence(dnaSeq, location.getStart().getPosition(), location.getEnd().getPosition());
123                                } catch (CompoundNotFoundException e) {
124                                        // TODO is there another solution to handle this exception?
125                                        logger.error("Could not add 'coded_by' parent DNA location feature, unrecognised compounds found in DNA sequence: {}", e.getMessage());
126                                }
127                        }
128                }
129
130        }
131
132        /**
133         * A Protein sequence can be stand alone or loaded from a transcript
134         * sequence. The design goal is to allow the creation of a Protein sequence
135         * from a Uniprot ID or some other Protein ID that based on cross reference
136         * you should be able to get the GeneSequence that codes for the protein if
137         * the CDS/Gene region is known. From the GeneSequence you should then be
138         * able to get the ChromosomeSequence which then allows you explore flaning
139         * regions of the gene sequences. The framework is in place to do this but
140         * currently hasn't been implement in the reverse direction starting from
141         * the Protein sequence.
142         *
143         * @param parentDNASequence
144         * @param begin
145         * @param end
146         */
147        //TODO - Someone needs to check if this is a bug.  Shouldn't a parentDNASequence be something other then AminoAcid?
148        //However, due to the derivation of this class, this is the only possible type argument for this parameter...
149        public void setParentDNASequence(AbstractSequence<NucleotideCompound> parentDNASequence, Integer begin, Integer end) {
150                this.setParentSequence(parentDNASequence);
151                setBioBegin(begin);
152                setBioEnd(end);
153        }
154
155        private DNASequence getRawParentSequence(String accessId) throws IOException {
156                String seqUrlTemplate = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s&rettype=fasta&retmode=text";
157                URL url = new URL(String.format(seqUrlTemplate, accessId));
158
159                logger.trace("Getting parent DNA sequence from URL: {}", url.toString());
160
161                InputStream is = url.openConnection().getInputStream();
162
163                FastaReader<DNASequence, NucleotideCompound> parentReader
164                                = new FastaReader<DNASequence, NucleotideCompound>(is,
165                                                new PlainFastaHeaderParser<DNASequence, NucleotideCompound>(),
166                                                new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet()));
167                LinkedHashMap<String, DNASequence> seq = parentReader.process();
168
169                DNASequence parentSeq = null;
170                if (seq.size() == 1) {
171                        parentSeq = seq.values().iterator().next();
172                }
173                is.close();
174
175                return parentSeq;
176        }
177
178        private String getSequence(Location cdna) {
179                DNASequence rawParent;
180                if (!cdna.isComplex()) {
181                        try {
182                                rawParent = getRawParentSequence(cdna.getAccession().getID());
183                                return cdna.getSubSequence(rawParent).getSequenceAsString();
184                        } catch (IOException e) {
185                                // return null
186                                logger.error("Caught IOException when getting DNA sequence for id {}. Error: {}", cdna.getAccession().getID(), e.getMessage());
187                                return null;
188                        }
189                } else {
190                        // in case of complex
191                        StringBuilder sb = new StringBuilder();
192
193                        for (Location sub : cdna.getSubLocations()) {
194                                String sebStr = getSequence(sub);
195                                sb.append((sebStr == null ? "" : sebStr));
196                        }
197
198                        return sb.toString();
199                }
200        }
201
202        public static void main(String[] args) throws Exception {
203                ProteinSequence proteinSequence = new ProteinSequence("ARNDCEQGHILKMFPSTWYVBZJX");
204                logger.info("Protein Sequence: {}", proteinSequence.toString());
205
206                StringProxySequenceReader<AminoAcidCompound> sequenceStringProxyLoader = new StringProxySequenceReader<AminoAcidCompound>("XRNDCEQGHILKMFPSTWYVBZJA", AminoAcidCompoundSet.getAminoAcidCompoundSet());
207                ProteinSequence proteinSequenceFromProxy = new ProteinSequence(sequenceStringProxyLoader);
208                logger.info("Protein Sequence from Proxy: {}", proteinSequenceFromProxy.toString());
209
210        }
211}