001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.*; 027import org.biojava.nbio.core.sequence.features.FeatureInterface; 028import org.biojava.nbio.core.sequence.io.DNASequenceCreator; 029import org.biojava.nbio.core.sequence.io.FastaReader; 030import org.biojava.nbio.core.sequence.io.PlainFastaHeaderParser; 031import org.biojava.nbio.core.sequence.location.InsdcParser; 032import org.biojava.nbio.core.sequence.location.template.Location; 033import org.biojava.nbio.core.sequence.template.AbstractSequence; 034import org.biojava.nbio.core.sequence.template.CompoundSet; 035import org.biojava.nbio.core.sequence.template.ProxySequenceReader; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import java.io.IOException; 040import java.io.InputStream; 041import java.net.URL; 042import java.util.LinkedHashMap; 043import java.util.List; 044import org.biojava.nbio.core.sequence.features.Qualifier; 045import java.util.Map; 046 047/** 048 * The representation of a ProteinSequence 049 * 050 * @author Scooter Willis 051 * @author Paolo Pavan 052 */ 053public class ProteinSequence extends AbstractSequence<AminoAcidCompound> { 054 055 private final static Logger logger = LoggerFactory.getLogger(ProteinSequence.class); 056 057 /* 058 private ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> features 059 = new ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>(); 060 private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>> groupedFeatures 061 = new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>>(); 062 */ 063 /** 064 * Create a protein from a string 065 * 066 * @param seqString 067 * @throws CompoundNotFoundException 068 */ 069 public ProteinSequence(String seqString) throws CompoundNotFoundException { 070 this(seqString, AminoAcidCompoundSet.getAminoAcidCompoundSet()); 071 } 072 073 /** 074 * Create a protein from a string with a user defined set of amino acids 075 * 076 * @param seqString 077 * @param compoundSet 078 * @throws CompoundNotFoundException 079 */ 080 public ProteinSequence(String seqString, CompoundSet<AminoAcidCompound> compoundSet) throws CompoundNotFoundException { 081 super(seqString, compoundSet); 082 } 083 084 /** 085 * A protein sequence where the storage of the sequence is somewhere else. 086 * Could be loaded from a large Fasta file or via a Uniprot Proxy reader via 087 * Uniprot ID 088 * 089 * @param proxyLoader 090 */ 091 public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader) { 092 this(proxyLoader, AminoAcidCompoundSet.getAminoAcidCompoundSet()); 093 } 094 095 /** 096 * A protein sequence where the storage of the sequence is somewhere else 097 * with user defined set of amino acids. Could be loaded from a large Fasta 098 * file or via a Uniprot Proxy reader via Uniprot ID 099 * 100 * @param proxyLoader 101 * @param compoundSet 102 */ 103 public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader, CompoundSet<AminoAcidCompound> compoundSet) { 104 super(proxyLoader, compoundSet); 105 106 // do protein-specific tasks 107 // add source if found 108 List<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> CDSFeatures = getFeaturesByType("CDS"); 109 110 // cases if a protein has more than 1 parent are not supported yet 111 if (CDSFeatures.size() == 1) { 112 Qualifier codedBy = CDSFeatures.get(0).getQualifiers().get("coded_by").get(0); 113 114 if (codedBy != null) { 115 String codedBySeq = codedBy.getValue(); 116 117 InsdcParser parser = new InsdcParser(DataSource.GENBANK); 118 Location location = parser.parse(codedBySeq); 119 120 try { 121 DNASequence dnaSeq = new DNASequence(getSequence(location), DNACompoundSet.getDNACompoundSet()); 122 setParentDNASequence(dnaSeq, location.getStart().getPosition(), location.getEnd().getPosition()); 123 } catch (CompoundNotFoundException e) { 124 // TODO is there another solution to handle this exception? 125 logger.error("Could not add 'coded_by' parent DNA location feature, unrecognised compounds found in DNA sequence: {}", e.getMessage()); 126 } 127 } 128 } 129 130 } 131 132 /** 133 * A Protein sequence can be stand alone or loaded from a transcript 134 * sequence. The design goal is to allow the creation of a Protein sequence 135 * from a Uniprot ID or some other Protein ID that based on cross reference 136 * you should be able to get the GeneSequence that codes for the protein if 137 * the CDS/Gene region is known. From the GeneSequence you should then be 138 * able to get the ChromosomeSequence which then allows you explore flaning 139 * regions of the gene sequences. The framework is in place to do this but 140 * currently hasn't been implement in the reverse direction starting from 141 * the Protein sequence. 142 * 143 * @param parentDNASequence 144 * @param begin 145 * @param end 146 */ 147 //TODO - Someone needs to check if this is a bug. Shouldn't a parentDNASequence be something other then AminoAcid? 148 //However, due to the derivation of this class, this is the only possible type argument for this parameter... 149 public void setParentDNASequence(AbstractSequence<NucleotideCompound> parentDNASequence, Integer begin, Integer end) { 150 this.setParentSequence(parentDNASequence); 151 setBioBegin(begin); 152 setBioEnd(end); 153 } 154 155 private DNASequence getRawParentSequence(String accessId) throws IOException { 156 String seqUrlTemplate = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s&rettype=fasta&retmode=text"; 157 URL url = new URL(String.format(seqUrlTemplate, accessId)); 158 159 logger.trace("Getting parent DNA sequence from URL: {}", url.toString()); 160 161 InputStream is = url.openConnection().getInputStream(); 162 163 FastaReader<DNASequence, NucleotideCompound> parentReader 164 = new FastaReader<>(is, 165 new PlainFastaHeaderParser<DNASequence, NucleotideCompound>(), 166 new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet())); 167 Map<String, DNASequence> seq = parentReader.process(); 168 169 DNASequence parentSeq = null; 170 if (seq.size() == 1) { 171 parentSeq = seq.values().iterator().next(); 172 } 173 is.close(); 174 175 return parentSeq; 176 } 177 178 private String getSequence(Location cdna) { 179 DNASequence rawParent; 180 if (!cdna.isComplex()) { 181 try { 182 rawParent = getRawParentSequence(cdna.getAccession().getID()); 183 return cdna.getSubSequence(rawParent).getSequenceAsString(); 184 } catch (IOException e) { 185 // return null 186 logger.error("Caught IOException when getting DNA sequence for id {}. Error: {}", cdna.getAccession().getID(), e.getMessage()); 187 return null; 188 } 189 } else { 190 // in case of complex 191 StringBuilder sb = new StringBuilder(); 192 193 for (Location sub : cdna.getSubLocations()) { 194 String sebStr = getSequence(sub); 195 sb.append((sebStr == null ? "" : sebStr)); 196 } 197 198 return sb.toString(); 199 } 200 } 201 202}