001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * Created on DATE 021 * 022 */ 023package org.biojava.nbio.core.sequence; 024 025import org.biojava.nbio.core.exceptions.CompoundNotFoundException; 026import org.biojava.nbio.core.sequence.compound.*; 027import org.biojava.nbio.core.sequence.features.FeatureInterface; 028import org.biojava.nbio.core.sequence.io.DNASequenceCreator; 029import org.biojava.nbio.core.sequence.io.FastaReader; 030import org.biojava.nbio.core.sequence.io.PlainFastaHeaderParser; 031import org.biojava.nbio.core.sequence.location.InsdcParser; 032import org.biojava.nbio.core.sequence.location.template.Location; 033import org.biojava.nbio.core.sequence.template.AbstractSequence; 034import org.biojava.nbio.core.sequence.template.CompoundSet; 035import org.biojava.nbio.core.sequence.template.ProxySequenceReader; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import java.io.IOException; 040import java.io.InputStream; 041import java.net.URL; 042import java.util.LinkedHashMap; 043import java.util.List; 044import org.biojava.nbio.core.sequence.features.Qualifier; 045 046/** 047 * The representation of a ProteinSequence 048 * 049 * @author Scooter Willis 050 * @author Paolo Pavan 051 */ 052public class ProteinSequence extends AbstractSequence<AminoAcidCompound> { 053 054 private final static Logger logger = LoggerFactory.getLogger(ProteinSequence.class); 055 056 /* 057 private ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> features 058 = new ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>(); 059 private LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>> groupedFeatures 060 = new LinkedHashMap<String, ArrayList<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>>>(); 061 */ 062 /** 063 * Create a protein from a string 064 * 065 * @param seqString 066 * @throws CompoundNotFoundException 067 */ 068 public ProteinSequence(String seqString) throws CompoundNotFoundException { 069 this(seqString, AminoAcidCompoundSet.getAminoAcidCompoundSet()); 070 } 071 072 /** 073 * Create a protein from a string with a user defined set of amino acids 074 * 075 * @param seqString 076 * @param compoundSet 077 * @throws CompoundNotFoundException 078 */ 079 public ProteinSequence(String seqString, CompoundSet<AminoAcidCompound> compoundSet) throws CompoundNotFoundException { 080 super(seqString, compoundSet); 081 } 082 083 /** 084 * A protein sequence where the storage of the sequence is somewhere else. 085 * Could be loaded from a large Fasta file or via a Uniprot Proxy reader via 086 * Uniprot ID 087 * 088 * @param proxyLoader 089 */ 090 public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader) { 091 this(proxyLoader, AminoAcidCompoundSet.getAminoAcidCompoundSet()); 092 } 093 094 /** 095 * A protein sequence where the storage of the sequence is somewhere else 096 * with user defined set of amino acids. Could be loaded from a large Fasta 097 * file or via a Uniprot Proxy reader via Uniprot ID 098 * 099 * @param proxyLoader 100 * @param compoundSet 101 */ 102 public ProteinSequence(ProxySequenceReader<AminoAcidCompound> proxyLoader, CompoundSet<AminoAcidCompound> compoundSet) { 103 super(proxyLoader, compoundSet); 104 105 // do protein-specific tasks 106 // add source if found 107 List<FeatureInterface<AbstractSequence<AminoAcidCompound>, AminoAcidCompound>> CDSFeatures = getFeaturesByType("CDS"); 108 109 // cases if a protein has more than 1 parent are not supported yet 110 if (CDSFeatures.size() == 1) { 111 Qualifier codedBy = CDSFeatures.get(0).getQualifiers().get("coded_by").get(0); 112 113 if (codedBy != null) { 114 String codedBySeq = codedBy.getValue(); 115 116 InsdcParser parser = new InsdcParser(DataSource.GENBANK); 117 Location location = parser.parse(codedBySeq); 118 119 try { 120 DNASequence dnaSeq = new DNASequence(getSequence(location), DNACompoundSet.getDNACompoundSet()); 121 setParentDNASequence(dnaSeq, location.getStart().getPosition(), location.getEnd().getPosition()); 122 } catch (CompoundNotFoundException e) { 123 // TODO is there another solution to handle this exception? 124 logger.error("Could not add 'coded_by' parent DNA location feature, unrecognised compounds found in DNA sequence: {}", e.getMessage()); 125 } 126 } 127 } 128 129 } 130 131 /** 132 * A Protein sequence can be stand alone or loaded from a transcript 133 * sequence. The design goal is to allow the creation of a Protein sequence 134 * from a Uniprot ID or some other Protein ID that based on cross reference 135 * you should be able to get the GeneSequence that codes for the protein if 136 * the CDS/Gene region is known. From the GeneSequence you should then be 137 * able to get the ChromosomeSequence which then allows you explore flaning 138 * regions of the gene sequences. The framework is in place to do this but 139 * currently hasn't been implement in the reverse direction starting from 140 * the Protein sequence. 141 * 142 * @param parentDNASequence 143 * @param begin 144 * @param end 145 */ 146 //TODO - Someone needs to check if this is a bug. Shouldn't a parentDNASequence be something other then AminoAcid? 147 //However, due to the derivation of this class, this is the only possible type argument for this parameter... 148 public void setParentDNASequence(AbstractSequence<NucleotideCompound> parentDNASequence, Integer begin, Integer end) { 149 this.setParentSequence(parentDNASequence); 150 setBioBegin(begin); 151 setBioEnd(end); 152 } 153 154 private DNASequence getRawParentSequence(String accessId) throws IOException { 155 String seqUrlTemplate = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=%s&rettype=fasta&retmode=text"; 156 URL url = new URL(String.format(seqUrlTemplate, accessId)); 157 158 logger.trace("Getting parent DNA sequence from URL: {}", url.toString()); 159 160 InputStream is = url.openConnection().getInputStream(); 161 162 FastaReader<DNASequence, NucleotideCompound> parentReader 163 = new FastaReader<DNASequence, NucleotideCompound>(is, 164 new PlainFastaHeaderParser<DNASequence, NucleotideCompound>(), 165 new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet())); 166 LinkedHashMap<String, DNASequence> seq = parentReader.process(); 167 168 DNASequence parentSeq = null; 169 if (seq.size() == 1) { 170 parentSeq = seq.values().iterator().next(); 171 } 172 is.close(); 173 174 return parentSeq; 175 } 176 177 private String getSequence(Location cdna) { 178 DNASequence rawParent; 179 if (!cdna.isComplex()) { 180 try { 181 rawParent = getRawParentSequence(cdna.getAccession().getID()); 182 return cdna.getSubSequence(rawParent).getSequenceAsString(); 183 } catch (IOException e) { 184 // return null 185 logger.error("Caught IOException when getting DNA sequence for id {}. Error: {}", cdna.getAccession().getID(), e.getMessage()); 186 return null; 187 } 188 } else { 189 // in case of complex 190 StringBuilder sb = new StringBuilder(); 191 192 for (Location sub : cdna.getSubLocations()) { 193 String sebStr = getSequence(sub); 194 sb.append((sebStr == null ? "" : sebStr)); 195 } 196 197 return sb.toString(); 198 } 199 } 200 201}