Source code

001/*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
005 * should be distributed with the code. If you do not have a copy, see:
006 *
007 * http://www.gnu.org/copyleft/lesser.html
008 *
009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
010 *
011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
012 * at:
013 *
014 * http://www.biojava.org/
015 *
016 * Created on August 13, 2010 Author: Mark Chapman
017 */
018
019package org.biojava.nbio.alignment.io;
020
021import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
022import org.biojava.nbio.core.sequence.ProteinSequence;
023import org.biojava.nbio.core.sequence.RNASequence;
024import org.biojava.nbio.core.sequence.template.AbstractCompound;
025import org.biojava.nbio.core.sequence.template.AbstractSequence;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.List;
032import java.util.Map;
033
034/**
035 * Stores all the content of a Stockholm file. <i><b>N.B.: This structure will undergo several enhancements later on.
036 * Don't depend on it in a final code, otherwise it will be hard to maintain.</b></i>
037 *
038 * In general, Stockholm File contains the alignment mark-up lines.<br>
039 * <br>
040 *
041 * <Table border="1" align="center">
042 * <tr>
043 * <td><b>Header Section</b></td>
044 * </tr>
045 * <tr>
046 * <td><b>Reference Section</b></td>
047 * </tr>
048 * <tr>
049 * <td><b>Comment Section</b></td>
050 * </tr>
051 * <tr>
052 * <td><B>Alignment Section</B></td>
053 * </tr>
054 * </table>
055 *
056 * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".<br>
057 * Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.<br>
058 *
059 * <Table border="1">
060 * <th>section field</th>
061 * <th>preferred location</th>
062 * <tr>
063 * <td>#=GF &lt;feature&gt; &lt;Generic per-File annotation, free text&gt;</td>
064 * <td>Above the alignment</td>
065 * <tr>
066 * <td>#=GC &lt;feature&gt; &lt;Generic per-Column annotation, exactly 1 char per column&gt;</td>
067 * <td>Below the alignment</td>
068 * <tr>
069 * <td>#=GS &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Sequence annotation, free text&gt;</td>
070 * <td>Above the alignment or just below the corresponding sequence</td>
071 * <tr>
072 * <td>#=GR &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Residue annotation, exactly 1 char per residue&gt;</td>
073 * <td>Just below the corresponding sequence</td>
074 * </tr>
075 * </table>
076 *
077 * @since 3.0.5
078 * @author Amr ALHOSSARY
079 * @author Marko Vaz
080 *
081 */
082public class StockholmStructure {
083
084        private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class);
085
086        public static final String PFAM = "PFAM";
087        public static final String RFAM = "RFAM";
088        private final StockholmFileAnnotation fileAnnotation;
089        private final StockholmConsensusAnnotation consAnnotation;
090        private final Map<String, StringBuffer> sequences;
091        private final Map<String, StockholmSequenceAnnotation> seqsAnnotation;
092        private final Map<String, StockholmResidueAnnotation> resAnnotation;
093
094        public StockholmStructure() {
095                fileAnnotation = new StockholmFileAnnotation();
096                consAnnotation = new StockholmConsensusAnnotation();
097                sequences = new HashMap<String, StringBuffer>();
098                seqsAnnotation = new HashMap<String, StockholmSequenceAnnotation>();
099                resAnnotation = new HashMap<String, StockholmResidueAnnotation>();
100        }
101
102        public StockholmFileAnnotation getFileAnnotation() {
103                return fileAnnotation;
104        }
105
106        public StockholmConsensusAnnotation getConsAnnotation() {
107                return consAnnotation;
108        }
109
110        /**
111         * @param seqName
112         * @param seqText
113         */
114        public void appendToSequence(String seqName, String seqText) {
115                StringBuffer seq = sequences.get(seqName);
116                if (seq != null) {
117                        // add sequence without space
118                        seq.append(seqText);
119                } else {
120                        seq = new StringBuffer(seqText);
121                        sequences.put(seqName, seq);
122                }
123        }
124
125        public Map<String, StringBuffer> getSequences() {
126                return sequences;
127        }
128
129        private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) {
130                if (!seqsAnnotation.containsKey(seqName)) {
131                        seqsAnnotation.put(seqName, new StockholmSequenceAnnotation());
132                }
133                return seqsAnnotation.get(seqName);
134        }
135
136        /**
137         * @param seqName
138         * @param text
139         */
140        public void addGSAccessionNumber(String seqName, String text) {
141                getSequenceAnnotation(seqName).setAccessionNumber(text);
142        }
143
144        public void addGSDescription(String seqName, String text) {
145                getSequenceAnnotation(seqName).addToDescription(text);
146        }
147
148        /**
149         * @param seqName
150         * @param text
151         */
152        public void addGSdbReference(String seqName, String text) {
153                getSequenceAnnotation(seqName).addDBReference(text);
154        }
155
156        public void addGSOrganismSpecies(String seqName, String text) {
157                getSequenceAnnotation(seqName).setOrganism(text);
158        }
159
160        public void addGSOrganismClassification(String seqName, String text) {
161                getSequenceAnnotation(seqName).setOrganismClassification(text);
162        }
163
164        public void addGSLook(String seqName, String text) {
165                getSequenceAnnotation(seqName).setLook(text);
166        }
167
168        private StockholmResidueAnnotation getResidueAnnotation(String seqName) {
169                if (!resAnnotation.containsKey(seqName)) {
170                        resAnnotation.put(seqName, new StockholmResidueAnnotation());
171                }
172                return resAnnotation.get(seqName);
173        }
174
175        public void addSurfaceAccessibility(String seqName, String text) {
176                getResidueAnnotation(seqName).setSurfaceAccessibility(text);
177        }
178
179        public void addTransMembrane(String seqName, String text) {
180                getResidueAnnotation(seqName).setTransMembrane(text);
181        }
182
183        public void addPosteriorProbability(String seqName, String text) {
184                getResidueAnnotation(seqName).setPosteriorProbability(text);
185        }
186
187        public void addLigandBinding(String seqName, String text) {
188                getResidueAnnotation(seqName).setLigandBinding(text);
189        }
190
191        public void addActiveSite(String seqName, String text) {
192                getResidueAnnotation(seqName).setActiveSite(text);
193        }
194
195        public void addASPFamPredicted(String seqName, String text) {
196                getResidueAnnotation(seqName).setAsPFamPredicted(text);
197        }
198
199        public void addASSwissProt(String seqName, String text) {
200                getResidueAnnotation(seqName).setAsSwissProt(text);
201        }
202
203        public void addIntron(String seqName, String text) {
204                getResidueAnnotation(seqName).setIntron(text);
205        }
206
207        public void addSecondaryStructure(String seqName, String text) {
208                getResidueAnnotation(seqName).setSecondaryStructure(text);
209        }
210
211        /**
212         * used to retrieve sequences from the structure
213         *
214         * @return Biosequences (case sensitive)
215         * @see #getBioSequences(boolean)
216         * @see #getBioSequences(boolean, String)
217         */
218        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences() {
219                return getBioSequences(false);
220        }
221
222        /**
223         * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM).
224         *
225         * @param ignoreCase
226         *            if <code>true</code>, the function will deal with small letters as if they are capital ones
227         * @param forcedSequenceType
228         *            either <code>null</code>, {@link #PFAM}, or {@link #RFAM}.
229         * @return Biosequences according to the criteria specified
230         * @see #getBioSequences()
231         * @see #getBioSequences(boolean)
232         */
233        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase,
234                        String forcedSequenceType) {
235                if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) || forcedSequenceType.equals(RFAM))) {
236                        throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType);
237                }
238                List<AbstractSequence<? extends AbstractCompound>> seqs = new ArrayList<AbstractSequence<? extends AbstractCompound>>();
239                for (String sequencename : sequences.keySet()) {
240                        AbstractSequence<? extends AbstractCompound> seq = null;
241                        String sequence = sequences.get(sequencename).toString();
242                        if (ignoreCase) {
243                                sequence = sequence.toUpperCase();
244                        }
245
246                        try {
247                        if (forcedSequenceType == null)
248                                seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence);
249                        else if (forcedSequenceType.equals(PFAM))
250                                seq = new ProteinSequence(sequence);
251                        else
252                                seq = new RNASequence(sequence);
253                        } catch (CompoundNotFoundException e) {
254                                logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename);
255                                continue;
256                        }
257                        String[] seqDetails = splitSeqName(sequencename);
258                        seq.setDescription(seqDetails[0]);
259                        seq.setBioBegin((seqDetails[1] == null || seqDetails[1].trim().equals("") ? null : new Integer(
260                                        seqDetails[1])));
261                        seq.setBioEnd((seqDetails[2] == null || seqDetails[2].trim().equals("") ? null : new Integer(seqDetails[2])));
262
263                        seqs.add(seq);
264                }
265                return seqs;
266        }
267
268        /**
269         * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence
270         * TKRA_BACSU/6-322), this function is used to ignore the small letters case.
271         *
272         * @param ignoreCase
273         * @return
274         * @see #getBioSequences()
275         * @see #getBioSequences(boolean, String)
276         */
277        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase) {
278                return getBioSequences(ignoreCase, null);
279        }
280
281        /**
282         * Returns an array with the following sequence related content: name, start, end.
283         *
284         * @param sequenceName
285         *            the sequence from where to extract the content. It is supposed that it follows the following
286         *            convention name/start-end (e.g.: COATB_BPIKE/30-81)
287         * @return array with the following sequence related content: name, start, end.
288         */
289        private String[] splitSeqName(String sequenceName) {
290                String[] result = new String[3];
291
292                String[] barSplit = sequenceName.split("/");
293                if (barSplit.length == 2) {
294                        result[0] = barSplit[0];
295                        String[] positions = barSplit[1].split("-");
296                        if (positions.length == 2) {
297                                result[1] = positions[0];
298                                result[2] = positions[1];
299                        }
300                } else {
301                        result[0] = sequenceName;
302                        result[1] = null;
303                        result[2] = null;
304                }
305
306                return result;
307        }
308
309        @Override
310        public String toString() {
311                StringBuffer result = new StringBuffer();
312                List<AbstractSequence<? extends AbstractCompound>> bioSeqs = getBioSequences(false);
313                int sequenceLength = -1;
314                for (AbstractSequence<? extends AbstractCompound> sequence : bioSeqs) {
315                        String sequenceAsString = sequence.getSequenceAsString();
316                        sequenceLength = sequenceAsString.length();
317                        if (sequenceLength > 50) {
318                                result.append(sequenceAsString.substring(0, 40));
319                                result.append("...");
320                                result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength));
321                        } else {
322                                result.append(sequenceAsString);
323                        }
324                        result.append(" " + sequence.getDescription() + "\n");
325                }
326                result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns");
327
328                return result.toString();
329        }
330
331        public static class DatabaseReference {
332                public static final String EXPERT = "EXPERT";
333                public static final String MIM = "MIM";
334                public static final String PFAMB = "PFAMB";
335                public static final String PRINTS = "PRINTS";
336                public static final String PROSITE = "PROSITE";
337                public static final String PROSITE_PROFILE = "PROSITE_PROFILE";
338                public static final String SCOP = "SCOP";
339                public static final String PDB = "PDB";
340                public static final String SMART = "SMART";
341                public static final String URL = "URL";
342                public static final String LOAD = "LOAD";
343                public static final String HOMSTRAD = "HOMSTRAD";
344                public static final String INTERPRO = "INTERPRO";
345
346                private final String database;
347                /** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */
348                private final String reference;
349
350                public DatabaseReference(String database, String reference) {
351                        this.database = database;
352                        this.reference = reference;
353                }
354
355                public DatabaseReference(String representativeAnnotationString) {
356                        int semiColonIndex = representativeAnnotationString.indexOf(';');
357                        this.database = representativeAnnotationString.substring(0, semiColonIndex);
358                        this.reference = representativeAnnotationString.substring(semiColonIndex + 1,
359                                        representativeAnnotationString.lastIndexOf(';')).trim();
360                }
361
362                @Override
363                public String toString() {
364                        return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';')
365                                        .toString();
366                }
367
368                public String getDatabase() {
369                        return database;
370                }
371
372                public String getReference() {
373                        return reference;
374                }
375        }
376}