Source code

001/*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
005 * should be distributed with the code. If you do not have a copy, see:
006 *
007 * http://www.gnu.org/copyleft/lesser.html
008 *
009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
010 *
011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
012 * at:
013 *
014 * http://www.biojava.org/
015 *
016 * Created on August 13, 2010 Author: Mark Chapman
017 */
018
019package org.biojava.nbio.alignment.io;
020
021import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
022import org.biojava.nbio.core.sequence.ProteinSequence;
023import org.biojava.nbio.core.sequence.RNASequence;
024import org.biojava.nbio.core.sequence.template.AbstractCompound;
025import org.biojava.nbio.core.sequence.template.AbstractSequence;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.List;
032import java.util.Map;
033
034/**
035 * Stores all the content of a Stockholm file. <i><b>N.B.: This structure will undergo several enhancements later on.
036 * Don't depend on it in a final code, otherwise it will be hard to maintain.</b></i>
037 *
038 * In general, Stockholm File contains the alignment mark-up lines.<br>
039 * <br>
040 *
041 * <Table border="1" align="center">
042 * <tr>
043 * <td><b>Header Section</b></td>
044 * </tr>
045 * <tr>
046 * <td><b>Reference Section</b></td>
047 * </tr>
048 * <tr>
049 * <td><b>Comment Section</b></td>
050 * </tr>
051 * <tr>
052 * <td><B>Alignment Section</B></td>
053 * </tr>
054 * </table>
055 *
056 * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".<br>
057 * Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.<br>
058 *
059 * <Table border="1">
060 * <th>section field</th>
061 * <th>preferred location</th>
062 * <tr>
063 * <td>#=GF &lt;feature&gt; &lt;Generic per-File annotation, free text&gt;</td>
064 * <td>Above the alignment</td>
065 * <tr>
066 * <td>#=GC &lt;feature&gt; &lt;Generic per-Column annotation, exactly 1 char per column&gt;</td>
067 * <td>Below the alignment</td>
068 * <tr>
069 * <td>#=GS &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Sequence annotation, free text&gt;</td>
070 * <td>Above the alignment or just below the corresponding sequence</td>
071 * <tr>
072 * <td>#=GR &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Residue annotation, exactly 1 char per residue&gt;</td>
073 * <td>Just below the corresponding sequence</td>
074 * </tr>
075 * </table>
076 *
077 * @since 3.0.5
078 * @author Amr AL-Hossary
079 * @author Marko Vaz
080 *
081 */
082public class StockholmStructure {
083
084        private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class);
085
086        public static final String PFAM = "PFAM";
087        public static final String RFAM = "RFAM";
088        private final StockholmFileAnnotation fileAnnotation;
089        private final StockholmConsensusAnnotation consAnnotation;
090        private final Map<String, StringBuffer> sequences;
091        private final Map<String, StockholmSequenceAnnotation> seqsAnnotation;
092        private final Map<String, StockholmResidueAnnotation> resAnnotation;
093
094        public StockholmStructure() {
095                fileAnnotation = new StockholmFileAnnotation();
096                consAnnotation = new StockholmConsensusAnnotation();
097                sequences = new HashMap<String, StringBuffer>();
098                seqsAnnotation = new HashMap<String, StockholmSequenceAnnotation>();
099                resAnnotation = new HashMap<String, StockholmResidueAnnotation>();
100        }
101
102        public StockholmFileAnnotation getFileAnnotation() {
103                return fileAnnotation;
104        }
105
106        public StockholmConsensusAnnotation getConsAnnotation() {
107                return consAnnotation;
108        }
109
110        /**
111         * Actually this function should be called appendToSequence
112         *
113         * @param seqName
114         * @param seqText
115         * @deprecated Use {@link #appendToSequence(String,String)} instead
116         */
117        @Deprecated
118        public void addSequence(String seqName, String seqText) {
119                appendToSequence(seqName, seqText);
120        }
121
122        /**
123         * @param seqName
124         * @param seqText
125         */
126        public void appendToSequence(String seqName, String seqText) {
127                StringBuffer seq = sequences.get(seqName);
128                if (seq != null) {
129                        // add sequence without space
130                        seq.append(seqText);
131                } else {
132                        seq = new StringBuffer(seqText);
133                        sequences.put(seqName, seq);
134                }
135        }
136
137        public Map<String, StringBuffer> getSequences() {
138                return sequences;
139        }
140
141        private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) {
142                if (!seqsAnnotation.containsKey(seqName)) {
143                        seqsAnnotation.put(seqName, new StockholmSequenceAnnotation());
144                }
145                return seqsAnnotation.get(seqName);
146        }
147
148        /**
149         * @param seqName
150         * @param text
151         */
152        public void addGSAccessionNumber(String seqName, String text) {
153                getSequenceAnnotation(seqName).setAccessionNumber(text);
154        }
155
156        public void addGSDescription(String seqName, String text) {
157                getSequenceAnnotation(seqName).addToDescription(text);
158        }
159
160        /**
161         * @param seqName
162         * @param text
163         */
164        public void addGSdbReference(String seqName, String text) {
165                getSequenceAnnotation(seqName).addDBReference(text);
166        }
167
168        public void addGSOrganismSpecies(String seqName, String text) {
169                getSequenceAnnotation(seqName).setOrganism(text);
170        }
171
172        public void addGSOrganismClassification(String seqName, String text) {
173                getSequenceAnnotation(seqName).setOrganismClassification(text);
174        }
175
176        public void addGSLook(String seqName, String text) {
177                getSequenceAnnotation(seqName).setLook(text);
178        }
179
180        private StockholmResidueAnnotation getResidueAnnotation(String seqName) {
181                if (!resAnnotation.containsKey(seqName)) {
182                        resAnnotation.put(seqName, new StockholmResidueAnnotation());
183                }
184                return resAnnotation.get(seqName);
185        }
186
187        public void addSurfaceAccessibility(String seqName, String text) {
188                getResidueAnnotation(seqName).setSurfaceAccessibility(text);
189        }
190
191        public void addTransMembrane(String seqName, String text) {
192                getResidueAnnotation(seqName).setTransMembrane(text);
193        }
194
195        public void addPosteriorProbability(String seqName, String text) {
196                getResidueAnnotation(seqName).setPosteriorProbability(text);
197        }
198
199        public void addLigandBinding(String seqName, String text) {
200                getResidueAnnotation(seqName).setLigandBinding(text);
201        }
202
203        public void addActiveSite(String seqName, String text) {
204                getResidueAnnotation(seqName).setActiveSite(text);
205        }
206
207        public void addASPFamPredicted(String seqName, String text) {
208                getResidueAnnotation(seqName).setAsPFamPredicted(text);
209        }
210
211        public void addASSwissProt(String seqName, String text) {
212                getResidueAnnotation(seqName).setAsSwissProt(text);
213        }
214
215        public void addIntron(String seqName, String text) {
216                getResidueAnnotation(seqName).setIntron(text);
217        }
218
219        public void addSecondaryStructure(String seqName, String text) {
220                getResidueAnnotation(seqName).setSecondaryStructure(text);
221        }
222
223        /**
224         * used to retrieve sequences from the structure
225         *
226         * @return Biosequences (case sensitive)
227         * @see #getBioSequences(boolean)
228         * @see #getBioSequences(boolean, String)
229         */
230        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences() {
231                return getBioSequences(false);
232        }
233
234        /**
235         * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM).
236         *
237         * @param ignoreCase
238         *            if <code>true</code>, the function will deal with small letters as if they are capital ones
239         * @param forcedSequenceType
240         *            either <code>null</code>, {@link #PFAM}, or {@link #RFAM}.
241         * @return Biosequences according to the criteria specified
242         * @see #getBioSequences()
243         * @see #getBioSequences(boolean)
244         */
245        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase,
246                        String forcedSequenceType) {
247                if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) | forcedSequenceType.equals(RFAM))) {
248                        throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType);
249                }
250                List<AbstractSequence<? extends AbstractCompound>> seqs = new ArrayList<AbstractSequence<? extends AbstractCompound>>();
251                for (String sequencename : sequences.keySet()) {
252                        AbstractSequence<? extends AbstractCompound> seq = null;
253                        String sequence = sequences.get(sequencename).toString();
254                        if (ignoreCase) {
255                                sequence = sequence.toUpperCase();
256                        }
257
258                        try {
259                        if (forcedSequenceType == null)
260                                seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence);
261                        else if (forcedSequenceType.equals(PFAM))
262                                seq = new ProteinSequence(sequence);
263                        else
264                                seq = new RNASequence(sequence);
265                        } catch (CompoundNotFoundException e) {
266                                logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename);
267                                continue;
268                        }
269                        String[] seqDetails = splitSeqName(sequencename);
270                        seq.setDescription(seqDetails[0]);
271                        seq.setBioBegin((seqDetails[1] == null || seqDetails[1].trim().equals("") ? null : new Integer(
272                                        seqDetails[1])));
273                        seq.setBioEnd((seqDetails[2] == null || seqDetails[2].trim().equals("") ? null : new Integer(seqDetails[2])));
274
275                        seqs.add(seq);
276                }
277                return seqs;
278        }
279
280        /**
281         * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence
282         * TKRA_BACSU/6-322), this function is used to ignore the small letters case.
283         *
284         * @param ignoreCase
285         * @return
286         * @see #getBioSequences()
287         * @see #getBioSequences(boolean, String)
288         */
289        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase) {
290                return getBioSequences(ignoreCase, null);
291        }
292
293        /**
294         * Returns an array with the following sequence related content: name, start, end.
295         *
296         * @param sequenceName
297         *            the sequence from where to extract the content. It is supposed that it follows the following
298         *            convention name/start-end (e.g.: COATB_BPIKE/30-81)
299         * @return array with the following sequence related content: name, start, end.
300         */
301        private String[] splitSeqName(String sequenceName) {
302                String[] result = new String[3];
303
304                String[] barSplit = sequenceName.toString().split("/");
305                if (barSplit.length == 2) {
306                        result[0] = barSplit[0];
307                        String[] positions = barSplit[1].split("-");
308                        if (positions.length == 2) {
309                                result[1] = positions[0];
310                                result[2] = positions[1];
311                        }
312                } else {
313                        result[0] = sequenceName;
314                        result[1] = null;
315                        result[2] = null;
316                }
317
318                return result;
319        }
320
321        @Override
322        public String toString() {
323                StringBuffer result = new StringBuffer();
324                List<AbstractSequence<? extends AbstractCompound>> bioSeqs = getBioSequences(false);
325                int sequenceLength = -1;
326                for (AbstractSequence<? extends AbstractCompound> sequence : bioSeqs) {
327                        String sequenceAsString = sequence.getSequenceAsString();
328                        sequenceLength = sequenceAsString.length();
329                        if (sequenceLength > 50) {
330                                result.append(sequenceAsString.substring(0, 40));
331                                result.append("...");
332                                result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength));
333                        } else {
334                                result.append(sequenceAsString);
335                        }
336                        result.append(" " + sequence.getDescription() + "\n");
337                }
338                result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns");
339
340                return result.toString();
341        }
342
343        public static class DatabaseReference {
344                public static final String EXPERT = "EXPERT";
345                public static final String MIM = "MIM";
346                public static final String PFAMB = "PFAMB";
347                public static final String PRINTS = "PRINTS";
348                public static final String PROSITE = "PROSITE";
349                public static final String PROSITE_PROFILE = "PROSITE_PROFILE";
350                public static final String SCOP = "SCOP";
351                public static final String PDB = "PDB";
352                public static final String SMART = "SMART";
353                public static final String URL = "URL";
354                public static final String LOAD = "LOAD";
355                public static final String HOMSTRAD = "HOMSTRAD";
356                public static final String INTERPRO = "INTERPRO";
357
358                private final String database;
359                /** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */
360                private final String reference;
361
362                public DatabaseReference(String database, String reference) {
363                        this.database = database;
364                        this.reference = reference;
365                }
366
367                public DatabaseReference(String representativeAnnotationString) {
368                        int semiColonIndex = representativeAnnotationString.indexOf(';');
369                        this.database = representativeAnnotationString.substring(0, semiColonIndex);
370                        this.reference = representativeAnnotationString.substring(semiColonIndex + 1,
371                                        representativeAnnotationString.lastIndexOf(';')).trim();
372                }
373
374                @Override
375                public String toString() {
376                        return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';')
377                                        .toString();
378                }
379
380                public String getDatabase() {
381                        return database;
382                }
383
384                public String getReference() {
385                        return reference;
386                }
387        }
388}