Source code

001/*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
005 * should be distributed with the code. If you do not have a copy, see:
006 *
007 * http://www.gnu.org/copyleft/lesser.html
008 *
009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
010 *
011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
012 * at:
013 *
014 * http://www.biojava.org/
015 *
016 * Created on August 13, 2010 Author: Mark Chapman
017 */
018
019package org.biojava.nbio.alignment.io;
020
021import org.biojava.nbio.core.exceptions.CompoundNotFoundException;
022import org.biojava.nbio.core.sequence.ProteinSequence;
023import org.biojava.nbio.core.sequence.RNASequence;
024import org.biojava.nbio.core.sequence.template.AbstractCompound;
025import org.biojava.nbio.core.sequence.template.AbstractSequence;
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import java.util.ArrayList;
030import java.util.HashMap;
031import java.util.List;
032import java.util.Map;
033
034/**
035 * Stores all the content of a Stockholm file. <i><b>N.B.: This structure will undergo several enhancements later on.
036 * Don't depend on it in a final code, otherwise it will be hard to maintain.</b></i>
037 *
038 * In general, Stockholm File contains the alignment mark-up lines.<br>
039 * <br>
040 *
041 * <table>
042 * <caption></caption>
043 * <tr>
044 * <td><b>Header Section</b></td>
045 * </tr>
046 * <tr>
047 * <td><b>Reference Section</b></td>
048 * </tr>
049 * <tr>
050 * <td><b>Comment Section</b></td>
051 * </tr>
052 * <tr>
053 * <td><B>Alignment Section</B></td>
054 * </tr>
055 * </table>
056 *
057 * Sequence letters may include any characters except whitespace. Gaps may be indicated by "." or "-".<br>
058 * Mark-up lines may include any characters except whitespace. Use underscore ("_") instead of space.<br>
059 *
060 * <table>
061 * <caption></caption>
062 * <tr>
063 * <th>section field</th>
064 * <th>preferred location</th>
065 * </tr>
066 * <tr>
067 * <td>#=GF &lt;feature&gt; &lt;Generic per-File annotation, free text&gt;</td>
068 * <td>Above the alignment</td>
069 * </tr>
070 * <tr>
071 * <td>#=GC &lt;feature&gt; &lt;Generic per-Column annotation, exactly 1 char per column&gt;</td>
072 * <td>Below the alignment</td>
073 * </tr>
074 * <tr>
075 * <td>#=GS &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Sequence annotation, free text&gt;</td>
076 * <td>Above the alignment or just below the corresponding sequence</td>
077 * </tr>
078 * <tr>
079 * <td>#=GR &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Residue annotation, exactly 1 char per residue&gt;</td>
080 * <td>Just below the corresponding sequence</td>
081 * </tr>
082 * </table>
083 *
084 * @since 3.0.5
085 * @author Amr ALHOSSARY
086 * @author Marko Vaz
087 *
088 */
089public class StockholmStructure {
090
091        private final static Logger logger = LoggerFactory.getLogger(StockholmStructure.class);
092
093        public static final String PFAM = "PFAM";
094        public static final String RFAM = "RFAM";
095        private final StockholmFileAnnotation fileAnnotation;
096        private final StockholmConsensusAnnotation consAnnotation;
097        private final Map<String, StringBuffer> sequences;
098        private final Map<String, StockholmSequenceAnnotation> seqsAnnotation;
099        private final Map<String, StockholmResidueAnnotation> resAnnotation;
100
101        public StockholmStructure() {
102                fileAnnotation = new StockholmFileAnnotation();
103                consAnnotation = new StockholmConsensusAnnotation();
104                sequences = new HashMap<>();
105                seqsAnnotation = new HashMap<>();
106                resAnnotation = new HashMap<>();
107        }
108
109        public StockholmFileAnnotation getFileAnnotation() {
110                return fileAnnotation;
111        }
112
113        public StockholmConsensusAnnotation getConsAnnotation() {
114                return consAnnotation;
115        }
116
117        /**
118         * @param seqName
119         * @param seqText
120         */
121        public void appendToSequence(String seqName, String seqText) {
122                StringBuffer seq = sequences.get(seqName);
123                if (seq != null) {
124                        // add sequence without space
125                        seq.append(seqText);
126                } else {
127                        seq = new StringBuffer(seqText);
128                        sequences.put(seqName, seq);
129                }
130        }
131
132        public Map<String, StringBuffer> getSequences() {
133                return sequences;
134        }
135
136        private StockholmSequenceAnnotation getSequenceAnnotation(String seqName) {
137                if (!seqsAnnotation.containsKey(seqName)) {
138                        seqsAnnotation.put(seqName, new StockholmSequenceAnnotation());
139                }
140                return seqsAnnotation.get(seqName);
141        }
142
143        /**
144         * @param seqName
145         * @param text
146         */
147        public void addGSAccessionNumber(String seqName, String text) {
148                getSequenceAnnotation(seqName).setAccessionNumber(text);
149        }
150
151        public void addGSDescription(String seqName, String text) {
152                getSequenceAnnotation(seqName).addToDescription(text);
153        }
154
155        /**
156         * @param seqName
157         * @param text
158         */
159        public void addGSdbReference(String seqName, String text) {
160                getSequenceAnnotation(seqName).addDBReference(text);
161        }
162
163        public void addGSOrganismSpecies(String seqName, String text) {
164                getSequenceAnnotation(seqName).setOrganism(text);
165        }
166
167        public void addGSOrganismClassification(String seqName, String text) {
168                getSequenceAnnotation(seqName).setOrganismClassification(text);
169        }
170
171        public void addGSLook(String seqName, String text) {
172                getSequenceAnnotation(seqName).setLook(text);
173        }
174
175        private StockholmResidueAnnotation getResidueAnnotation(String seqName) {
176                if (!resAnnotation.containsKey(seqName)) {
177                        resAnnotation.put(seqName, new StockholmResidueAnnotation());
178                }
179                return resAnnotation.get(seqName);
180        }
181
182        public void addSurfaceAccessibility(String seqName, String text) {
183                getResidueAnnotation(seqName).setSurfaceAccessibility(text);
184        }
185
186        public void addTransMembrane(String seqName, String text) {
187                getResidueAnnotation(seqName).setTransMembrane(text);
188        }
189
190        public void addPosteriorProbability(String seqName, String text) {
191                getResidueAnnotation(seqName).setPosteriorProbability(text);
192        }
193
194        public void addLigandBinding(String seqName, String text) {
195                getResidueAnnotation(seqName).setLigandBinding(text);
196        }
197
198        public void addActiveSite(String seqName, String text) {
199                getResidueAnnotation(seqName).setActiveSite(text);
200        }
201
202        public void addASPFamPredicted(String seqName, String text) {
203                getResidueAnnotation(seqName).setAsPFamPredicted(text);
204        }
205
206        public void addASSwissProt(String seqName, String text) {
207                getResidueAnnotation(seqName).setAsSwissProt(text);
208        }
209
210        public void addIntron(String seqName, String text) {
211                getResidueAnnotation(seqName).setIntron(text);
212        }
213
214        public void addSecondaryStructure(String seqName, String text) {
215                getResidueAnnotation(seqName).setSecondaryStructure(text);
216        }
217
218        /**
219         * used to retrieve sequences from the structure
220         *
221         * @return Biosequences (case sensitive)
222         * @see #getBioSequences(boolean)
223         * @see #getBioSequences(boolean, String)
224         */
225        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences() {
226                return getBioSequences(false);
227        }
228
229        /**
230         * This function tolerates mixed case letters, and allows for forcing the output biosequence type (PFAM/RFAM).
231         *
232         * @param ignoreCase
233         *            if <code>true</code>, the function will deal with small letters as if they are capital ones
234         * @param forcedSequenceType
235         *            either <code>null</code>, {@link #PFAM}, or {@link #RFAM}.
236         * @return Biosequences according to the criteria specified
237         * @see #getBioSequences()
238         * @see #getBioSequences(boolean)
239         */
240        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase,
241                        String forcedSequenceType) {
242                if (forcedSequenceType != null && !(forcedSequenceType.equals(PFAM) || forcedSequenceType.equals(RFAM))) {
243                        throw new IllegalArgumentException("Illegal Argument " + forcedSequenceType);
244                }
245                List<AbstractSequence<? extends AbstractCompound>> seqs = new ArrayList<>();
246                for (String sequencename : sequences.keySet()) {
247                        AbstractSequence<? extends AbstractCompound> seq = null;
248                        String sequence = sequences.get(sequencename).toString();
249                        if (ignoreCase) {
250                                sequence = sequence.toUpperCase();
251                        }
252
253                        try {
254                        if (forcedSequenceType == null)
255                                seq = fileAnnotation.isPFam() ? new ProteinSequence(sequence) : new RNASequence(sequence);
256                        else if (forcedSequenceType.equals(PFAM))
257                                seq = new ProteinSequence(sequence);
258                        else
259                                seq = new RNASequence(sequence);
260                        } catch (CompoundNotFoundException e) {
261                                logger.warn("Could not create sequence because of unknown compounds ({}). Sequence {} will be ignored.",e.getMessage(),sequencename);
262                                continue;
263                        }
264                        String[] seqDetails = splitSeqName(sequencename);
265                        seq.setDescription(seqDetails[0]);
266                        seq.setBioBegin((seqDetails[1] == null || "".equals(seqDetails[1].trim()) ? null : Integer.valueOf(seqDetails[1])));
267                        seq.setBioEnd((seqDetails[2] == null || "".equals(seqDetails[2].trim()) ? null : Integer.valueOf(seqDetails[2])));
268
269                        seqs.add(seq);
270                }
271                return seqs;
272        }
273
274        /**
275         * Because some database files have incorrectly small letters (e.g. Pfam23 structure PF00389.22 sequence
276         * TKRA_BACSU/6-322), this function is used to ignore the small letters case.
277         *
278         * @param ignoreCase
279         * @return
280         * @see #getBioSequences()
281         * @see #getBioSequences(boolean, String)
282         */
283        public List<AbstractSequence<? extends AbstractCompound>> getBioSequences(boolean ignoreCase) {
284                return getBioSequences(ignoreCase, null);
285        }
286
287        /**
288         * Returns an array with the following sequence related content: name, start, end.
289         *
290         * @param sequenceName
291         *            the sequence from where to extract the content. It is supposed that it follows the following
292         *            convention name/start-end (e.g.: COATB_BPIKE/30-81)
293         * @return array with the following sequence related content: name, start, end.
294         */
295        private String[] splitSeqName(String sequenceName) {
296                String[] result = new String[3];
297
298                String[] barSplit = sequenceName.split("/");
299                if (barSplit.length == 2) {
300                        result[0] = barSplit[0];
301                        String[] positions = barSplit[1].split("-");
302                        if (positions.length == 2) {
303                                result[1] = positions[0];
304                                result[2] = positions[1];
305                        }
306                } else {
307                        result[0] = sequenceName;
308                        result[1] = null;
309                        result[2] = null;
310                }
311
312                return result;
313        }
314
315        @Override
316        public String toString() {
317                StringBuffer result = new StringBuffer();
318                List<AbstractSequence<? extends AbstractCompound>> bioSeqs = getBioSequences(false);
319                int sequenceLength = -1;
320                for (AbstractSequence<? extends AbstractCompound> sequence : bioSeqs) {
321                        String sequenceAsString = sequence.getSequenceAsString();
322                        sequenceLength = sequenceAsString.length();
323                        if (sequenceLength > 50) {
324                                result.append(sequenceAsString.substring(0, 40));
325                                result.append("...");
326                                result.append(sequenceAsString.substring(sequenceLength - 3, sequenceLength));
327                        } else {
328                                result.append(sequenceAsString);
329                        }
330                        result.append(" " + sequence.getDescription() + "\n");
331                }
332                result.append("Alignment with " + bioSeqs.size() + " rows and " + sequenceLength + " columns");
333
334                return result.toString();
335        }
336
337        public static class DatabaseReference {
338                public static final String EXPERT = "EXPERT";
339                public static final String MIM = "MIM";
340                public static final String PFAMB = "PFAMB";
341                public static final String PRINTS = "PRINTS";
342                public static final String PROSITE = "PROSITE";
343                public static final String PROSITE_PROFILE = "PROSITE_PROFILE";
344                public static final String SCOP = "SCOP";
345                public static final String PDB = "PDB";
346                public static final String SMART = "SMART";
347                public static final String URL = "URL";
348                public static final String LOAD = "LOAD";
349                public static final String HOMSTRAD = "HOMSTRAD";
350                public static final String INTERPRO = "INTERPRO";
351
352                private final String database;
353                /** TODO this field should be subdivided into smaller fields if the database is SCOP or PDB. */
354                private final String reference;
355
356                public DatabaseReference(String database, String reference) {
357                        this.database = database;
358                        this.reference = reference;
359                }
360
361                public DatabaseReference(String representativeAnnotationString) {
362                        int semiColonIndex = representativeAnnotationString.indexOf(';');
363                        this.database = representativeAnnotationString.substring(0, semiColonIndex);
364                        this.reference = representativeAnnotationString.substring(semiColonIndex + 1,
365                                        representativeAnnotationString.lastIndexOf(';')).trim();
366                }
367
368                @Override
369                public String toString() {
370                        return new StringBuilder(this.database).append(';').append(' ').append(this.reference).append(';')
371                                        .toString();
372                }
373
374                public String getDatabase() {
375                        return database;
376                }
377
378                public String getReference() {
379                        return reference;
380                }
381        }
382}