Source code

001/*
002 * BioJava development code
003 *
004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This
005 * should be distributed with the code. If you do not have a copy, see:
006 *
007 * http://www.gnu.org/copyleft/lesser.html
008 *
009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments.
010 *
011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page
012 * at:
013 *
014 * http://www.biojava.org/
015 *
016 * Created on August 13, 2010 Author: Mark Chapman
017 */
018
019package org.biojava.nbio.alignment.io;
020
021import org.biojava.nbio.alignment.io.StockholmFileAnnotation.StockholmFileAnnotationReference;
022import org.biojava.nbio.core.exceptions.ParserException;
023import org.biojava.nbio.core.util.InputStreamProvider;
024import org.slf4j.Logger;
025import org.slf4j.LoggerFactory;
026
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.ArrayList;
030import java.util.List;
031import java.util.Map;
032import java.util.Scanner;
033
034/**
035 * Stockholm file parser.<br>
036 * for more information about the format refer to
037 * <ul>
038 * <li><a
039 * href="ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/userman.txt">ftp://ftp.sanger.ac.uk/pub/databases
040 * /Pfam/current_release/userman.txt</a>.</li>
041 * <li><a
042 * href="ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT/USERMAN">ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT
043 * /USERMAN</a>.</li>
044 * <li><a href="http://sonnhammer.sbc.su.se/Stockholm.html">http://sonnhammer.sbc.su.se/Stockholm.html</a>.</li>
045 * </ul>
046 *
047 * <pre>
048 * Pfam DESCRIPTION OF FIELDS
049 *
050 *    Compulsory fields:
051 *    ------------------
052 *
053 *    AC   Accession number:           Accession number in form PFxxxxx.version or PBxxxxxx.
054 *    ID   Identification:             One word name for family.
055 *    DE   Definition:                 Short description of family.
056 *    AU   Author:                     Authors of the entry.
057 *    SE   Source of seed:             The source suggesting the seed members belong to one family.
058 *    GA   Gathering method:           Search threshold to build the full alignment.
059 *    TC   Trusted Cutoff:             Lowest sequence score and domain score of match in the full alignment.
060 *    NC   Noise Cutoff:               Highest sequence score and domain score of match not in full alignment.
061 *    TP   Type:                       Type of family -- presently Family, Domain, Motif or Repeat.
062 *    SQ   Sequence:                   Number of sequences in alignment.
063 *    //                               End of alignment.
064 *
065 *    Optional fields:
066 *    ----------------
067 *
068 *    DC   Database Comment:           Comment about database reference.
069 *    DR   Database Reference:         Reference to external database.
070 *    RC   Reference Comment:          Comment about literature reference.
071 *    RN   Reference Number:           Reference Number.
072 *    RM   Reference Medline:          Eight digit medline UI number.
073 *    RT   Reference Title:            Reference Title.
074 *    RA   Reference Author:           Reference Author
075 *    RL   Reference Location:         Journal location.
076 *    PI   Previous identifier:        Record of all previous ID lines.
077 *    KW   Keywords:                   Keywords.
078 *    CC   Comment:                    Comments.
079 *    NE   Pfam accession:             Indicates a nested domain.
080 *    NL   Location:                   Location of nested domains - sequence ID, start and end of insert.
081 *    WK   Wikipedia Reference:        Reference to wikipedia.
082 *
083 *    Obsolete fields:
084 *    -----------
085 *    AL   Alignment method of seed:   The method used to align the seed members.
086 *    AM   Alignment Method:        The order ls and fs hits are aligned to the model to build the full align.
087 *
088 * </pre>
089 *
090 * @since 3.0.5
091 * @author Amr AL-Hossary
092 * @author Marko Vaz
093 *
094 */
095public class StockholmFileParser {
096
097        private final static Logger logger = LoggerFactory.getLogger(StockholmFileParser.class);
098
099        /** indicates reading as much as possible, without limits */
100        public static final int INFINITY = -1;
101        /** #=GF &lt;feature&gt; &lt;Generic per-File annotation, free text&gt; */
102        private static final String GENERIC_PER_FILE_ANNOTATION = "GF";
103        /** #=GC &lt;feature&gt; &lt;Generic per-Column annotation, exactly 1 char per column&gt; */
104        private static final String GENERIC_PER_CONSENSUS_ANNOTATION = "GC";
105        /** #=GS &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Sequence annotation, free text&gt; */
106        private static final String GENERIC_PER_SEQUENCE_ANNOTATION = "GS";
107        /** #=GR &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Residue annotation, exactly 1 char per residue&gt; */
108        private static final String GENERIC_PER_RESIDUE_ANNOTATION = "GR";
109
110        // COMPULSORY FIELDS
111        /** Accession number in form PFxxxxx (Pfam) or RFxxxxx (Rfam). */
112        private static final String GF_ACCESSION_NUMBER = "AC";
113        /** One word name for family. */
114        private static final String GF_IDENTIFICATION = "ID";
115        /** Short description of family. */
116        private static final String GF_DEFINITION = "DE";
117        /** Authors of the entry. */
118        private static final String GF_AUTHOR = "AU";
119        /**
120         * Indicates the order that ls and fs matches are aligned to the model to give the full alignment. (OBSOLETE IN
121         * HMMER3)
122         */
123        private static final String GF_ALIGNMENT_METHOD = "AM";
124        /** Command line used to generate the model */
125        private static final String GF_BUILD_METHOD = "BM";
126        /** Command line used to perform the search */
127        private static final String GF_SEARCH_METHOD = "SM";
128        /** The source suggesting the seed members belong to one family. */
129        private static final String GF_SOURCE_SEED = "SE";
130        /** The source (prediction or publication) of the consensus RNA secondary structure used by Rfam. */
131        private static final String GF_SOURCE_STRUCTURE = "SS";
132        /** Search threshold to build the full alignment. */
133        private static final String GF_GATHERING_THRESHOLD = "GA";
134        /** Lowest sequence score (and domain score for Pfam) of match in the full alignment. */
135        private static final String GF_TRUSTED_CUTOFF = "TC";
136        /** Highest sequence score (and domain score for Pfam) of match not in full alignment. */
137        private static final String GF_NOISE_CUTOFF = "NC";
138        /**
139         * Type of family -- presently Family, Domain, Motif or Repeat for Pfam. -- a tree with roots Gene, Intron or
140         * Cis-reg for Rfam.
141         */
142        private static final String GF_TYPE_FIELD = "TP";
143        /** Number of sequences in alignment, and start of MSA. */
144        private static final String GF_SEQUENCE = "SQ";
145
146        // OPTIONAL FIELDS
147
148        /** Comment about database reference. */
149        private static final String GF_DB_COMMENT = "DC";
150        /** Reference to external database. */
151        private static final String GF_DB_REFERENCE = "DR";
152        /** Comment about literature reference. */
153        private static final String GF_REFERENCE_COMMENT = "RC";
154        /** Reference Number. */
155        private static final String GF_REFERENCE_NUMBER = "RN";
156        /** Eight digit medline UI number. */
157        private static final String GF_REFERENCE_MEDLINE = "RM";
158        /** Reference Title. */
159        private static final String GF_REFERENCE_TITLE = "RT";
160        /** Reference Author. */
161        private static final String GF_REFERENCE_AUTHOR = "RA";
162        /** Journal Location. */
163        private static final String GF_REFERENCE_LOCALTION = "RL";
164        /** Record of all previous ID lines. */
165        private static final String GF_PREVIOUS_IDS = "PI";
166        /** Keywords */
167        private static final String GF_KEYWORDS = "KW";
168        /** Comments */
169        private static final String GF_COMMENT = "CC";
170        /** Indicates a nested domain */
171        private static final String GF_PFAM_ACCESSION = "NE";
172        /** Location of nested domains - sequence ID, start and end of insert. */
173        private static final String GF_LOCATION = "NL";
174        /** Wikipedia page */
175        private static final String GF_WIKIPEDIA_LINK = "WK";
176        /** Clan accession */
177        private static final String GF_CLAN = "CL";
178        /** Used for listing Clan membership */
179        private static final String GF_MEMBERSHIP = "MB";
180
181        /** FOR EMBEDDING TREES **/
182
183        /** A tree in New Hampshire eXtended format. */
184        private static final String GF_NEW_HAMPSHIRE = "NH";
185        /** A unique identifier for the next tree. */
186        private static final String GF_TREE_ID = "TN";
187
188        // OTHER
189
190        /**
191         * A method used to set the bit score threshold based on the ratio of expected false positives to true positives.
192         * Floating point number between 0 and 1.
193         */
194        private static final String GF_FALSE_DISCOVERY_RATE = "FR";
195
196        // #=GS <seqname> <feature> <Generic per-Sequence annotation, free text>
197
198        private static final String GS_ACCESSION_NUMBER = "AC";
199        private static final String GS_DESCRIPTION = "DE";
200        private static final String GS_DATABASE_REFERENCE = "DR";
201        private static final String GS_ORGANISM_SPECIES = "OS";
202        private static final String GS_ORGANISM_CLASSIFICATION = "OC";
203        private static final String GS_LOOK = "LO";
204
205        // #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue>
206
207        /**
208         * For RNA [.,;<>(){}[]AaBb...],<br>
209         * For protein [HGIEBTSCX]
210         */
211        private static final String GR_SECONDARY_STRUCTURE = "SS";
212        /**
213         * [0-9X]<br>
214         * (0=0%-10%; ...; 9=90%-100%)
215         */
216        private static final String GR_SURFACE_ACCESSIBILITY = "SA";
217
218        /** [Mio] */
219        private static final String GR_TRANS_MEMBRANE = "TM";
220        /**
221         * [0-9*]<br>
222         * (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00)
223         */
224        private static final String GR_POSTERIOR_PROBABILITY = "PP";
225        /** [*] */
226        private static final String GR_LIGAND_BINDING = "LI";
227        /** [*] */
228        private static final String GR_ACTIVE_SITE = "AS";
229        /** [*] */
230        private static final String GR_AS_PFAM_PREDICTED = "pAS";
231        /** [*] */
232        private static final String GR_AS_SWISSPROT = "sAS";
233        /** [0-2] */
234        private static final String GR_INTRON = "IN";
235
236        // #=GC <feature> <Generic per-Column annotation, exactly 1 char per column>
237
238        private static final String GC_SEQUENSE_CONSENSUS = "seq_cons";
239        private static final String GC_SECONDARY_STRUCTURE = "SS_cons";
240        private static final String GC_SURFACE_ACCESSIBILITY = "SA_cons";
241        private static final String GC_TRANS_MEMBRANE = "TM_cons";
242        private static final String GC_POSTERIOR_PROBABILITY = "PP_cons";
243        private static final String GC_LIGAND_BINDING = "LI_cons";
244        private static final String GC_ACTIVE_SITE = "AS_cons";
245        private static final String GC_AS_PFAM_PREDICTED = "pAS_cons";
246        private static final String GC_AS_SWISSPROT = "sAS_cons";
247        private static final String GC_INTRON = "IN_cons";
248        /**
249         * Often the consensus RNA or protein sequence is used as a reference Any non-gap character (eg. x's) can indicate
250         * consensus/conserved/match columns .'s or -'s indicate insert columns ~'s indicate unaligned insertions Upper and
251         * lower case can be used to discriminate strong and weakly conserved residues respectively
252         */
253        private static final String GC_REFERENCE_ANNOTATION = "RF";
254        /**
255         * Indicates which columns in an alignment should be masked, such that the emission probabilities for match states
256         * corresponding to those columns will be the background distribution.
257         */
258        private static final String GC_MODEL_MASK = "MM";
259
260        private StockholmStructure stockholmStructure;
261        // private boolean endFile = false;
262
263        // private static final int STATUS_OUTSIDE_FILE = 0;
264        // private static final int STATUS_INSIDE_FILE = 10;
265        // private static final int STATUS_IN_SEQUENCE = 20;
266        //
267        // private int status=STATUS_OUTSIDE_FILE;
268
269        Scanner internalScanner = null;
270        private InputStream cashedInputStream;
271
272        /**
273         * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.<br>
274         * This function is meant to be used for single access to specific file and it closes the file after doing its
275         * assigned job. Any subsequent call to {@link #parseNext(int)} will throw an exception or will function with
276         * unpredicted behavior.
277         *
278         * @param filename
279         *            complete(?) path to the file from where to read the content
280         * @return stockholm file content
281         * @throws IOException
282         *             when an exception occurred while opening/reading/closing the file+
283         * @throws ParserException
284         *             if unexpected format is encountered
285         */
286        public StockholmStructure parse(String filename) throws IOException, ParserException {
287                InputStream inStream = new InputStreamProvider().getInputStream(filename);
288                StockholmStructure structure = parse(inStream);
289                inStream.close();
290                return structure;
291        }
292
293        /**
294         * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.<br>
295         * This function doesn't close the file after doing its assigned job; to allow for further calls of
296         * {@link #parseNext(int)}.
297         *
298         * @see #parseNext(int)
299         *
300         * @param filename
301         *            file from where to read the content. see {@link InputStreamProvider} for more details.
302         * @param max
303         *            maximum number of files to read, {@link #INFINITY} for all.
304         * @return a vector of {@link StockholmStructure} containing parsed structures.
305         * @throws IOException
306         *             when an exception occurred while opening/reading/closing the file.
307         * @throws ParserException
308         *             if unexpected format is encountered
309         */
310        public List<StockholmStructure> parse(String filename, int max) throws IOException, ParserException {
311                InputStreamProvider isp = new InputStreamProvider();
312                InputStream inStream = isp.getInputStream(filename);
313                return parse(inStream, max);
314        }
315
316        /**
317         * parses {@link InputStream} and returns a the first contained alignment in a {@link StockholmStructure} object.
318         * Used mainly for multiple files within the same input stream, (e.g. when reading from Pfam flat files. <br>
319         * This method leaves the stream open for further calls of {@link #parseNext(int)}.
320         *
321         * @see #parseNext(int)
322         * @param inStream
323         *            the {@link InputStream} containing the file to read.
324         * @return a {@link StockholmStructure} object representing file contents.
325         * @throws IOException
326         * @throws ParserException
327         */
328        public StockholmStructure parse(InputStream inStream) throws ParserException, IOException {
329                return parse(inStream, 1).get(0);
330        }
331
332        /**
333         * parses an {@link InputStream} and returns at maximum <code>max</code> objects contained in that file.<br>
334         * This method leaves the stream open for further calls of {@link #parse(InputStream, int)} (same function) or
335         * {@link #parseNext(int)}.
336         *
337         * @see #parseNext(int)
338         * @param inStream
339         *            the stream to parse
340         * @param max
341         *            maximum number of structures to try to parse, {@link #INFINITY} to try to obtain as much as possible.
342         * @return a {@link List} of {@link StockholmStructure} objects. If there are no more structures, an empty list is
343         *         returned.
344         * @throws IOException
345         *             in case an I/O Exception occurred.
346         */
347        public List<StockholmStructure> parse(InputStream inStream, int max) throws IOException {
348                if (max < INFINITY) {
349                        throw new IllegalArgumentException("max can't be -ve value " + max);
350                }
351                if (inStream != this.cashedInputStream) {
352                        this.cashedInputStream = inStream;
353                        this.internalScanner = null;
354                }
355
356                if (internalScanner == null) {
357                        internalScanner = new Scanner(inStream);
358                }
359                ArrayList<StockholmStructure> structures = new ArrayList<StockholmStructure>();
360                while (max != INFINITY && max-- > 0) {
361                        StockholmStructure structure = parse(internalScanner);
362                        if (structure != null) {
363                                structures.add(structure);
364                        } else {
365                                break;
366                        }
367                }
368                return structures;
369        }
370
371        /**
372         * Tries to parse and return as maximum as <code>max</code> structures in the last used file or input stream.<br>
373         * Please consider calling either {@link #parse(InputStream)}, {@link #parse(InputStream, int)}, or
374         * {@link #parse(String, int)} before calling this function.
375         *
376         * @param max
377         * @return
378         * @throws IOException
379         */
380        public List<StockholmStructure> parseNext(int max) throws IOException {
381                return parse(this.cashedInputStream, max);
382        }
383
384        /**
385         * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content. This method returns
386         * just after reaching the end of structure delimiter line ("//"), leaving any remaining empty lines unconsumed.
387         *
388         * @param scanner
389         *            from where to read the file content
390         * @return Stockholm file content, <code>null</code> if couldn't or no more structures.
391         * @throws IOException
392         * @throws Exception
393         */
394        StockholmStructure parse(Scanner scanner) throws ParserException, IOException {
395                if (scanner == null) {
396                        if (internalScanner != null) {
397                                scanner = internalScanner;
398                        } else {
399                                throw new IllegalArgumentException("No Scanner defined");
400                        }
401                }
402                String line = null;
403                int linesCount = 0;
404                try {
405                        while (scanner.hasNextLine()) {
406                                line = scanner.nextLine();
407                                // if the file is empty
408                                // this condition will not happen, just left in case we decided to go for buffereedReader again for
409                                // performance purpose.
410                                if (linesCount == 0 && line == null) {
411                                        throw new IOException("Could not parse Stockholm file, BufferedReader returns null!");
412                                }
413
414                                // ignore empty lines
415                                if ((/* status==STATUS_INSIDE_FILE && */line == null) || line.trim().length() == 0) {
416                                        continue;
417                                }
418
419                                if (line.startsWith("#=G")) {
420                                        // // comment line or metadata
421                                        // line = line.substring(1).trim();
422                                        // line = line.substring(1).trim();
423                                        if (line.startsWith(GENERIC_PER_FILE_ANNOTATION, 2)) {
424                                                // #=GF <featurename> <generic per-file annotation, free text>
425                                                int firstSpaceIndex = line.indexOf(' ', 5);
426                                                String featureName = line.substring(5, firstSpaceIndex);
427                                                String value = line.substring(firstSpaceIndex).trim();
428                                                handleFileAnnotation(featureName, value);
429                                        } else if (line.startsWith(GENERIC_PER_CONSENSUS_ANNOTATION, 2)) {
430                                                // Being in a consensus means we are no longer in a sequence.
431                                                // this.status = STATUS_INSIDE_FILE;
432                                                // #=GC <featurename> <generic per-column annotation, exactly 1 char per column>
433                                                int firstSpaceIndex = line.indexOf(' ', 5);
434                                                String featureName = line.substring(5, firstSpaceIndex);
435                                                String value = line.substring(firstSpaceIndex).trim();
436                                                handleConsensusAnnotation(featureName, value);
437                                        } else if (line.startsWith(GENERIC_PER_SEQUENCE_ANNOTATION, 2)) {
438                                                // #=GS <seqname> <featurename> <generic per-sequence annotation, free text>
439                                                int index1 = line.indexOf(' ', 5);
440                                                String seqName = line.substring(5, index1);
441                                                while (line.charAt(++index1) <= ' ')
442                                                        // i.e. white space
443                                                        ;// keep advancing
444                                                int index2 = line.indexOf(' ', index1);
445                                                String featureName = line.substring(index1, index2);
446                                                String value = line.substring(index2).trim();
447                                                handleSequenceAnnotation(seqName, featureName, value);
448                                        } else if (line.startsWith(GENERIC_PER_RESIDUE_ANNOTATION, 2)) {
449                                                // #=GR <seqname> <featurename> <generic per-sequence AND per-column mark-up, exactly 1
450                                                // character per column>
451                                                int index1 = line.indexOf(' ', 5);
452                                                String seqName = line.substring(5, index1);
453                                                while (line.charAt(++index1) == ' ')
454                                                        ;// keep advancing
455                                                int index2 = line.indexOf(' ', index1);
456                                                String featureName = line.substring(index1, index2);
457                                                String value = line.substring(index2).trim();
458                                                handleResidueAnnotation(seqName, featureName, value);
459                                        }
460                                } else if (line.startsWith("# STOCKHOLM")) { // it is the header line
461                                        // if (status == STATUS_OUTSIDE_FILE) {
462                                        // status = STATUS_INSIDE_FILE;
463                                        // String[] header = line.split("\\s+");
464                                        // this.stockholmStructure = new StockholmStructure();
465                                        // this.stockholmStructure.getFileAnnotation().setFormat(header[1]);
466                                        // this.stockholmStructure.getFileAnnotation().setVersion(header[2]);
467                                        // } else {
468                                        // throw new ParserException("Uexpected Format line: [" + line + "]");
469                                        // }
470                                        String[] header = line.split("\\s+");
471                                        this.stockholmStructure = new StockholmStructure();
472                                        this.stockholmStructure.getFileAnnotation().setFormat(header[1]);
473                                        this.stockholmStructure.getFileAnnotation().setVersion(header[2]);
474                                } else if (line.trim().equals("//")) {
475                                        // status = STATUS_OUTSIDE_FILE;
476                                        break;// should we just break immediately or jump next empty lines?
477                                } else /* if (!line.startsWith("#")) */{
478                                        // most probably This line corresponds to a sequence. Something like:
479                                        // O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS
480                                        // N.B. as long as we don't check the status now, it is somehow error prone
481                                        handleSequenceLine(line);
482                                        // //============removed status==========================
483                                        // if (status == STATUS_IN_SEQUENCE) {
484                                        // // This line corresponds to a sequence. Something like:
485                                        // // O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS
486                                        // handleSequenceLine(line);
487                                        // // }else if (status==STATUS_OUTSIDE_FILE) {
488                                        // // throw new
489                                        // //
490                                        // ParserException("The end of file character was allready reached but there are still sequence lines");
491                                        // } else {
492                                        // System.err.println("Error: Unknown or unexpected line [" + line
493                                        // + "].\nPlease contact the Biojava team.");
494                                        // throw new ParserException("Error: Unknown or unexpected line [" + line + "].");
495                                        // }
496                                        // //============removed status==========================
497                                }
498                                linesCount++;
499                        }
500                } catch (IOException e) {
501                        // TODO: Best practice is to catch or throw Exception, never both
502                        logger.error("IOException: ", e);
503                        throw new IOException("Error parsing Stockholm file");
504                }
505                StockholmStructure structure = this.stockholmStructure;
506                this.stockholmStructure = null;
507                if (structure != null) {
508                        int length = -1;
509                        Map<String, StringBuffer> sequences = structure.getSequences();
510                        for (String sequencename : sequences.keySet()) {
511                                StringBuffer sequence = sequences.get(sequencename);
512                                if (length == -1) {
513                                        length = sequence.length();
514                                } else if (length != sequence.length()) {
515                                        throw new RuntimeException("Sequences have different lengths");
516                                }
517                        }
518                }
519                return structure;
520        }
521
522        /**
523         * Handles a line that corresponds to a sequence. <br>
524         * e.g.: COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA<br>
525         * N.B.: This function can't tolerate sequences with intrinsic white space.
526         *
527         * @param line
528         *            the line to be parsed
529         * @throws Exception
530         */
531        private void handleSequenceLine(String line) throws ParserException {
532                String[] lineContent = line.split("\\s+");
533                if (lineContent.length != 2) {
534                        throw new ParserException("Could not split sequence line into sequence name and sequence:\n" + line);
535                }
536                stockholmStructure.appendToSequence(lineContent[0], lineContent[1]);
537        }
538
539        /**
540         * #=GF &lt;feature&gt; &lt;Generic per-File annotation, free text&gt;
541         *
542         * @param featureName
543         * @param value
544         *            the line to be parsed
545         */
546        private void handleFileAnnotation(String featureName, String value) {
547                if (featureName.equals(GF_ACCESSION_NUMBER)) {
548                        stockholmStructure.getFileAnnotation().setGFAccessionNumber(value);
549                } else if (featureName.equals(GF_IDENTIFICATION)) {
550                        stockholmStructure.getFileAnnotation().setGFIdentification(value);
551                } else if (featureName.equals(GF_DB_REFERENCE)) {
552                        stockholmStructure.getFileAnnotation().addDBReference(value);
553                } else if (featureName.equals(GF_DEFINITION)) {
554                        stockholmStructure.getFileAnnotation().setGFDefinition(value);
555                } else if (featureName.equals(GF_AUTHOR)) {
556                        stockholmStructure.getFileAnnotation().setGFAuthors(value);
557                } else if (featureName.equals(GF_ALIGNMENT_METHOD)) {
558                        stockholmStructure.getFileAnnotation().setAlignmentMethod(value);
559                } else if (featureName.equals(GF_BUILD_METHOD)) {
560                        stockholmStructure.getFileAnnotation().addGFBuildMethod(value);
561                } else if (featureName.equals(GF_SEARCH_METHOD)) {
562                        stockholmStructure.getFileAnnotation().setGFSearchMethod(value);
563                } else if (featureName.equals(GF_SOURCE_SEED)) {
564                        stockholmStructure.getFileAnnotation().setGFSourceSeed(value);
565                } else if (featureName.equals(GF_SOURCE_STRUCTURE)) {
566                        stockholmStructure.getFileAnnotation().setGFSourceStructure(value);
567                } else if (featureName.equals(GF_GATHERING_THRESHOLD)) {
568                        stockholmStructure.getFileAnnotation().setGFGatheringThreshs(value);
569                } else if (featureName.equals(GF_TRUSTED_CUTOFF)) {
570                        stockholmStructure.getFileAnnotation().setGFTrustedCutoffs(value);
571                } else if (featureName.equals(GF_NOISE_CUTOFF)) {
572                        stockholmStructure.getFileAnnotation().setGFNoiseCutoffs(value);
573                } else if (featureName.equals(GF_TYPE_FIELD)) {
574                        stockholmStructure.getFileAnnotation().setGFTypeField(value);
575                } else if (featureName.equals(GF_PREVIOUS_IDS)) {
576                        stockholmStructure.getFileAnnotation().setGFPreviousIDs(value);
577                } else if (featureName.equals(GF_SEQUENCE)) {
578                        // status = STATUS_IN_SEQUENCE;
579                        stockholmStructure.getFileAnnotation().setGFNumSequences(value);
580                } else if (featureName.equals(GF_DB_COMMENT)) {
581                        stockholmStructure.getFileAnnotation().setGFDBComment(value);
582                        // } else if (featureName.equals(GF_DB_REFERENCE)) {
583                        // stockholmStructure.getFileAnnotation().addDBReference(value);
584                } else if (featureName.equals(GF_REFERENCE_COMMENT)) {
585                        stockholmStructure.getFileAnnotation().setGFRefComment(value);
586                } else if (featureName.equals(GF_REFERENCE_NUMBER)) {
587                        StockholmFileAnnotationReference reference = new StockholmFileAnnotationReference();
588                        stockholmStructure.getFileAnnotation().getReferences().add(reference);
589                } else if (featureName.equals(GF_REFERENCE_MEDLINE)) {
590                        stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefMedline(value);
591                } else if (featureName.equals(GF_REFERENCE_TITLE)) {
592                        stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefTitle(value);
593                } else if (featureName.equals(GF_REFERENCE_AUTHOR)) {
594                        stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefAuthor(value);
595                } else if (featureName.equals(GF_REFERENCE_LOCALTION)) {
596                        stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefLocation(value);
597                } else if (featureName.equals(GF_KEYWORDS)) {
598                        stockholmStructure.getFileAnnotation().setGFKeywords(value);
599                } else if (featureName.equals(GF_COMMENT)) {
600                        stockholmStructure.getFileAnnotation().addToGFComment(value);
601                } else if (featureName.equals(GF_PFAM_ACCESSION)) {
602                        stockholmStructure.getFileAnnotation().setGFPfamAccession(value);
603                } else if (featureName.equals(GF_LOCATION)) {
604                        stockholmStructure.getFileAnnotation().setGFLocation(value);
605                } else if (featureName.equals(GF_WIKIPEDIA_LINK)) {
606                        stockholmStructure.getFileAnnotation().setGFWikipediaLink(value);
607                } else if (featureName.equals(GF_CLAN)) {
608                        stockholmStructure.getFileAnnotation().setGFClan(value);
609                } else if (featureName.equals(GF_MEMBERSHIP)) {
610                        stockholmStructure.getFileAnnotation().setGFMembership(value);
611                } else if (featureName.equals(GF_NEW_HAMPSHIRE)) {
612                        stockholmStructure.getFileAnnotation().addGFNewHampshire(value);
613                } else if (featureName.equals(GF_TREE_ID)) {
614                        stockholmStructure.getFileAnnotation().addGFTreeID(value);
615                } else if (featureName.equals(GF_FALSE_DISCOVERY_RATE)) {
616                        stockholmStructure.getFileAnnotation().addGFFalseDiscoveryRate(value);
617                } else {
618                        // unknown feature
619                        logger.warn("Unknown File Feature [{}].\nPlease contact the Biojava team.", featureName);
620                }
621        }
622
623        /**
624         * usually a single line of:<br>
625         * #=GC &lt;feature&gt; &lt;Generic per-Column annotation, exactly 1 char per column&gt;
626         *
627         * @param featureName
628         *            the feature name :)
629         * @param value
630         *            the line to be parsed.
631         */
632        private void handleConsensusAnnotation(String featureName, String value) {
633                if (featureName.equals(GC_SECONDARY_STRUCTURE)) {
634                        stockholmStructure.getConsAnnotation().setSecondaryStructure(value);
635                } else if (featureName.equals(GC_SEQUENSE_CONSENSUS)) {
636                        stockholmStructure.getConsAnnotation().setSequenceConsensus(value);
637                } else if (featureName.equals(GC_SURFACE_ACCESSIBILITY)) {
638                        stockholmStructure.getConsAnnotation().setSurfaceAccessibility(value);
639                } else if (featureName.equals(GC_TRANS_MEMBRANE)) {
640                        stockholmStructure.getConsAnnotation().setTransMembrane(value);
641                } else if (featureName.equals(GC_POSTERIOR_PROBABILITY)) {
642                        stockholmStructure.getConsAnnotation().setPosteriorProbability(value);
643                } else if (featureName.equals(GC_LIGAND_BINDING)) {
644                        stockholmStructure.getConsAnnotation().setLigandBinding(value);
645                } else if (featureName.equals(GC_ACTIVE_SITE)) {
646                        stockholmStructure.getConsAnnotation().setActiveSite(value);
647                } else if (featureName.equals(GC_AS_PFAM_PREDICTED)) {
648                        stockholmStructure.getConsAnnotation().setAsPFamPredicted(value);
649                } else if (featureName.equals(GC_AS_SWISSPROT)) {
650                        stockholmStructure.getConsAnnotation().setAsSwissProt(value);
651                } else if (featureName.equals(GC_INTRON)) {
652                        stockholmStructure.getConsAnnotation().setIntron(value);
653                } else if (featureName.equals(GC_REFERENCE_ANNOTATION)) {
654                        stockholmStructure.getConsAnnotation().setReferenceAnnotation(value);
655                } else if (featureName.equals(GC_MODEL_MASK)) {
656                        stockholmStructure.getConsAnnotation().setModelMask(value);
657                } else {
658                        // unknown feature
659                        logger.warn("Unknown Consensus Feature [{}].\nPlease contact the Biojava team.", featureName);
660                }
661        }
662
663        /**
664         * #=GS &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Sequence annotation, free text&gt;
665         *
666         * @param line
667         *            the line to be parsed
668         */
669        private void handleSequenceAnnotation(String seqName, String featureName, String value) {
670                if (featureName.equals(GS_ACCESSION_NUMBER)) {
671                        stockholmStructure.addGSAccessionNumber(seqName, value);
672                } else if (featureName.equals(GS_DESCRIPTION)) {
673                        stockholmStructure.addGSDescription(seqName, value);
674                } else if (featureName.equals(GS_DATABASE_REFERENCE)) {
675                        stockholmStructure.addGSdbReference(seqName, value);
676                } else if (featureName.equals(GS_ORGANISM_SPECIES)) {
677                        stockholmStructure.addGSOrganismSpecies(seqName, value);
678                } else if (featureName.equals(GS_ORGANISM_CLASSIFICATION)) {
679                        stockholmStructure.addGSOrganismClassification(seqName, value);
680                } else if (featureName.equals(GS_LOOK)) {
681                        stockholmStructure.addGSLook(seqName, value);
682                } else {
683                        // unknown feature
684                        logger.warn("Unknown Sequence Feature [{}].\nPlease contact the Biojava team.", featureName);
685                }
686        }
687
688        /**
689         * #=GR &lt;seqname&gt; &lt;feature&gt; &lt;Generic per-Residue annotation, exactly 1 char per residue&gt;
690         *
691         * @param line
692         *            the line to be parsed
693         */
694        private void handleResidueAnnotation(String seqName, String featureName, String value) {
695
696                if (featureName.equals(GR_SURFACE_ACCESSIBILITY)) {
697                        stockholmStructure.addSurfaceAccessibility(seqName, value);
698                } else if (featureName.equals(GR_TRANS_MEMBRANE)) {
699                        stockholmStructure.addTransMembrane(seqName, value);
700                } else if (featureName.equals(GR_POSTERIOR_PROBABILITY)) {
701                        stockholmStructure.addPosteriorProbability(seqName, value);
702                } else if (featureName.equals(GR_LIGAND_BINDING)) {
703                        stockholmStructure.addLigandBinding(seqName, value);
704                } else if (featureName.equals(GR_ACTIVE_SITE)) {
705                        stockholmStructure.addActiveSite(seqName, value);
706                } else if (featureName.equals(GR_AS_PFAM_PREDICTED)) {
707                        stockholmStructure.addASPFamPredicted(seqName, value);
708                } else if (featureName.equals(GR_AS_SWISSPROT)) {
709                        stockholmStructure.addASSwissProt(seqName, value);
710                } else if (featureName.equals(GR_INTRON)) {
711                        stockholmStructure.addIntron(seqName, value);
712                } else if (featureName.equals(GR_SECONDARY_STRUCTURE)) {
713                        stockholmStructure.addSecondaryStructure(seqName, value);
714                } else {
715                        // unknown feature
716                        logger.warn("Unknown Residue Feature [{}].\nPlease contact the Biojava team.", featureName);
717                }
718        }
719}