001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the terms of the GNU Lesser General Public Licence. This 005 * should be distributed with the code. If you do not have a copy, see: 006 * 007 * http://www.gnu.org/copyleft/lesser.html 008 * 009 * Copyright for this code is held jointly by the individual authors. These should be listed in @author doc comments. 010 * 011 * For more information on the BioJava project and its aims, or to join the biojava-l mailing list, visit the home page 012 * at: 013 * 014 * http://www.biojava.org/ 015 * 016 * Created on August 13, 2010 Author: Mark Chapman 017 */ 018 019package org.biojava.nbio.alignment.io; 020 021import org.biojava.nbio.alignment.io.StockholmFileAnnotation.StockholmFileAnnotationReference; 022import org.biojava.nbio.core.exceptions.ParserException; 023import org.biojava.nbio.core.util.InputStreamProvider; 024import org.slf4j.Logger; 025import org.slf4j.LoggerFactory; 026 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.ArrayList; 030import java.util.List; 031import java.util.Map; 032import java.util.Scanner; 033 034/** 035 * Stockholm file parser.<br> 036 * for more information about the format refer to 037 * <ul> 038 * <li><a 039 * href="ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/userman.txt">ftp://ftp.sanger.ac.uk/pub/databases 040 * /Pfam/current_release/userman.txt</a>.</li> 041 * <li><a 042 * href="ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT/USERMAN">ftp://ftp.sanger.ac.uk/pub/databases/Rfam/CURRENT 043 * /USERMAN</a>.</li> 044 * <li><a href="http://sonnhammer.sbc.su.se/Stockholm.html">http://sonnhammer.sbc.su.se/Stockholm.html</a>.</li> 045 * </ul> 046 * 047 * <pre> 048 * Pfam DESCRIPTION OF FIELDS 049 * 050 * Compulsory fields: 051 * ------------------ 052 * 053 * AC Accession number: Accession number in form PFxxxxx.version or PBxxxxxx. 054 * ID Identification: One word name for family. 055 * DE Definition: Short description of family. 056 * AU Author: Authors of the entry. 057 * SE Source of seed: The source suggesting the seed members belong to one family. 058 * GA Gathering method: Search threshold to build the full alignment. 059 * TC Trusted Cutoff: Lowest sequence score and domain score of match in the full alignment. 060 * NC Noise Cutoff: Highest sequence score and domain score of match not in full alignment. 061 * TP Type: Type of family -- presently Family, Domain, Motif or Repeat. 062 * SQ Sequence: Number of sequences in alignment. 063 * // End of alignment. 064 * 065 * Optional fields: 066 * ---------------- 067 * 068 * DC Database Comment: Comment about database reference. 069 * DR Database Reference: Reference to external database. 070 * RC Reference Comment: Comment about literature reference. 071 * RN Reference Number: Reference Number. 072 * RM Reference Medline: Eight digit medline UI number. 073 * RT Reference Title: Reference Title. 074 * RA Reference Author: Reference Author 075 * RL Reference Location: Journal location. 076 * PI Previous identifier: Record of all previous ID lines. 077 * KW Keywords: Keywords. 078 * CC Comment: Comments. 079 * NE Pfam accession: Indicates a nested domain. 080 * NL Location: Location of nested domains - sequence ID, start and end of insert. 081 * WK Wikipedia Reference: Reference to wikipedia. 082 * 083 * Obsolete fields: 084 * ----------- 085 * AL Alignment method of seed: The method used to align the seed members. 086 * AM Alignment Method: The order ls and fs hits are aligned to the model to build the full align. 087 * 088 * </pre> 089 * 090 * @since 3.0.5 091 * @author Amr ALHOSSARY 092 * @author Marko Vaz 093 * 094 */ 095public class StockholmFileParser { 096 097 private final static Logger logger = LoggerFactory.getLogger(StockholmFileParser.class); 098 099 /** indicates reading as much as possible, without limits */ 100 public static final int INFINITY = -1; 101 /** #=GF <feature> <Generic per-File annotation, free text> */ 102 private static final String GENERIC_PER_FILE_ANNOTATION = "GF"; 103 /** #=GC <feature> <Generic per-Column annotation, exactly 1 char per column> */ 104 private static final String GENERIC_PER_CONSENSUS_ANNOTATION = "GC"; 105 /** #=GS <seqname> <feature> <Generic per-Sequence annotation, free text> */ 106 private static final String GENERIC_PER_SEQUENCE_ANNOTATION = "GS"; 107 /** #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue> */ 108 private static final String GENERIC_PER_RESIDUE_ANNOTATION = "GR"; 109 110 // COMPULSORY FIELDS 111 /** Accession number in form PFxxxxx (Pfam) or RFxxxxx (Rfam). */ 112 private static final String GF_ACCESSION_NUMBER = "AC"; 113 /** One word name for family. */ 114 private static final String GF_IDENTIFICATION = "ID"; 115 /** Short description of family. */ 116 private static final String GF_DEFINITION = "DE"; 117 /** Authors of the entry. */ 118 private static final String GF_AUTHOR = "AU"; 119 /** 120 * Indicates the order that ls and fs matches are aligned to the model to give the full alignment. (OBSOLETE IN 121 * HMMER3) 122 */ 123 private static final String GF_ALIGNMENT_METHOD = "AM"; 124 /** Command line used to generate the model */ 125 private static final String GF_BUILD_METHOD = "BM"; 126 /** Command line used to perform the search */ 127 private static final String GF_SEARCH_METHOD = "SM"; 128 /** The source suggesting the seed members belong to one family. */ 129 private static final String GF_SOURCE_SEED = "SE"; 130 /** The source (prediction or publication) of the consensus RNA secondary structure used by Rfam. */ 131 private static final String GF_SOURCE_STRUCTURE = "SS"; 132 /** Search threshold to build the full alignment. */ 133 private static final String GF_GATHERING_THRESHOLD = "GA"; 134 /** Lowest sequence score (and domain score for Pfam) of match in the full alignment. */ 135 private static final String GF_TRUSTED_CUTOFF = "TC"; 136 /** Highest sequence score (and domain score for Pfam) of match not in full alignment. */ 137 private static final String GF_NOISE_CUTOFF = "NC"; 138 /** 139 * Type of family -- presently Family, Domain, Motif or Repeat for Pfam. -- a tree with roots Gene, Intron or 140 * Cis-reg for Rfam. 141 */ 142 private static final String GF_TYPE_FIELD = "TP"; 143 /** Number of sequences in alignment, and start of MSA. */ 144 private static final String GF_SEQUENCE = "SQ"; 145 146 // OPTIONAL FIELDS 147 148 /** Comment about database reference. */ 149 private static final String GF_DB_COMMENT = "DC"; 150 /** Reference to external database. */ 151 private static final String GF_DB_REFERENCE = "DR"; 152 /** Comment about literature reference. */ 153 private static final String GF_REFERENCE_COMMENT = "RC"; 154 /** Reference Number. */ 155 private static final String GF_REFERENCE_NUMBER = "RN"; 156 /** Eight digit medline UI number. */ 157 private static final String GF_REFERENCE_MEDLINE = "RM"; 158 /** Reference Title. */ 159 private static final String GF_REFERENCE_TITLE = "RT"; 160 /** Reference Author. */ 161 private static final String GF_REFERENCE_AUTHOR = "RA"; 162 /** Journal Location. */ 163 private static final String GF_REFERENCE_LOCALTION = "RL"; 164 /** Record of all previous ID lines. */ 165 private static final String GF_PREVIOUS_IDS = "PI"; 166 /** Keywords */ 167 private static final String GF_KEYWORDS = "KW"; 168 /** Comments */ 169 private static final String GF_COMMENT = "CC"; 170 /** Indicates a nested domain */ 171 private static final String GF_PFAM_ACCESSION = "NE"; 172 /** Location of nested domains - sequence ID, start and end of insert. */ 173 private static final String GF_LOCATION = "NL"; 174 /** Wikipedia page */ 175 private static final String GF_WIKIPEDIA_LINK = "WK"; 176 /** Clan accession */ 177 private static final String GF_CLAN = "CL"; 178 /** Used for listing Clan membership */ 179 private static final String GF_MEMBERSHIP = "MB"; 180 181 /** FOR EMBEDDING TREES **/ 182 183 /** A tree in New Hampshire eXtended format. */ 184 private static final String GF_NEW_HAMPSHIRE = "NH"; 185 /** A unique identifier for the next tree. */ 186 private static final String GF_TREE_ID = "TN"; 187 188 // OTHER 189 190 /** 191 * A method used to set the bit score threshold based on the ratio of expected false positives to true positives. 192 * Floating point number between 0 and 1. 193 */ 194 private static final String GF_FALSE_DISCOVERY_RATE = "FR"; 195 196 // #=GS <seqname> <feature> <Generic per-Sequence annotation, free text> 197 198 private static final String GS_ACCESSION_NUMBER = "AC"; 199 private static final String GS_DESCRIPTION = "DE"; 200 private static final String GS_DATABASE_REFERENCE = "DR"; 201 private static final String GS_ORGANISM_SPECIES = "OS"; 202 private static final String GS_ORGANISM_CLASSIFICATION = "OC"; 203 private static final String GS_LOOK = "LO"; 204 205 // #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue> 206 207 /** 208 * For RNA [.,;<>(){}[]AaBb...],<br> 209 * For protein [HGIEBTSCX] 210 */ 211 private static final String GR_SECONDARY_STRUCTURE = "SS"; 212 /** 213 * [0-9X]<br> 214 * (0=0%-10%; ...; 9=90%-100%) 215 */ 216 private static final String GR_SURFACE_ACCESSIBILITY = "SA"; 217 218 /** [Mio] */ 219 private static final String GR_TRANS_MEMBRANE = "TM"; 220 /** 221 * [0-9*]<br> 222 * (0=0.00-0.05; 1=0.05-0.15; *=0.95-1.00) 223 */ 224 private static final String GR_POSTERIOR_PROBABILITY = "PP"; 225 /** [*] */ 226 private static final String GR_LIGAND_BINDING = "LI"; 227 /** [*] */ 228 private static final String GR_ACTIVE_SITE = "AS"; 229 /** [*] */ 230 private static final String GR_AS_PFAM_PREDICTED = "pAS"; 231 /** [*] */ 232 private static final String GR_AS_SWISSPROT = "sAS"; 233 /** [0-2] */ 234 private static final String GR_INTRON = "IN"; 235 236 // #=GC <feature> <Generic per-Column annotation, exactly 1 char per column> 237 238 private static final String GC_SEQUENSE_CONSENSUS = "seq_cons"; 239 private static final String GC_SECONDARY_STRUCTURE = "SS_cons"; 240 private static final String GC_SURFACE_ACCESSIBILITY = "SA_cons"; 241 private static final String GC_TRANS_MEMBRANE = "TM_cons"; 242 private static final String GC_POSTERIOR_PROBABILITY = "PP_cons"; 243 private static final String GC_LIGAND_BINDING = "LI_cons"; 244 private static final String GC_ACTIVE_SITE = "AS_cons"; 245 private static final String GC_AS_PFAM_PREDICTED = "pAS_cons"; 246 private static final String GC_AS_SWISSPROT = "sAS_cons"; 247 private static final String GC_INTRON = "IN_cons"; 248 /** 249 * Often the consensus RNA or protein sequence is used as a reference Any non-gap character (eg. x's) can indicate 250 * consensus/conserved/match columns .'s or -'s indicate insert columns ~'s indicate unaligned insertions Upper and 251 * lower case can be used to discriminate strong and weakly conserved residues respectively 252 */ 253 private static final String GC_REFERENCE_ANNOTATION = "RF"; 254 /** 255 * Indicates which columns in an alignment should be masked, such that the emission probabilities for match states 256 * corresponding to those columns will be the background distribution. 257 */ 258 private static final String GC_MODEL_MASK = "MM"; 259 260 private StockholmStructure stockholmStructure; 261 // private boolean endFile = false; 262 263 // private static final int STATUS_OUTSIDE_FILE = 0; 264 // private static final int STATUS_INSIDE_FILE = 10; 265 // private static final int STATUS_IN_SEQUENCE = 20; 266 // 267 // private int status=STATUS_OUTSIDE_FILE; 268 269 Scanner internalScanner = null; 270 private InputStream cashedInputStream; 271 272 /** 273 * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.<br> 274 * This function is meant to be used for single access to specific file and it closes the file after doing its 275 * assigned job. Any subsequent call to {@link #parseNext(int)} will throw an exception or will function with 276 * unpredicted behavior. 277 * 278 * @param filename 279 * complete(?) path to the file from where to read the content 280 * @return stockholm file content 281 * @throws IOException 282 * when an exception occurred while opening/reading/closing the file+ 283 * @throws ParserException 284 * if unexpected format is encountered 285 */ 286 public StockholmStructure parse(String filename) throws IOException { 287 InputStream inStream = new InputStreamProvider().getInputStream(filename); 288 StockholmStructure structure = parse(inStream); 289 inStream.close(); 290 return structure; 291 } 292 293 /** 294 * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content.<br> 295 * This function doesn't close the file after doing its assigned job; to allow for further calls of 296 * {@link #parseNext(int)}. 297 * 298 * @see #parseNext(int) 299 * 300 * @param filename 301 * file from where to read the content. see {@link InputStreamProvider} for more details. 302 * @param max 303 * maximum number of files to read, {@link #INFINITY} for all. 304 * @return a vector of {@link StockholmStructure} containing parsed structures. 305 * @throws IOException 306 * when an exception occurred while opening/reading/closing the file. 307 * @throws ParserException 308 * if unexpected format is encountered 309 */ 310 public List<StockholmStructure> parse(String filename, int max) throws IOException { 311 InputStreamProvider isp = new InputStreamProvider(); 312 InputStream inStream = isp.getInputStream(filename); 313 return parse(inStream, max); 314 } 315 316 /** 317 * parses {@link InputStream} and returns a the first contained alignment in a {@link StockholmStructure} object. 318 * Used mainly for multiple files within the same input stream, (e.g. when reading from Pfam flat files. <br> 319 * This method leaves the stream open for further calls of {@link #parseNext(int)}. 320 * 321 * @see #parseNext(int) 322 * @param inStream 323 * the {@link InputStream} containing the file to read. 324 * @return a {@link StockholmStructure} object representing file contents. 325 * @throws IOException 326 * @throws ParserException 327 */ 328 public StockholmStructure parse(InputStream inStream) throws IOException { 329 return parse(inStream, 1).get(0); 330 } 331 332 /** 333 * parses an {@link InputStream} and returns at maximum <code>max</code> objects contained in that file.<br> 334 * This method leaves the stream open for further calls of {@link #parse(InputStream, int)} (same function) or 335 * {@link #parseNext(int)}. 336 * 337 * @see #parseNext(int) 338 * @param inStream 339 * the stream to parse 340 * @param max 341 * maximum number of structures to try to parse, {@link #INFINITY} to try to obtain as much as possible. 342 * @return a {@link List} of {@link StockholmStructure} objects. If there are no more structures, an empty list is 343 * returned. 344 * @throws IOException 345 * in case an I/O Exception occurred. 346 */ 347 public List<StockholmStructure> parse(InputStream inStream, int max) throws IOException { 348 if (max < INFINITY) { 349 throw new IllegalArgumentException("max can't be -ve value " + max); 350 } 351 if (inStream != this.cashedInputStream) { 352 this.cashedInputStream = inStream; 353 this.internalScanner = null; 354 } 355 356 if (internalScanner == null) { 357 internalScanner = new Scanner(inStream); 358 } 359 ArrayList<StockholmStructure> structures = new ArrayList<>(); 360 while (max != INFINITY && max-- > 0) { 361 StockholmStructure structure = parse(internalScanner); 362 if (structure != null) { 363 structures.add(structure); 364 } else { 365 break; 366 } 367 } 368 return structures; 369 } 370 371 /** 372 * Tries to parse and return as maximum as <code>max</code> structures in the last used file or input stream.<br> 373 * Please consider calling either {@link #parse(InputStream)}, {@link #parse(InputStream, int)}, or 374 * {@link #parse(String, int)} before calling this function. 375 * 376 * @param max 377 * @return 378 * @throws IOException 379 */ 380 public List<StockholmStructure> parseNext(int max) throws IOException { 381 return parse(this.cashedInputStream, max); 382 } 383 384 /** 385 * Parses a Stockholm file and returns a {@link StockholmStructure} object with its content. This method returns 386 * just after reaching the end of structure delimiter line ("//"), leaving any remaining empty lines unconsumed. 387 * 388 * @param scanner 389 * from where to read the file content 390 * @return Stockholm file content, <code>null</code> if couldn't or no more structures. 391 * @throws IOException 392 * @throws Exception 393 */ 394 StockholmStructure parse(Scanner scanner) throws IOException { 395 if (scanner == null) { 396 if (internalScanner != null) { 397 scanner = internalScanner; 398 } else { 399 throw new IllegalArgumentException("No Scanner defined"); 400 } 401 } 402 String line = null; 403 int linesCount = 0; 404 try { 405 while (scanner.hasNextLine()) { 406 line = scanner.nextLine(); 407 // if the file is empty 408 // this condition will not happen, just left in case we decided to go for buffereedReader again for 409 // performance purpose. 410 if (linesCount == 0 && line == null) { 411 throw new IOException("Could not parse Stockholm file, BufferedReader returns null!"); 412 } 413 414 // ignore empty lines 415 if ((/* status==STATUS_INSIDE_FILE && */line == null) || line.trim().length() == 0) { 416 continue; 417 } 418 419 if (line.startsWith("#=G")) { 420 // // comment line or metadata 421 // line = line.substring(1).trim(); 422 // line = line.substring(1).trim(); 423 if (line.startsWith(GENERIC_PER_FILE_ANNOTATION, 2)) { 424 // #=GF <featurename> <generic per-file annotation, free text> 425 int firstSpaceIndex = line.indexOf(' ', 5); 426 String featureName = line.substring(5, firstSpaceIndex); 427 String value = line.substring(firstSpaceIndex).trim(); 428 handleFileAnnotation(featureName, value); 429 } else if (line.startsWith(GENERIC_PER_CONSENSUS_ANNOTATION, 2)) { 430 // Being in a consensus means we are no longer in a sequence. 431 // this.status = STATUS_INSIDE_FILE; 432 // #=GC <featurename> <generic per-column annotation, exactly 1 char per column> 433 int firstSpaceIndex = line.indexOf(' ', 5); 434 String featureName = line.substring(5, firstSpaceIndex); 435 String value = line.substring(firstSpaceIndex).trim(); 436 handleConsensusAnnotation(featureName, value); 437 } else if (line.startsWith(GENERIC_PER_SEQUENCE_ANNOTATION, 2)) { 438 // #=GS <seqname> <featurename> <generic per-sequence annotation, free text> 439 int index1 = line.indexOf(' ', 5); 440 String seqName = line.substring(5, index1); 441 while (line.charAt(++index1) <= ' ') 442 // i.e. white space 443 ;// keep advancing 444 int index2 = line.indexOf(' ', index1); 445 String featureName = line.substring(index1, index2); 446 String value = line.substring(index2).trim(); 447 handleSequenceAnnotation(seqName, featureName, value); 448 } else if (line.startsWith(GENERIC_PER_RESIDUE_ANNOTATION, 2)) { 449 // #=GR <seqname> <featurename> <generic per-sequence AND per-column mark-up, exactly 1 450 // character per column> 451 int index1 = line.indexOf(' ', 5); 452 String seqName = line.substring(5, index1); 453 while (line.charAt(++index1) == ' ') 454 ;// keep advancing 455 int index2 = line.indexOf(' ', index1); 456 String featureName = line.substring(index1, index2); 457 String value = line.substring(index2).trim(); 458 handleResidueAnnotation(seqName, featureName, value); 459 } 460 } else if (line.startsWith("# STOCKHOLM")) { // it is the header line 461 // if (status == STATUS_OUTSIDE_FILE) { 462 // status = STATUS_INSIDE_FILE; 463 // String[] header = line.split("\\s+"); 464 // this.stockholmStructure = new StockholmStructure(); 465 // this.stockholmStructure.getFileAnnotation().setFormat(header[1]); 466 // this.stockholmStructure.getFileAnnotation().setVersion(header[2]); 467 // } else { 468 // throw new ParserException("Uexpected Format line: [" + line + "]"); 469 // } 470 String[] header = line.split("\\s+"); 471 this.stockholmStructure = new StockholmStructure(); 472 this.stockholmStructure.getFileAnnotation().setFormat(header[1]); 473 this.stockholmStructure.getFileAnnotation().setVersion(header[2]); 474 } else if ("//".equals(line.trim())) { 475 // status = STATUS_OUTSIDE_FILE; 476 break;// should we just break immediately or jump next empty lines? 477 } else /* if (!line.startsWith("#")) */{ 478 // most probably This line corresponds to a sequence. Something like: 479 // O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS 480 // N.B. as long as we don't check the status now, it is somehow error prone 481 handleSequenceLine(line); 482 // //============removed status========================== 483 // if (status == STATUS_IN_SEQUENCE) { 484 // // This line corresponds to a sequence. Something like: 485 // // O83071/192-246 MTCRAQLIAVPRASSLAEAIACAQKMRVSRVPVYERS 486 // handleSequenceLine(line); 487 // // }else if (status==STATUS_OUTSIDE_FILE) { 488 // // throw new 489 // // 490 // ParserException("The end of file character was allready reached but there are still sequence lines"); 491 // } else { 492 // System.err.println("Error: Unknown or unexpected line [" + line 493 // + "].\nPlease contact the Biojava team."); 494 // throw new ParserException("Error: Unknown or unexpected line [" + line + "]."); 495 // } 496 // //============removed status========================== 497 } 498 linesCount++; 499 } 500 } catch (IOException e) { 501 // TODO: Best practice is to catch or throw Exception, never both 502 logger.error("IOException: ", e); 503 throw new IOException("Error parsing Stockholm file"); 504 } 505 StockholmStructure structure = this.stockholmStructure; 506 this.stockholmStructure = null; 507 if (structure != null) { 508 int length = -1; 509 Map<String, StringBuffer> sequences = structure.getSequences(); 510 for (String sequencename : sequences.keySet()) { 511 StringBuffer sequence = sequences.get(sequencename); 512 if (length == -1) { 513 length = sequence.length(); 514 } else if (length != sequence.length()) { 515 throw new RuntimeException("Sequences have different lengths"); 516 } 517 } 518 } 519 return structure; 520 } 521 522 /** 523 * Handles a line that corresponds to a sequence. <br> 524 * e.g.: COATB_BPIKE/30-81 AEPNAATNYATEAMDSLKTQAIDLISQTWPVVTTVVVAGLVIRLFKKFSSKA<br> 525 * N.B.: This function can't tolerate sequences with intrinsic white space. 526 * 527 * @param line 528 * the line to be parsed 529 * @throws Exception 530 */ 531 private void handleSequenceLine(String line) { 532 String[] lineContent = line.split("\\s+"); 533 if (lineContent.length != 2) { 534 throw new ParserException("Could not split sequence line into sequence name and sequence:\n" + line); 535 } 536 stockholmStructure.appendToSequence(lineContent[0], lineContent[1]); 537 } 538 539 /** 540 * #=GF <feature> <Generic per-File annotation, free text> 541 * 542 * @param featureName 543 * @param value 544 * the line to be parsed 545 */ 546 private void handleFileAnnotation(String featureName, String value) { 547 if (featureName.equals(GF_ACCESSION_NUMBER)) { 548 stockholmStructure.getFileAnnotation().setGFAccessionNumber(value); 549 } else if (featureName.equals(GF_IDENTIFICATION)) { 550 stockholmStructure.getFileAnnotation().setGFIdentification(value); 551 } else if (featureName.equals(GF_DB_REFERENCE)) { 552 stockholmStructure.getFileAnnotation().addDBReference(value); 553 } else if (featureName.equals(GF_DEFINITION)) { 554 stockholmStructure.getFileAnnotation().setGFDefinition(value); 555 } else if (featureName.equals(GF_AUTHOR)) { 556 stockholmStructure.getFileAnnotation().setGFAuthors(value); 557 } else if (featureName.equals(GF_ALIGNMENT_METHOD)) { 558 stockholmStructure.getFileAnnotation().setAlignmentMethod(value); 559 } else if (featureName.equals(GF_BUILD_METHOD)) { 560 stockholmStructure.getFileAnnotation().addGFBuildMethod(value); 561 } else if (featureName.equals(GF_SEARCH_METHOD)) { 562 stockholmStructure.getFileAnnotation().setGFSearchMethod(value); 563 } else if (featureName.equals(GF_SOURCE_SEED)) { 564 stockholmStructure.getFileAnnotation().setGFSourceSeed(value); 565 } else if (featureName.equals(GF_SOURCE_STRUCTURE)) { 566 stockholmStructure.getFileAnnotation().setGFSourceStructure(value); 567 } else if (featureName.equals(GF_GATHERING_THRESHOLD)) { 568 stockholmStructure.getFileAnnotation().setGFGatheringThreshs(value); 569 } else if (featureName.equals(GF_TRUSTED_CUTOFF)) { 570 stockholmStructure.getFileAnnotation().setGFTrustedCutoffs(value); 571 } else if (featureName.equals(GF_NOISE_CUTOFF)) { 572 stockholmStructure.getFileAnnotation().setGFNoiseCutoffs(value); 573 } else if (featureName.equals(GF_TYPE_FIELD)) { 574 stockholmStructure.getFileAnnotation().setGFTypeField(value); 575 } else if (featureName.equals(GF_PREVIOUS_IDS)) { 576 stockholmStructure.getFileAnnotation().setGFPreviousIDs(value); 577 } else if (featureName.equals(GF_SEQUENCE)) { 578 // status = STATUS_IN_SEQUENCE; 579 stockholmStructure.getFileAnnotation().setGFNumSequences(value); 580 } else if (featureName.equals(GF_DB_COMMENT)) { 581 stockholmStructure.getFileAnnotation().setGFDBComment(value); 582 // } else if (featureName.equals(GF_DB_REFERENCE)) { 583 // stockholmStructure.getFileAnnotation().addDBReference(value); 584 } else if (featureName.equals(GF_REFERENCE_COMMENT)) { 585 stockholmStructure.getFileAnnotation().setGFRefComment(value); 586 } else if (featureName.equals(GF_REFERENCE_NUMBER)) { 587 StockholmFileAnnotationReference reference = new StockholmFileAnnotationReference(); 588 stockholmStructure.getFileAnnotation().getReferences().add(reference); 589 } else if (featureName.equals(GF_REFERENCE_MEDLINE)) { 590 stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefMedline(value); 591 } else if (featureName.equals(GF_REFERENCE_TITLE)) { 592 stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefTitle(value); 593 } else if (featureName.equals(GF_REFERENCE_AUTHOR)) { 594 stockholmStructure.getFileAnnotation().getReferences().lastElement().addToRefAuthor(value); 595 } else if (featureName.equals(GF_REFERENCE_LOCALTION)) { 596 stockholmStructure.getFileAnnotation().getReferences().lastElement().setRefLocation(value); 597 } else if (featureName.equals(GF_KEYWORDS)) { 598 stockholmStructure.getFileAnnotation().setGFKeywords(value); 599 } else if (featureName.equals(GF_COMMENT)) { 600 stockholmStructure.getFileAnnotation().addToGFComment(value); 601 } else if (featureName.equals(GF_PFAM_ACCESSION)) { 602 stockholmStructure.getFileAnnotation().setGFPfamAccession(value); 603 } else if (featureName.equals(GF_LOCATION)) { 604 stockholmStructure.getFileAnnotation().setGFLocation(value); 605 } else if (featureName.equals(GF_WIKIPEDIA_LINK)) { 606 stockholmStructure.getFileAnnotation().setGFWikipediaLink(value); 607 } else if (featureName.equals(GF_CLAN)) { 608 stockholmStructure.getFileAnnotation().setGFClan(value); 609 } else if (featureName.equals(GF_MEMBERSHIP)) { 610 stockholmStructure.getFileAnnotation().setGFMembership(value); 611 } else if (featureName.equals(GF_NEW_HAMPSHIRE)) { 612 stockholmStructure.getFileAnnotation().addGFNewHampshire(value); 613 } else if (featureName.equals(GF_TREE_ID)) { 614 stockholmStructure.getFileAnnotation().addGFTreeID(value); 615 } else if (featureName.equals(GF_FALSE_DISCOVERY_RATE)) { 616 stockholmStructure.getFileAnnotation().addGFFalseDiscoveryRate(value); 617 } else { 618 // unknown feature 619 logger.warn("Unknown File Feature [{}].\nPlease contact the Biojava team.", featureName); 620 } 621 } 622 623 /** 624 * usually a single line of:<br> 625 * #=GC <feature> <Generic per-Column annotation, exactly 1 char per column> 626 * 627 * @param featureName 628 * the feature name :) 629 * @param value 630 * the line to be parsed. 631 */ 632 private void handleConsensusAnnotation(String featureName, String value) { 633 if (featureName.equals(GC_SECONDARY_STRUCTURE)) { 634 stockholmStructure.getConsAnnotation().setSecondaryStructure(value); 635 } else if (featureName.equals(GC_SEQUENSE_CONSENSUS)) { 636 stockholmStructure.getConsAnnotation().setSequenceConsensus(value); 637 } else if (featureName.equals(GC_SURFACE_ACCESSIBILITY)) { 638 stockholmStructure.getConsAnnotation().setSurfaceAccessibility(value); 639 } else if (featureName.equals(GC_TRANS_MEMBRANE)) { 640 stockholmStructure.getConsAnnotation().setTransMembrane(value); 641 } else if (featureName.equals(GC_POSTERIOR_PROBABILITY)) { 642 stockholmStructure.getConsAnnotation().setPosteriorProbability(value); 643 } else if (featureName.equals(GC_LIGAND_BINDING)) { 644 stockholmStructure.getConsAnnotation().setLigandBinding(value); 645 } else if (featureName.equals(GC_ACTIVE_SITE)) { 646 stockholmStructure.getConsAnnotation().setActiveSite(value); 647 } else if (featureName.equals(GC_AS_PFAM_PREDICTED)) { 648 stockholmStructure.getConsAnnotation().setAsPFamPredicted(value); 649 } else if (featureName.equals(GC_AS_SWISSPROT)) { 650 stockholmStructure.getConsAnnotation().setAsSwissProt(value); 651 } else if (featureName.equals(GC_INTRON)) { 652 stockholmStructure.getConsAnnotation().setIntron(value); 653 } else if (featureName.equals(GC_REFERENCE_ANNOTATION)) { 654 stockholmStructure.getConsAnnotation().setReferenceAnnotation(value); 655 } else if (featureName.equals(GC_MODEL_MASK)) { 656 stockholmStructure.getConsAnnotation().setModelMask(value); 657 } else { 658 // unknown feature 659 logger.warn("Unknown Consensus Feature [{}].\nPlease contact the Biojava team.", featureName); 660 } 661 } 662 663 /** 664 * #=GS <seqname> <feature> <Generic per-Sequence annotation, free text> 665 * 666 * @param line 667 * the line to be parsed 668 */ 669 private void handleSequenceAnnotation(String seqName, String featureName, String value) { 670 if (featureName.equals(GS_ACCESSION_NUMBER)) { 671 stockholmStructure.addGSAccessionNumber(seqName, value); 672 } else if (featureName.equals(GS_DESCRIPTION)) { 673 stockholmStructure.addGSDescription(seqName, value); 674 } else if (featureName.equals(GS_DATABASE_REFERENCE)) { 675 stockholmStructure.addGSdbReference(seqName, value); 676 } else if (featureName.equals(GS_ORGANISM_SPECIES)) { 677 stockholmStructure.addGSOrganismSpecies(seqName, value); 678 } else if (featureName.equals(GS_ORGANISM_CLASSIFICATION)) { 679 stockholmStructure.addGSOrganismClassification(seqName, value); 680 } else if (featureName.equals(GS_LOOK)) { 681 stockholmStructure.addGSLook(seqName, value); 682 } else { 683 // unknown feature 684 logger.warn("Unknown Sequence Feature [{}].\nPlease contact the Biojava team.", featureName); 685 } 686 } 687 688 /** 689 * #=GR <seqname> <feature> <Generic per-Residue annotation, exactly 1 char per residue> 690 * 691 * @param line 692 * the line to be parsed 693 */ 694 private void handleResidueAnnotation(String seqName, String featureName, String value) { 695 696 if (featureName.equals(GR_SURFACE_ACCESSIBILITY)) { 697 stockholmStructure.addSurfaceAccessibility(seqName, value); 698 } else if (featureName.equals(GR_TRANS_MEMBRANE)) { 699 stockholmStructure.addTransMembrane(seqName, value); 700 } else if (featureName.equals(GR_POSTERIOR_PROBABILITY)) { 701 stockholmStructure.addPosteriorProbability(seqName, value); 702 } else if (featureName.equals(GR_LIGAND_BINDING)) { 703 stockholmStructure.addLigandBinding(seqName, value); 704 } else if (featureName.equals(GR_ACTIVE_SITE)) { 705 stockholmStructure.addActiveSite(seqName, value); 706 } else if (featureName.equals(GR_AS_PFAM_PREDICTED)) { 707 stockholmStructure.addASPFamPredicted(seqName, value); 708 } else if (featureName.equals(GR_AS_SWISSPROT)) { 709 stockholmStructure.addASSwissProt(seqName, value); 710 } else if (featureName.equals(GR_INTRON)) { 711 stockholmStructure.addIntron(seqName, value); 712 } else if (featureName.equals(GR_SECONDARY_STRUCTURE)) { 713 stockholmStructure.addSecondaryStructure(seqName, value); 714 } else { 715 // unknown feature 716 logger.warn("Unknown Residue Feature [{}].\nPlease contact the Biojava team.", featureName); 717 } 718 } 719}