001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021 022package org.biojavax.bio.taxa.io; 023 024import java.io.BufferedReader; 025import java.io.IOException; 026 027import org.biojava.bio.seq.io.ParseException; 028import org.biojava.utils.ChangeVetoException; 029import org.biojavax.RichObjectFactory; 030import org.biojavax.bio.taxa.NCBITaxon; 031import org.biojavax.bio.taxa.SimpleNCBITaxon; 032 033/** 034 * Loads NCBI taxon information from names.dmp and nodes.dmp, which are 035 * two of the files in the archive downloadable at ftp://ftp.ncbi.nih.gov/pub/taxonomy/ . 036 * This simple implementation makes no attempt to process deletions 037 * or merges - it merely creates instances as it goes along, reusing 038 * any that may already exist. 039 * 040 * @author Richard Holland 041 * @since 1.5 042 */ 043public class SimpleNCBITaxonomyLoader implements NCBITaxonomyLoader { 044 045 /** 046 * {@inheritDoc} 047 */ 048 public NCBITaxon readNode(BufferedReader nodes) throws IOException, ParseException { 049 if (nodes==null) throw new IllegalArgumentException("Nodes file cannot be null"); 050 String line; 051 // parse nodes first 052 if ((line=nodes.readLine())!=null) { 053 /* separated by '\t|\t' 054 tax_id -- node id in GenBank taxonomy database 055 parent tax_id -- parent node id in GenBank taxonomy database 056 rank -- rank of this node (superkingdom, kingdom, ...) 057 embl code -- locus-name prefix; not unique 058 division id -- see division.dmp file 059 inherited div flag (1 or 0) -- 1 if node inherits division from parent 060 genetic code id -- see gencode.dmp file 061 inherited GC flag (1 or 0) -- 1 if node inherits genetic code from parent 062 mitochondrial genetic code id -- see gencode.dmp file 063 inherited MGC flag (1 or 0) -- 1 if node inherits mitochondrial gencode from parent 064 GenBank hidden flag (1 or 0) -- 1 if name is suppressed in GenBank entry lineage 065 hidden subtree root flag (1 or 0) -- 1 if this subtree has no sequence data yet 066 comments -- free-text comments and citations 067 */ 068 String[] parts = line.split("\\|"); 069 Integer tax_id = Integer.valueOf(parts[0].trim()); 070 String pti = parts[1].trim(); 071 Integer parent_tax_id = pti.length()>0?new Integer(pti):null; 072 String rank = parts[2].trim(); 073 Integer genetic_code = new Integer(parts[6].trim()); 074 Integer mito_code = new Integer(parts[8].trim()); 075 String isTaxonHidden = parts[10].trim();// either "0" or "1" 076 // by getting it from the factory, it auto-creates. If the user is using the 077 // HibernateRichObjectFactory, then it even auto-persists. Magic! 078 NCBITaxon t = findTaxon(new Object[]{tax_id}); 079 try { 080 t.setParentNCBITaxID(parent_tax_id); 081 t.setNodeRank(rank); 082 t.setGeneticCode(genetic_code); 083 t.setMitoGeneticCode(mito_code); 084 t.setTaxonHidden(Integer.parseInt(isTaxonHidden)==1); 085 } catch (ChangeVetoException e) { 086 throw new ParseException(e); 087 } 088 // return the node 089 return t; 090 } else return null; 091 } 092 093 protected NCBITaxon findTaxon(final Object[] theKeys) {// allows subclass to override and cast 094 return (SimpleNCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, theKeys); 095 } 096 097 /** 098 * {@inheritDoc} 099 */ 100 public NCBITaxon readName(BufferedReader names) throws IOException, ParseException { 101 if (names==null) throw new IllegalArgumentException("Names file cannot be null"); 102 String line; 103 if ((line=names.readLine())!=null) { 104 /* separated by '\t|\t' 105 tax_id -- the id of node associated with this name 106 name_txt -- name itself 107 unique name -- the unique variant of this name if name not unique 108 name class -- (synonym, common name, ...) 109 */ 110 String[] parts = line.split("\\|"); 111 Integer tax_id = Integer.valueOf(parts[0].trim()); 112 String name = parts[1].trim(); 113 String nameClass = parts[3].trim(); 114 // look up the taxon from the factory 115 NCBITaxon t = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class,new Object[]{tax_id}); 116 // add the name 117 try { 118 t.addName(nameClass,name); 119 } catch (ChangeVetoException e) { 120 throw new ParseException(e); 121 } 122 return t; 123 } else return null; 124 } 125}