001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021
022package org.biojavax.bio.taxa.io;
023
024import java.io.BufferedReader;
025import java.io.IOException;
026
027import org.biojava.bio.seq.io.ParseException;
028import org.biojava.utils.ChangeVetoException;
029import org.biojavax.RichObjectFactory;
030import org.biojavax.bio.taxa.NCBITaxon;
031import org.biojavax.bio.taxa.SimpleNCBITaxon;
032
033/**
034 * Loads NCBI taxon information from names.dmp and nodes.dmp, which are
035 * two of the files in the archive downloadable at ftp://ftp.ncbi.nih.gov/pub/taxonomy/ .
036 * This simple implementation makes no attempt to process deletions
037 * or merges - it merely creates instances as it goes along, reusing
038 * any that may already exist.
039 *
040 * @author Richard Holland
041 * @since 1.5
042 */
043public class SimpleNCBITaxonomyLoader implements NCBITaxonomyLoader {
044    
045    /**
046     * {@inheritDoc}
047     */
048    public NCBITaxon readNode(BufferedReader nodes) throws IOException, ParseException {
049        if (nodes==null) throw new IllegalArgumentException("Nodes file cannot be null");
050        String line;
051        // parse nodes first
052        if ((line=nodes.readLine())!=null) {
053                /* separated by '\t|\t'
054        tax_id                                  -- node id in GenBank taxonomy database
055        parent tax_id                           -- parent node id in GenBank taxonomy database
056        rank                                    -- rank of this node (superkingdom, kingdom, ...)
057        embl code                               -- locus-name prefix; not unique
058        division id                             -- see division.dmp file
059        inherited div flag  (1 or 0)            -- 1 if node inherits division from parent
060        genetic code id                         -- see gencode.dmp file
061        inherited GC  flag  (1 or 0)            -- 1 if node inherits genetic code from parent
062        mitochondrial genetic code id           -- see gencode.dmp file
063        inherited MGC flag  (1 or 0)            -- 1 if node inherits mitochondrial gencode from parent
064        GenBank hidden flag (1 or 0)            -- 1 if name is suppressed in GenBank entry lineage
065        hidden subtree root flag (1 or 0)       -- 1 if this subtree has no sequence data yet
066        comments                                -- free-text comments and citations
067                 */
068            String[] parts = line.split("\\|");
069            Integer tax_id = Integer.valueOf(parts[0].trim());
070            String pti = parts[1].trim();
071            Integer parent_tax_id = pti.length()>0?new Integer(pti):null;
072            String rank = parts[2].trim();
073            Integer genetic_code = new Integer(parts[6].trim());
074            Integer mito_code = new Integer(parts[8].trim());
075            String isTaxonHidden = parts[10].trim();// either "0" or "1"
076            // by getting it from the factory, it auto-creates. If the user is using the
077            // HibernateRichObjectFactory, then it even auto-persists. Magic!
078            NCBITaxon t = findTaxon(new Object[]{tax_id});
079            try {
080                t.setParentNCBITaxID(parent_tax_id);
081                t.setNodeRank(rank);
082                t.setGeneticCode(genetic_code);
083                t.setMitoGeneticCode(mito_code);
084                t.setTaxonHidden(Integer.parseInt(isTaxonHidden)==1);
085            } catch (ChangeVetoException e) {
086                throw new ParseException(e);
087            }
088            // return the node
089            return t;
090        } else return null;
091    }
092    
093    protected NCBITaxon findTaxon(final Object[] theKeys) {// allows subclass to override and cast
094        return (SimpleNCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class, theKeys);
095    }
096    
097    /**
098     * {@inheritDoc}
099     */
100    public NCBITaxon readName(BufferedReader names) throws IOException, ParseException {
101        if (names==null) throw new IllegalArgumentException("Names file cannot be null");
102        String line;
103        if ((line=names.readLine())!=null) {
104                /* separated by '\t|\t'
105        tax_id                                  -- the id of node associated with this name
106        name_txt                                -- name itself
107        unique name                             -- the unique variant of this name if name not unique
108        name class                              -- (synonym, common name, ...)
109                 */
110            String[] parts = line.split("\\|");
111            Integer tax_id = Integer.valueOf(parts[0].trim());
112            String name = parts[1].trim();
113            String nameClass = parts[3].trim();
114            // look up the taxon from the factory
115            NCBITaxon t = (NCBITaxon)RichObjectFactory.getObject(SimpleNCBITaxon.class,new Object[]{tax_id});
116            // add the name
117            try {
118                t.addName(nameClass,name);
119            } catch (ChangeVetoException e) {
120                throw new ParseException(e);
121            }
122            return t;
123        } else return null;
124    }
125}