001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * created at 28 Jan 2014
021 * Author: Andreas Prlic
022 */
023
024package org.biojava.nbio.genome.parsers.genename;
025
026import org.biojava.nbio.core.util.InputStreamProvider;
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import java.io.BufferedReader;
031import java.io.IOException;
032import java.io.InputStream;
033import java.io.InputStreamReader;
034import java.net.URL;
035import java.util.ArrayList;
036import java.util.List;
037
038/** 
039 * Parses a file from the www.genenames.org website that contains a mapping of human gene names to other databases
040 *
041 * @author Andreas Prlic
042 *
043 */
044public class GeneNamesParser {
045
046        private static final Logger logger = LoggerFactory.getLogger(GeneNamesParser.class);
047
048        public static final String DEFAULT_GENENAMES_URL = "https://www.genenames.org/cgi-bin/download?title=HGNC+output+data&hgnc_dbtag=on&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=md_mim_id&col=gd_pub_refseq_ids&col=md_ensembl_id&col=md_prot_id&col=gd_hgnc_id" +
049                         "&status=Approved&status_opt=2&where=((gd_pub_chrom_map%20not%20like%20%27%patch%%27%20and%20gd_pub_chrom_map%20not%20like%20%27%ALT_REF%%27)%20or%20gd_pub_chrom_map%20IS%20NULL)%20and%20gd_locus_group%20%3d%20%27protein-coding%20gene%27&order_by=gd_app_sym_sort&format=text&limit=&submit=submit&.cgifields=&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag";
050
051        /** parses a file from the genenames website
052         *
053         * @param args
054         */
055        public static void main(String[] args) {
056
057                try {
058
059                        List<GeneName> geneNames = getGeneNames();
060
061                        logger.info("got {} gene names", geneNames.size());
062
063                        for ( GeneName g : geneNames){
064                                if ( g.getApprovedSymbol().equals("FOLH1"))
065                                        logger.info("Gene Name: {}", g);
066                        }
067                        // and returns a list of beans that contains key-value pairs for each gene name
068
069                } catch (Exception e) {
070                        // TODO Auto-generated catch block
071                        logger.error("Exception: ", e);
072                }
073
074        }
075
076
077        public static List<GeneName> getGeneNames() throws IOException{
078                URL url = new URL(DEFAULT_GENENAMES_URL);
079
080                InputStreamProvider prov = new InputStreamProvider();
081
082                InputStream inStream = prov.getInputStream(url);
083
084                return getGeneNames(inStream);
085        }
086
087        /** Get a list of GeneNames from an input stream.
088         *
089         * @param inStream
090         * @return list of geneNames
091         * @throws IOException
092         */
093        public static List<GeneName> getGeneNames(InputStream inStream) throws IOException{
094
095                ArrayList<GeneName> geneNames = new ArrayList<GeneName>();
096                BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
097
098                // skip reading first line (it is the legend)
099                String line = reader.readLine();
100
101                while ((line = reader.readLine()) != null) {
102                        // process line...
103                        //System.out.println(Arrays.toString(line.split("\t")));
104
105                        GeneName  geneName = getGeneName(line);
106                        if ( geneName != null)
107                                geneNames.add(geneName);
108                                //System.out.println(geneName);
109
110                }
111
112                // since this is a large list, let's free up unused space...
113                geneNames.trimToSize();
114                return geneNames;
115        }
116
117        private static GeneName getGeneName(String line) {
118                // data is in this order:
119                //[HGNC ID, Approved Symbol, Approved Name, Status, Previous Symbols,
120                // Previous Names, Synonyms, Chromosome, Accession Numbers, RefSeq IDs, UniProt ID(supplied by UniProt)]
121
122                if (line == null)
123                        return null;
124
125                String[] s = line.split("\t");
126
127                if ( s.length != 13) {
128                        logger.warn("Line does not contain 13 data items, but {}: {}", s.length, line);
129                        logger.warn(line.replaceAll("\t", "|---|"));
130                        return null;
131                }
132                GeneName gn = new GeneName();
133
134
135                gn.setApprovedSymbol(s[0]);
136                gn.setApprovedName(s[1]);
137                gn.setStatus(s[2]);
138                gn.setPreviousSymbols(s[3]);
139                gn.setPreviousNames(s[4]);
140                gn.setSynonyms(s[5]);
141                gn.setChromosome(s[6]);
142                gn.setAccessionNr(s[7]);
143                gn.setOmimId(s[8]);
144                gn.setRefseqIds(s[9]);
145                gn.setEnsemblGeneId(s[10]);
146                gn.setUniprot(s[11]);
147                gn.setHgncId(s[12]);
148
149                return gn;
150
151        }
152
153}