001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * created at 28 Jan 2014 021 * Author: Andreas Prlic 022 */ 023 024package org.biojava.nbio.genome.parsers.genename; 025 026import org.biojava.nbio.core.util.InputStreamProvider; 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import java.io.BufferedReader; 031import java.io.IOException; 032import java.io.InputStream; 033import java.io.InputStreamReader; 034import java.net.URL; 035import java.util.ArrayList; 036import java.util.List; 037 038/** 039 * Parses a file from the www.genenames.org website that contains a mapping of human gene names to other databases 040 * 041 * @author Andreas Prlic 042 * 043 */ 044public class GeneNamesParser { 045 046 private static final Logger logger = LoggerFactory.getLogger(GeneNamesParser.class); 047 048 public static final String DEFAULT_GENENAMES_URL = "https://www.genenames.org/cgi-bin/download?title=HGNC+output+data&hgnc_dbtag=on&col=gd_app_sym&col=gd_app_name&col=gd_status&col=gd_prev_sym&col=gd_prev_name&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_acc_ids&col=md_mim_id&col=gd_pub_refseq_ids&col=md_ensembl_id&col=md_prot_id&col=gd_hgnc_id" + 049 "&status=Approved&status_opt=2&where=((gd_pub_chrom_map%20not%20like%20%27%patch%%27%20and%20gd_pub_chrom_map%20not%20like%20%27%ALT_REF%%27)%20or%20gd_pub_chrom_map%20IS%20NULL)%20and%20gd_locus_group%20%3d%20%27protein-coding%20gene%27&order_by=gd_app_sym_sort&format=text&limit=&submit=submit&.cgifields=&.cgifields=chr&.cgifields=status&.cgifields=hgnc_dbtag"; 050 051 /** parses a file from the genenames website 052 * 053 * @param args 054 */ 055 public static void main(String[] args) { 056 057 try { 058 059 List<GeneName> geneNames = getGeneNames(); 060 061 logger.info("got {} gene names", geneNames.size()); 062 063 for ( GeneName g : geneNames){ 064 if ( "FOLH1".equals(g.getApprovedSymbol())) 065 logger.info("Gene Name: {}", g); 066 } 067 // and returns a list of beans that contains key-value pairs for each gene name 068 069 } catch (Exception e) { 070 // TODO Auto-generated catch block 071 logger.error("Exception: ", e); 072 } 073 074 } 075 076 077 public static List<GeneName> getGeneNames() throws IOException{ 078 URL url = new URL(DEFAULT_GENENAMES_URL); 079 080 InputStreamProvider prov = new InputStreamProvider(); 081 082 InputStream inStream = prov.getInputStream(url); 083 084 return getGeneNames(inStream); 085 } 086 087 /** Get a list of GeneNames from an input stream. 088 * 089 * @param inStream 090 * @return list of geneNames 091 * @throws IOException 092 */ 093 public static List<GeneName> getGeneNames(InputStream inStream) throws IOException{ 094 095 ArrayList<GeneName> geneNames = new ArrayList<>(); 096 BufferedReader reader = new BufferedReader(new InputStreamReader(inStream)); 097 098 // skip reading first line (it is the legend) 099 String line = reader.readLine(); 100 101 while ((line = reader.readLine()) != null) { 102 // process line... 103 //System.out.println(Arrays.toString(line.split("\t"))); 104 105 GeneName geneName = getGeneName(line); 106 if ( geneName != null) 107 geneNames.add(geneName); 108 //System.out.println(geneName); 109 110 } 111 112 // since this is a large list, let's free up unused space... 113 geneNames.trimToSize(); 114 return geneNames; 115 } 116 117 private static GeneName getGeneName(String line) { 118 // data is in this order: 119 //[HGNC ID, Approved Symbol, Approved Name, Status, Previous Symbols, 120 // Previous Names, Synonyms, Chromosome, Accession Numbers, RefSeq IDs, UniProt ID(supplied by UniProt)] 121 122 if (line == null) 123 return null; 124 125 String[] s = line.split("\t"); 126 127 if ( s.length != 13) { 128 logger.warn("Line does not contain 13 data items, but {}: {}", s.length, line); 129 logger.warn(line.replaceAll("\t", "|---|")); 130 return null; 131 } 132 GeneName gn = new GeneName(); 133 134 135 gn.setApprovedSymbol(s[0]); 136 gn.setApprovedName(s[1]); 137 gn.setStatus(s[2]); 138 gn.setPreviousSymbols(s[3]); 139 gn.setPreviousNames(s[4]); 140 gn.setSynonyms(s[5]); 141 gn.setChromosome(s[6]); 142 gn.setAccessionNr(s[7]); 143 gn.setOmimId(s[8]); 144 gn.setRefseqIds(s[9]); 145 gn.setEnsemblGeneId(s[10]); 146 gn.setUniprot(s[11]); 147 gn.setHgncId(s[12]); 148 149 return gn; 150 151 } 152 153}