001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * created at 28 Jan 2014 021 * Author: ap3 022 */ 023 024package org.biojava.nbio.genome.parsers.genename; 025 026import org.biojava.nbio.genome.App; 027import org.biojava.nbio.core.util.InputStreamProvider; 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import java.io.BufferedReader; 032import java.io.IOException; 033import java.io.InputStream; 034import java.io.InputStreamReader; 035import java.net.URL; 036import java.util.ArrayList; 037import java.util.List; 038 039/** A parser that parses a file from the UCSC genome browser that contains mapping of gene name to chromosome positions 040 * 041 * @author Andreas Prlic 042 * 043 */ 044public class GeneChromosomePositionParser { 045 046 private static final Logger logger = LoggerFactory.getLogger(App.class); 047 048 public static final String DEFAULT_MAPPING_URL="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refFlat.txt.gz"; 049 050 public static void main(String[] args){ 051 try { 052 053 List<GeneChromosomePosition> genePositions= getChromosomeMappings(); 054 logger.info("got {} gene positions", genePositions.size()); 055 056 for (GeneChromosomePosition pos : genePositions){ 057 if ( "FOLH1".equals(pos.getGeneName())) { 058 logger.info("Gene Position: {}", pos); 059 break; 060 } 061 } 062 063 } catch(Exception e){ 064 logger.error("Exception: ", e); 065 } 066 } 067 068 public static List<GeneChromosomePosition> getChromosomeMappings() throws IOException { 069 070 URL url = new URL(DEFAULT_MAPPING_URL); 071 072 InputStreamProvider prov = new InputStreamProvider(); 073 074 InputStream inStream = prov.getInputStream(url); 075 076 return getChromosomeMappings(inStream); 077 } 078 079 public static List<GeneChromosomePosition> getChromosomeMappings(InputStream inStream) throws IOException { 080 BufferedReader reader = new BufferedReader(new InputStreamReader(inStream)); 081 082 ArrayList<GeneChromosomePosition> gcps = new ArrayList<>(); 083 084 String line = null; 085 while ((line = reader.readLine()) != null) { 086 GeneChromosomePosition gcp = getGeneChromosomePosition(line); 087 if ( gcp != null) 088 gcps.add(gcp); 089 } 090 091 // since this is a large list, remove empty content. 092 gcps.trimToSize(); 093 return gcps; 094 } 095 096 private static GeneChromosomePosition getGeneChromosomePosition(String line) { 097 if ( line == null) 098 return null; 099 String[] spl = line.split("\t"); 100 101 if ( spl.length != 11) { 102 logger.warn("Line does not have 11 data items, but {}: {}", spl.length, line); 103 return null; 104 } 105 106 GeneChromosomePosition g = new GeneChromosomePosition(); 107 108 g.setGeneName(spl[0]); 109 g.setGenebankId(spl[1]); 110 g.setChromosome(spl[2]); 111 g.setOrientation(spl[3].charAt(0)); 112 g.setTranscriptionStart(Integer.parseInt(spl[4])); 113 g.setTranscriptionEnd(Integer.parseInt(spl[5])); 114 g.setCdsStart(Integer.parseInt(spl[6])); 115 g.setCdsEnd(Integer.parseInt(spl[7])); 116 g.setExonCount(Integer.parseInt(spl[8])); 117 String exonStarts = spl[9]; 118 String exonEnds = spl[10]; 119 g.setExonStarts(getIntegerList(exonStarts)); 120 g.setExonEnds(getIntegerList(exonEnds)); 121 122 //System.out.println(line); 123 //System.out.println(Arrays.asList(spl) + " " + spl.length); 124 return g; 125 } 126 127 private static List<Integer> getIntegerList(String lst){ 128 String[] spl = lst.split(","); 129 ArrayList<Integer> l = new ArrayList<>(); 130 for (String s : spl){ 131 l.add(Integer.parseInt(s)); 132 } 133 l.trimToSize(); 134 return l; 135 } 136}