001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 * created at 20 Feb 2014 021 * Author: ap3 022 */ 023 024package org.biojava.nbio.genome.parsers.cytoband; 025 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import java.io.BufferedReader; 030import java.io.IOException; 031import java.io.InputStream; 032import java.io.InputStreamReader; 033import java.net.URL; 034import java.util.SortedSet; 035import java.util.TreeSet; 036import java.util.zip.GZIPInputStream; 037 038/** 039 * Parses the cytoband (karyotype) file from UCSC. 040 * 041 */ 042public class CytobandParser { 043 044 private static final Logger logger = LoggerFactory 045 .getLogger(CytobandParser.class); 046 047 public static final String DEFAULT_LOCATION = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz"; 048 049 public static void main(String[] args) { 050 051 CytobandParser me = new CytobandParser(); 052 try { 053 SortedSet<Cytoband> cytobands = me.getAllCytobands(new URL( 054 DEFAULT_LOCATION)); 055 SortedSet<StainType> types = new TreeSet<>(); 056 for (Cytoband c : cytobands) { 057 logger.info("Cytoband: {}", c); 058 if (!types.contains(c.getType())) 059 types.add(c.getType()); 060 } 061 logger.info("Strain Type: {}", types); 062 } catch (Exception e) { 063 // TODO Auto-generated catch block 064 logger.error("Exception: ", e); 065 } 066 067 } 068 069 public SortedSet<Cytoband> getAllCytobands(URL u) throws IOException { 070 InputStream stream = new GZIPInputStream(u.openStream()); 071 return getAllCytobands(stream); 072 073 } 074 075 public SortedSet<Cytoband> getAllCytobands(InputStream instream) 076 throws IOException { 077 BufferedReader reader = new BufferedReader(new InputStreamReader( 078 instream)); 079 String line = null; 080 SortedSet<Cytoband> cytobands = new TreeSet<>(); 081 while ((line = reader.readLine()) != null) { 082 String[] spl = line.split("\t"); 083 if (spl.length != 5) { 084 logger.warn( 085 "WRONG LINE LENGHT, expected 5, but got {} for: {}", 086 spl.length, line); 087 } 088 089 Cytoband b = new Cytoband(); 090 b.setChromosome(spl[0]); 091 b.setStart(Integer.parseInt(spl[1])); 092 b.setEnd(Integer.parseInt(spl[2])); 093 b.setLocus(spl[3]); 094 StainType type = StainType.getStainTypeFromString(spl[4]); 095 if (type == null) 096 logger.warn("unknown type: {}", spl[4]); 097 b.setType(type); 098 cytobands.add(b); 099 } 100 101 return cytobands; 102 } 103 104}