001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * created at 20 Feb 2014
021 * Author: ap3
022 */
023
024package org.biojava.nbio.genome.parsers.cytoband;
025
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import java.io.BufferedReader;
030import java.io.IOException;
031import java.io.InputStream;
032import java.io.InputStreamReader;
033import java.net.URL;
034import java.util.SortedSet;
035import java.util.TreeSet;
036import java.util.zip.GZIPInputStream;
037
038/**
039 * Parses the cytoband (karyotype) file from UCSC.
040 *
041 */
042public class CytobandParser {
043
044        private static final Logger logger = LoggerFactory
045                        .getLogger(CytobandParser.class);
046
047        public static final String DEFAULT_LOCATION = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz";
048
049        public static void main(String[] args) {
050
051                CytobandParser me = new CytobandParser();
052                try {
053                        SortedSet<Cytoband> cytobands = me.getAllCytobands(new URL(
054                                        DEFAULT_LOCATION));
055                        SortedSet<StainType> types = new TreeSet<>();
056                        for (Cytoband c : cytobands) {
057                                logger.info("Cytoband: {}", c);
058                                if (!types.contains(c.getType()))
059                                        types.add(c.getType());
060                        }
061                        logger.info("Strain Type: {}", types);
062                } catch (Exception e) {
063                        // TODO Auto-generated catch block
064                        logger.error("Exception: ", e);
065                }
066
067        }
068
069        public SortedSet<Cytoband> getAllCytobands(URL u) throws IOException {
070                InputStream stream = new GZIPInputStream(u.openStream());
071                return getAllCytobands(stream);
072
073        }
074
075        public SortedSet<Cytoband> getAllCytobands(InputStream instream)
076                        throws IOException {
077                BufferedReader reader = new BufferedReader(new InputStreamReader(
078                                instream));
079                String line = null;
080                SortedSet<Cytoband> cytobands = new TreeSet<>();
081                while ((line = reader.readLine()) != null) {
082                        String[] spl = line.split("\t");
083                        if (spl.length != 5) {
084                                logger.warn(
085                                                "WRONG LINE LENGHT, expected 5, but got {} for: {}",
086                                                spl.length, line);
087                        }
088
089                        Cytoband b = new Cytoband();
090                        b.setChromosome(spl[0]);
091                        b.setStart(Integer.parseInt(spl[1]));
092                        b.setEnd(Integer.parseInt(spl[2]));
093                        b.setLocus(spl[3]);
094                        StainType type = StainType.getStainTypeFromString(spl[4]);
095                        if (type == null)
096                                logger.warn("unknown type: {}", spl[4]);
097                        b.setType(type);
098                        cytobands.add(b);
099                }
100
101                return cytobands;
102        }
103
104}