001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 * created at 28 Jan 2014
021 * Author: ap3
022 */
023
024package org.biojava.nbio.genome.parsers.genename;
025
026import org.biojava.nbio.genome.App;
027import org.biojava.nbio.core.util.InputStreamProvider;
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import java.io.BufferedReader;
032import java.io.IOException;
033import java.io.InputStream;
034import java.io.InputStreamReader;
035import java.net.URL;
036import java.util.ArrayList;
037import java.util.List;
038
039/** A parser that parses a file from the UCSC genome browser that contains mapping of gene name to chromosome positions
040 *
041 * @author Andreas Prlic
042 *
043 */
044public class GeneChromosomePositionParser {
045
046        private static final Logger logger = LoggerFactory.getLogger(App.class);
047
048        public static final String DEFAULT_MAPPING_URL="http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/refFlat.txt.gz";
049
050        public static void main(String[] args){
051                try {
052
053                        List<GeneChromosomePosition> genePositions=     getChromosomeMappings();
054                        logger.info("got {} gene positions", genePositions.size());
055
056                        for (GeneChromosomePosition pos : genePositions){
057                                if ( "FOLH1".equals(pos.getGeneName())) {
058                                        logger.info("Gene Position: {}", pos);
059                                        break;
060                                }
061                        }
062
063                } catch(Exception e){
064                        logger.error("Exception: ", e);
065                }
066        }
067
068        public static List<GeneChromosomePosition> getChromosomeMappings() throws IOException {
069
070                URL url = new URL(DEFAULT_MAPPING_URL);
071
072                InputStreamProvider prov = new InputStreamProvider();
073
074                InputStream inStream = prov.getInputStream(url);
075
076                return getChromosomeMappings(inStream);
077        }
078
079        public static List<GeneChromosomePosition> getChromosomeMappings(InputStream inStream) throws IOException {
080                BufferedReader reader = new BufferedReader(new InputStreamReader(inStream));
081
082                ArrayList<GeneChromosomePosition> gcps = new ArrayList<>();
083
084                String line = null;
085                while ((line = reader.readLine()) != null) {
086                        GeneChromosomePosition gcp = getGeneChromosomePosition(line);
087                        if ( gcp != null)
088                                gcps.add(gcp);
089                }
090
091                // since this is a large list, remove empty content.
092                gcps.trimToSize();
093                return gcps;
094        }
095
096        private static GeneChromosomePosition getGeneChromosomePosition(String line) {
097                if ( line == null)
098                        return null;
099                String[] spl = line.split("\t");
100
101                if ( spl.length != 11) {
102                        logger.warn("Line does not have 11 data items, but {}: {}", spl.length, line);
103                        return null;
104                }
105
106                GeneChromosomePosition g = new GeneChromosomePosition();
107
108                g.setGeneName(spl[0]);
109                g.setGenebankId(spl[1]);
110                g.setChromosome(spl[2]);
111                g.setOrientation(spl[3].charAt(0));
112                g.setTranscriptionStart(Integer.parseInt(spl[4]));
113                g.setTranscriptionEnd(Integer.parseInt(spl[5]));
114                g.setCdsStart(Integer.parseInt(spl[6]));
115                g.setCdsEnd(Integer.parseInt(spl[7]));
116                g.setExonCount(Integer.parseInt(spl[8]));
117                String exonStarts = spl[9];
118                String exonEnds = spl[10];
119                g.setExonStarts(getIntegerList(exonStarts));
120                g.setExonEnds(getIntegerList(exonEnds));
121
122                //System.out.println(line);
123                //System.out.println(Arrays.asList(spl) + " " + spl.length);
124                return g;
125        }
126
127        private static List<Integer> getIntegerList(String lst){
128                String[] spl = lst.split(",");
129                ArrayList<Integer> l = new ArrayList<>();
130                for (String s : spl){
131                        l.add(Integer.parseInt(s));
132                }
133                l.trimToSize();
134                return l;
135        }
136}