001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021package org.biojava.nbio.genome.parsers.twobit;
022
023
024import org.slf4j.Logger;
025import org.slf4j.LoggerFactory;
026
027import java.io.File;
028import java.io.IOException;
029import java.io.InputStream;
030import java.io.RandomAccessFile;
031import java.util.HashMap;
032
033/**
034 * downloaded from http://storage.bioinf.fbb.msu.ru/~roman/TwoBitParser.java
035 *
036 * Class is a parser of UCSC Genome Browser file format .2bit used to store
037 * nucleotide sequence information. This class extends InputStream and can
038 * be used as it after choosing one of names of containing sequences. This
039 * parser can be used to do some work like UCSC tool named twoBitToFa. For
040 * it just run this class with input file path as single parameter and set
041 * stdout stream into output file. If you have any problems or ideas don't
042 * hesitate to contact me through email: rsutormin[at]gmail.com.
043 * @author Roman Sutormin
044 */
045public class TwoBitParser extends InputStream {
046
047        private static final Logger logger = LoggerFactory.getLogger(TwoBitParser.class);
048
049        public int DEFAULT_BUFFER_SIZE = 10000;
050        //
051        private RandomAccessFile raf;
052        private File f;
053        private boolean reverse = false;
054        private String[] seq_names;
055        private HashMap<String,Long> seq2pos = new HashMap<String,Long>();
056        private String cur_seq_name;
057        private long[][] cur_nn_blocks;
058        private long[][] cur_mask_blocks;
059        private long cur_seq_pos;
060        private long cur_dna_size;
061        private int cur_nn_block_num;
062        private int cur_mask_block_num;
063        private int[] cur_bits;
064        private byte[] buffer;
065        private long buffer_size;
066        private long buffer_pos;
067        private long start_file_pos;
068        private long file_pos;
069        //
070        private static final char[] bit_chars = {
071                        'T','C','A','G'
072        };
073
074        public TwoBitParser(File f) throws Exception {
075                this.f = f;
076                raf = new RandomAccessFile(f,"r");
077                long sign = readFourBytes();
078                if(sign==0x1A412743) {
079                        logger.debug("2bit: Normal number architecture");
080                }
081                else if(sign==0x4327411A) {
082                        reverse = true;
083                        logger.debug("2bit: Reverse number architecture");
084                }
085                else throw new Exception("Wrong start signature in 2BIT format");
086                readFourBytes();
087                int seq_qnt = (int)readFourBytes();
088                readFourBytes();
089                seq_names = new String[seq_qnt];
090                for(int i=0;i<seq_qnt;i++) {
091                        int name_len = raf.read();
092                        char[] chars = new char[name_len];
093                        for(int j=0;j<name_len;j++) chars[j] = (char)raf.read();
094                        seq_names[i] = new String(chars);
095                        long pos = readFourBytes();
096                        seq2pos.put(seq_names[i],pos);
097                        logger.debug("2bit: Sequence name=[{}], pos={}", seq_names[i], pos);
098                }
099        }
100        private long readFourBytes() throws Exception {
101                long ret = 0;
102                if(!reverse) {
103                        ret = raf.read();
104                        ret += raf.read()*0x100;
105                        ret += raf.read()*0x10000;
106                        ret += raf.read()*0x1000000;
107                }
108                else {
109                        ret = raf.read()*0x1000000;
110                        ret += raf.read()*0x10000;
111                        ret += raf.read()*0x100;
112                        ret += raf.read();
113                }
114                return ret;
115        }
116        public String[] getSequenceNames() {
117                String[] ret = new String[seq_names.length];
118                System.arraycopy(seq_names,0,ret,0,seq_names.length);
119                return ret;
120        }
121        /**
122         * Method open nucleotide stream for sequence with given name.
123         * @param seq_name name of sequence (one of returned by getSequenceNames()).
124         * @throws Exception
125         */
126        public void setCurrentSequence(String seq_name) throws Exception {
127                if(cur_seq_name!=null) {
128                        throw new Exception("Sequence ["+cur_seq_name+"] was not closed");
129                }
130                if(seq2pos.get(seq_name)==null) {
131                        throw new Exception("Sequence ["+seq_name+"] was not found in 2bit file");
132                }
133                cur_seq_name = seq_name;
134                long pos = seq2pos.get(seq_name);
135                raf.seek(pos);
136                long dna_size = readFourBytes();
137                logger.debug("2bit: Sequence name=[{}], dna_size={}", cur_seq_name, dna_size);
138                cur_dna_size = dna_size;
139                int nn_block_qnt = (int)readFourBytes();
140                cur_nn_blocks = new long[nn_block_qnt][2];
141                for(int i=0;i<nn_block_qnt;i++) {
142                        cur_nn_blocks[i][0] = readFourBytes();
143                }
144                for(int i=0;i<nn_block_qnt;i++) {
145                        cur_nn_blocks[i][1] = readFourBytes();
146                }
147
148                for(int i=0;i<nn_block_qnt;i++) {
149                        logger.debug("NN-block: [{},{}] ", cur_nn_blocks[i][0], cur_nn_blocks[i][1]);
150                }
151
152                int mask_block_qnt = (int)readFourBytes();
153                cur_mask_blocks = new long[mask_block_qnt][2];
154                for(int i=0;i<mask_block_qnt;i++) {
155                        cur_mask_blocks[i][0] = readFourBytes();
156                }
157                for(int i=0;i<mask_block_qnt;i++) {
158                        cur_mask_blocks[i][1] = readFourBytes();
159                }
160
161                for(int i=0;i<mask_block_qnt;i++) {
162                        logger.debug("[{},{}] ", cur_mask_blocks[i][0], cur_mask_blocks[i][1]);
163                }
164
165                readFourBytes();
166                start_file_pos = raf.getFilePointer();
167                reset();
168        }
169        /**
170         * Method resets current position to the begining of sequence stream.
171         */
172        @Override
173        public synchronized void reset() throws IOException {
174                cur_seq_pos = 0;
175                cur_nn_block_num = (cur_nn_blocks.length>0)?0:-1;
176                cur_mask_block_num = (cur_mask_blocks.length>0)?0:-1;
177                cur_bits = new int[4];
178                file_pos = start_file_pos;
179                buffer_size = 0;
180                buffer_pos = -1;
181        }
182        /**
183         * @return number (starting from 0) of next readable nucleotide in sequence stream.
184         */
185        public long getCurrentSequencePosition() {
186                if(cur_seq_name==null) throw new RuntimeException("Sequence is not set");
187                return cur_seq_pos;
188        }
189        public void setCurrentSequencePosition(long pos) throws IOException {
190                if(cur_seq_name==null) throw new RuntimeException("Sequence is not set");
191                if(pos>cur_dna_size) throw new RuntimeException(
192                                "Postion is too high (more than "+cur_dna_size+")");
193                if(cur_seq_pos>pos) {
194                        reset();
195                }
196                skip(pos-cur_seq_pos);
197        }
198        private void loadBits() throws IOException {
199                if((buffer==null)||(buffer_pos<0)||(file_pos<buffer_pos)||
200                                (file_pos>=buffer_pos+buffer_size)) {
201                        if((buffer==null)||(buffer.length!=DEFAULT_BUFFER_SIZE)) {
202                                buffer = new byte[DEFAULT_BUFFER_SIZE];
203                        }
204                        buffer_pos = file_pos;
205                        buffer_size = raf.read(buffer);
206                }
207                int cur_byte = buffer[(int)(file_pos-buffer_pos)]& 0xff;
208                for(int i=0;i<4;i++) {
209                        cur_bits[3-i] = cur_byte%4;
210                        cur_byte /= 4;
211                }
212        }
213        /**
214         * Method reads 1 nucleotide from sequence stream. You should set current sequence
215         * before use it.
216         */
217        @Override
218        public int read() throws IOException {
219                if(cur_seq_name==null) throw new IOException("Sequence is not set");
220                if(cur_seq_pos==cur_dna_size) {
221                        logger.debug("End of sequence (file position:{})", raf.getFilePointer());
222                        return -1;
223                }
224                int bit_num = (int)cur_seq_pos%4;
225                if(bit_num==0) {
226                        loadBits();
227                }
228                else if(bit_num==3) {
229                        file_pos++;
230                }
231                char ret = 'N';
232                if((cur_nn_block_num>=0)&&
233                                (cur_nn_blocks[cur_nn_block_num][0]<=cur_seq_pos)) {
234                        if(cur_bits[bit_num]!=0) {
235                                throw new IOException("Wrong data in NN-block ("+cur_bits[bit_num]+") "+
236                                                "at position "+cur_seq_pos);
237                        }
238                        if(cur_nn_blocks[cur_nn_block_num][0]+cur_nn_blocks[cur_nn_block_num][1]==cur_seq_pos+1) {
239                                cur_nn_block_num++;
240                                if(cur_nn_block_num>=cur_nn_blocks.length) {
241                                        cur_nn_block_num = -1;
242                                }
243                        }
244                        ret = 'N';
245                }
246                else {
247                        ret = bit_chars[cur_bits[bit_num]];
248                }
249                if((cur_mask_block_num>=0)&&
250                                (cur_mask_blocks[cur_mask_block_num][0]<=cur_seq_pos)) {
251                        ret = Character.toLowerCase(ret);
252                        if(cur_mask_blocks[cur_mask_block_num][0]+cur_mask_blocks[cur_mask_block_num][1]==cur_seq_pos+1) {
253                                cur_mask_block_num++;
254                                if(cur_mask_block_num>=cur_mask_blocks.length) {
255                                        cur_mask_block_num = -1;
256                                }
257                        }
258                }
259                cur_seq_pos++;
260                return ret;
261        }
262        /**
263         * Method skips n nucleotides in sequence stream. You should set current sequence
264         * before use it.
265         */
266        @Override
267        public synchronized long skip(long n) throws IOException {
268                if(cur_seq_name==null) throw new IOException("Sequence is not set");
269                if(n<4) {
270                        int ret = 0;
271                        while((ret<n)&&(read()>=0)) ret++;
272                        return ret;
273                }
274                if(n>cur_dna_size-cur_seq_pos) {
275                        n = cur_dna_size-cur_seq_pos;
276                }
277                cur_seq_pos += n;
278                file_pos = start_file_pos+(cur_seq_pos/4);
279                raf.seek(file_pos);
280                if((cur_seq_pos%4)!=0) {
281                        loadBits();
282                }
283                while((cur_nn_block_num>=0)&&
284                                (cur_nn_blocks[cur_nn_block_num][0]+cur_nn_blocks[cur_nn_block_num][1]<=cur_seq_pos)) {
285                        cur_nn_block_num++;
286                        if(cur_nn_block_num>=cur_nn_blocks.length) cur_nn_block_num = -1;
287                }
288                while((cur_mask_block_num>=0)&&
289                                (cur_mask_blocks[cur_mask_block_num][0]+cur_mask_blocks[cur_mask_block_num][1]<=cur_seq_pos)) {
290                        cur_mask_block_num++;
291                        if(cur_mask_block_num>=cur_mask_blocks.length) cur_mask_block_num = -1;
292                }
293                return n;
294        }
295        /**
296         * Method closes current sequence and it's necessary to invoke it before setting
297         * new current sequence.
298         */
299        @Override
300        public void close() throws IOException {
301                cur_seq_name = null;
302                cur_nn_blocks = null;
303                cur_mask_blocks = null;
304                cur_seq_pos = -1;
305                cur_dna_size = -1;
306                cur_nn_block_num = -1;
307                cur_mask_block_num = -1;
308                cur_bits = null;
309                buffer_size = 0;
310                buffer_pos = -1;
311                file_pos = -1;
312                start_file_pos = -1;
313        }
314        @Override
315        public int available() throws IOException {
316                if(cur_seq_name==null) throw new IOException("Sequence is not set");
317                return (int)(cur_dna_size-cur_seq_pos);
318        }
319        /**
320         * Method closes random access file descriptor. You can't use any reading methods
321         * after it.
322         * @throws Exception
323         */
324        public void closeParser() throws Exception {
325                raf.close();
326        }
327        public File getFile() {
328                return f;
329        }
330        public String loadFragment(long seq_pos,int len) throws IOException {
331                if(cur_seq_name==null) throw new IOException("Sequence is not set");
332                setCurrentSequencePosition(seq_pos);
333                char[] ret = new char[len];
334                int i = 0;
335                for(;i<len;i++) {
336                        int ch = read();
337                        if(ch<0) break;
338                        ret[i] = (char)ch;
339                }
340                return new String(ret,0,i);
341        }
342        public void printFastaSequence() throws IOException {
343                if(cur_seq_name==null) throw new RuntimeException("Sequence is not set");
344                printFastaSequence(cur_dna_size-cur_seq_pos);
345        }
346        public void printFastaSequence(long len) throws IOException {
347                if(cur_seq_name==null) throw new RuntimeException("Sequence is not set");
348                logger.info(">{} pos={}, len={}", cur_seq_name, cur_seq_pos, len);
349                char[] line = new char[60];
350                boolean end = false;
351                long qnt_all = 0;
352                while(!end) {
353                        int qnt = 0;
354                        for(;(qnt<line.length)&&(qnt_all<len);qnt++,qnt_all++) {
355                                int ch = read();
356                                if(ch<0) {
357                                        end = true;
358                                        break;
359                                }
360                                line[qnt] = (char)ch;
361                        }
362                        if(qnt>0) {
363                                logger.info(new String(line,0,qnt));
364                        }
365                        if(qnt_all>=len) end = true;
366                }
367        }
368        public static void main(String[] args) throws Exception {
369                if(args.length==0) {
370                        logger.info("Usage: <program> <input.2bit> [<seq_name> [<start> [<length>]]]");
371                        logger.info("Resulting fasta data will be written in stdout.");
372                        return;
373                }
374                TwoBitParser p = new TwoBitParser(new File(args[0]));
375                if(args.length==1) {
376                        String[] names = p.getSequenceNames();
377                        for(int i=0;i<names.length;i++) {
378                                p.setCurrentSequence(names[i]);
379                                p.printFastaSequence();
380                                p.close();
381                        }
382                }
383                else {
384                        String name = args[1];
385                        p.setCurrentSequence(name);
386                        if(args.length>2) {
387                                long start = Long.parseLong(args[2]);
388                                p.skip(start);
389                        }
390                        if(args.length>3) {
391                                long len = Long.parseLong(args[3]);
392                                p.printFastaSequence(len);
393                        }
394                        else {
395                                p.printFastaSequence();
396                        }
397                        p.close();
398                }
399                p.closeParser();
400        }
401}
402