001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.genome.parsers.twobit; 022 023 024import org.slf4j.Logger; 025import org.slf4j.LoggerFactory; 026 027import java.io.File; 028import java.io.IOException; 029import java.io.InputStream; 030import java.io.RandomAccessFile; 031import java.util.HashMap; 032 033/** 034 * downloaded from http://storage.bioinf.fbb.msu.ru/~roman/TwoBitParser.java 035 * 036 * Class is a parser of UCSC Genome Browser file format .2bit used to store 037 * nucleotide sequence information. This class extends InputStream and can 038 * be used as it after choosing one of names of containing sequences. This 039 * parser can be used to do some work like UCSC tool named twoBitToFa. For 040 * it just run this class with input file path as single parameter and set 041 * stdout stream into output file. If you have any problems or ideas don't 042 * hesitate to contact me through email: rsutormin[at]gmail.com. 043 * @author Roman Sutormin 044 */ 045public class TwoBitParser extends InputStream { 046 047 private static final Logger logger = LoggerFactory.getLogger(TwoBitParser.class); 048 049 public int DEFAULT_BUFFER_SIZE = 10000; 050 // 051 private RandomAccessFile raf; 052 private File f; 053 private boolean reverse = false; 054 private String[] seq_names; 055 private HashMap<String,Long> seq2pos = new HashMap<String,Long>(); 056 private String cur_seq_name; 057 private long[][] cur_nn_blocks; 058 private long[][] cur_mask_blocks; 059 private long cur_seq_pos; 060 private long cur_dna_size; 061 private int cur_nn_block_num; 062 private int cur_mask_block_num; 063 private int[] cur_bits; 064 private byte[] buffer; 065 private long buffer_size; 066 private long buffer_pos; 067 private long start_file_pos; 068 private long file_pos; 069 // 070 private static final char[] bit_chars = { 071 'T','C','A','G' 072 }; 073 074 public TwoBitParser(File f) throws Exception { 075 this.f = f; 076 raf = new RandomAccessFile(f,"r"); 077 long sign = readFourBytes(); 078 if(sign==0x1A412743) { 079 logger.debug("2bit: Normal number architecture"); 080 } 081 else if(sign==0x4327411A) { 082 reverse = true; 083 logger.debug("2bit: Reverse number architecture"); 084 } 085 else throw new Exception("Wrong start signature in 2BIT format"); 086 readFourBytes(); 087 int seq_qnt = (int)readFourBytes(); 088 readFourBytes(); 089 seq_names = new String[seq_qnt]; 090 for(int i=0;i<seq_qnt;i++) { 091 int name_len = raf.read(); 092 char[] chars = new char[name_len]; 093 for(int j=0;j<name_len;j++) chars[j] = (char)raf.read(); 094 seq_names[i] = new String(chars); 095 long pos = readFourBytes(); 096 seq2pos.put(seq_names[i],pos); 097 logger.debug("2bit: Sequence name=[{}], pos={}", seq_names[i], pos); 098 } 099 } 100 private long readFourBytes() throws Exception { 101 long ret = 0; 102 if(!reverse) { 103 ret = raf.read(); 104 ret += raf.read()*0x100; 105 ret += raf.read()*0x10000; 106 ret += raf.read()*0x1000000; 107 } 108 else { 109 ret = raf.read()*0x1000000; 110 ret += raf.read()*0x10000; 111 ret += raf.read()*0x100; 112 ret += raf.read(); 113 } 114 return ret; 115 } 116 public String[] getSequenceNames() { 117 String[] ret = new String[seq_names.length]; 118 System.arraycopy(seq_names,0,ret,0,seq_names.length); 119 return ret; 120 } 121 /** 122 * Method open nucleotide stream for sequence with given name. 123 * @param seq_name name of sequence (one of returned by getSequenceNames()). 124 * @throws Exception 125 */ 126 public void setCurrentSequence(String seq_name) throws Exception { 127 if(cur_seq_name!=null) { 128 throw new Exception("Sequence ["+cur_seq_name+"] was not closed"); 129 } 130 if(seq2pos.get(seq_name)==null) { 131 throw new Exception("Sequence ["+seq_name+"] was not found in 2bit file"); 132 } 133 cur_seq_name = seq_name; 134 long pos = seq2pos.get(seq_name); 135 raf.seek(pos); 136 long dna_size = readFourBytes(); 137 logger.debug("2bit: Sequence name=[{}], dna_size={}", cur_seq_name, dna_size); 138 cur_dna_size = dna_size; 139 int nn_block_qnt = (int)readFourBytes(); 140 cur_nn_blocks = new long[nn_block_qnt][2]; 141 for(int i=0;i<nn_block_qnt;i++) { 142 cur_nn_blocks[i][0] = readFourBytes(); 143 } 144 for(int i=0;i<nn_block_qnt;i++) { 145 cur_nn_blocks[i][1] = readFourBytes(); 146 } 147 148 for(int i=0;i<nn_block_qnt;i++) { 149 logger.debug("NN-block: [{},{}] ", cur_nn_blocks[i][0], cur_nn_blocks[i][1]); 150 } 151 152 int mask_block_qnt = (int)readFourBytes(); 153 cur_mask_blocks = new long[mask_block_qnt][2]; 154 for(int i=0;i<mask_block_qnt;i++) { 155 cur_mask_blocks[i][0] = readFourBytes(); 156 } 157 for(int i=0;i<mask_block_qnt;i++) { 158 cur_mask_blocks[i][1] = readFourBytes(); 159 } 160 161 for(int i=0;i<mask_block_qnt;i++) { 162 logger.debug("[{},{}] ", cur_mask_blocks[i][0], cur_mask_blocks[i][1]); 163 } 164 165 readFourBytes(); 166 start_file_pos = raf.getFilePointer(); 167 reset(); 168 } 169 /** 170 * Method resets current position to the begining of sequence stream. 171 */ 172 @Override 173 public synchronized void reset() throws IOException { 174 cur_seq_pos = 0; 175 cur_nn_block_num = (cur_nn_blocks.length>0)?0:-1; 176 cur_mask_block_num = (cur_mask_blocks.length>0)?0:-1; 177 cur_bits = new int[4]; 178 file_pos = start_file_pos; 179 buffer_size = 0; 180 buffer_pos = -1; 181 } 182 /** 183 * @return number (starting from 0) of next readable nucleotide in sequence stream. 184 */ 185 public long getCurrentSequencePosition() { 186 if(cur_seq_name==null) throw new RuntimeException("Sequence is not set"); 187 return cur_seq_pos; 188 } 189 public void setCurrentSequencePosition(long pos) throws IOException { 190 if(cur_seq_name==null) throw new RuntimeException("Sequence is not set"); 191 if(pos>cur_dna_size) throw new RuntimeException( 192 "Postion is too high (more than "+cur_dna_size+")"); 193 if(cur_seq_pos>pos) { 194 reset(); 195 } 196 skip(pos-cur_seq_pos); 197 } 198 private void loadBits() throws IOException { 199 if((buffer==null)||(buffer_pos<0)||(file_pos<buffer_pos)|| 200 (file_pos>=buffer_pos+buffer_size)) { 201 if((buffer==null)||(buffer.length!=DEFAULT_BUFFER_SIZE)) { 202 buffer = new byte[DEFAULT_BUFFER_SIZE]; 203 } 204 buffer_pos = file_pos; 205 buffer_size = raf.read(buffer); 206 } 207 int cur_byte = buffer[(int)(file_pos-buffer_pos)]& 0xff; 208 for(int i=0;i<4;i++) { 209 cur_bits[3-i] = cur_byte%4; 210 cur_byte /= 4; 211 } 212 } 213 /** 214 * Method reads 1 nucleotide from sequence stream. You should set current sequence 215 * before use it. 216 */ 217 @Override 218 public int read() throws IOException { 219 if(cur_seq_name==null) throw new IOException("Sequence is not set"); 220 if(cur_seq_pos==cur_dna_size) { 221 logger.debug("End of sequence (file position:{})", raf.getFilePointer()); 222 return -1; 223 } 224 int bit_num = (int)cur_seq_pos%4; 225 if(bit_num==0) { 226 loadBits(); 227 } 228 else if(bit_num==3) { 229 file_pos++; 230 } 231 char ret = 'N'; 232 if((cur_nn_block_num>=0)&& 233 (cur_nn_blocks[cur_nn_block_num][0]<=cur_seq_pos)) { 234 if(cur_bits[bit_num]!=0) { 235 throw new IOException("Wrong data in NN-block ("+cur_bits[bit_num]+") "+ 236 "at position "+cur_seq_pos); 237 } 238 if(cur_nn_blocks[cur_nn_block_num][0]+cur_nn_blocks[cur_nn_block_num][1]==cur_seq_pos+1) { 239 cur_nn_block_num++; 240 if(cur_nn_block_num>=cur_nn_blocks.length) { 241 cur_nn_block_num = -1; 242 } 243 } 244 ret = 'N'; 245 } 246 else { 247 ret = bit_chars[cur_bits[bit_num]]; 248 } 249 if((cur_mask_block_num>=0)&& 250 (cur_mask_blocks[cur_mask_block_num][0]<=cur_seq_pos)) { 251 ret = Character.toLowerCase(ret); 252 if(cur_mask_blocks[cur_mask_block_num][0]+cur_mask_blocks[cur_mask_block_num][1]==cur_seq_pos+1) { 253 cur_mask_block_num++; 254 if(cur_mask_block_num>=cur_mask_blocks.length) { 255 cur_mask_block_num = -1; 256 } 257 } 258 } 259 cur_seq_pos++; 260 return ret; 261 } 262 /** 263 * Method skips n nucleotides in sequence stream. You should set current sequence 264 * before use it. 265 */ 266 @Override 267 public synchronized long skip(long n) throws IOException { 268 if(cur_seq_name==null) throw new IOException("Sequence is not set"); 269 if(n<4) { 270 int ret = 0; 271 while((ret<n)&&(read()>=0)) ret++; 272 return ret; 273 } 274 if(n>cur_dna_size-cur_seq_pos) { 275 n = cur_dna_size-cur_seq_pos; 276 } 277 cur_seq_pos += n; 278 file_pos = start_file_pos+(cur_seq_pos/4); 279 raf.seek(file_pos); 280 if((cur_seq_pos%4)!=0) { 281 loadBits(); 282 } 283 while((cur_nn_block_num>=0)&& 284 (cur_nn_blocks[cur_nn_block_num][0]+cur_nn_blocks[cur_nn_block_num][1]<=cur_seq_pos)) { 285 cur_nn_block_num++; 286 if(cur_nn_block_num>=cur_nn_blocks.length) cur_nn_block_num = -1; 287 } 288 while((cur_mask_block_num>=0)&& 289 (cur_mask_blocks[cur_mask_block_num][0]+cur_mask_blocks[cur_mask_block_num][1]<=cur_seq_pos)) { 290 cur_mask_block_num++; 291 if(cur_mask_block_num>=cur_mask_blocks.length) cur_mask_block_num = -1; 292 } 293 return n; 294 } 295 /** 296 * Method closes current sequence and it's necessary to invoke it before setting 297 * new current sequence. 298 */ 299 @Override 300 public void close() throws IOException { 301 cur_seq_name = null; 302 cur_nn_blocks = null; 303 cur_mask_blocks = null; 304 cur_seq_pos = -1; 305 cur_dna_size = -1; 306 cur_nn_block_num = -1; 307 cur_mask_block_num = -1; 308 cur_bits = null; 309 buffer_size = 0; 310 buffer_pos = -1; 311 file_pos = -1; 312 start_file_pos = -1; 313 } 314 @Override 315 public int available() throws IOException { 316 if(cur_seq_name==null) throw new IOException("Sequence is not set"); 317 return (int)(cur_dna_size-cur_seq_pos); 318 } 319 /** 320 * Method closes random access file descriptor. You can't use any reading methods 321 * after it. 322 * @throws Exception 323 */ 324 public void closeParser() throws Exception { 325 raf.close(); 326 } 327 public File getFile() { 328 return f; 329 } 330 public String loadFragment(long seq_pos,int len) throws IOException { 331 if(cur_seq_name==null) throw new IOException("Sequence is not set"); 332 setCurrentSequencePosition(seq_pos); 333 char[] ret = new char[len]; 334 int i = 0; 335 for(;i<len;i++) { 336 int ch = read(); 337 if(ch<0) break; 338 ret[i] = (char)ch; 339 } 340 return new String(ret,0,i); 341 } 342 public void printFastaSequence() throws IOException { 343 if(cur_seq_name==null) throw new RuntimeException("Sequence is not set"); 344 printFastaSequence(cur_dna_size-cur_seq_pos); 345 } 346 public void printFastaSequence(long len) throws IOException { 347 if(cur_seq_name==null) throw new RuntimeException("Sequence is not set"); 348 logger.info(">{} pos={}, len={}", cur_seq_name, cur_seq_pos, len); 349 char[] line = new char[60]; 350 boolean end = false; 351 long qnt_all = 0; 352 while(!end) { 353 int qnt = 0; 354 for(;(qnt<line.length)&&(qnt_all<len);qnt++,qnt_all++) { 355 int ch = read(); 356 if(ch<0) { 357 end = true; 358 break; 359 } 360 line[qnt] = (char)ch; 361 } 362 if(qnt>0) { 363 logger.info(new String(line,0,qnt)); 364 } 365 if(qnt_all>=len) end = true; 366 } 367 } 368 public static void main(String[] args) throws Exception { 369 if(args.length==0) { 370 logger.info("Usage: <program> <input.2bit> [<seq_name> [<start> [<length>]]]"); 371 logger.info("Resulting fasta data will be written in stdout."); 372 return; 373 } 374 TwoBitParser p = new TwoBitParser(new File(args[0])); 375 if(args.length==1) { 376 String[] names = p.getSequenceNames(); 377 for(int i=0;i<names.length;i++) { 378 p.setCurrentSequence(names[i]); 379 p.printFastaSequence(); 380 p.close(); 381 } 382 } 383 else { 384 String name = args[1]; 385 p.setCurrentSequence(name); 386 if(args.length>2) { 387 long start = Long.parseLong(args[2]); 388 p.skip(start); 389 } 390 if(args.length>3) { 391 long len = Long.parseLong(args[3]); 392 p.printFastaSequence(len); 393 } 394 else { 395 p.printFastaSequence(); 396 } 397 p.close(); 398 } 399 p.closeParser(); 400 } 401} 402