001/*
002 *                    BioJava development code
003 *
004 * This code may be freely distributed and modified under the
005 * terms of the GNU Lesser General Public Licence.  This should
006 * be distributed with the code.  If you do not have a copy,
007 * see:
008 *
009 *      http://www.gnu.org/copyleft/lesser.html
010 *
011 * Copyright for this code is held jointly by the individual
012 * authors.  These should be listed in @author doc comments.
013 *
014 * For more information on the BioJava project and its aims,
015 * or to join the biojava-l mailing list, visit the home page
016 * at:
017 *
018 *      http://www.biojava.org/
019 *
020 */
021/*
022 * @(#)UncompressInputStream.java           0.3-3 06/05/2001
023 *
024 *  This file is part of the HTTPClient package
025 *  Copyright (C) 1996-2001 Ronald Tschalar
026 *
027 *  This library is free software; you can redistribute it and/or
028 *  modify it under the terms of the GNU Lesser General Public
029 *  License as published by the Free Software Foundation; either
030 *  version 2 of the License, or (at your option) any later version.
031 *
032 *  This library is distributed in the hope that it will be useful,
033 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
034 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
035 *  Lesser General Public License for more details.
036 *
037 *  You should have received a copy of the GNU Lesser General Public
038 *  License along with this library; if not, write to the Free
039 *  Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
040 *  MA 02111-1307, USA
041 *
042 *  For questions, suggestions, bug-reports, enhancement-requests etc.
043 *  I may be contacted at:
044 *
045 *  ronald@xxxxxxxxxxxxx
046 *
047 *  The HTTPClient's home page is located at:
048 *
049 *  http://www.innovation.ch/java/HTTPClient/
050 */
051
052package org.biojava.nbio.core.util;
053
054import org.slf4j.Logger;
055import org.slf4j.LoggerFactory;
056
057import java.io.*;
058
059
060/**
061 * This class decompresses an input stream containing data compressed with
062 * the unix "compress" utility (LZC, a LZW variant). This code is based
063 * heavily on the <var>unlzw.c</var> code in <var>gzip-1.2.4</var> (written
064 * by Peter Jannesen) and the original compress code.
065 *
066 *
067 *  This version has been modified from the original 0.3-3 version by the
068 *  Unidata Program Center (support@xxxxxxxxxxxxxxxx) to make the constructor
069 *  public and to fix a couple of bugs.
070 *  Also:
071 *   - markSupported() returns false
072 *   - add uncompress() static method
073 *
074 * @version 0.3-3 06/05/2001
075 * @author  Ronald Tschalar
076 * @author  Unidata Program Center
077 * @author  Richard Holland - making LZW_MAGIC package-visible.
078 *
079 * @version 0.3-5 2008/01/19
080 * @author  Fred Hansen (zweibieren@yahoo.com)
081 *      Fixed available() and the EOF condition for mainloop.
082 *      Also added some comments.
083 * 
084 * @version 1.0 2018/01/08
085 * @author      Fred Hansen (zweibieren@yahoo.com)
086 *      added uncompress(InputStream,OutputStream)
087 *          and called it from main(String[])
088 *          and uncompress(String, FileOutputStream)
089 *      normalize indentation 
090 *      rewrite skip method
091 *      amend logging code in uncompress(String, FileOutputStream)  
092 */
093public class UncompressInputStream extends FilterInputStream {
094        private final static Logger logger 
095                = LoggerFactory.getLogger(UncompressInputStream.class);
096
097        /**
098         * @param is the input stream to decompress
099         * @throws IOException if the header is malformed
100         */
101        public UncompressInputStream(InputStream is) throws IOException {
102                super(is);
103                parse_header();
104        }
105
106
107        @Override
108        public synchronized int read() throws IOException {
109                byte[] one = new byte[1];
110                int b = read(one, 0, 1);
111                if (b == 1)
112                        return (one[0] & 0xff);
113                else
114                        return -1;
115        }
116
117
118        // string table stuff
119        private static final int TBL_CLEAR = 0x100;
120        private static final int TBL_FIRST = TBL_CLEAR + 1;
121
122        private int[] tab_prefix;
123        private byte[] tab_suffix;
124        final private int[] zeros = new int[256];
125        private byte[] stack;
126
127        // various state
128        private boolean block_mode;
129        private int n_bits;
130        private int maxbits;
131        private int maxmaxcode;
132        private int maxcode;
133        private int bitmask;
134        private int oldcode;
135        private byte finchar;
136        private int stackp;
137        private int free_ent;
138
139        /* input buffer
140                The input stream must be considered in chunks 
141                Each chunk is of length eight times the current code length.
142                Thus the chunk contains eight codes; NOT on byte boundaries.
143        */
144        final private byte[] data = new byte[10000];
145        private int 
146            bit_pos = 0,  // current bitwise location in bitstream
147            end = 0,      // index of next byte to fill in data
148            got = 0;      // number of bytes gotten by most recent read()
149        private boolean eof = false;
150        private static final int EXTRA = 64;
151
152
153        @Override
154        public synchronized int read(byte[] buf, int off, int len)
155                        throws IOException {
156                if (eof) return -1;
157                int start = off;
158
159                /* Using local copies of various variables speeds things up by as
160                 * much as 30% !
161                 */
162                int[] l_tab_prefix = tab_prefix;
163                byte[] l_tab_suffix = tab_suffix;
164                byte[] l_stack = stack;
165                int l_n_bits = n_bits;
166                int l_maxcode = maxcode;
167                int l_maxmaxcode = maxmaxcode;
168                int l_bitmask = bitmask;
169                int l_oldcode = oldcode;
170                byte l_finchar = finchar;
171                int l_stackp = stackp;
172                int l_free_ent = free_ent;
173                byte[] l_data = data;
174                int l_bit_pos = bit_pos;
175
176                // empty stack if stuff still left
177                int s_size = l_stack.length - l_stackp;
178                if (s_size > 0) {
179                        int num = (s_size >= len) ? len : s_size;
180                        System.arraycopy(l_stack, l_stackp, buf, off, num);
181                        off += num;
182                        len -= num;
183                        l_stackp += num;
184                }
185
186                if (len == 0) {
187                        stackp = l_stackp;
188                        return off - start;
189                }
190
191                // loop, filling local buffer until enough data has been decompressed
192                main_loop: do {
193                        if (end < EXTRA) fill();
194
195                        int bit_end = (got > 0) 
196                            ?  (end - end % l_n_bits) << 3      // set to a "chunk" boundary
197                            :  (end << 3) - (l_n_bits - 1);     // no more data, set to last code
198
199                        while (l_bit_pos < bit_end) {           // handle 1-byte reads correctly
200                                if (len == 0) {
201                                        n_bits = l_n_bits;
202                                        maxcode = l_maxcode;
203                                        maxmaxcode = l_maxmaxcode;
204                                        bitmask = l_bitmask;
205                                        oldcode = l_oldcode;
206                                        finchar = l_finchar;
207                                        stackp = l_stackp;
208                                        free_ent = l_free_ent;
209                                        bit_pos = l_bit_pos;
210
211                                        return off - start;
212                                }
213
214                                // check for code-width expansion
215
216                                if (l_free_ent > l_maxcode) {
217                                        int n_bytes = l_n_bits << 3;
218                                        l_bit_pos = (l_bit_pos - 1) +
219                                                        n_bytes - (l_bit_pos - 1 + n_bytes) % n_bytes;
220
221                                        l_n_bits++;
222                                        l_maxcode = (l_n_bits == maxbits) ? l_maxmaxcode :
223                                                        (1 << l_n_bits) - 1;
224
225                                        logger.debug("Code-width expanded to ", l_n_bits);
226
227                                        l_bitmask = (1 << l_n_bits) - 1;
228                                        l_bit_pos = resetbuf(l_bit_pos);
229                                        continue main_loop;
230                                }
231
232
233                                // read next code
234
235                                int pos = l_bit_pos >> 3;
236                                int code = (((l_data[pos] & 0xFF) | ((l_data[pos + 1] & 0xFF) << 8) |
237                                                ((l_data[pos + 2] & 0xFF) << 16))
238                                                >> (l_bit_pos & 0x7)) & l_bitmask;
239                                l_bit_pos += l_n_bits;
240
241
242                                // handle first iteration
243
244                                if (l_oldcode == -1) {
245                                        if (code >= 256)
246                                                throw new IOException("corrupt input: " + code +
247                                                                " > 255");
248                                        l_finchar = (byte) (l_oldcode = code);
249                                        buf[off++] = l_finchar;
250                                        len--;
251                                        continue;
252                                }
253
254
255                                // handle CLEAR code
256
257                                if (code == TBL_CLEAR && block_mode) {
258                                        System.arraycopy(zeros, 0, l_tab_prefix, 0, zeros.length);
259                                        l_free_ent = TBL_FIRST - 1;
260
261                                        int n_bytes = l_n_bits << 3;
262                                        l_bit_pos = (l_bit_pos - 1) +
263                                                        n_bytes - (l_bit_pos - 1 + n_bytes) % n_bytes;
264                                        l_n_bits = INIT_BITS;
265                                        l_maxcode = (1 << l_n_bits) - 1;
266                                        l_bitmask = l_maxcode;
267
268                                        logger.debug("Code tables reset");
269
270                                        l_bit_pos = resetbuf(l_bit_pos);
271                                        continue main_loop;
272                                }
273
274
275                                // setup
276
277                                int incode = code;
278                                l_stackp = l_stack.length;
279
280
281                                // Handle KwK case
282
283                                if (code >= l_free_ent) {
284                                        if (code > l_free_ent)
285                                                throw new IOException("corrupt input: code=" + code +
286                                                                ", free_ent=" + l_free_ent);
287
288                                        l_stack[--l_stackp] = l_finchar;
289                                        code = l_oldcode;
290                                }
291
292
293                                // Generate output characters in reverse order
294
295                                while (code >= 256) {
296                                        l_stack[--l_stackp] = l_tab_suffix[code];
297                                        code = l_tab_prefix[code];
298                                }
299                                l_finchar = l_tab_suffix[code];
300                                buf[off++] = l_finchar;
301                                len--;
302
303
304                                // And put them out in forward order
305
306                                s_size = l_stack.length - l_stackp;
307                                int num = (s_size >= len) ? len : s_size;
308                                System.arraycopy(l_stack, l_stackp, buf, off, num);
309                                off += num;
310                                len -= num;
311                                l_stackp += num;
312
313
314                                // generate new entry in table
315
316                                if (l_free_ent < l_maxmaxcode) {
317                                        l_tab_prefix[l_free_ent] = l_oldcode;
318                                        l_tab_suffix[l_free_ent] = l_finchar;
319                                        l_free_ent++;
320                                }
321
322
323                                // Remember previous code
324
325                                l_oldcode = incode;
326
327
328                                // if output buffer full, then return
329
330                                if (len == 0) {
331                                        n_bits = l_n_bits;
332                                        maxcode = l_maxcode;
333                                        bitmask = l_bitmask;
334                                        oldcode = l_oldcode;
335                                        finchar = l_finchar;
336                                        stackp = l_stackp;
337                                        free_ent = l_free_ent;
338                                        bit_pos = l_bit_pos;
339
340                                        return off - start;
341                                }
342                        }
343
344                        l_bit_pos = resetbuf(l_bit_pos);
345                } while
346                        // old code: (got>0)  fails if code width expands near EOF
347                        (got > 0            // usually true
348                        || l_bit_pos < (end << 3) - (l_n_bits - 1));  // last few bytes
349
350                n_bits = l_n_bits;
351                maxcode = l_maxcode;
352                bitmask = l_bitmask;
353                oldcode = l_oldcode;
354                finchar = l_finchar;
355                stackp = l_stackp;
356                free_ent = l_free_ent;
357                bit_pos = l_bit_pos;
358
359                eof = true;
360                return off - start;
361        }
362
363
364        /**
365         * Moves the unread data in the buffer to the beginning and resets
366         * the pointers.
367         */
368        private int resetbuf(int bit_pos) {
369                int pos = bit_pos >> 3;
370                System.arraycopy(data, pos, data, 0, end - pos);
371                end -= pos;
372                return 0;
373        }
374
375
376        private void fill() throws IOException {
377                got = in.read(data, end, data.length - 1 - end);
378                if (got > 0) end += got;
379        }
380
381
382        @Override
383        public synchronized long skip(long num) throws IOException {
384                return Math.max(0, read(new byte[(int) num]));
385        }
386
387
388        @Override
389        public synchronized int available() throws IOException {
390                if (eof) return 0;
391                // the old code was:    return in.available();
392                // it fails because this.read() can return bytes 
393                // even after in.available()  is zero
394                // -- zweibieren
395                int avail = in.available(); 
396                return (avail == 0) ? 1 : avail;
397        }
398
399
400        static final int LZW_MAGIC = 0x1f9d;
401        private static final int MAX_BITS = 16;
402        private static final int INIT_BITS = 9;
403        private static final int HDR_MAXBITS = 0x1f;
404        private static final int HDR_EXTENDED = 0x20;
405        private static final int HDR_FREE = 0x40;
406        private static final int HDR_BLOCK_MODE = 0x80;
407
408        private void parse_header() throws IOException {
409                // read in and check magic number
410                int t = in.read();
411                if (t < 0) throw new EOFException("Failed to read magic number");
412                int magic = (t & 0xff) << 8;
413                t = in.read();
414                if (t < 0) throw new EOFException("Failed to read magic number");
415                magic += t & 0xff;
416                if (magic != LZW_MAGIC)
417                        throw new IOException("Input not in compress format (read " +
418                                        "magic number 0x" +
419                                        Integer.toHexString(magic) + ")");
420
421                // read in header byte
422                int header = in.read();
423                if (header < 0) throw new EOFException("Failed to read header");
424
425                block_mode = (header & HDR_BLOCK_MODE) > 0;
426                maxbits = header & HDR_MAXBITS;
427
428                if (maxbits > MAX_BITS)
429                        throw new IOException("Stream compressed with " + maxbits +
430                                        " bits, but can only handle " + MAX_BITS +
431                                        " bits");
432
433                if ((header & HDR_EXTENDED) > 0)
434                        throw new IOException("Header extension bit set");
435
436                if ((header & HDR_FREE) > 0)
437                        throw new IOException("Header bit 6 set");
438
439                logger.debug("block mode: {}", block_mode);
440                logger.debug("max bits:   {}", maxbits);
441
442                // initialize stuff
443                maxmaxcode = 1 << maxbits;
444                n_bits = INIT_BITS;
445                maxcode = (1 << n_bits) - 1;
446                bitmask = maxcode;
447                oldcode = -1;
448                finchar = 0;
449                free_ent = block_mode ? TBL_FIRST : 256;
450
451                tab_prefix = new int[1 << maxbits];
452                tab_suffix = new byte[1 << maxbits];
453                stack = new byte[1 << maxbits];
454                stackp = stack.length;
455
456                for (int idx = 255; idx >= 0; idx--)
457                        tab_suffix[idx] = (byte) idx;
458        }
459
460        /**
461         * This stream does not support mark/reset on the stream.
462         *
463         * @return false
464         */
465        @Override
466        public boolean markSupported() {
467                return false;
468        }
469
470        /**
471         * Read a named file and uncompress it.
472         * @param fileInName Name of compressed file.
473         * @param out A destination for the result. It is closed after data is sent.
474         * @return number of bytes sent to the output stream,
475         * @throws IOException for any error
476         */
477        public static long uncompress(String fileInName, FileOutputStream out) 
478                        throws IOException {
479                long start = System.currentTimeMillis();
480                long total;
481                try (InputStream fin = new FileInputStream(fileInName)) {
482                        total = uncompress(fin, out);
483                }
484                out.close();
485
486                if (debugTiming) {
487                        long end = System.currentTimeMillis();
488                        logger.info("Decompressed {} bytes", total);
489                        UncompressInputStream.logger.info("Time: {} seconds", (end - start) / 1000);
490                }
491                return total;
492        }
493
494        /**
495        * Read an input stream and uncompress it to an output stream.
496        * @param in the incoming InputStream. It is NOT closed.
497        * @param out the destination OutputStream. It is NOT closed.
498        * @return number of bytes sent to the output stream
499        * @throws IOException for any error
500        */
501        public static long uncompress(InputStream in, OutputStream out) 
502                throws IOException {
503                UncompressInputStream ucis = new UncompressInputStream(in);
504                long total = 0;
505                byte[] buffer = new byte[100000];
506                while (true) {
507                        int bytesRead = ucis.read(buffer);
508                        if (bytesRead == -1) break;
509                        out.write(buffer, 0, bytesRead);
510                        total += bytesRead;
511                }
512                return total;
513        }
514
515        private static final boolean debugTiming = false;
516
517        /**
518         * Reads a file, uncompresses it, and sends the result to stdout.
519         * Also writes trivial statistics to stderr.
520         * @param args An array with one String element, the name of the file to read.
521         * @throws IOException for any failure
522         */
523        public static void main(String[] args) throws Exception {
524                if (args.length != 1) {
525                        logger.info("Usage: UncompressInputStream <file>");
526                        System.exit(1);
527                }
528                long beg = System.currentTimeMillis();
529
530                long tot;
531                try (InputStream in = new FileInputStream(args[0])) {
532                        tot = uncompress(in, System.out);
533                }
534
535                long end = System.currentTimeMillis();
536                logger.info("Decompressed {} bytes", tot);
537                logger.info("Time: {} seconds", (end - beg) / 1000);
538        }
539}