| 1 |
747 |
jeremybenn |
// Copyright 2011 The Go Authors. All rights reserved.
|
| 2 |
|
|
// Use of this source code is governed by a BSD-style
|
| 3 |
|
|
// license that can be found in the LICENSE file.
|
| 4 |
|
|
|
| 5 |
|
|
// Package bzip2 implements bzip2 decompression.
|
| 6 |
|
|
package bzip2
|
| 7 |
|
|
|
| 8 |
|
|
import "io"
|
| 9 |
|
|
|
| 10 |
|
|
// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
|
| 11 |
|
|
// of guessing: http://en.wikipedia.org/wiki/Bzip2
|
| 12 |
|
|
// The source code to pyflate was useful for debugging:
|
| 13 |
|
|
// http://www.paul.sladen.org/projects/pyflate
|
| 14 |
|
|
|
| 15 |
|
|
// A StructuralError is returned when the bzip2 data is found to be
|
| 16 |
|
|
// syntactically invalid.
|
| 17 |
|
|
type StructuralError string
|
| 18 |
|
|
|
| 19 |
|
|
func (s StructuralError) Error() string {
|
| 20 |
|
|
return "bzip2 data invalid: " + string(s)
|
| 21 |
|
|
}
|
| 22 |
|
|
|
| 23 |
|
|
// A reader decompresses bzip2 compressed data.
|
| 24 |
|
|
type reader struct {
|
| 25 |
|
|
br bitReader
|
| 26 |
|
|
setupDone bool // true if we have parsed the bzip2 header.
|
| 27 |
|
|
blockSize int // blockSize in bytes, i.e. 900 * 1024.
|
| 28 |
|
|
eof bool
|
| 29 |
|
|
buf []byte // stores Burrows-Wheeler transformed data.
|
| 30 |
|
|
c [256]uint // the `C' array for the inverse BWT.
|
| 31 |
|
|
tt []uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
|
| 32 |
|
|
tPos uint32 // Index of the next output byte in tt.
|
| 33 |
|
|
|
| 34 |
|
|
preRLE []uint32 // contains the RLE data still to be processed.
|
| 35 |
|
|
preRLEUsed int // number of entries of preRLE used.
|
| 36 |
|
|
lastByte int // the last byte value seen.
|
| 37 |
|
|
byteRepeats uint // the number of repeats of lastByte seen.
|
| 38 |
|
|
repeats uint // the number of copies of lastByte to output.
|
| 39 |
|
|
}
|
| 40 |
|
|
|
| 41 |
|
|
// NewReader returns an io.Reader which decompresses bzip2 data from r.
|
| 42 |
|
|
func NewReader(r io.Reader) io.Reader {
|
| 43 |
|
|
bz2 := new(reader)
|
| 44 |
|
|
bz2.br = newBitReader(r)
|
| 45 |
|
|
return bz2
|
| 46 |
|
|
}
|
| 47 |
|
|
|
| 48 |
|
|
const bzip2FileMagic = 0x425a // "BZ"
|
| 49 |
|
|
const bzip2BlockMagic = 0x314159265359
|
| 50 |
|
|
const bzip2FinalMagic = 0x177245385090
|
| 51 |
|
|
|
| 52 |
|
|
// setup parses the bzip2 header.
|
| 53 |
|
|
func (bz2 *reader) setup() error {
|
| 54 |
|
|
br := &bz2.br
|
| 55 |
|
|
|
| 56 |
|
|
magic := br.ReadBits(16)
|
| 57 |
|
|
if magic != bzip2FileMagic {
|
| 58 |
|
|
return StructuralError("bad magic value")
|
| 59 |
|
|
}
|
| 60 |
|
|
|
| 61 |
|
|
t := br.ReadBits(8)
|
| 62 |
|
|
if t != 'h' {
|
| 63 |
|
|
return StructuralError("non-Huffman entropy encoding")
|
| 64 |
|
|
}
|
| 65 |
|
|
|
| 66 |
|
|
level := br.ReadBits(8)
|
| 67 |
|
|
if level < '1' || level > '9' {
|
| 68 |
|
|
return StructuralError("invalid compression level")
|
| 69 |
|
|
}
|
| 70 |
|
|
|
| 71 |
|
|
bz2.blockSize = 100 * 1024 * (int(level) - '0')
|
| 72 |
|
|
bz2.tt = make([]uint32, bz2.blockSize)
|
| 73 |
|
|
return nil
|
| 74 |
|
|
}
|
| 75 |
|
|
|
| 76 |
|
|
func (bz2 *reader) Read(buf []byte) (n int, err error) {
|
| 77 |
|
|
if bz2.eof {
|
| 78 |
|
|
return 0, io.EOF
|
| 79 |
|
|
}
|
| 80 |
|
|
|
| 81 |
|
|
if !bz2.setupDone {
|
| 82 |
|
|
err = bz2.setup()
|
| 83 |
|
|
brErr := bz2.br.Err()
|
| 84 |
|
|
if brErr != nil {
|
| 85 |
|
|
err = brErr
|
| 86 |
|
|
}
|
| 87 |
|
|
if err != nil {
|
| 88 |
|
|
return 0, err
|
| 89 |
|
|
}
|
| 90 |
|
|
bz2.setupDone = true
|
| 91 |
|
|
}
|
| 92 |
|
|
|
| 93 |
|
|
n, err = bz2.read(buf)
|
| 94 |
|
|
brErr := bz2.br.Err()
|
| 95 |
|
|
if brErr != nil {
|
| 96 |
|
|
err = brErr
|
| 97 |
|
|
}
|
| 98 |
|
|
return
|
| 99 |
|
|
}
|
| 100 |
|
|
|
| 101 |
|
|
func (bz2 *reader) read(buf []byte) (n int, err error) {
|
| 102 |
|
|
// bzip2 is a block based compressor, except that it has a run-length
|
| 103 |
|
|
// preprocessing step. The block based nature means that we can
|
| 104 |
|
|
// preallocate fixed-size buffers and reuse them. However, the RLE
|
| 105 |
|
|
// preprocessing would require allocating huge buffers to store the
|
| 106 |
|
|
// maximum expansion. Thus we process blocks all at once, except for
|
| 107 |
|
|
// the RLE which we decompress as required.
|
| 108 |
|
|
|
| 109 |
|
|
for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
|
| 110 |
|
|
// We have RLE data pending.
|
| 111 |
|
|
|
| 112 |
|
|
// The run-length encoding works like this:
|
| 113 |
|
|
// Any sequence of four equal bytes is followed by a length
|
| 114 |
|
|
// byte which contains the number of repeats of that byte to
|
| 115 |
|
|
// include. (The number of repeats can be zero.) Because we are
|
| 116 |
|
|
// decompressing on-demand our state is kept in the reader
|
| 117 |
|
|
// object.
|
| 118 |
|
|
|
| 119 |
|
|
if bz2.repeats > 0 {
|
| 120 |
|
|
buf[n] = byte(bz2.lastByte)
|
| 121 |
|
|
n++
|
| 122 |
|
|
bz2.repeats--
|
| 123 |
|
|
if bz2.repeats == 0 {
|
| 124 |
|
|
bz2.lastByte = -1
|
| 125 |
|
|
}
|
| 126 |
|
|
continue
|
| 127 |
|
|
}
|
| 128 |
|
|
|
| 129 |
|
|
bz2.tPos = bz2.preRLE[bz2.tPos]
|
| 130 |
|
|
b := byte(bz2.tPos)
|
| 131 |
|
|
bz2.tPos >>= 8
|
| 132 |
|
|
bz2.preRLEUsed++
|
| 133 |
|
|
|
| 134 |
|
|
if bz2.byteRepeats == 3 {
|
| 135 |
|
|
bz2.repeats = uint(b)
|
| 136 |
|
|
bz2.byteRepeats = 0
|
| 137 |
|
|
continue
|
| 138 |
|
|
}
|
| 139 |
|
|
|
| 140 |
|
|
if bz2.lastByte == int(b) {
|
| 141 |
|
|
bz2.byteRepeats++
|
| 142 |
|
|
} else {
|
| 143 |
|
|
bz2.byteRepeats = 0
|
| 144 |
|
|
}
|
| 145 |
|
|
bz2.lastByte = int(b)
|
| 146 |
|
|
|
| 147 |
|
|
buf[n] = b
|
| 148 |
|
|
n++
|
| 149 |
|
|
}
|
| 150 |
|
|
|
| 151 |
|
|
if n > 0 {
|
| 152 |
|
|
return
|
| 153 |
|
|
}
|
| 154 |
|
|
|
| 155 |
|
|
// No RLE data is pending so we need to read a block.
|
| 156 |
|
|
|
| 157 |
|
|
br := &bz2.br
|
| 158 |
|
|
magic := br.ReadBits64(48)
|
| 159 |
|
|
if magic == bzip2FinalMagic {
|
| 160 |
|
|
br.ReadBits64(32) // ignored CRC
|
| 161 |
|
|
bz2.eof = true
|
| 162 |
|
|
return 0, io.EOF
|
| 163 |
|
|
} else if magic != bzip2BlockMagic {
|
| 164 |
|
|
return 0, StructuralError("bad magic value found")
|
| 165 |
|
|
}
|
| 166 |
|
|
|
| 167 |
|
|
err = bz2.readBlock()
|
| 168 |
|
|
if err != nil {
|
| 169 |
|
|
return 0, err
|
| 170 |
|
|
}
|
| 171 |
|
|
|
| 172 |
|
|
return bz2.read(buf)
|
| 173 |
|
|
}
|
| 174 |
|
|
|
| 175 |
|
|
// readBlock reads a bzip2 block. The magic number should already have been consumed.
|
| 176 |
|
|
func (bz2 *reader) readBlock() (err error) {
|
| 177 |
|
|
br := &bz2.br
|
| 178 |
|
|
br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is.
|
| 179 |
|
|
randomized := br.ReadBits(1)
|
| 180 |
|
|
if randomized != 0 {
|
| 181 |
|
|
return StructuralError("deprecated randomized files")
|
| 182 |
|
|
}
|
| 183 |
|
|
origPtr := uint(br.ReadBits(24))
|
| 184 |
|
|
|
| 185 |
|
|
// If not every byte value is used in the block (i.e., it's text) then
|
| 186 |
|
|
// the symbol set is reduced. The symbols used are stored as a
|
| 187 |
|
|
// two-level, 16x16 bitmap.
|
| 188 |
|
|
symbolRangeUsedBitmap := br.ReadBits(16)
|
| 189 |
|
|
symbolPresent := make([]bool, 256)
|
| 190 |
|
|
numSymbols := 0
|
| 191 |
|
|
for symRange := uint(0); symRange < 16; symRange++ {
|
| 192 |
|
|
if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
|
| 193 |
|
|
bits := br.ReadBits(16)
|
| 194 |
|
|
for symbol := uint(0); symbol < 16; symbol++ {
|
| 195 |
|
|
if bits&(1<<(15-symbol)) != 0 {
|
| 196 |
|
|
symbolPresent[16*symRange+symbol] = true
|
| 197 |
|
|
numSymbols++
|
| 198 |
|
|
}
|
| 199 |
|
|
}
|
| 200 |
|
|
}
|
| 201 |
|
|
}
|
| 202 |
|
|
|
| 203 |
|
|
// A block uses between two and six different Huffman trees.
|
| 204 |
|
|
numHuffmanTrees := br.ReadBits(3)
|
| 205 |
|
|
if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
|
| 206 |
|
|
return StructuralError("invalid number of Huffman trees")
|
| 207 |
|
|
}
|
| 208 |
|
|
|
| 209 |
|
|
// The Huffman tree can switch every 50 symbols so there's a list of
|
| 210 |
|
|
// tree indexes telling us which tree to use for each 50 symbol block.
|
| 211 |
|
|
numSelectors := br.ReadBits(15)
|
| 212 |
|
|
treeIndexes := make([]uint8, numSelectors)
|
| 213 |
|
|
|
| 214 |
|
|
// The tree indexes are move-to-front transformed and stored as unary
|
| 215 |
|
|
// numbers.
|
| 216 |
|
|
mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
|
| 217 |
|
|
for i := range treeIndexes {
|
| 218 |
|
|
c := 0
|
| 219 |
|
|
for {
|
| 220 |
|
|
inc := br.ReadBits(1)
|
| 221 |
|
|
if inc == 0 {
|
| 222 |
|
|
break
|
| 223 |
|
|
}
|
| 224 |
|
|
c++
|
| 225 |
|
|
}
|
| 226 |
|
|
if c >= numHuffmanTrees {
|
| 227 |
|
|
return StructuralError("tree index too large")
|
| 228 |
|
|
}
|
| 229 |
|
|
treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
|
| 230 |
|
|
}
|
| 231 |
|
|
|
| 232 |
|
|
// The list of symbols for the move-to-front transform is taken from
|
| 233 |
|
|
// the previously decoded symbol bitmap.
|
| 234 |
|
|
symbols := make([]byte, numSymbols)
|
| 235 |
|
|
nextSymbol := 0
|
| 236 |
|
|
for i := 0; i < 256; i++ {
|
| 237 |
|
|
if symbolPresent[i] {
|
| 238 |
|
|
symbols[nextSymbol] = byte(i)
|
| 239 |
|
|
nextSymbol++
|
| 240 |
|
|
}
|
| 241 |
|
|
}
|
| 242 |
|
|
mtf := newMTFDecoder(symbols)
|
| 243 |
|
|
|
| 244 |
|
|
numSymbols += 2 // to account for RUNA and RUNB symbols
|
| 245 |
|
|
huffmanTrees := make([]huffmanTree, numHuffmanTrees)
|
| 246 |
|
|
|
| 247 |
|
|
// Now we decode the arrays of code-lengths for each tree.
|
| 248 |
|
|
lengths := make([]uint8, numSymbols)
|
| 249 |
|
|
for i := 0; i < numHuffmanTrees; i++ {
|
| 250 |
|
|
// The code lengths are delta encoded from a 5-bit base value.
|
| 251 |
|
|
length := br.ReadBits(5)
|
| 252 |
|
|
for j := 0; j < numSymbols; j++ {
|
| 253 |
|
|
for {
|
| 254 |
|
|
if !br.ReadBit() {
|
| 255 |
|
|
break
|
| 256 |
|
|
}
|
| 257 |
|
|
if br.ReadBit() {
|
| 258 |
|
|
length--
|
| 259 |
|
|
} else {
|
| 260 |
|
|
length++
|
| 261 |
|
|
}
|
| 262 |
|
|
}
|
| 263 |
|
|
if length < 0 || length > 20 {
|
| 264 |
|
|
return StructuralError("Huffman length out of range")
|
| 265 |
|
|
}
|
| 266 |
|
|
lengths[j] = uint8(length)
|
| 267 |
|
|
}
|
| 268 |
|
|
huffmanTrees[i], err = newHuffmanTree(lengths)
|
| 269 |
|
|
if err != nil {
|
| 270 |
|
|
return err
|
| 271 |
|
|
}
|
| 272 |
|
|
}
|
| 273 |
|
|
|
| 274 |
|
|
selectorIndex := 1 // the next tree index to use
|
| 275 |
|
|
currentHuffmanTree := huffmanTrees[treeIndexes[0]]
|
| 276 |
|
|
bufIndex := 0 // indexes bz2.buf, the output buffer.
|
| 277 |
|
|
// The output of the move-to-front transform is run-length encoded and
|
| 278 |
|
|
// we merge the decoding into the Huffman parsing loop. These two
|
| 279 |
|
|
// variables accumulate the repeat count. See the Wikipedia page for
|
| 280 |
|
|
// details.
|
| 281 |
|
|
repeat := 0
|
| 282 |
|
|
repeat_power := 0
|
| 283 |
|
|
|
| 284 |
|
|
// The `C' array (used by the inverse BWT) needs to be zero initialized.
|
| 285 |
|
|
for i := range bz2.c {
|
| 286 |
|
|
bz2.c[i] = 0
|
| 287 |
|
|
}
|
| 288 |
|
|
|
| 289 |
|
|
decoded := 0 // counts the number of symbols decoded by the current tree.
|
| 290 |
|
|
for {
|
| 291 |
|
|
if decoded == 50 {
|
| 292 |
|
|
currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
|
| 293 |
|
|
selectorIndex++
|
| 294 |
|
|
decoded = 0
|
| 295 |
|
|
}
|
| 296 |
|
|
|
| 297 |
|
|
v := currentHuffmanTree.Decode(br)
|
| 298 |
|
|
decoded++
|
| 299 |
|
|
|
| 300 |
|
|
if v < 2 {
|
| 301 |
|
|
// This is either the RUNA or RUNB symbol.
|
| 302 |
|
|
if repeat == 0 {
|
| 303 |
|
|
repeat_power = 1
|
| 304 |
|
|
}
|
| 305 |
|
|
repeat += repeat_power << v
|
| 306 |
|
|
repeat_power <<= 1
|
| 307 |
|
|
|
| 308 |
|
|
// This limit of 2 million comes from the bzip2 source
|
| 309 |
|
|
// code. It prevents repeat from overflowing.
|
| 310 |
|
|
if repeat > 2*1024*1024 {
|
| 311 |
|
|
return StructuralError("repeat count too large")
|
| 312 |
|
|
}
|
| 313 |
|
|
continue
|
| 314 |
|
|
}
|
| 315 |
|
|
|
| 316 |
|
|
if repeat > 0 {
|
| 317 |
|
|
// We have decoded a complete run-length so we need to
|
| 318 |
|
|
// replicate the last output symbol.
|
| 319 |
|
|
for i := 0; i < repeat; i++ {
|
| 320 |
|
|
b := byte(mtf.First())
|
| 321 |
|
|
bz2.tt[bufIndex] = uint32(b)
|
| 322 |
|
|
bz2.c[b]++
|
| 323 |
|
|
bufIndex++
|
| 324 |
|
|
}
|
| 325 |
|
|
repeat = 0
|
| 326 |
|
|
}
|
| 327 |
|
|
|
| 328 |
|
|
if int(v) == numSymbols-1 {
|
| 329 |
|
|
// This is the EOF symbol. Because it's always at the
|
| 330 |
|
|
// end of the move-to-front list, and never gets moved
|
| 331 |
|
|
// to the front, it has this unique value.
|
| 332 |
|
|
break
|
| 333 |
|
|
}
|
| 334 |
|
|
|
| 335 |
|
|
// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
|
| 336 |
|
|
// one would expect |v-2| to be passed to the MTF decoder.
|
| 337 |
|
|
// However, the front of the MTF list is never referenced as 0,
|
| 338 |
|
|
// it's always referenced with a run-length of 1. Thus 0
|
| 339 |
|
|
// doesn't need to be encoded and we have |v-1| in the next
|
| 340 |
|
|
// line.
|
| 341 |
|
|
b := byte(mtf.Decode(int(v - 1)))
|
| 342 |
|
|
bz2.tt[bufIndex] = uint32(b)
|
| 343 |
|
|
bz2.c[b]++
|
| 344 |
|
|
bufIndex++
|
| 345 |
|
|
}
|
| 346 |
|
|
|
| 347 |
|
|
if origPtr >= uint(bufIndex) {
|
| 348 |
|
|
return StructuralError("origPtr out of bounds")
|
| 349 |
|
|
}
|
| 350 |
|
|
|
| 351 |
|
|
// We have completed the entropy decoding. Now we can perform the
|
| 352 |
|
|
// inverse BWT and setup the RLE buffer.
|
| 353 |
|
|
bz2.preRLE = bz2.tt[:bufIndex]
|
| 354 |
|
|
bz2.preRLEUsed = 0
|
| 355 |
|
|
bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
|
| 356 |
|
|
bz2.lastByte = -1
|
| 357 |
|
|
bz2.byteRepeats = 0
|
| 358 |
|
|
bz2.repeats = 0
|
| 359 |
|
|
|
| 360 |
|
|
return nil
|
| 361 |
|
|
}
|
| 362 |
|
|
|
| 363 |
|
|
// inverseBWT implements the inverse Burrows-Wheeler transform as described in
|
| 364 |
|
|
// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
|
| 365 |
|
|
// In that document, origPtr is called `I' and c is the `C' array after the
|
| 366 |
|
|
// first pass over the data. It's an argument here because we merge the first
|
| 367 |
|
|
// pass with the Huffman decoding.
|
| 368 |
|
|
//
|
| 369 |
|
|
// This also implements the `single array' method from the bzip2 source code
|
| 370 |
|
|
// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
|
| 371 |
|
|
// index of the next byte in the top 24-bits. The index of the first byte is
|
| 372 |
|
|
// returned.
|
| 373 |
|
|
func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
|
| 374 |
|
|
sum := uint(0)
|
| 375 |
|
|
for i := 0; i < 256; i++ {
|
| 376 |
|
|
sum += c[i]
|
| 377 |
|
|
c[i] = sum - c[i]
|
| 378 |
|
|
}
|
| 379 |
|
|
|
| 380 |
|
|
for i := range tt {
|
| 381 |
|
|
b := tt[i] & 0xff
|
| 382 |
|
|
tt[c[b]] |= uint32(i) << 8
|
| 383 |
|
|
c[b]++
|
| 384 |
|
|
}
|
| 385 |
|
|
|
| 386 |
|
|
return tt[origPtr] >> 8
|
| 387 |
|
|
}
|