OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [compress/] [bzip2/] [bzip2.go] - Blame information for rev 747

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2011 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
// Package bzip2 implements bzip2 decompression.
6
package bzip2
7
 
8
import "io"
9
 
10
// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
11
// of guessing: http://en.wikipedia.org/wiki/Bzip2
12
// The source code to pyflate was useful for debugging:
13
// http://www.paul.sladen.org/projects/pyflate
14
 
15
// A StructuralError is returned when the bzip2 data is found to be
16
// syntactically invalid.
17
type StructuralError string
18
 
19
func (s StructuralError) Error() string {
20
        return "bzip2 data invalid: " + string(s)
21
}
22
 
23
// A reader decompresses bzip2 compressed data.
24
type reader struct {
25
        br        bitReader
26
        setupDone bool // true if we have parsed the bzip2 header.
27
        blockSize int  // blockSize in bytes, i.e. 900 * 1024.
28
        eof       bool
29
        buf       []byte    // stores Burrows-Wheeler transformed data.
30
        c         [256]uint // the `C' array for the inverse BWT.
31
        tt        []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
32
        tPos      uint32    // Index of the next output byte in tt.
33
 
34
        preRLE      []uint32 // contains the RLE data still to be processed.
35
        preRLEUsed  int      // number of entries of preRLE used.
36
        lastByte    int      // the last byte value seen.
37
        byteRepeats uint     // the number of repeats of lastByte seen.
38
        repeats     uint     // the number of copies of lastByte to output.
39
}
40
 
41
// NewReader returns an io.Reader which decompresses bzip2 data from r.
42
func NewReader(r io.Reader) io.Reader {
43
        bz2 := new(reader)
44
        bz2.br = newBitReader(r)
45
        return bz2
46
}
47
 
48
const bzip2FileMagic = 0x425a // "BZ"
49
const bzip2BlockMagic = 0x314159265359
50
const bzip2FinalMagic = 0x177245385090
51
 
52
// setup parses the bzip2 header.
53
func (bz2 *reader) setup() error {
54
        br := &bz2.br
55
 
56
        magic := br.ReadBits(16)
57
        if magic != bzip2FileMagic {
58
                return StructuralError("bad magic value")
59
        }
60
 
61
        t := br.ReadBits(8)
62
        if t != 'h' {
63
                return StructuralError("non-Huffman entropy encoding")
64
        }
65
 
66
        level := br.ReadBits(8)
67
        if level < '1' || level > '9' {
68
                return StructuralError("invalid compression level")
69
        }
70
 
71
        bz2.blockSize = 100 * 1024 * (int(level) - '0')
72
        bz2.tt = make([]uint32, bz2.blockSize)
73
        return nil
74
}
75
 
76
func (bz2 *reader) Read(buf []byte) (n int, err error) {
77
        if bz2.eof {
78
                return 0, io.EOF
79
        }
80
 
81
        if !bz2.setupDone {
82
                err = bz2.setup()
83
                brErr := bz2.br.Err()
84
                if brErr != nil {
85
                        err = brErr
86
                }
87
                if err != nil {
88
                        return 0, err
89
                }
90
                bz2.setupDone = true
91
        }
92
 
93
        n, err = bz2.read(buf)
94
        brErr := bz2.br.Err()
95
        if brErr != nil {
96
                err = brErr
97
        }
98
        return
99
}
100
 
101
func (bz2 *reader) read(buf []byte) (n int, err error) {
102
        // bzip2 is a block based compressor, except that it has a run-length
103
        // preprocessing step. The block based nature means that we can
104
        // preallocate fixed-size buffers and reuse them. However, the RLE
105
        // preprocessing would require allocating huge buffers to store the
106
        // maximum expansion. Thus we process blocks all at once, except for
107
        // the RLE which we decompress as required.
108
 
109
        for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
110
                // We have RLE data pending.
111
 
112
                // The run-length encoding works like this:
113
                // Any sequence of four equal bytes is followed by a length
114
                // byte which contains the number of repeats of that byte to
115
                // include. (The number of repeats can be zero.) Because we are
116
                // decompressing on-demand our state is kept in the reader
117
                // object.
118
 
119
                if bz2.repeats > 0 {
120
                        buf[n] = byte(bz2.lastByte)
121
                        n++
122
                        bz2.repeats--
123
                        if bz2.repeats == 0 {
124
                                bz2.lastByte = -1
125
                        }
126
                        continue
127
                }
128
 
129
                bz2.tPos = bz2.preRLE[bz2.tPos]
130
                b := byte(bz2.tPos)
131
                bz2.tPos >>= 8
132
                bz2.preRLEUsed++
133
 
134
                if bz2.byteRepeats == 3 {
135
                        bz2.repeats = uint(b)
136
                        bz2.byteRepeats = 0
137
                        continue
138
                }
139
 
140
                if bz2.lastByte == int(b) {
141
                        bz2.byteRepeats++
142
                } else {
143
                        bz2.byteRepeats = 0
144
                }
145
                bz2.lastByte = int(b)
146
 
147
                buf[n] = b
148
                n++
149
        }
150
 
151
        if n > 0 {
152
                return
153
        }
154
 
155
        // No RLE data is pending so we need to read a block.
156
 
157
        br := &bz2.br
158
        magic := br.ReadBits64(48)
159
        if magic == bzip2FinalMagic {
160
                br.ReadBits64(32) // ignored CRC
161
                bz2.eof = true
162
                return 0, io.EOF
163
        } else if magic != bzip2BlockMagic {
164
                return 0, StructuralError("bad magic value found")
165
        }
166
 
167
        err = bz2.readBlock()
168
        if err != nil {
169
                return 0, err
170
        }
171
 
172
        return bz2.read(buf)
173
}
174
 
175
// readBlock reads a bzip2 block. The magic number should already have been consumed.
176
func (bz2 *reader) readBlock() (err error) {
177
        br := &bz2.br
178
        br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is.
179
        randomized := br.ReadBits(1)
180
        if randomized != 0 {
181
                return StructuralError("deprecated randomized files")
182
        }
183
        origPtr := uint(br.ReadBits(24))
184
 
185
        // If not every byte value is used in the block (i.e., it's text) then
186
        // the symbol set is reduced. The symbols used are stored as a
187
        // two-level, 16x16 bitmap.
188
        symbolRangeUsedBitmap := br.ReadBits(16)
189
        symbolPresent := make([]bool, 256)
190
        numSymbols := 0
191
        for symRange := uint(0); symRange < 16; symRange++ {
192
                if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
193
                        bits := br.ReadBits(16)
194
                        for symbol := uint(0); symbol < 16; symbol++ {
195
                                if bits&(1<<(15-symbol)) != 0 {
196
                                        symbolPresent[16*symRange+symbol] = true
197
                                        numSymbols++
198
                                }
199
                        }
200
                }
201
        }
202
 
203
        // A block uses between two and six different Huffman trees.
204
        numHuffmanTrees := br.ReadBits(3)
205
        if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
206
                return StructuralError("invalid number of Huffman trees")
207
        }
208
 
209
        // The Huffman tree can switch every 50 symbols so there's a list of
210
        // tree indexes telling us which tree to use for each 50 symbol block.
211
        numSelectors := br.ReadBits(15)
212
        treeIndexes := make([]uint8, numSelectors)
213
 
214
        // The tree indexes are move-to-front transformed and stored as unary
215
        // numbers.
216
        mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
217
        for i := range treeIndexes {
218
                c := 0
219
                for {
220
                        inc := br.ReadBits(1)
221
                        if inc == 0 {
222
                                break
223
                        }
224
                        c++
225
                }
226
                if c >= numHuffmanTrees {
227
                        return StructuralError("tree index too large")
228
                }
229
                treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
230
        }
231
 
232
        // The list of symbols for the move-to-front transform is taken from
233
        // the previously decoded symbol bitmap.
234
        symbols := make([]byte, numSymbols)
235
        nextSymbol := 0
236
        for i := 0; i < 256; i++ {
237
                if symbolPresent[i] {
238
                        symbols[nextSymbol] = byte(i)
239
                        nextSymbol++
240
                }
241
        }
242
        mtf := newMTFDecoder(symbols)
243
 
244
        numSymbols += 2 // to account for RUNA and RUNB symbols
245
        huffmanTrees := make([]huffmanTree, numHuffmanTrees)
246
 
247
        // Now we decode the arrays of code-lengths for each tree.
248
        lengths := make([]uint8, numSymbols)
249
        for i := 0; i < numHuffmanTrees; i++ {
250
                // The code lengths are delta encoded from a 5-bit base value.
251
                length := br.ReadBits(5)
252
                for j := 0; j < numSymbols; j++ {
253
                        for {
254
                                if !br.ReadBit() {
255
                                        break
256
                                }
257
                                if br.ReadBit() {
258
                                        length--
259
                                } else {
260
                                        length++
261
                                }
262
                        }
263
                        if length < 0 || length > 20 {
264
                                return StructuralError("Huffman length out of range")
265
                        }
266
                        lengths[j] = uint8(length)
267
                }
268
                huffmanTrees[i], err = newHuffmanTree(lengths)
269
                if err != nil {
270
                        return err
271
                }
272
        }
273
 
274
        selectorIndex := 1 // the next tree index to use
275
        currentHuffmanTree := huffmanTrees[treeIndexes[0]]
276
        bufIndex := 0 // indexes bz2.buf, the output buffer.
277
        // The output of the move-to-front transform is run-length encoded and
278
        // we merge the decoding into the Huffman parsing loop. These two
279
        // variables accumulate the repeat count. See the Wikipedia page for
280
        // details.
281
        repeat := 0
282
        repeat_power := 0
283
 
284
        // The `C' array (used by the inverse BWT) needs to be zero initialized.
285
        for i := range bz2.c {
286
                bz2.c[i] = 0
287
        }
288
 
289
        decoded := 0 // counts the number of symbols decoded by the current tree.
290
        for {
291
                if decoded == 50 {
292
                        currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
293
                        selectorIndex++
294
                        decoded = 0
295
                }
296
 
297
                v := currentHuffmanTree.Decode(br)
298
                decoded++
299
 
300
                if v < 2 {
301
                        // This is either the RUNA or RUNB symbol.
302
                        if repeat == 0 {
303
                                repeat_power = 1
304
                        }
305
                        repeat += repeat_power << v
306
                        repeat_power <<= 1
307
 
308
                        // This limit of 2 million comes from the bzip2 source
309
                        // code. It prevents repeat from overflowing.
310
                        if repeat > 2*1024*1024 {
311
                                return StructuralError("repeat count too large")
312
                        }
313
                        continue
314
                }
315
 
316
                if repeat > 0 {
317
                        // We have decoded a complete run-length so we need to
318
                        // replicate the last output symbol.
319
                        for i := 0; i < repeat; i++ {
320
                                b := byte(mtf.First())
321
                                bz2.tt[bufIndex] = uint32(b)
322
                                bz2.c[b]++
323
                                bufIndex++
324
                        }
325
                        repeat = 0
326
                }
327
 
328
                if int(v) == numSymbols-1 {
329
                        // This is the EOF symbol. Because it's always at the
330
                        // end of the move-to-front list, and never gets moved
331
                        // to the front, it has this unique value.
332
                        break
333
                }
334
 
335
                // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
336
                // one would expect |v-2| to be passed to the MTF decoder.
337
                // However, the front of the MTF list is never referenced as 0,
338
                // it's always referenced with a run-length of 1. Thus 0
339
                // doesn't need to be encoded and we have |v-1| in the next
340
                // line.
341
                b := byte(mtf.Decode(int(v - 1)))
342
                bz2.tt[bufIndex] = uint32(b)
343
                bz2.c[b]++
344
                bufIndex++
345
        }
346
 
347
        if origPtr >= uint(bufIndex) {
348
                return StructuralError("origPtr out of bounds")
349
        }
350
 
351
        // We have completed the entropy decoding. Now we can perform the
352
        // inverse BWT and setup the RLE buffer.
353
        bz2.preRLE = bz2.tt[:bufIndex]
354
        bz2.preRLEUsed = 0
355
        bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
356
        bz2.lastByte = -1
357
        bz2.byteRepeats = 0
358
        bz2.repeats = 0
359
 
360
        return nil
361
}
362
 
363
// inverseBWT implements the inverse Burrows-Wheeler transform as described in
364
// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
365
// In that document, origPtr is called `I' and c is the `C' array after the
366
// first pass over the data. It's an argument here because we merge the first
367
// pass with the Huffman decoding.
368
//
369
// This also implements the `single array' method from the bzip2 source code
370
// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
371
// index of the next byte in the top 24-bits. The index of the first byte is
372
// returned.
373
func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
374
        sum := uint(0)
375
        for i := 0; i < 256; i++ {
376
                sum += c[i]
377
                c[i] = sum - c[i]
378
        }
379
 
380
        for i := range tt {
381
                b := tt[i] & 0xff
382
                tt[c[b]] |= uint32(i) << 8
383
                c[b]++
384
        }
385
 
386
        return tt[origPtr] >> 8
387
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.