OpenCores
URL https://opencores.org/ocsvn/mpeg2fpga/mpeg2fpga/trunk

Subversion Repositories mpeg2fpga

[/] [mpeg2fpga/] [trunk/] [rtl/] [mpeg2/] [idct.v] - Blame information for rev 2

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 2 kdv
/*
2
 * idct.v
3
 *
4
 * Copyright (c) 2007 Koen De Vleeschauwer.
5
 *
6
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
7
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
8
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
9
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
10
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
11
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
12
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
13
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
14
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
15
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
16
 * SUCH DAMAGE.
17
 */
18
 
19
/*
20
 * Inverse Discrete Cosine Transform.
21
 */
22
 
23
`include "timescale.v"
24
 
25
`undef DEBUG
26
//`define DEBUG 1
27
//`define DEBUG_IDCT_1D 1
28
//`define DEBUG_TRANSPOSE 1
29
`undef CHECK
30
`ifdef __IVERILOG__
31
`define CHECK 1
32
`endif
33
 
34
  /*
35
   * 2-dimensional inverse discrete cosine transform.
36
   *
37
   * Uses row/column decomposition method:
38
   * 1. do a one-dimensional idct om the rows
39
   * 2. swap rows and columns,
40
   * 3. do a one-dimensional idct om the columns
41
   * 4. swap rows and columns to go back to row order.
42
   *
43
   * Thought to meet or exceed the former IEEE 1180-1990 standard.
44
   * Can do streaming.
45
   * Uses 12 multipliers, all smaller than 18x18, and 2 dual-ported rams.
46
   *
47
   * The 8-point 1-dimensional inverse discrete cosine transform can be written as:
48
   *
49
   * | y0 |   1   |  a  b  a  c |   | x0 |   1   |  d  e  f  g |   | x1 |
50
   * | y1 | = - * |  a  c -a -b | * | x2 | + - * |  e -g -d -f | * | x3 |
51
   * | y2 |   2   |  a -c -a  b |   | x4 |   2   |  f -d  g  e |   | x5 |
52
   * | y3 |       |  a -b  a  c |   | x6 |       |  g -f  e -d |   | x7 |
53
   *
54
   * | y7 |   1   |  a  b  a  c |   | x0 |   1   |  d  e  f  g |   | x1 |
55
   * | y6 | = - * |  a  c -a -b | * | x2 | - - * |  e -g -d -f | * | x3 |
56
   * | y5 |   2   |  a -c -a  b |   | x4 |   2   |  f -d  g  e |   | x5 |
57
   * | y4 |       |  a -b  a  c |   | x6 |       |  g -f  e -d |   | x7 |
58
   *
59
   * where
60
   *   a = cos (pi/4)
61
   *   b = cos (pi/8)
62
   *   c = sin (pi/8)
63
   *   d = cos (pi/16)
64
   *   e = cos (3*pi/16)
65
   *   f = sin (3*pi/16)
66
   *   g = sin (pi/16)
67
   *
68
   * For fixed-point calculations, a..g are multiplied by sqrt(8) * 2**scale
69
   * where scale = 13 or 14, depending upon accuracy desired.
70
   * Multiplying by sqrt(8) causes a to be a power of two.
71
   * This way a*x0 and a*x4 can be calculated using shifts, saving two multipliers.
72
   *
73
   * Multipliers and adders are dimensioned according to:
74
   * "Systematic approach of Fixed Point 8x8 IDCT and DCT Design and Implementation",
75
   * Zhang, Wang, Yu.
76
   *
77
   * We choose:
78
   *   scheme = 4
79
   *   scale = 14
80
   *   row_shift = 10
81
   *   col_shift = 21
82
   *
83
   * Calculation of theoretical register sizes:
84
   *   sample = 8 (in mpeg2 video)
85
   *   input of idct_row:
86
   *     input_bits =  sample_bits + 4 = 8 + 4 = 12 (Form. 6)
87
   *   outout of idct_row:
88
   *     output_bits_row = scale - row_shift + sample_bits + 5 = 14 - 10 + 8 + 5 = 17 (Form. 11)
89
   *   size of internal registers during calculation of idct_row:
90
   *     max_inter_bits_row = scale + sample_bits + 5 = 13 + 8 + 5 = 26
91
   *   output of idct_col:
92
   *     output_bits_col = sample_bits  + 3 = 8 + 3 = 11 (Form. 12)
93
   *   size of internal registers during calculation of idct_col:
94
   *     max_inter_bits_col = col_shift + sample_bits  + 3 = 21 + 8 + 3 = 32 (Form. 13)
95
   *
96
   * We choose:
97
   *   register for idct_row: 32 bits
98
   *   output of idct_row: 22 bits
99
   *   registers for idct_col: 42 bits
100
   *   output of idct_col: 22 bits
101
   *
102
   */
103
 
104
module idct(clk, clk_en, rst,
105
            iquant_level, iquant_eob, iquant_valid,
106
            idct_data, idct_valid, idct_eob);
107
 
108
  input              clk;                       // clock
109
  input              clk_en;                    // clock enable
110
  input              rst;                       // synchronous active low reset
111
  input signed [11:0]iquant_level;              // inverse quantized dct coefficient
112
  input              iquant_eob;                // asserted at last inverse quantized dct coefficient of block
113
  input              iquant_valid;              // asserted when inverse quantized dct coefficient valid
114
  output signed [8:0]idct_data;                 // inverse quantized dct coefficient
115
  output             idct_eob;                  // asserted at last inverse quantized dct coefficient of block
116
  output             idct_valid;                // asserted when idct_data, idct_eob valid
117
 
118
  wire signed  [21:0]idct_row_data;
119
  wire               idct_row_valid;
120
  wire signed  [21:0]idct_col_data_in;
121
  wire               idct_col_valid_in;
122
  wire signed  [20:0]idct_col_data_out;
123
  wire               idct_col_valid_out;
124
  wire signed   [8:0]idct_col_clip_data_out;
125
  wire               idct_col_clip_valid_out;
126
 
127
  /* apply 1-d idct to rows */
128
  idct1d_row      #(.scale(14), .dta_in_width(12), .dta_shift(10), .reg_width(32))
129
                  idct_row(.clk(clk), .clk_en(clk_en), .rst(rst),
130
                  .dta_in(iquant_level), .dta_in_valid(iquant_valid),
131
                  .dta_out(idct_row_data), .dta_out_valid(idct_row_valid));
132
 
133
  /*
134
   * Result from idct_row is 22 bit wide.
135
   */
136
 
137
  /* swap rows and columns */
138
  transpose       #(.dta_width(22))
139
                  row2col(.clk(clk), .clk_en(clk_en), .rst(rst),
140
                  .dta_in(idct_row_data), .dta_in_valid(idct_row_valid),
141
                  .dta_out(idct_col_data_in), .dta_out_valid(idct_col_valid_in), .dta_out_eob());
142
 
143
  /* apply 1-d idct to columns */
144
  idct1d_col      #(.scale(14), .dta_in_width(22), .dta_shift(21), .reg_width(42))
145
                  idct_col(.clk(clk), .clk_en(clk_en), .rst(rst),
146
                  .dta_in(idct_col_data_in), .dta_in_valid(idct_col_valid_in),
147
                  .dta_out(idct_col_data_out), .dta_out_valid(idct_col_valid_out));
148
 
149
  /*
150
   * Result from idct_col is 22 bits,
151
   * Clip to 9 bits.
152
   */
153
 
154
  clip_col        clip_col(.clk(clk), .clk_en(clk_en), .rst(rst),
155
                  .dta_in(idct_col_data_out), .dta_in_valid(idct_col_valid_out),
156
                  .dta_out(idct_col_clip_data_out), .dta_out_valid(idct_col_clip_valid_out));
157
 
158
  /* swap back to rows */
159
  transpose       #(.dta_width(9))
160
                  col2row(.clk(clk), .clk_en(clk_en), .rst(rst),
161
                  .dta_in(idct_col_clip_data_out), .dta_in_valid(idct_col_clip_valid_out),
162
                  .dta_out(idct_data), .dta_out_valid(idct_valid), .dta_out_eob(idct_eob));
163
 
164
`ifdef DEBUG
165
always @(posedge clk)
166
  if (rst && clk_en && idct_valid && (idct_data === 9'bx))
167
    begin
168
      $display ("%m\t*** Error: idct value undefined ***");
169
      $stop;
170
    end
171
 
172
always @(posedge clk)
173
  if (clk_en && iquant_valid)
174
    begin
175
      if (iquant_eob)
176
        begin
177
          #0 $display("%m\t\tidct input: %d (eob)", iquant_level);
178
        end
179
      else
180
        begin
181
          #0 $display("%m\t\tidct input: %d", iquant_level);
182
        end
183
    end
184
 
185
always @(posedge clk)
186
  if (clk_en && idct_row_valid)
187
    begin
188
        #0 $display("%m\t\tafter idct_row: %d", idct_row_data);
189
    end
190
 
191
always @(posedge clk)
192
  if (clk_en && idct_col_valid_in)
193
    begin
194
        #0 $display("%m\t\tafter row2col: %d", idct_col_data_in);
195
    end
196
 
197
always @(posedge clk)
198
  if (clk_en && idct_col_valid_out)
199
    begin
200
        #0 $display("%m\t\tafter idct_col: %d", idct_col_data_out);
201
    end
202
 
203
always @(posedge clk)
204
  if (clk_en && idct_col_clip_valid_out)
205
    begin
206
        #0 $display("%m\t\tafter clipping: %d", idct_col_clip_data_out);
207
    end
208
 
209
always @(posedge clk)
210
  if (clk_en && idct_valid)
211
    begin
212
      #0 $display("%m\t\tafter col2row: %d", idct_data);
213
    end
214
 
215
`endif
216
 
217
endmodule
218
 
219
/*
220
 * 8-point 1-dimensional inverse discrete cosine transform. Row transform.
221
 */
222
 
223
module idct1d_row (clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid);
224
  parameter                            dta_in_width=12,          // width of dta_in
225
                                       dta_shift=11,             // how much to shift result to the right
226
                                       reg_width=29,             // width of internal registers
227
                                       scale=13,                 // cosine values scaled by 2**scale
228
                                       dta_out_width=reg_width-dta_shift, // width of dta_out
229
                                       cosval_width=16,          // width of COSVAL_A .. COSVAL_G
230
                                       prod_width=dta_in_width+cosval_width;          // width of COSVAL_i * xi
231
 
232
  input                                clk;                      // clock
233
  input                                clk_en;                   // clock enable
234
  input                                rst;                      // synchronous active low reset
235
  input signed       [dta_in_width-1:0]dta_in;                   // data in
236
  input                                dta_in_valid;
237
  output reg signed [dta_out_width-1:0]dta_out;                  // data out - 18 bits wide
238
  output reg                           dta_out_valid;
239
 
240
  parameter [cosval_width-1:0]
241
    COSVAL_A      =  16'sd16384,  /*   SQRT(8)/2 * 2**14 * cos (pi/4) */
242
    COSVAL_MINUSA = -16'sd16384,  /* - SQRT(8)/2 * 2**14 * cos (pi/4) */
243
    COSVAL_B      =  16'sd21407,  /*   SQRT(8)/2 * 2**14 * cos (pi/8) */
244
    COSVAL_MINUSB = -16'sd21407,  /* - SQRT(8)/2 * 2**14 * cos (pi/8) */
245
    COSVAL_C      =  16'sd8867,   /*   SQRT(8)/2 * 2**14 * sin (pi/8) */
246
    COSVAL_MINUSC = -16'sd8867,   /* - SQRT(8)/2 * 2**14 * sin (pi/8) */
247
    COSVAL_D      =  16'sd22725,  /*   SQRT(8)/2 * 2**14 * cos (pi/16) */
248
    COSVAL_MINUSD = -16'sd22725,  /* - SQRT(8)/2 * 2**14 * cos (pi/16) */
249
    COSVAL_E      =  16'sd19266,  /*   SQRT(8)/2 * 2**14 * cos (3*pi/16) */
250
    COSVAL_MINUSE = -16'sd19266,  /* - SQRT(8)/2 * 2**14 * cos (3*pi/16) */
251
    COSVAL_F      =  16'sd12873,  /*   SQRT(8)/2 * 2**14 * sin (3*pi/16) */
252
    COSVAL_MINUSF = -16'sd12873,  /* - SQRT(8)/2 * 2**14 * sin (3*pi/16) */
253
    COSVAL_G      =  16'sd4520,   /*   SQRT(8)/2 * 2**14 * sin (pi/16) */
254
    COSVAL_MINUSG = -16'sd4520;   /* - SQRT(8)/2 * 2**14 * sin (pi/16) */
255
 
256
  /* dct coefficients input */
257
  reg signed [dta_in_width-1:0]q0;
258
  reg signed [dta_in_width-1:0]q1;
259
  reg signed [dta_in_width-1:0]q2;
260
  reg signed [dta_in_width-1:0]q3;
261
  reg signed [dta_in_width-1:0]q4;
262
  reg signed [dta_in_width-1:0]q5;
263
  reg signed [dta_in_width-1:0]q6;
264
  reg signed [dta_in_width-1:0]q7;
265
 
266
  reg signed [dta_in_width-1:0]x0;
267
  reg signed [dta_in_width-1:0]x1;
268
  reg signed [dta_in_width-1:0]x2;
269
  reg signed [dta_in_width-1:0]x3;
270
  reg signed [dta_in_width-1:0]x4;
271
  reg signed   [dta_in_width:0]minus_x4; // needs one bit more than x4, else two's complement of most negative x4 doesn't fit.
272
  reg signed [dta_in_width-1:0]x5;
273
  reg signed [dta_in_width-1:0]x6;
274
  reg signed [dta_in_width-1:0]x7;
275
 
276
  reg signed [cosval_width-1:0]cos1;
277
  reg signed [cosval_width-1:0]cos2;
278
  reg signed [cosval_width-1:0]cos3;
279
  reg signed [cosval_width-1:0]cos5;
280
  reg signed [cosval_width-1:0]cos6;
281
  reg signed [cosval_width-1:0]cos7;
282
 
283
  reg signed [prod_width-1:0]prod0; // product of xi * cosvali
284
  reg signed [prod_width-1:0]prod1;
285
  reg signed [prod_width-1:0]prod2;
286
  reg signed [prod_width-1:0]prod3;
287
  reg signed [prod_width-1:0]prod4;
288
  reg signed [prod_width-1:0]prod5;
289
  reg signed [prod_width-1:0]prod6;
290
  reg signed [prod_width-1:0]prod7;
291
 
292
  reg signed [reg_width-1:0]sum02; // sum of prodi and prodj
293
  reg signed [reg_width-1:0]sum46;
294
  reg signed [reg_width-1:0]sum13;
295
  reg signed [reg_width-1:0]sum57;
296
  reg signed [reg_width-1:0]sum0246; // sum of sumij and sumpq
297
  reg signed [reg_width-1:0]sum1357;
298
 
299
  reg signed [reg_width-1:0]y; // y sum or difference of sum0246 and sum0246
300
 
301
  reg [3:0]dta_in_cntr;
302
 
303
  reg dta_out_val_0;
304
  reg dta_out_val_1;
305
  reg dta_out_val_2;
306
  reg dta_out_val_3;
307
 
308
  reg add_0;
309
  reg add_1;
310
  reg add_2;
311
 
312
  // an offset which is added to x0 to round the results.
313
  parameter signed [reg_width-1:0] offset = {2'b01, {(dta_shift-1){1'b0}}};
314
 
315
  parameter [3:0]
316
    STATE_IDLE  = 4'd0,
317
    STATE_0     = 4'd1,
318
    STATE_1     = 4'd2,
319
    STATE_2     = 4'd3,
320
    STATE_3     = 4'd4,
321
    STATE_4     = 4'd5,
322
    STATE_5     = 4'd6,
323
    STATE_6     = 4'd7,
324
    STATE_7     = 4'd8;
325
 
326
  reg [3:0]state;
327
  reg [3:0]next;
328
 
329
  /*
330
   * IDCT data input
331
   */
332
 
333
  /* input shift register */
334
  always @(posedge clk)
335
    if (~rst)
336
      begin
337
        q0 <= 'sd0;
338
        q1 <= 'sd0;
339
        q2 <= 'sd0;
340
        q3 <= 'sd0;
341
        q4 <= 'sd0;
342
        q5 <= 'sd0;
343
        q6 <= 'sd0;
344
        q7 <= 'sd0;
345
      end
346
    else if (clk_en && dta_in_valid)
347
      begin
348
        q0 <= q1;
349
        q1 <= q2;
350
        q2 <= q3;
351
        q3 <= q4;
352
        q4 <= q5;
353
        q5 <= q6;
354
        q6 <= q7;
355
        q7 <= dta_in;
356
      end
357
    else
358
      begin
359
        q0 <= q0;
360
        q1 <= q1;
361
        q2 <= q2;
362
        q3 <= q3;
363
        q4 <= q4;
364
        q5 <= q5;
365
        q6 <= q6;
366
        q7 <= q7;
367
      end
368
 
369
  always @(posedge clk)
370
    if (~rst)
371
      begin
372
        x0 <= 'sd0;
373
        x1 <= 'sd0;
374
        x2 <= 'sd0;
375
        x3 <= 'sd0;
376
        x4 <= 'sd0;
377
        minus_x4 <= 'sd0;
378
        x5 <= 'sd0;
379
        x6 <= 'sd0;
380
        x7 <= 'sd0;
381
      end
382
    else if (clk_en && (dta_in_cntr == 4'd8))
383
      begin
384
        x0 <= q0;
385
        x1 <= q1;
386
        x2 <= q2;
387
        x3 <= q3;
388
        x4 <= q4;
389
        minus_x4 <= ~{q4[dta_in_width-1], q4}+1'b1;
390
        x5 <= q5;
391
        x6 <= q6;
392
        x7 <= q7;
393
      end
394
    else
395
      begin
396
        x0 <= x0;
397
        x1 <= x1;
398
        x2 <= x2;
399
        x3 <= x3;
400
        x4 <= x4;
401
        minus_x4 <= minus_x4;
402
        x5 <= x5;
403
        x6 <= x6;
404
        x7 <= x7;
405
      end
406
 
407
  /* input counter */
408
  always @(posedge clk)
409
    if (~rst) dta_in_cntr <= 4'b0;
410
    else if (clk_en && (dta_in_cntr == 4'd8) && dta_in_valid) dta_in_cntr <= 3'd1;
411
    else if (clk_en && (dta_in_cntr == 4'd8)) dta_in_cntr <= 3'd0;
412
    else if (clk_en && dta_in_valid) dta_in_cntr <= dta_in_cntr + 3'd1;
413
    else dta_in_cntr <= dta_in_cntr;
414
 
415
  /*
416
   * IDCT calculation
417
   */
418
 
419
  /* next state logic */
420
  always @*
421
    case (state)
422
      STATE_IDLE:   if (dta_in_cntr == 4'd8) next = STATE_0;
423
                    else next = STATE_IDLE;
424
      STATE_0:      next = STATE_1;
425
      STATE_1:      next = STATE_2;
426
      STATE_2:      next = STATE_3;
427
      STATE_3:      next = STATE_4;
428
      STATE_4:      next = STATE_5;
429
      STATE_5:      next = STATE_6;
430
      STATE_6:      next = STATE_7;
431
      STATE_7:      if (dta_in_cntr == 4'd8) next = STATE_0;
432
                    else next = STATE_IDLE;
433
      default       next = STATE_IDLE;
434
    endcase
435
 
436
  /* state */
437
  always @(posedge clk)
438
    if(~rst) state <= STATE_IDLE;
439
    else if (clk_en) state <= next;
440
    else  state <= state;
441
 
442
  always @(posedge clk)
443
    if (~rst)
444
      cos2 <= COSVAL_B;
445
    else if (clk_en)
446
      case (state)
447
        STATE_0:       cos2 <= COSVAL_C;
448
        STATE_1:       cos2 <= COSVAL_MINUSC;
449
        STATE_2:       cos2 <= COSVAL_MINUSB;
450
        STATE_3:       cos2 <= COSVAL_MINUSB;
451
        STATE_4:       cos2 <= COSVAL_MINUSC;
452
        STATE_5:       cos2 <= COSVAL_C;
453
        STATE_6:       cos2 <= COSVAL_B;
454
        STATE_7:       cos2 <= COSVAL_B;
455
        default        cos2 <= COSVAL_B;
456
      endcase
457
    else
458
      cos2 <= cos2;
459
 
460
  always @(posedge clk)
461
    if (~rst)
462
      cos6 <= COSVAL_C;
463
    else if (clk_en)
464
      case (state)
465
        STATE_0:       cos6 <= COSVAL_MINUSB;
466
        STATE_1:       cos6 <= COSVAL_B;
467
        STATE_2:       cos6 <= COSVAL_MINUSC;
468
        STATE_3:       cos6 <= COSVAL_MINUSC;
469
        STATE_4:       cos6 <= COSVAL_B;
470
        STATE_5:       cos6 <= COSVAL_MINUSB;
471
        STATE_6:       cos6 <= COSVAL_C;
472
        STATE_7:       cos6 <= COSVAL_C;
473
        default        cos6 <= COSVAL_C;
474
      endcase
475
    else
476
      cos6 <= cos6;
477
 
478
  always @(posedge clk)
479
    if (~rst)
480
      cos1 <= COSVAL_D;
481
    else if (clk_en)
482
      case (state)
483
        STATE_0:       cos1 <= COSVAL_E;
484
        STATE_1:       cos1 <= COSVAL_F;
485
        STATE_2:       cos1 <= COSVAL_G;
486
        STATE_3:       cos1 <= COSVAL_G;
487
        STATE_4:       cos1 <= COSVAL_F;
488
        STATE_5:       cos1 <= COSVAL_E;
489
        STATE_6:       cos1 <= COSVAL_D;
490
        STATE_7:       cos1 <= COSVAL_D;
491
        default        cos1 <= COSVAL_D;
492
      endcase
493
    else
494
      cos1 <= cos1;
495
 
496
  always @(posedge clk)
497
    if (~rst)
498
      cos3 <= COSVAL_E;
499
    else if (clk_en)
500
      case (state)
501
        STATE_0:       cos3 <= COSVAL_MINUSG;
502
        STATE_1:       cos3 <= COSVAL_MINUSD;
503
        STATE_2:       cos3 <= COSVAL_MINUSF;
504
        STATE_3:       cos3 <= COSVAL_MINUSF;
505
        STATE_4:       cos3 <= COSVAL_MINUSD;
506
        STATE_5:       cos3 <= COSVAL_MINUSG;
507
        STATE_6:       cos3 <= COSVAL_E;
508
        STATE_7:       cos3 <= COSVAL_E;
509
        default        cos3 <= COSVAL_E;
510
      endcase
511
    else
512
      cos3 <= cos3;
513
 
514
  always @(posedge clk)
515
    if (~rst)
516
      cos5 <= COSVAL_F;
517
    else if (clk_en)
518
      case (state)
519
        STATE_0:       cos5 <= COSVAL_MINUSD;
520
        STATE_1:       cos5 <= COSVAL_G;
521
        STATE_2:       cos5 <= COSVAL_E;
522
        STATE_3:       cos5 <= COSVAL_E;
523
        STATE_4:       cos5 <= COSVAL_G;
524
        STATE_5:       cos5 <= COSVAL_MINUSD;
525
        STATE_6:       cos5 <= COSVAL_F;
526
        STATE_7:       cos5 <= COSVAL_F;
527
        default        cos5 <= COSVAL_F;
528
      endcase
529
    else
530
      cos5 <= cos5;
531
 
532
  always @(posedge clk)
533
    if (~rst)
534
      cos7 <= COSVAL_G;
535
    else if (clk_en)
536
      case (state)
537
        STATE_0:       cos7 <= COSVAL_MINUSF;
538
        STATE_1:       cos7 <= COSVAL_E;
539
        STATE_2:       cos7 <= COSVAL_MINUSD;
540
        STATE_3:       cos7 <= COSVAL_MINUSD;
541
        STATE_4:       cos7 <= COSVAL_E;
542
        STATE_5:       cos7 <= COSVAL_MINUSF;
543
        STATE_6:       cos7 <= COSVAL_G;
544
        STATE_7:       cos7 <= COSVAL_G;
545
        default        cos7 <= COSVAL_G;
546
      endcase
547
    else
548
      cos7 <= cos7;
549
 
550
  always @(posedge clk)
551
    if (~rst)
552
      begin
553
        prod0 <= 'sd0;
554
        prod1 <= 'sd0;
555
        prod2 <= 'sd0;
556
        prod3 <= 'sd0;
557
        prod4 <= 'sd0;
558
        prod5 <= 'sd0;
559
        prod6 <= 'sd0;
560
        prod7 <= 'sd0;
561
        sum02 <= 'sd0;
562
        sum46 <= 'sd0;
563
        sum13 <= 'sd0;
564
        sum57 <= 'sd0;
565
        sum0246 <= 'sd0;
566
        sum1357 <= 'sd0;
567
      end
568
    else if (clk_en)
569
      begin
570
        /*
571
         * Next line implements
572
         * prod0 <= (cos0 * x0) + offset; // = cos0 * x0 + offset;
573
         * using shifts; offset added for proper rounding.
574
         */
575
 
576
        prod0 <= {{(reg_width - dta_in_width){x0[dta_in_width-1]}}, x0, {scale{1'b0}}} + offset; // = cos0 * x0 + offset; offset added for proper rounding. Avoids a multipier.
577
 
578
        /*
579
         * These ought to map to a hardware multiplier in the fpga.
580
         */
581
        prod1 <= cos1 * x1;
582
        prod2 <= cos2 * x2;
583
        prod3 <= cos3 * x3;
584
 
585
        /*
586
         * case implements
587
         *  prod4 <= cos4 * x4;
588
         * using shifts, saving a multiplier.
589
         */
590
 
591
        case (state)
592
          STATE_0,
593
          STATE_3,
594
          STATE_4,
595
          STATE_7: prod4 <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
596
          STATE_1,
597
          STATE_2,
598
          STATE_5,
599
          STATE_6: prod4 <=  {{(reg_width - dta_in_width-1){minus_x4[dta_in_width]}}, minus_x4, {scale{1'b0}}};
600
          default  prod4 <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
601
        endcase
602
 
603
        prod5 <= cos5 * x5;
604
        prod6 <= cos6 * x6;
605
        prod7 <= cos7 * x7;
606
        sum02 <= {{(reg_width-prod_width){prod0[prod_width-1]}}, prod0} + {{(reg_width-prod_width){prod2[prod_width-1]}}, prod2};
607
        sum46 <= {{(reg_width-prod_width){prod4[prod_width-1]}}, prod4} + {{(reg_width-prod_width){prod6[prod_width-1]}}, prod6};
608
        sum13 <= {{(reg_width-prod_width){prod1[prod_width-1]}}, prod1} + {{(reg_width-prod_width){prod3[prod_width-1]}}, prod3};
609
        sum57 <= {{(reg_width-prod_width){prod5[prod_width-1]}}, prod5} + {{(reg_width-prod_width){prod7[prod_width-1]}}, prod7};
610
        sum0246 <= sum02 + sum46;
611
        sum1357 <= sum13 + sum57;
612
      end
613
    else
614
      begin
615
        prod0 <= prod0;
616
        prod1 <= prod1;
617
        prod2 <= prod2;
618
        prod3 <= prod3;
619
        prod4 <= prod4;
620
        prod5 <= prod5;
621
        prod6 <= prod6;
622
        prod7 <= prod7;
623
        sum02 <= sum02;
624
        sum46 <= sum46;
625
        sum13 <= sum13;
626
        sum57 <= sum57;
627
        sum0246 <= sum0246;
628
        sum1357 <= sum1357;
629
      end
630
 
631
  always @(posedge clk)
632
    if (~rst)
633
      begin
634
        dta_out_val_0 <= 1'b0;
635
        dta_out_val_1 <= 1'b0;
636
        dta_out_val_2 <= 1'b0;
637
        dta_out_val_3 <= 1'b0;
638
        dta_out_valid <= 1'b0;
639
      end
640
    else if (clk_en)
641
      begin
642
        dta_out_val_0 <= (state != STATE_IDLE);
643
        dta_out_val_1 <= dta_out_val_0;
644
        dta_out_val_2 <= dta_out_val_1;
645
        dta_out_val_3 <= dta_out_val_2;
646
        dta_out_valid <= dta_out_val_3;
647
      end
648
    else
649
      begin
650
        dta_out_val_0 <= dta_out_val_0;
651
        dta_out_val_1 <= dta_out_val_1;
652
        dta_out_val_2 <= dta_out_val_2;
653
        dta_out_val_3 <= dta_out_val_3;
654
        dta_out_valid <= dta_out_valid;
655
      end
656
 
657
 
658
  /*
659
   * Looking at the equation for the 1d idct, the final step when calculating
660
   * y0..y3 is addition, when calculating y4..y7 subtraction.
661
   * register add_0 is 1 when one needs to add, 0 when one needs to subtract.
662
   */
663
 
664
  always @(posedge clk)
665
    if (~rst)
666
      add_0 <= 1'd0;
667
    else if (clk_en)
668
      case (state)
669
        STATE_0,
670
        STATE_1,
671
        STATE_2,
672
        STATE_3:       add_0 <= 1'b1;
673
        STATE_4,
674
        STATE_6,
675
        STATE_5,
676
        STATE_7:       add_0 <= 1'b0;
677
        default        add_0 <= 1'b0;
678
      endcase
679
    else
680
      add_0 <= add_0;
681
 
682
  always @(posedge clk)
683
    if (~rst)
684
      begin
685
        add_1 <= 1'b0;
686
        add_2 <= 1'b0;
687
      end
688
    else if (clk_en)
689
      begin
690
        add_1 <= add_0;
691
        add_2 <= add_1;
692
      end
693
    else
694
      begin
695
      add_1 <= add_1;
696
      add_2 <= add_2;
697
      end
698
 
699
  always @(posedge clk)
700
    if (~rst)
701
      y <= 'sd0;
702
    else if (clk_en && add_2)
703
      y <= sum0246 + sum1357;
704
    else if (clk_en)
705
      y <= sum0246 - sum1357;
706
    else
707
      y <= y;
708
 
709
  always @(posedge clk)
710
    if (~rst) dta_out <= 'sd0;
711
    else if (clk_en) dta_out <=  y >>> dta_shift;
712
    else dta_out <= dta_out;
713
 
714
`ifdef DEBUG_IDCT_1D
715
  always @(posedge clk)
716
    begin
717
      $strobe("%m\toffset: %d", offset);
718
      $strobe("%m\tcos0: -------- cos1: %8d cos2: %8d cos3: %8d cos4: -------- cos5: %8d cos6: %8d cos7: %8d", cos1, cos2, cos3, cos5, cos6, cos7);
719
      $strobe("%m\t  x0: %8d   x1: %8d   x2: %8d   x3: %8d   x4: %8d   x5: %8d   x6: %8d   x7: %8d",   x0,   x1,   x2,   x3,   x4,   x5,   x6,   x7);
720
      $strobe("%m\tprod0: %d prod1: %d prod2: %d prod3: %d prod4: %d prod5: %d prod6: %d prod7: %d", prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7);
721
      $strobe("%m\tsum02: %8d sum46: %8d sum13: %8d sum57: %8d", sum02, sum46, sum13, sum57);
722
      $strobe("%m\tsum0246: %8d sum1357: %8d", sum0246, sum1357);
723
      $strobe("%m\ty: %8d", y);
724
      $strobe("%m\tdta_out: %8d", dta_out);
725
    end
726
`endif
727
 
728
endmodule
729
 
730
/*
731
 * 8-point 1-dimensional inverse discrete cosine transform. Column transform.
732
 *
733
 * Mathematically identical to the row transform.
734
 * However, the 22x16 multipliers have not been implemented as two 18x18 multipliers,
735
 * but as an 18x18 multiplier with a few shifters and adders added.
736
 * This saves six multipliers. Clock speed improves, too.
737
 */
738
 
739
module idct1d_col (clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid);
740
  parameter                            dta_in_width=12,          // width of dta_in
741
                                       dta_shift=11,             // how much to shift result to the right
742
                                       reg_width=29,             // width of internal registers
743
                                       scale=13,                 // cosine values scaled by 2**scale
744
                                       dta_out_width=reg_width-dta_shift, // width of dta_out
745
                                       cosval_width=16,          // width of COSVAL_A .. COSVAL_G
746
                                       prod_width=dta_in_width+cosval_width;          // width of COSVAL_i * xi
747
 
748
  input                                clk;                      // clock
749
  input                                clk_en;                   // clock enable
750
  input                                rst;                      // synchronous active low reset
751
  input signed       [dta_in_width-1:0]dta_in;                   // data in
752
  input                                dta_in_valid;
753
  output reg signed [dta_out_width-1:0]dta_out;                  // data out - 18 bits wide
754
  output reg                           dta_out_valid;
755
 
756
  parameter [cosval_width-1:0]
757
    COSVAL_A      =  16'sd16384,  /*   SQRT(8)/2 * 2**14 * cos (pi/4) */
758
    COSVAL_MINUSA = -16'sd16384,  /* - SQRT(8)/2 * 2**14 * cos (pi/4) */
759
    COSVAL_B      =  16'sd21407,  /*   SQRT(8)/2 * 2**14 * cos (pi/8) */
760
    COSVAL_MINUSB = -16'sd21407,  /* - SQRT(8)/2 * 2**14 * cos (pi/8) */
761
    COSVAL_C      =  16'sd8867,   /*   SQRT(8)/2 * 2**14 * sin (pi/8) */
762
    COSVAL_MINUSC = -16'sd8867,   /* - SQRT(8)/2 * 2**14 * sin (pi/8) */
763
    COSVAL_D      =  16'sd22725,  /*   SQRT(8)/2 * 2**14 * cos (pi/16) */
764
    COSVAL_MINUSD = -16'sd22725,  /* - SQRT(8)/2 * 2**14 * cos (pi/16) */
765
    COSVAL_E      =  16'sd19266,  /*   SQRT(8)/2 * 2**14 * cos (3*pi/16) */
766
    COSVAL_MINUSE = -16'sd19266,  /* - SQRT(8)/2 * 2**14 * cos (3*pi/16) */
767
    COSVAL_F      =  16'sd12873,  /*   SQRT(8)/2 * 2**14 * sin (3*pi/16) */
768
    COSVAL_MINUSF = -16'sd12873,  /* - SQRT(8)/2 * 2**14 * sin (3*pi/16) */
769
    COSVAL_G      =  16'sd4520,   /*   SQRT(8)/2 * 2**14 * sin (pi/16) */
770
    COSVAL_MINUSG = -16'sd4520;   /* - SQRT(8)/2 * 2**14 * sin (pi/16) */
771
 
772
  /* dct coefficients input */
773
  reg signed [dta_in_width-1:0]q0;
774
  reg signed [dta_in_width-1:0]q1;
775
  reg signed [dta_in_width-1:0]q2;
776
  reg signed [dta_in_width-1:0]q3;
777
  reg signed [dta_in_width-1:0]q4;
778
  reg signed [dta_in_width-1:0]q5;
779
  reg signed [dta_in_width-1:0]q6;
780
  reg signed [dta_in_width-1:0]q7;
781
 
782
  reg signed [dta_in_width-1:0]x0;
783
  reg signed [dta_in_width-1:0]x1;
784
  reg signed [dta_in_width-1:0]x2;
785
  reg signed [dta_in_width-1:0]x3;
786
  reg signed [dta_in_width-1:0]x4;
787
  reg signed   [dta_in_width:0]minus_x4; // needs one bit more than x4, else two's complement of most negative x4 doesn't fit.
788
  reg signed [dta_in_width-1:0]x5;
789
  reg signed [dta_in_width-1:0]x6;
790
  reg signed [dta_in_width-1:0]x7;
791
 
792
  reg signed [cosval_width-1:0]cos1;
793
  reg signed [cosval_width-1:0]cos2;
794
  reg signed [cosval_width-1:0]cos3;
795
  reg signed [cosval_width-1:0]cos5;
796
  reg signed [cosval_width-1:0]cos6;
797
  reg signed [cosval_width-1:0]cos7;
798
 
799
  reg  signed [prod_width-1:0]prod0; // product of xi * cosvali
800
  reg  signed [prod_width-1:0]prod0_delayed;
801
  wire signed [prod_width-1:0]prod1;
802
  wire signed [prod_width-1:0]prod2;
803
  wire signed [prod_width-1:0]prod3;
804
  reg  signed [prod_width-1:0]prod4;
805
  reg  signed [prod_width-1:0]prod4_delayed;
806
  wire signed [prod_width-1:0]prod5;
807
  wire signed [prod_width-1:0]prod6;
808
  wire signed [prod_width-1:0]prod7;
809
 
810
  reg signed [reg_width-1:0]sum02; // sum of prodi and prodj
811
  reg signed [reg_width-1:0]sum46;
812
  reg signed [reg_width-1:0]sum13;
813
  reg signed [reg_width-1:0]sum57;
814
  reg signed [reg_width-1:0]sum0246; // sum of sumij and sumpq
815
  reg signed [reg_width-1:0]sum1357;
816
 
817
  reg signed [reg_width-1:0]y; // y sum or difference of sum0246 and sum0246
818
 
819
  reg [3:0]dta_in_cntr;
820
 
821
  reg dta_out_val_0;
822
  reg dta_out_val_1;
823
  reg dta_out_val_2;
824
  reg dta_out_val_3;
825
  reg dta_out_val_4;
826
 
827
  reg add_0;
828
  reg add_1;
829
  reg add_2;
830
  reg add_3;
831
 
832
  // an offset which is added to x0 to round the results.
833
  parameter signed [reg_width-1:0] offset = {2'b01, {(dta_shift-1){1'b0}}};
834
 
835
  parameter [3:0]
836
    STATE_IDLE  = 4'd0,
837
    STATE_0     = 4'd1,
838
    STATE_1     = 4'd2,
839
    STATE_2     = 4'd3,
840
    STATE_3     = 4'd4,
841
    STATE_4     = 4'd5,
842
    STATE_5     = 4'd6,
843
    STATE_6     = 4'd7,
844
    STATE_7     = 4'd8;
845
 
846
  reg [3:0]state;
847
  reg [3:0]next;
848
 
849
  /*
850
   * IDCT data input
851
   */
852
 
853
  /* input shift register */
854
  always @(posedge clk)
855
    if (~rst)
856
      begin
857
        q0 <= 'sd0;
858
        q1 <= 'sd0;
859
        q2 <= 'sd0;
860
        q3 <= 'sd0;
861
        q4 <= 'sd0;
862
        q5 <= 'sd0;
863
        q6 <= 'sd0;
864
        q7 <= 'sd0;
865
      end
866
    else if (clk_en && dta_in_valid)
867
      begin
868
        q0 <= q1;
869
        q1 <= q2;
870
        q2 <= q3;
871
        q3 <= q4;
872
        q4 <= q5;
873
        q5 <= q6;
874
        q6 <= q7;
875
        q7 <= dta_in;
876
      end
877
    else
878
      begin
879
        q0 <= q0;
880
        q1 <= q1;
881
        q2 <= q2;
882
        q3 <= q3;
883
        q4 <= q4;
884
        q5 <= q5;
885
        q6 <= q6;
886
        q7 <= q7;
887
      end
888
 
889
  always @(posedge clk)
890
    if (~rst)
891
      begin
892
        x0 <= 'sd0;
893
        x1 <= 'sd0;
894
        x2 <= 'sd0;
895
        x3 <= 'sd0;
896
        x4 <= 'sd0;
897
        minus_x4 <= 'sd0;
898
        x5 <= 'sd0;
899
        x6 <= 'sd0;
900
        x7 <= 'sd0;
901
      end
902
    else if (clk_en && (dta_in_cntr == 4'd8))
903
      begin
904
        x0 <= q0;
905
        x1 <= q1;
906
        x2 <= q2;
907
        x3 <= q3;
908
        x4 <= q4;
909
        minus_x4 <= ~{q4[dta_in_width-1], q4}+1'b1;
910
        x5 <= q5;
911
        x6 <= q6;
912
        x7 <= q7;
913
      end
914
    else
915
      begin
916
        x0 <= x0;
917
        x1 <= x1;
918
        x2 <= x2;
919
        x3 <= x3;
920
        x4 <= x4;
921
        minus_x4 <= minus_x4;
922
        x5 <= x5;
923
        x6 <= x6;
924
        x7 <= x7;
925
      end
926
 
927
  /* input counter */
928
  always @(posedge clk)
929
    if (~rst) dta_in_cntr <= 4'b0;
930
    else if (clk_en && (dta_in_cntr == 4'd8) && dta_in_valid) dta_in_cntr <= 3'd1;
931
    else if (clk_en && (dta_in_cntr == 4'd8)) dta_in_cntr <= 3'd0;
932
    else if (clk_en && dta_in_valid) dta_in_cntr <= dta_in_cntr + 3'd1;
933
    else dta_in_cntr <= dta_in_cntr;
934
 
935
  /*
936
   * IDCT calculation
937
   */
938
 
939
  /* next state logic */
940
  always @*
941
    case (state)
942
      STATE_IDLE:   if (dta_in_cntr == 4'd8) next = STATE_0;
943
                    else next = STATE_IDLE;
944
      STATE_0:      next = STATE_1;
945
      STATE_1:      next = STATE_2;
946
      STATE_2:      next = STATE_3;
947
      STATE_3:      next = STATE_4;
948
      STATE_4:      next = STATE_5;
949
      STATE_5:      next = STATE_6;
950
      STATE_6:      next = STATE_7;
951
      STATE_7:      if (dta_in_cntr == 4'd8) next = STATE_0;
952
                    else next = STATE_IDLE;
953
      default       next = STATE_IDLE;
954
    endcase
955
 
956
  /* state */
957
  always @(posedge clk)
958
    if(~rst) state <= STATE_IDLE;
959
    else if (clk_en) state <= next;
960
    else  state <= state;
961
 
962
  always @(posedge clk)
963
    if (~rst)
964
      cos2 <= COSVAL_B;
965
    else if (clk_en)
966
      case (state)
967
        STATE_0:       cos2 <= COSVAL_C;
968
        STATE_1:       cos2 <= COSVAL_MINUSC;
969
        STATE_2:       cos2 <= COSVAL_MINUSB;
970
        STATE_3:       cos2 <= COSVAL_MINUSB;
971
        STATE_4:       cos2 <= COSVAL_MINUSC;
972
        STATE_5:       cos2 <= COSVAL_C;
973
        STATE_6:       cos2 <= COSVAL_B;
974
        STATE_7:       cos2 <= COSVAL_B;
975
        default        cos2 <= COSVAL_B;
976
      endcase
977
    else
978
      cos2 <= cos2;
979
 
980
  always @(posedge clk)
981
    if (~rst)
982
      cos6 <= COSVAL_C;
983
    else if (clk_en)
984
      case (state)
985
        STATE_0:       cos6 <= COSVAL_MINUSB;
986
        STATE_1:       cos6 <= COSVAL_B;
987
        STATE_2:       cos6 <= COSVAL_MINUSC;
988
        STATE_3:       cos6 <= COSVAL_MINUSC;
989
        STATE_4:       cos6 <= COSVAL_B;
990
        STATE_5:       cos6 <= COSVAL_MINUSB;
991
        STATE_6:       cos6 <= COSVAL_C;
992
        STATE_7:       cos6 <= COSVAL_C;
993
        default        cos6 <= COSVAL_C;
994
      endcase
995
    else
996
      cos6 <= cos6;
997
 
998
  always @(posedge clk)
999
    if (~rst)
1000
      cos1 <= COSVAL_D;
1001
    else if (clk_en)
1002
      case (state)
1003
        STATE_0:       cos1 <= COSVAL_E;
1004
        STATE_1:       cos1 <= COSVAL_F;
1005
        STATE_2:       cos1 <= COSVAL_G;
1006
        STATE_3:       cos1 <= COSVAL_G;
1007
        STATE_4:       cos1 <= COSVAL_F;
1008
        STATE_5:       cos1 <= COSVAL_E;
1009
        STATE_6:       cos1 <= COSVAL_D;
1010
        STATE_7:       cos1 <= COSVAL_D;
1011
        default        cos1 <= COSVAL_D;
1012
      endcase
1013
    else
1014
      cos1 <= cos1;
1015
 
1016
  always @(posedge clk)
1017
    if (~rst)
1018
      cos3 <= COSVAL_E;
1019
    else if (clk_en)
1020
      case (state)
1021
        STATE_0:       cos3 <= COSVAL_MINUSG;
1022
        STATE_1:       cos3 <= COSVAL_MINUSD;
1023
        STATE_2:       cos3 <= COSVAL_MINUSF;
1024
        STATE_3:       cos3 <= COSVAL_MINUSF;
1025
        STATE_4:       cos3 <= COSVAL_MINUSD;
1026
        STATE_5:       cos3 <= COSVAL_MINUSG;
1027
        STATE_6:       cos3 <= COSVAL_E;
1028
        STATE_7:       cos3 <= COSVAL_E;
1029
        default        cos3 <= COSVAL_E;
1030
      endcase
1031
    else
1032
      cos3 <= cos3;
1033
 
1034
  always @(posedge clk)
1035
    if (~rst)
1036
      cos5 <= COSVAL_F;
1037
    else if (clk_en)
1038
      case (state)
1039
        STATE_0:       cos5 <= COSVAL_MINUSD;
1040
        STATE_1:       cos5 <= COSVAL_G;
1041
        STATE_2:       cos5 <= COSVAL_E;
1042
        STATE_3:       cos5 <= COSVAL_E;
1043
        STATE_4:       cos5 <= COSVAL_G;
1044
        STATE_5:       cos5 <= COSVAL_MINUSD;
1045
        STATE_6:       cos5 <= COSVAL_F;
1046
        STATE_7:       cos5 <= COSVAL_F;
1047
        default        cos5 <= COSVAL_F;
1048
      endcase
1049
    else
1050
      cos5 <= cos5;
1051
 
1052
  always @(posedge clk)
1053
    if (~rst)
1054
      cos7 <= COSVAL_G;
1055
    else if (clk_en)
1056
      case (state)
1057
        STATE_0:       cos7 <= COSVAL_MINUSF;
1058
        STATE_1:       cos7 <= COSVAL_E;
1059
        STATE_2:       cos7 <= COSVAL_MINUSD;
1060
        STATE_3:       cos7 <= COSVAL_MINUSD;
1061
        STATE_4:       cos7 <= COSVAL_E;
1062
        STATE_5:       cos7 <= COSVAL_MINUSF;
1063
        STATE_6:       cos7 <= COSVAL_G;
1064
        STATE_7:       cos7 <= COSVAL_G;
1065
        default        cos7 <= COSVAL_G;
1066
      endcase
1067
    else
1068
      cos7 <= cos7;
1069
 
1070
  /* The 22x18 multipliers */
1071
 
1072
  always @(posedge clk)                                         /* prod0 <= cos0 * x0 + offset; offset added for proper rounding. Uses shifts, avoids a multipier. */
1073
    if (~rst) prod0_delayed <= 'sd0;
1074
    else if (clk_en)
1075
        prod0_delayed <= {{(reg_width - dta_in_width){x0[dta_in_width-1]}}, x0, {scale{1'b0}}} + offset;
1076
    else prod0_delayed <= prod0_delayed;
1077
 
1078
  always @(posedge clk)
1079
    if (~rst) prod0 <= 'sd0;
1080
    else if (clk_en) prod0 <= prod0_delayed;
1081
    else prod0 <= prod0;
1082
 
1083
  mult22x16 mult_prod1(clk, clk_en, rst, prod1, cos1, x1);      /* prod1 <= cos1 * x1; */
1084
  mult22x16 mult_prod2(clk, clk_en, rst, prod2, cos2, x2);      /* prod2 <= cos2 * x2; */
1085
  mult22x16 mult_prod3(clk, clk_en, rst, prod3, cos3, x3);      /* prod3 <= cos3 * x3; */
1086
 
1087
  always @(posedge clk)                                         /* prod4 <= cos4 * x4. Uses shifts, avoids a multipier. */
1088
    if (~rst) prod4_delayed <= 'sd0;
1089
    else if (clk_en)
1090
        case (state)
1091
          STATE_0,
1092
          STATE_3,
1093
          STATE_4,
1094
          STATE_7: prod4_delayed <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
1095
          STATE_1,
1096
          STATE_2,
1097
          STATE_5,
1098
          STATE_6: prod4_delayed <=  {{(reg_width - dta_in_width-1){minus_x4[dta_in_width]}}, minus_x4, {scale{1'b0}}};
1099
          default  prod4_delayed <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
1100
        endcase
1101
    else prod4_delayed <= prod4_delayed;
1102
 
1103
  always @(posedge clk)
1104
    if (~rst) prod4 <= 'sd0;
1105
    else if (clk_en) prod4 <= prod4_delayed;
1106
    else prod4 <= prod4;
1107
 
1108
  mult22x16 mult_prod5(clk, clk_en, rst, prod5, cos5, x5);      /* prod5 <= cos5 * x5; */
1109
  mult22x16 mult_prod6(clk, clk_en, rst, prod6, cos6, x6);      /* prod6 <= cos6 * x6; */
1110
  mult22x16 mult_prod7(clk, clk_en, rst, prod7, cos7, x7);      /* prod7 <= cos7 * x7; */
1111
 
1112
  always @(posedge clk)
1113
    if (~rst)
1114
      begin
1115
        sum02 <= 'sd0;
1116
        sum46 <= 'sd0;
1117
        sum13 <= 'sd0;
1118
        sum57 <= 'sd0;
1119
        sum0246 <= 'sd0;
1120
        sum1357 <= 'sd0;
1121
      end
1122
    else if (clk_en)
1123
      begin
1124
        sum02 <= {{(reg_width-prod_width){prod0[prod_width-1]}}, prod0} + {{(reg_width-prod_width){prod2[prod_width-1]}}, prod2};
1125
        sum46 <= {{(reg_width-prod_width){prod4[prod_width-1]}}, prod4} + {{(reg_width-prod_width){prod6[prod_width-1]}}, prod6};
1126
        sum13 <= {{(reg_width-prod_width){prod1[prod_width-1]}}, prod1} + {{(reg_width-prod_width){prod3[prod_width-1]}}, prod3};
1127
        sum57 <= {{(reg_width-prod_width){prod5[prod_width-1]}}, prod5} + {{(reg_width-prod_width){prod7[prod_width-1]}}, prod7};
1128
        sum0246 <= sum02 + sum46;
1129
        sum1357 <= sum13 + sum57;
1130
      end
1131
    else
1132
      begin
1133
        sum02 <= sum02;
1134
        sum46 <= sum46;
1135
        sum13 <= sum13;
1136
        sum57 <= sum57;
1137
        sum0246 <= sum0246;
1138
        sum1357 <= sum1357;
1139
      end
1140
 
1141
  always @(posedge clk)
1142
    if (~rst)
1143
      begin
1144
        dta_out_val_0 <= 1'b0;
1145
        dta_out_val_1 <= 1'b0;
1146
        dta_out_val_2 <= 1'b0;
1147
        dta_out_val_3 <= 1'b0;
1148
        dta_out_val_4 <= 1'b0;
1149
        dta_out_valid <= 1'b0;
1150
      end
1151
    else if (clk_en)
1152
      begin
1153
        dta_out_val_0 <= (state != STATE_IDLE);
1154
        dta_out_val_1 <= dta_out_val_0;
1155
        dta_out_val_2 <= dta_out_val_1;
1156
        dta_out_val_3 <= dta_out_val_2;
1157
        dta_out_val_4 <= dta_out_val_3;
1158
        dta_out_valid <= dta_out_val_4;
1159
      end
1160
    else
1161
      begin
1162
        dta_out_val_0 <= dta_out_val_0;
1163
        dta_out_val_1 <= dta_out_val_1;
1164
        dta_out_val_2 <= dta_out_val_2;
1165
        dta_out_val_3 <= dta_out_val_3;
1166
        dta_out_val_4 <= dta_out_val_4;
1167
        dta_out_valid <= dta_out_valid;
1168
      end
1169
 
1170
  /*
1171
   * Looking at the equation for the 1d idct, the final step when calculating
1172
   * y0..y3 is addition, when calculating y4..y7 subtraction.
1173
   * register add_0 is 1 when one needs to add, 0 when one needs to subtract.
1174
   */
1175
 
1176
  always @(posedge clk)
1177
    if (~rst)
1178
      add_0 <= 1'd0;
1179
    else if (clk_en)
1180
      case (state)
1181
        STATE_0,
1182
        STATE_1,
1183
        STATE_2,
1184
        STATE_3:       add_0 <= 1'b1;
1185
        STATE_4,
1186
        STATE_6,
1187
        STATE_5,
1188
        STATE_7:       add_0 <= 1'b0;
1189
        default        add_0 <= 1'b0;
1190
      endcase
1191
    else
1192
      add_0 <= add_0;
1193
 
1194
  always @(posedge clk)
1195
    if (~rst)
1196
      begin
1197
        add_1 <= 1'b0;
1198
        add_2 <= 1'b0;
1199
        add_3 <= 1'b0;
1200
      end
1201
    else if (clk_en)
1202
      begin
1203
        add_1 <= add_0;
1204
        add_2 <= add_1;
1205
        add_3 <= add_2;
1206
      end
1207
    else
1208
      begin
1209
      add_1 <= add_1;
1210
      add_2 <= add_2;
1211
      add_3 <= add_3;
1212
      end
1213
 
1214
  always @(posedge clk)
1215
    if (~rst)
1216
      y <= 'sd0;
1217
    else if (clk_en && add_3)
1218
      y <= sum0246 + sum1357;
1219
    else if (clk_en)
1220
      y <= sum0246 - sum1357;
1221
    else
1222
      y <= y;
1223
 
1224
  always @(posedge clk)
1225
    if (~rst) dta_out <= 'sd0;
1226
    else if (clk_en) dta_out <=  y >>> dta_shift;
1227
    else dta_out <= dta_out;
1228
 
1229
`ifdef DEBUG_IDCT_1D
1230
  always @(posedge clk)
1231
    begin
1232
      $strobe("%m\toffset: %d", offset);
1233
      $strobe("%m\tcos0: -------- cos1: %8d cos2: %8d cos3: %8d cos4: -------- cos5: %8d cos6: %8d cos7: %8d", cos1, cos2, cos3, cos5, cos6, cos7);
1234
      $strobe("%m\t  x0: %8d   x1: %8d   x2: %8d   x3: %8d   x4: %8d   x5: %8d   x6: %8d   x7: %8d",   x0,   x1,   x2,   x3,   x4,   x5,   x6,   x7);
1235
      $strobe("%m\tprod0: %d prod1: %d prod2: %d prod3: %d prod4: %d prod5: %d prod6: %d prod7: %d", prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7);
1236
      $strobe("%m\tsum02: %8d sum46: %8d sum13: %8d sum57: %8d", sum02, sum46, sum13, sum57);
1237
      $strobe("%m\tsum0246: %8d sum1357: %8d", sum0246, sum1357);
1238
      $strobe("%m\ty: %8d", y);
1239
      $strobe("%m\tdta_out: %8d", dta_out);
1240
    end
1241
`endif
1242
 
1243
endmodule
1244
 
1245
/*
1246
 * 8x8 transpose ram. Swaps rows and columns.
1247
 */
1248
 
1249
module transpose(clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid, dta_out_eob);
1250
  parameter  dta_width=16;                           // data width;
1251
  input                    clk;                      // clock
1252
  input                    clk_en;                   // clock enable
1253
  input                    rst;                      // synchronous active low reset
1254
 
1255
  input    [dta_width -1:0]dta_in;                   // data in
1256
  input                    dta_in_valid;
1257
  output   [dta_width -1:0]dta_out;                  // transposed data out
1258
  output reg               dta_out_valid;
1259
  output reg               dta_out_eob;
1260
 
1261
  reg                 [7:0]wr_cnt;
1262
  reg                 [6:0]wr_addr;
1263
  reg                      wr_en;
1264
  reg      [dta_width -1:0]wr_din;
1265
 
1266
  reg                 [7:0]rd_cnt;
1267
  reg                 [6:0]rd_addr;
1268
  reg                      rd_en;
1269
 
1270
  /*
1271
   * We've got one dual-port ram, sufficient for two 8x8 matrices, with simultaneous reads and writes.
1272
   */
1273
 
1274
  /*
1275
   * write counter
1276
   * write data cyclically in dual-port ram.
1277
   */
1278
 
1279
  always @(posedge clk)
1280
    if (~rst) wr_cnt <= 8'b0;
1281
    else if (clk_en && dta_in_valid) wr_cnt <= wr_cnt + 8'd1;
1282
    else wr_cnt <= wr_cnt;
1283
 
1284
  always @(posedge clk)
1285
    if (~rst) wr_addr <= 7'b0;
1286
    else if (clk_en && dta_in_valid) wr_addr <= wr_cnt[6:0];
1287
    else wr_addr <= wr_addr;
1288
 
1289
  always @(posedge clk)
1290
    if (~rst) wr_en <= 1'b0;
1291
    else if (clk_en) wr_en <= dta_in_valid;
1292
    else wr_en <= wr_en;
1293
 
1294
  always @(posedge clk)
1295
    if (~rst) wr_din <= 1'b0;
1296
    else if (clk_en) wr_din <= dta_in;
1297
    else wr_din <= wr_din;
1298
 
1299
  /* read counter */
1300
  always @(posedge clk)
1301
    if (~rst) rd_cnt <= 8'b0;
1302
    else if (clk_en && (wr_cnt[7:6] != rd_cnt[7:6])) rd_cnt <= rd_cnt + 8'd1;
1303
    else rd_cnt <= rd_cnt;
1304
 
1305
  always @(posedge clk)
1306
    if (~rst) rd_addr <= 7'b0;
1307
    else if (clk_en) rd_addr <= {rd_cnt[6], rd_cnt[2:0], rd_cnt[5:3]}; // swap rows and columns in address
1308
    else rd_addr <= rd_addr;
1309
 
1310
  always @(posedge clk)
1311
    if (~rst) rd_en <= 1'b0;
1312
    else if (clk_en) rd_en <= (wr_cnt[7:6] != rd_cnt[7:6]);
1313
    else rd_en <= rd_en;
1314
 
1315
  always @(posedge clk)
1316
    if (~rst) dta_out_valid <= 1'b0;
1317
    else if (clk_en) dta_out_valid <= rd_en;
1318
    else dta_out_valid <= dta_out_valid;
1319
 
1320
  always @(posedge clk)
1321
    if (~rst) dta_out_eob <= 1'b0;
1322
    else if (clk_en) dta_out_eob <= rd_en && (rd_addr[5:0] == 6'd63);
1323
    else dta_out_eob <= dta_out_eob;
1324
 
1325
  /* transposition memory */
1326
 
1327
  dpram_sc
1328
    #(.addr_width(7),                                         // number of bits in address bus
1329
    .dta_width(dta_width))                                    // number of bits in data bus
1330
    ram0 (
1331
    .rst(rst),                                                // reset, active low
1332
    .clk(clk),                                                // clock, rising edge trigger
1333
    .wr_en(wr_en),                                            // write enable, active high
1334
    .wr_addr(wr_addr),                                        // write address
1335
    .din(wr_din),                                             // data input
1336
    .rd_en(rd_en),                                            // read enable, active high 
1337
    .rd_addr(rd_addr),                                        // read address
1338
    .dout(dta_out)                                            // data output
1339
    );
1340
 
1341
`ifdef DEBUG_TRANSPOSE
1342
  always @(posedge clk)
1343
    begin
1344
      $strobe("%m\twr_cnt: %d rd_cnt: %d dta_in: %d dta_in_valid: %d dta_out: %d dta_out_valid: %d dta_out_eob: %d",
1345
      wr_cnt, rd_cnt, dta_in, dta_in_valid, dta_out, dta_out_valid, dta_out_eob);
1346
      $strobe("%m\twr_en: %d wr_addr: %d wr_din: %d rd_en: %d rd_addr: %d dta_out: %d",
1347
      wr_en, wr_addr, wr_din, rd_en, rd_addr, dta_out);
1348
    end
1349
`endif
1350
 
1351
endmodule
1352
 
1353
 
1354
/*
1355
 * Clips idct output to -256..255
1356
 */
1357
 
1358
module clip_col(clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid);
1359
  input                  clk;                      // clock
1360
  input                  clk_en;                   // clock enable
1361
  input                  rst;                      // synchronous active low reset
1362
  input signed     [20:0]dta_in;                   // data in
1363
  input                  dta_in_valid;
1364
  output reg signed [8:0]dta_out;                 // data out
1365
  output reg             dta_out_valid;
1366
 
1367
  always @(posedge clk)
1368
    if (~rst) dta_out <= 'sd0;
1369
    else if (clk_en && ((dta_in[20:8] == 13'b1111111111111) || (dta_in[20:8] == 13'b000000000000))) dta_out <= dta_in[8:0];
1370
    else if (clk_en) dta_out <= {dta_in[20], {8{~dta_in[20]}}}; // clipping
1371
    else dta_out <= dta_out;
1372
 
1373
  always @(posedge clk)
1374
    if (~rst) dta_out_valid <= 'sd0;
1375
    else if (clk_en) dta_out_valid <= dta_in_valid;
1376
    else dta_out_valid <= dta_out_valid;
1377
 
1378
endmodule
1379
 
1380
module mult22x16(clk, clk_en, rst, product, multiplicand, multiplier);
1381
   input         clk;
1382
   input         clk_en;
1383
   input         rst;
1384
   input signed [21:0]  multiplier;
1385
   input signed [15:0]  multiplicand;
1386
   output reg signed [37:0] product;
1387
 
1388
/*
1389
 * the following code implements
1390
 *   always @(posedge clk)
1391
 *     product <= multiplier * multiplicand;
1392
 * using only a single 18x18 multiplier, a few shifts and adders.
1393
 *
1394
 * See "Expanding Virtex-II" by Ken Chapman, Xilinx UK, 06/30/2001 for
1395
 * a discussion about expanding multipliers.
1396
 *
1397
 */
1398
   wire /* unsigned */  [3:0] multiplier_lsb;
1399
   wire signed         [17:0] multiplier_msb;
1400
 
1401
   reg signed [19:0] partial_product_1;
1402
   reg signed [33:0] partial_product_2;
1403
 
1404
   assign multiplier_lsb = multiplier[3:0];
1405
   assign multiplier_msb = multiplier[21:4];
1406
 
1407
   always @(posedge clk)
1408
     if (~rst) partial_product_2 <= 34'b0;
1409
     else if (clk_en) partial_product_2 <= multiplier_msb * multiplicand;
1410
     else partial_product_2 <= partial_product_2;
1411
 
1412
   always @(posedge clk)
1413
     if (~rst) partial_product_1 <= 20'b0;
1414
     else if (clk_en)
1415
       partial_product_1 <= (multiplier_lsb[0] ? {{4{multiplicand[15]}}, multiplicand      } : 20'b0) +
1416
                            (multiplier_lsb[1] ? {{3{multiplicand[15]}}, multiplicand, 1'b0} : 20'b0) +
1417
                            (multiplier_lsb[2] ? {{2{multiplicand[15]}}, multiplicand, 2'b0} : 20'b0) +
1418
                            (multiplier_lsb[3] ? {{1{multiplicand[15]}}, multiplicand, 3'b0} : 20'b0);
1419
     else partial_product_1 <= partial_product_1;
1420
 
1421
   always @(posedge clk)
1422
     if (~rst) product <= 38'b0;
1423
     else if (clk_en) product <=  {partial_product_2, 4'b0} + { {18{partial_product_1[19]}}, partial_product_1};
1424
     else product <= product;
1425
 
1426
endmodule
1427
 
1428
 /*
1429
  idct_fifo
1430
 
1431
  Groups idct coefficients into a row of eight.
1432
 
1433
  Input: 9-bit signed idct coefficients
1434
  Output: one row of 72 bits, consisting of 8 idct coefficients,
1435
  which is the  'prediction error', to be added to the motion compensation prediction.
1436
  */
1437
 
1438
module idct_fifo(
1439
  rst, clk_en, clk,
1440
  idct_data, idct_valid, idct_eob,
1441
  idct_wr_dta_full, idct_wr_dta_almost_full, idct_wr_dta_overflow,
1442
  idct_rd_dta_empty, idct_rd_dta_almost_empty, idct_rd_dta_valid,
1443
  idct_rd_dta_en, idct_rd_dta
1444
  );
1445
 
1446
  input              rst;                      // synchronous active low reset
1447
  input              clk_en;                   // clock enable
1448
  input              clk;                      // clock
1449
 
1450
  input signed  [8:0]idct_data;
1451
  input              idct_eob;
1452
  input              idct_valid;
1453
 
1454
  /* idct coefficients fifo */
1455
  /* idct coefficients fifo: writing */
1456
  output             idct_wr_dta_full;
1457
  output             idct_wr_dta_almost_full;
1458
  output             idct_wr_dta_overflow;
1459
  reg          [71:0]idct_wr_dta;
1460
  reg                idct_wr_dta_en;
1461
  /* idct coefficients fifo: reading */
1462
  output             idct_rd_dta_empty;
1463
  output             idct_rd_dta_almost_empty;
1464
  input              idct_rd_dta_en;
1465
  output       [71:0]idct_rd_dta;
1466
  output             idct_rd_dta_valid;
1467
 
1468
  reg           [8:0]cnt;
1469
 
1470
`include "fifo_size.v"
1471
 
1472
  always @(posedge clk)
1473
    if (~rst) idct_wr_dta <= 72'b0;
1474
    else if (clk_en && idct_valid) idct_wr_dta <= {idct_wr_dta[62:0], idct_data};
1475
    else idct_wr_dta <= idct_wr_dta;
1476
 
1477
  always @(posedge clk)
1478
    if (~rst) cnt <= 8'b1;
1479
    else if (clk_en && idct_valid) cnt <= {cnt[6:0], cnt[7]};
1480
    else cnt <= cnt;
1481
 
1482
  always @(posedge clk)
1483
    if (~rst) idct_wr_dta_en <= 1'b0;
1484
    else if (clk_en) idct_wr_dta_en <= cnt[7] && idct_valid;
1485
    else idct_wr_dta_en <= idct_wr_dta_en;
1486
 
1487
  /*
1488
     prediction error fifo. (f[y][x] in Figure 7-5).
1489
     addr_width = 6 > big enough to hold all blocks of a macroblock (6 blocks for 4:2:0, 8 for 4:4:4)
1490
 
1491
     Note one can read data from the fifo even when clk_en is low.
1492
     This allows motcomp to drain the fifo.
1493
   */
1494
 
1495
  fifo_sc
1496
    #(.addr_width(PREDICT_DEPTH),
1497
    .dta_width(9'd72),
1498
    .prog_thresh(PREDICT_THRESHOLD))
1499
    predict_err_fifo (
1500
    .rst(rst),
1501
    .clk(clk),
1502
    .din(idct_wr_dta),
1503
    .wr_en(idct_wr_dta_en && clk_en),
1504
    .full(idct_wr_dta_full),
1505
    .wr_ack(),
1506
    .overflow(idct_wr_dta_overflow),
1507
    .prog_full(idct_wr_dta_almost_full),
1508
    .dout(idct_rd_dta),
1509
    .rd_en(idct_rd_dta_en),
1510
    .empty(idct_rd_dta_empty),
1511
    .valid(idct_rd_dta_valid),
1512
    .underflow(),
1513
    .prog_empty(idct_rd_dta_almost_empty)
1514
    );
1515
 
1516
`ifdef CHECK
1517
  always @(posedge clk)
1518
    if (idct_wr_dta_overflow)
1519
      begin
1520
        #0 $display("%m\t*** error: idct fifo overflow ***");
1521
        $stop;
1522
      end
1523
`endif
1524
 
1525
//`define DEBUG 1
1526
`ifdef DEBUG
1527
  always @(posedge clk)
1528
    $strobe("%m\tclk_en: %d idct_data: %5d valid: %d  eob: %d", clk_en, idct_data, idct_valid, idct_eob);
1529
 
1530
  wire signed [8:0]predict_err_0;
1531
  wire signed [8:0]predict_err_1;
1532
  wire signed [8:0]predict_err_2;
1533
  wire signed [8:0]predict_err_3;
1534
  wire signed [8:0]predict_err_4;
1535
  wire signed [8:0]predict_err_5;
1536
  wire signed [8:0]predict_err_6;
1537
  wire signed [8:0]predict_err_7;
1538
 
1539
  assign {predict_err_0, predict_err_1, predict_err_2, predict_err_3, predict_err_4, predict_err_5, predict_err_6, predict_err_7} = idct_rd_dta;
1540
 
1541
  always @(posedge clk)
1542
    $strobe("%m\tpredict_err: %5d %5d %5d %5d %5d %5d %5d %5d valid: %d", predict_err_0, predict_err_1, predict_err_2, predict_err_3, predict_err_4, predict_err_5, predict_err_6, predict_err_7, idct_rd_dta_valid);
1543
 
1544
  always @(posedge clk)
1545
    $strobe("%m\tidct_rd_dta: %18h idct_rd_dta_valid: %d idct_rd_dta_en: %d idct_rd_dta_empty: %d", idct_rd_dta, idct_rd_dta_valid, idct_rd_dta_en, idct_rd_dta_empty);
1546
`endif
1547
endmodule
1548
/* not truncated */

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.