OpenCores
URL https://opencores.org/ocsvn/mpeg2fpga/mpeg2fpga/trunk

Subversion Repositories mpeg2fpga

[/] [mpeg2fpga/] [trunk/] [tools/] [ieee1180/] [ieee1180/] [idct.v] - Blame information for rev 2

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 2 kdv
/*
2
 * Inverse Discrete Cosine Transform.
3
 *
4
 * Copyright Koen De Vleeschauwer, 2006. All rights reserved.
5
 * Sat Jul 22 17:27:43 CEST 2006
6
 *
7
 */
8
 
9
`include "timescale.v"
10
 
11
`undef DEBUG
12
//`define DEBUG 1
13
//`define DEBUG_IDCT_1D 1
14
//`define DEBUG_TRANSPOSE 1
15
 
16
  /*
17
   * 2-dimensional inverse discrete cosine transform.
18
   *
19
   * Uses row/column decomposition method:
20
   * 1. do a one-dimensional idct om the rows
21
   * 2. swap rows and columns,
22
   * 3. do a one-dimensional idct om the columns
23
   * 4. swap rows and columns to go back to row order.
24
   *
25
   * Thought to meet or exceed the former IEEE 1180-1990 standard.
26
   * Can do streaming.
27
   * Uses 12 multipliers, all smaller than 18x18, and 2 dual-ported rams.
28
   *
29
   * The 8-point 1-dimensional inverse discrete cosine transform can be written as:
30
   *
31
   * | y0 |   1   |  a  b  a  c |   | x0 |   1   |  d  e  f  g |   | x1 |
32
   * | y1 | = - * |  a  c -a -b | * | x2 | + - * |  e -g -d -f | * | x3 |
33
   * | y2 |   2   |  a -c -a  b |   | x4 |   2   |  f -d  g  e |   | x5 |
34
   * | y3 |       |  a -b  a  c |   | x6 |       |  g -f  e -d |   | x7 |
35
   *
36
   * | y7 |   1   |  a  b  a  c |   | x0 |   1   |  d  e  f  g |   | x1 |
37
   * | y6 | = - * |  a  c -a -b | * | x2 | - - * |  e -g -d -f | * | x3 |
38
   * | y5 |   2   |  a -c -a  b |   | x4 |   2   |  f -d  g  e |   | x5 |
39
   * | y4 |       |  a -b  a  c |   | x6 |       |  g -f  e -d |   | x7 |
40
   *
41
   * where
42
   *   a = cos (pi/4)
43
   *   b = cos (pi/8)
44
   *   c = sin (pi/8)
45
   *   d = cos (pi/16)
46
   *   e = cos (3*pi/16)
47
   *   f = sin (3*pi/16)
48
   *   g = sin (pi/16)
49
   *
50
   * For fixed-point calculations, a..g are multiplied by sqrt(8) * 2**scale
51
   * where scale = 13 or 14, depending upon accuracy desired.
52
   * Multiplying by sqrt(8) causes a to be a power of two.
53
   * This way a*x0 and a*x4 can be calculated using shifts, saving two multipliers.
54
   *
55
   * There's a Call For Proposals for a Fixed-Point 8x8 IDCT within MPEG.
56
   * This implementation follows one of the proposals:
57
   * "Systematic approach of Fixed Point 8x8 IDCT and DCT Design and Implementation",
58
   * Zhang, Wang, Yu.
59
   *
60
   * We choose:
61
   *   scheme = 4
62
   *   scale = 14
63
   *   row_shift = 10
64
   *   col_shift = 21
65
   *
66
   * Calculation of theoretical register sizes:
67
   *   sample = 8 (in mpeg2 video)
68
   *   input of idct_row:
69
   *     input_bits =  sample_bits + 4 = 8 + 4 = 12 (Form. 6)
70
   *   outout of idct_row:
71
   *     output_bits_row = scale - row_shift + sample_bits + 5 = 14 - 10 + 8 + 5 = 17 (Form. 11)
72
   *   size of internal registers during calculation of idct_row:
73
   *     max_inter_bits_row = scale + sample_bits + 5 = 13 + 8 + 5 = 26
74
   *   output of idct_col:
75
   *     output_bits_col = sample_bits  + 3 = 8 + 3 = 11 (Form. 12)
76
   *   size of internal registers during calculation of idct_col:
77
   *     max_inter_bits_col = col_shift + sample_bits  + 3 = 21 + 8 + 3 = 32 (Form. 13)
78
   *
79
   * We choose:
80
   *   register for idct_row: 32 bits
81
   *   output of idct_row: 22 bits
82
   *   registers for idct_col: 42 bits
83
   *   output of idct_col: 22 bits
84
   *
85
   */
86
 
87
module idct(clk, clk_en, rst,
88
            iquant_level, iquant_eob, iquant_valid,
89
            idct_data, idct_valid, idct_eob);
90
 
91
  input              clk;                       // clock
92
  input              clk_en;                    // clock enable
93
  input              rst;                       // synchronous active low reset
94
  input signed [11:0]iquant_level;              // inverse quantized dct coefficient
95
  input              iquant_eob;                // asserted at last inverse quantized dct coefficient of block
96
  input              iquant_valid;              // asserted when inverse quantized dct coefficient valid
97
  output signed [8:0]idct_data;                 // inverse quantized dct coefficient
98
  output             idct_eob;                  // asserted at last inverse quantized dct coefficient of block
99
  output             idct_valid;                // asserted when idct_data, idct_eob valid
100
 
101
  wire signed  [21:0]idct_row_data;
102
  wire               idct_row_valid;
103
  wire signed  [21:0]idct_col_data_in;
104
  wire               idct_col_valid_in;
105
  wire signed  [20:0]idct_col_data_out;
106
  wire               idct_col_valid_out;
107
  wire signed   [8:0]idct_col_clip_data_out;
108
  wire               idct_col_clip_valid_out;
109
 
110
  /* apply 1-d idct to rows */
111
  idct1d_row      #(.scale(14), .dta_in_width(12), .dta_shift(10), .reg_width(32))
112
                  idct_row(.clk(clk), .clk_en(clk_en), .rst(rst),
113
                  .dta_in(iquant_level), .dta_in_valid(iquant_valid),
114
                  .dta_out(idct_row_data), .dta_out_valid(idct_row_valid));
115
 
116
  /*
117
   * Result from idct_row is 22 bit wide.
118
   */
119
 
120
  /* swap rows and columns */
121
  transpose       #(.dta_width(22))
122
                  row2col(.clk(clk), .clk_en(clk_en), .rst(rst),
123
                  .dta_in(idct_row_data), .dta_in_valid(idct_row_valid),
124
                  .dta_out(idct_col_data_in), .dta_out_valid(idct_col_valid_in));
125
 
126
  /* apply 1-d idct to columns */
127
  idct1d_col      #(.scale(14), .dta_in_width(22), .dta_shift(21), .reg_width(42))
128
                  idct_col(.clk(clk), .clk_en(clk_en), .rst(rst),
129
                  .dta_in(idct_col_data_in), .dta_in_valid(idct_col_valid_in),
130
                  .dta_out(idct_col_data_out), .dta_out_valid(idct_col_valid_out));
131
 
132
  /*
133
   * Result from idct_col is 22 bits,
134
   * Clip to 9 bits.
135
   */
136
 
137
  clip_col        clip_col(.clk(clk), .clk_en(clk_en), .rst(rst),
138
                  .dta_in(idct_col_data_out), .dta_in_valid(idct_col_valid_out),
139
                  .dta_out(idct_col_clip_data_out), .dta_out_valid(idct_col_clip_valid_out));
140
 
141
  /* swap back to rows */
142
  transpose       #(.dta_width(9))
143
                  col2row(.clk(clk), .clk_en(clk_en), .rst(rst),
144
                  .dta_in(idct_col_clip_data_out), .dta_in_valid(idct_col_clip_valid_out),
145
                  .dta_out(idct_data), .dta_out_valid(idct_valid), .dta_out_eob(idct_eob));
146
 
147
`ifdef DEBUG
148
always @(posedge clk)
149
  if (clk_en && iquant_valid)
150
    begin
151
      if (iquant_eob)
152
        begin
153
          #0 $display("%m\t\tidct input: %d (eob)", iquant_level);
154
        end
155
      else
156
        begin
157
          #0 $display("%m\t\tidct input: %d", iquant_level);
158
        end
159
    end
160
 
161
always @(posedge clk)
162
  if (clk_en && idct_row_valid)
163
    begin
164
        #0 $display("%m\t\tafter idct_row: %d", idct_row_data);
165
    end
166
 
167
always @(posedge clk)
168
  if (clk_en && idct_col_valid_in)
169
    begin
170
        #0 $display("%m\t\tafter row2col: %d", idct_col_data_in);
171
    end
172
 
173
always @(posedge clk)
174
  if (clk_en && idct_col_valid_out)
175
    begin
176
        #0 $display("%m\t\tafter idct_col: %d", idct_col_data_out);
177
    end
178
 
179
always @(posedge clk)
180
  if (clk_en && idct_col_clip_valid_out)
181
    begin
182
        #0 $display("%m\t\tafter clipping: %d", idct_col_clip_data_out);
183
    end
184
 
185
always @(posedge clk)
186
  if (clk_en && idct_valid)
187
    begin
188
      #0 $display("%m\t\tafter col2row: %d", idct_data);
189
    end
190
 
191
`endif
192
 
193
endmodule
194
 
195
/*
196
 * 8-point 1-dimensional inverse discrete cosine transform. Row transform.
197
 */
198
 
199
module idct1d_row (clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid);
200
  parameter                            dta_in_width=12,          // width of dta_in
201
                                       dta_shift=11,             // how much to shift result to the right
202
                                       reg_width=29,             // width of internal registers
203
                                       scale=13,                 // cosine values scaled by 2**scale
204
                                       dta_out_width=reg_width-dta_shift, // width of dta_out
205
                                       cosval_width=16,          // width of COSVAL_A .. COSVAL_G
206
                                       prod_width=dta_in_width+cosval_width;          // width of COSVAL_i * xi
207
 
208
  input                                clk;                      // clock
209
  input                                clk_en;                   // clock enable
210
  input                                rst;                      // synchronous active low reset
211
  input signed       [dta_in_width-1:0]dta_in;                   // data in
212
  input                                dta_in_valid;
213
  output reg signed [dta_out_width-1:0]dta_out;                  // data out - 18 bits wide
214
  output reg                           dta_out_valid;
215
 
216
  parameter [cosval_width-1:0]
217
    COSVAL_A      =  16'sd16384,  /*   SQRT(8)/2 * 2**14 * cos (pi/4) */
218
    COSVAL_MINUSA = -16'sd16384,  /* - SQRT(8)/2 * 2**14 * cos (pi/4) */
219
    COSVAL_B      =  16'sd21407,  /*   SQRT(8)/2 * 2**14 * cos (pi/8) */
220
    COSVAL_MINUSB = -16'sd21407,  /* - SQRT(8)/2 * 2**14 * cos (pi/8) */
221
    COSVAL_C      =  16'sd8867,   /*   SQRT(8)/2 * 2**14 * sin (pi/8) */
222
    COSVAL_MINUSC = -16'sd8867,   /* - SQRT(8)/2 * 2**14 * sin (pi/8) */
223
    COSVAL_D      =  16'sd22725,  /*   SQRT(8)/2 * 2**14 * cos (pi/16) */
224
    COSVAL_MINUSD = -16'sd22725,  /* - SQRT(8)/2 * 2**14 * cos (pi/16) */
225
    COSVAL_E      =  16'sd19266,  /*   SQRT(8)/2 * 2**14 * cos (3*pi/16) */
226
    COSVAL_MINUSE = -16'sd19266,  /* - SQRT(8)/2 * 2**14 * cos (3*pi/16) */
227
    COSVAL_F      =  16'sd12873,  /*   SQRT(8)/2 * 2**14 * sin (3*pi/16) */
228
    COSVAL_MINUSF = -16'sd12873,  /* - SQRT(8)/2 * 2**14 * sin (3*pi/16) */
229
    COSVAL_G      =  16'sd4520,   /*   SQRT(8)/2 * 2**14 * sin (pi/16) */
230
    COSVAL_MINUSG = -16'sd4520;   /* - SQRT(8)/2 * 2**14 * sin (pi/16) */
231
 
232
  /* dct coefficients input */
233
  reg signed [dta_in_width-1:0]q0;
234
  reg signed [dta_in_width-1:0]q1;
235
  reg signed [dta_in_width-1:0]q2;
236
  reg signed [dta_in_width-1:0]q3;
237
  reg signed [dta_in_width-1:0]q4;
238
  reg signed [dta_in_width-1:0]q5;
239
  reg signed [dta_in_width-1:0]q6;
240
  reg signed [dta_in_width-1:0]q7;
241
 
242
  reg signed [dta_in_width-1:0]x0;
243
  reg signed [dta_in_width-1:0]x1;
244
  reg signed [dta_in_width-1:0]x2;
245
  reg signed [dta_in_width-1:0]x3;
246
  reg signed [dta_in_width-1:0]x4;
247
  reg signed   [dta_in_width:0]minus_x4; // needs one bit more than x4, else two's complement of most negative x4 doesn't fit.
248
  reg signed [dta_in_width-1:0]x5;
249
  reg signed [dta_in_width-1:0]x6;
250
  reg signed [dta_in_width-1:0]x7;
251
 
252
  reg signed [cosval_width-1:0]cos1;
253
  reg signed [cosval_width-1:0]cos2;
254
  reg signed [cosval_width-1:0]cos3;
255
  reg signed [cosval_width-1:0]cos5;
256
  reg signed [cosval_width-1:0]cos6;
257
  reg signed [cosval_width-1:0]cos7;
258
 
259
  reg signed [prod_width-1:0]prod0; // product of xi * cosvali
260
  reg signed [prod_width-1:0]prod1;
261
  reg signed [prod_width-1:0]prod2;
262
  reg signed [prod_width-1:0]prod3;
263
  reg signed [prod_width-1:0]prod4;
264
  reg signed [prod_width-1:0]prod5;
265
  reg signed [prod_width-1:0]prod6;
266
  reg signed [prod_width-1:0]prod7;
267
 
268
  reg signed [reg_width-1:0]sum02; // sum of prodi and prodj
269
  reg signed [reg_width-1:0]sum46;
270
  reg signed [reg_width-1:0]sum13;
271
  reg signed [reg_width-1:0]sum57;
272
  reg signed [reg_width-1:0]sum0246; // sum of sumij and sumpq
273
  reg signed [reg_width-1:0]sum1357;
274
 
275
  reg signed [reg_width-1:0]y; // y sum or difference of sum0246 and sum0246
276
 
277
  reg [3:0]dta_in_cntr;
278
 
279
  reg dta_out_val_0;
280
  reg dta_out_val_1;
281
  reg dta_out_val_2;
282
  reg dta_out_val_3;
283
 
284
  reg add_0;
285
  reg add_1;
286
  reg add_2;
287
 
288
  // an offset which is added to x0 to round the results.
289
  parameter signed [reg_width-1:0] offset = {2'b01, {(dta_shift-1){1'b0}}};
290
 
291
  parameter [3:0]
292
    STATE_IDLE  = 4'd0,
293
    STATE_0     = 4'd1,
294
    STATE_1     = 4'd2,
295
    STATE_2     = 4'd3,
296
    STATE_3     = 4'd4,
297
    STATE_4     = 4'd5,
298
    STATE_5     = 4'd6,
299
    STATE_6     = 4'd7,
300
    STATE_7     = 4'd8;
301
 
302
  reg [3:0]state;
303
  reg [3:0]next;
304
 
305
  /*
306
   * IDCT data input
307
   */
308
 
309
  /* input shift register */
310
  always @(posedge clk)
311
    if (~rst)
312
      begin
313
        q0 <= 'sd0;
314
        q1 <= 'sd0;
315
        q2 <= 'sd0;
316
        q3 <= 'sd0;
317
        q4 <= 'sd0;
318
        q5 <= 'sd0;
319
        q6 <= 'sd0;
320
        q7 <= 'sd0;
321
      end
322
    else if (clk_en && dta_in_valid)
323
      begin
324
        q0 <= q1;
325
        q1 <= q2;
326
        q2 <= q3;
327
        q3 <= q4;
328
        q4 <= q5;
329
        q5 <= q6;
330
        q6 <= q7;
331
        q7 <= dta_in;
332
      end
333
    else
334
      begin
335
        q0 <= q0;
336
        q1 <= q1;
337
        q2 <= q2;
338
        q3 <= q3;
339
        q4 <= q4;
340
        q5 <= q5;
341
        q6 <= q6;
342
        q7 <= q7;
343
      end
344
 
345
  always @(posedge clk)
346
    if (~rst)
347
      begin
348
        x0 <= 'sd0;
349
        x1 <= 'sd0;
350
        x2 <= 'sd0;
351
        x3 <= 'sd0;
352
        x4 <= 'sd0;
353
        minus_x4 <= 'sd0;
354
        x5 <= 'sd0;
355
        x6 <= 'sd0;
356
        x7 <= 'sd0;
357
      end
358
    else if (clk_en && (dta_in_cntr == 4'd8))
359
      begin
360
        x0 <= q0;
361
        x1 <= q1;
362
        x2 <= q2;
363
        x3 <= q3;
364
        x4 <= q4;
365
        minus_x4 <= ~{q4[dta_in_width-1], q4}+1'b1;
366
        x5 <= q5;
367
        x6 <= q6;
368
        x7 <= q7;
369
      end
370
    else
371
      begin
372
        x0 <= x0;
373
        x1 <= x1;
374
        x2 <= x2;
375
        x3 <= x3;
376
        x4 <= x4;
377
        minus_x4 <= minus_x4;
378
        x5 <= x5;
379
        x6 <= x6;
380
        x7 <= x7;
381
      end
382
 
383
  /* input counter */
384
  always @(posedge clk)
385
    if (~rst) dta_in_cntr <= 4'b0;
386
    else if (clk_en && (dta_in_cntr == 4'd8) && dta_in_valid) dta_in_cntr <= 3'd1;
387
    else if (clk_en && (dta_in_cntr == 4'd8)) dta_in_cntr <= 3'd0;
388
    else if (clk_en && dta_in_valid) dta_in_cntr <= dta_in_cntr + 3'd1;
389
    else dta_in_cntr <= dta_in_cntr;
390
 
391
  /*
392
   * IDCT calculation
393
   */
394
 
395
  /* next state logic */
396
  always @*
397
    case (state)
398
      STATE_IDLE:   if (dta_in_cntr == 4'd8) next = STATE_0;
399
                    else next = STATE_IDLE;
400
      STATE_0:      next = STATE_1;
401
      STATE_1:      next = STATE_2;
402
      STATE_2:      next = STATE_3;
403
      STATE_3:      next = STATE_4;
404
      STATE_4:      next = STATE_5;
405
      STATE_5:      next = STATE_6;
406
      STATE_6:      next = STATE_7;
407
      STATE_7:      if (dta_in_cntr == 4'd8) next = STATE_0;
408
                    else next = STATE_IDLE;
409
      default       next = STATE_IDLE;
410
    endcase
411
 
412
  /* state */
413
  always @(posedge clk)
414
    if(~rst) state <= STATE_IDLE;
415
    else if (clk_en) state <= next;
416
    else  state <= state;
417
 
418
  always @(posedge clk)
419
    if (~rst)
420
      cos2 <= COSVAL_B;
421
    else if (clk_en)
422
      case (state)
423
        STATE_0:       cos2 <= COSVAL_C;
424
        STATE_1:       cos2 <= COSVAL_MINUSC;
425
        STATE_2:       cos2 <= COSVAL_MINUSB;
426
        STATE_3:       cos2 <= COSVAL_MINUSB;
427
        STATE_4:       cos2 <= COSVAL_MINUSC;
428
        STATE_5:       cos2 <= COSVAL_C;
429
        STATE_6:       cos2 <= COSVAL_B;
430
        STATE_7:       cos2 <= COSVAL_B;
431
        default        cos2 <= COSVAL_B;
432
      endcase
433
    else
434
      cos2 <= cos2;
435
 
436
  always @(posedge clk)
437
    if (~rst)
438
      cos6 <= COSVAL_C;
439
    else if (clk_en)
440
      case (state)
441
        STATE_0:       cos6 <= COSVAL_MINUSB;
442
        STATE_1:       cos6 <= COSVAL_B;
443
        STATE_2:       cos6 <= COSVAL_MINUSC;
444
        STATE_3:       cos6 <= COSVAL_MINUSC;
445
        STATE_4:       cos6 <= COSVAL_B;
446
        STATE_5:       cos6 <= COSVAL_MINUSB;
447
        STATE_6:       cos6 <= COSVAL_C;
448
        STATE_7:       cos6 <= COSVAL_C;
449
        default        cos6 <= COSVAL_C;
450
      endcase
451
    else
452
      cos6 <= cos6;
453
 
454
  always @(posedge clk)
455
    if (~rst)
456
      cos1 <= COSVAL_D;
457
    else if (clk_en)
458
      case (state)
459
        STATE_0:       cos1 <= COSVAL_E;
460
        STATE_1:       cos1 <= COSVAL_F;
461
        STATE_2:       cos1 <= COSVAL_G;
462
        STATE_3:       cos1 <= COSVAL_G;
463
        STATE_4:       cos1 <= COSVAL_F;
464
        STATE_5:       cos1 <= COSVAL_E;
465
        STATE_6:       cos1 <= COSVAL_D;
466
        STATE_7:       cos1 <= COSVAL_D;
467
        default        cos1 <= COSVAL_D;
468
      endcase
469
    else
470
      cos1 <= cos1;
471
 
472
  always @(posedge clk)
473
    if (~rst)
474
      cos3 <= COSVAL_E;
475
    else if (clk_en)
476
      case (state)
477
        STATE_0:       cos3 <= COSVAL_MINUSG;
478
        STATE_1:       cos3 <= COSVAL_MINUSD;
479
        STATE_2:       cos3 <= COSVAL_MINUSF;
480
        STATE_3:       cos3 <= COSVAL_MINUSF;
481
        STATE_4:       cos3 <= COSVAL_MINUSD;
482
        STATE_5:       cos3 <= COSVAL_MINUSG;
483
        STATE_6:       cos3 <= COSVAL_E;
484
        STATE_7:       cos3 <= COSVAL_E;
485
        default        cos3 <= COSVAL_E;
486
      endcase
487
    else
488
      cos3 <= cos3;
489
 
490
  always @(posedge clk)
491
    if (~rst)
492
      cos5 <= COSVAL_F;
493
    else if (clk_en)
494
      case (state)
495
        STATE_0:       cos5 <= COSVAL_MINUSD;
496
        STATE_1:       cos5 <= COSVAL_G;
497
        STATE_2:       cos5 <= COSVAL_E;
498
        STATE_3:       cos5 <= COSVAL_E;
499
        STATE_4:       cos5 <= COSVAL_G;
500
        STATE_5:       cos5 <= COSVAL_MINUSD;
501
        STATE_6:       cos5 <= COSVAL_F;
502
        STATE_7:       cos5 <= COSVAL_F;
503
        default        cos5 <= COSVAL_F;
504
      endcase
505
    else
506
      cos5 <= cos5;
507
 
508
  always @(posedge clk)
509
    if (~rst)
510
      cos7 <= COSVAL_G;
511
    else if (clk_en)
512
      case (state)
513
        STATE_0:       cos7 <= COSVAL_MINUSF;
514
        STATE_1:       cos7 <= COSVAL_E;
515
        STATE_2:       cos7 <= COSVAL_MINUSD;
516
        STATE_3:       cos7 <= COSVAL_MINUSD;
517
        STATE_4:       cos7 <= COSVAL_E;
518
        STATE_5:       cos7 <= COSVAL_MINUSF;
519
        STATE_6:       cos7 <= COSVAL_G;
520
        STATE_7:       cos7 <= COSVAL_G;
521
        default        cos7 <= COSVAL_G;
522
      endcase
523
    else
524
      cos7 <= cos7;
525
 
526
  always @(posedge clk)
527
    if (~rst)
528
      begin
529
        prod0 <= 'sd0;
530
        prod1 <= 'sd0;
531
        prod2 <= 'sd0;
532
        prod3 <= 'sd0;
533
        prod4 <= 'sd0;
534
        prod5 <= 'sd0;
535
        prod6 <= 'sd0;
536
        prod7 <= 'sd0;
537
        sum02 <= 'sd0;
538
        sum46 <= 'sd0;
539
        sum13 <= 'sd0;
540
        sum57 <= 'sd0;
541
        sum0246 <= 'sd0;
542
        sum1357 <= 'sd0;
543
      end
544
    else if (clk_en)
545
      begin
546
        /*
547
         * Next line implements
548
         * prod0 <= (cos0 * x0) + offset; // = cos0 * x0 + offset;
549
         * using shifts; offset added for proper rounding.
550
         */
551
 
552
        prod0 <= {{(reg_width - dta_in_width){x0[dta_in_width-1]}}, x0, {scale{1'b0}}} + offset; // = cos0 * x0 + offset; offset added for proper rounding. Avoids a multipier.
553
 
554
        /*
555
         * These ought to map to a hardware multiplier in the fpga.
556
         */
557
        prod1 <= cos1 * x1;
558
        prod2 <= cos2 * x2;
559
        prod3 <= cos3 * x3;
560
 
561
        /*
562
         * case implements
563
         *  prod4 <= cos4 * x4;
564
         * using shifts, saving a multiplier.
565
         */
566
 
567
        case (state)
568
          STATE_0,
569
          STATE_3,
570
          STATE_4,
571
          STATE_7: prod4 <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
572
          STATE_1,
573
          STATE_2,
574
          STATE_5,
575
          STATE_6: prod4 <=  {{(reg_width - dta_in_width-1){minus_x4[dta_in_width]}}, minus_x4, {scale{1'b0}}};
576
          default  prod4 <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
577
        endcase
578
 
579
        prod5 <= cos5 * x5;
580
        prod6 <= cos6 * x6;
581
        prod7 <= cos7 * x7;
582
        sum02 <= {{(reg_width-prod_width){prod0[prod_width-1]}}, prod0} + {{(reg_width-prod_width){prod2[prod_width-1]}}, prod2};
583
        sum46 <= {{(reg_width-prod_width){prod4[prod_width-1]}}, prod4} + {{(reg_width-prod_width){prod6[prod_width-1]}}, prod6};
584
        sum13 <= {{(reg_width-prod_width){prod1[prod_width-1]}}, prod1} + {{(reg_width-prod_width){prod3[prod_width-1]}}, prod3};
585
        sum57 <= {{(reg_width-prod_width){prod5[prod_width-1]}}, prod5} + {{(reg_width-prod_width){prod7[prod_width-1]}}, prod7};
586
        sum0246 <= sum02 + sum46;
587
        sum1357 <= sum13 + sum57;
588
      end
589
    else
590
      begin
591
        prod0 <= prod0;
592
        prod1 <= prod1;
593
        prod2 <= prod2;
594
        prod3 <= prod3;
595
        prod4 <= prod4;
596
        prod5 <= prod5;
597
        prod6 <= prod6;
598
        prod7 <= prod7;
599
        sum02 <= sum02;
600
        sum46 <= sum46;
601
        sum13 <= sum13;
602
        sum57 <= sum57;
603
        sum0246 <= sum0246;
604
        sum1357 <= sum1357;
605
      end
606
 
607
  always @(posedge clk)
608
    if (~rst)
609
      begin
610
        dta_out_val_0 <= 1'b0;
611
        dta_out_val_1 <= 1'b0;
612
        dta_out_val_2 <= 1'b0;
613
        dta_out_val_3 <= 1'b0;
614
        dta_out_valid <= 1'b0;
615
      end
616
    else if (clk_en)
617
      begin
618
        dta_out_val_0 <= (state != STATE_IDLE);
619
        dta_out_val_1 <= dta_out_val_0;
620
        dta_out_val_2 <= dta_out_val_1;
621
        dta_out_val_3 <= dta_out_val_2;
622
        dta_out_valid <= dta_out_val_3;
623
      end
624
    else
625
      begin
626
        dta_out_val_0 <= dta_out_val_0;
627
        dta_out_val_1 <= dta_out_val_1;
628
        dta_out_val_2 <= dta_out_val_2;
629
        dta_out_val_3 <= dta_out_val_3;
630
        dta_out_valid <= dta_out_valid;
631
      end
632
 
633
 
634
  /*
635
   * Looking at the equation for the 1d idct, the final step when calculating
636
   * y0..y3 is addition, when calculating y4..y7 subtraction.
637
   * register add_0 is 1 when one needs to add, 0 when one needs to subtract.
638
   */
639
 
640
  always @(posedge clk)
641
    if (~rst)
642
      add_0 <= 1'd0;
643
    else if (clk_en)
644
      case (state)
645
        STATE_0,
646
        STATE_1,
647
        STATE_2,
648
        STATE_3:       add_0 <= 1'b1;
649
        STATE_4,
650
        STATE_6,
651
        STATE_5,
652
        STATE_7:       add_0 <= 1'b0;
653
        default        add_0 <= 1'b0;
654
      endcase
655
    else
656
      add_0 <= add_0;
657
 
658
  always @(posedge clk)
659
    if (~rst)
660
      begin
661
        add_1 <= 1'b0;
662
        add_2 <= 1'b0;
663
      end
664
    else if (clk_en)
665
      begin
666
        add_1 <= add_0;
667
        add_2 <= add_1;
668
      end
669
    else
670
      begin
671
      add_1 <= add_1;
672
      add_2 <= add_2;
673
      end
674
 
675
  always @(posedge clk)
676
    if (~rst)
677
      y <= 'sd0;
678
    else if (clk_en && add_2)
679
      y <= sum0246 + sum1357;
680
    else if (clk_en)
681
      y <= sum0246 - sum1357;
682
    else
683
      y <= y;
684
 
685
  always @(posedge clk)
686
    if (~rst) dta_out <= 'sd0;
687
    else if (clk_en) dta_out <=  y >>> dta_shift;
688
    else dta_out <= dta_out;
689
 
690
`ifdef DEBUG_IDCT_1D
691
  always @(posedge clk)
692
    begin
693
      $strobe("%m\toffset: %d", offset);
694
      $strobe("%m\tcos0: -------- cos1: %8d cos2: %8d cos3: %8d cos4: -------- cos5: %8d cos6: %8d cos7: %8d", cos1, cos2, cos3, cos5, cos6, cos7);
695
      $strobe("%m\t  x0: %8d   x1: %8d   x2: %8d   x3: %8d   x4: %8d   x5: %8d   x6: %8d   x7: %8d",   x0,   x1,   x2,   x3,   x4,   x5,   x6,   x7);
696
      $strobe("%m\tprod0: %d prod1: %d prod2: %d prod3: %d prod4: %d prod5: %d prod6: %d prod7: %d", prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7);
697
      $strobe("%m\tsum02: %8d sum46: %8d sum13: %8d sum57: %8d", sum02, sum46, sum13, sum57);
698
      $strobe("%m\tsum0246: %8d sum1357: %8d", sum0246, sum1357);
699
      $strobe("%m\ty: %8d", y);
700
      $strobe("%m\tdta_out: %8d", dta_out);
701
    end
702
`endif
703
 
704
endmodule
705
 
706
/*
707
 * 8-point 1-dimensional inverse discrete cosine transform. Column transform.
708
 *
709
 * Mathematically identical to the row transform.
710
 * However, the 22x16 multipliers have not been implemented as two 18x18 multipliers,
711
 * but as an 18x18 multiplier with a few shifters and adders added.
712
 * This saves six multipliers. Clock speed improves, too.
713
 */
714
 
715
module idct1d_col (clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid);
716
  parameter                            dta_in_width=12,          // width of dta_in
717
                                       dta_shift=11,             // how much to shift result to the right
718
                                       reg_width=29,             // width of internal registers
719
                                       scale=13,                 // cosine values scaled by 2**scale
720
                                       dta_out_width=reg_width-dta_shift, // width of dta_out
721
                                       cosval_width=16,          // width of COSVAL_A .. COSVAL_G
722
                                       prod_width=dta_in_width+cosval_width;          // width of COSVAL_i * xi
723
 
724
  input                                clk;                      // clock
725
  input                                clk_en;                   // clock enable
726
  input                                rst;                      // synchronous active low reset
727
  input signed       [dta_in_width-1:0]dta_in;                   // data in
728
  input                                dta_in_valid;
729
  output reg signed [dta_out_width-1:0]dta_out;                  // data out - 18 bits wide
730
  output reg                           dta_out_valid;
731
 
732
  parameter [cosval_width-1:0]
733
    COSVAL_A      =  16'sd16384,  /*   SQRT(8)/2 * 2**14 * cos (pi/4) */
734
    COSVAL_MINUSA = -16'sd16384,  /* - SQRT(8)/2 * 2**14 * cos (pi/4) */
735
    COSVAL_B      =  16'sd21407,  /*   SQRT(8)/2 * 2**14 * cos (pi/8) */
736
    COSVAL_MINUSB = -16'sd21407,  /* - SQRT(8)/2 * 2**14 * cos (pi/8) */
737
    COSVAL_C      =  16'sd8867,   /*   SQRT(8)/2 * 2**14 * sin (pi/8) */
738
    COSVAL_MINUSC = -16'sd8867,   /* - SQRT(8)/2 * 2**14 * sin (pi/8) */
739
    COSVAL_D      =  16'sd22725,  /*   SQRT(8)/2 * 2**14 * cos (pi/16) */
740
    COSVAL_MINUSD = -16'sd22725,  /* - SQRT(8)/2 * 2**14 * cos (pi/16) */
741
    COSVAL_E      =  16'sd19266,  /*   SQRT(8)/2 * 2**14 * cos (3*pi/16) */
742
    COSVAL_MINUSE = -16'sd19266,  /* - SQRT(8)/2 * 2**14 * cos (3*pi/16) */
743
    COSVAL_F      =  16'sd12873,  /*   SQRT(8)/2 * 2**14 * sin (3*pi/16) */
744
    COSVAL_MINUSF = -16'sd12873,  /* - SQRT(8)/2 * 2**14 * sin (3*pi/16) */
745
    COSVAL_G      =  16'sd4520,   /*   SQRT(8)/2 * 2**14 * sin (pi/16) */
746
    COSVAL_MINUSG = -16'sd4520;   /* - SQRT(8)/2 * 2**14 * sin (pi/16) */
747
 
748
  /* dct coefficients input */
749
  reg signed [dta_in_width-1:0]q0;
750
  reg signed [dta_in_width-1:0]q1;
751
  reg signed [dta_in_width-1:0]q2;
752
  reg signed [dta_in_width-1:0]q3;
753
  reg signed [dta_in_width-1:0]q4;
754
  reg signed [dta_in_width-1:0]q5;
755
  reg signed [dta_in_width-1:0]q6;
756
  reg signed [dta_in_width-1:0]q7;
757
 
758
  reg signed [dta_in_width-1:0]x0;
759
  reg signed [dta_in_width-1:0]x1;
760
  reg signed [dta_in_width-1:0]x2;
761
  reg signed [dta_in_width-1:0]x3;
762
  reg signed [dta_in_width-1:0]x4;
763
  reg signed   [dta_in_width:0]minus_x4; // needs one bit more than x4, else two's complement of most negative x4 doesn't fit.
764
  reg signed [dta_in_width-1:0]x5;
765
  reg signed [dta_in_width-1:0]x6;
766
  reg signed [dta_in_width-1:0]x7;
767
 
768
  reg signed [cosval_width-1:0]cos1;
769
  reg signed [cosval_width-1:0]cos2;
770
  reg signed [cosval_width-1:0]cos3;
771
  reg signed [cosval_width-1:0]cos5;
772
  reg signed [cosval_width-1:0]cos6;
773
  reg signed [cosval_width-1:0]cos7;
774
 
775
  reg  signed [prod_width-1:0]prod0; // product of xi * cosvali
776
  reg  signed [prod_width-1:0]prod0_delayed;
777
  wire signed [prod_width-1:0]prod1;
778
  wire signed [prod_width-1:0]prod2;
779
  wire signed [prod_width-1:0]prod3;
780
  reg  signed [prod_width-1:0]prod4;
781
  reg  signed [prod_width-1:0]prod4_delayed;
782
  wire signed [prod_width-1:0]prod5;
783
  wire signed [prod_width-1:0]prod6;
784
  wire signed [prod_width-1:0]prod7;
785
 
786
  reg signed [reg_width-1:0]sum02; // sum of prodi and prodj
787
  reg signed [reg_width-1:0]sum46;
788
  reg signed [reg_width-1:0]sum13;
789
  reg signed [reg_width-1:0]sum57;
790
  reg signed [reg_width-1:0]sum0246; // sum of sumij and sumpq
791
  reg signed [reg_width-1:0]sum1357;
792
 
793
  reg signed [reg_width-1:0]y; // y sum or difference of sum0246 and sum0246
794
 
795
  reg [3:0]dta_in_cntr;
796
 
797
  reg dta_out_val_0;
798
  reg dta_out_val_1;
799
  reg dta_out_val_2;
800
  reg dta_out_val_3;
801
  reg dta_out_val_4;
802
 
803
  reg add_0;
804
  reg add_1;
805
  reg add_2;
806
  reg add_3;
807
 
808
  // an offset which is added to x0 to round the results.
809
  parameter signed [reg_width-1:0] offset = {2'b01, {(dta_shift-1){1'b0}}};
810
 
811
  parameter [3:0]
812
    STATE_IDLE  = 4'd0,
813
    STATE_0     = 4'd1,
814
    STATE_1     = 4'd2,
815
    STATE_2     = 4'd3,
816
    STATE_3     = 4'd4,
817
    STATE_4     = 4'd5,
818
    STATE_5     = 4'd6,
819
    STATE_6     = 4'd7,
820
    STATE_7     = 4'd8;
821
 
822
  reg [3:0]state;
823
  reg [3:0]next;
824
 
825
  /*
826
   * IDCT data input
827
   */
828
 
829
  /* input shift register */
830
  always @(posedge clk)
831
    if (~rst)
832
      begin
833
        q0 <= 'sd0;
834
        q1 <= 'sd0;
835
        q2 <= 'sd0;
836
        q3 <= 'sd0;
837
        q4 <= 'sd0;
838
        q5 <= 'sd0;
839
        q6 <= 'sd0;
840
        q7 <= 'sd0;
841
      end
842
    else if (clk_en && dta_in_valid)
843
      begin
844
        q0 <= q1;
845
        q1 <= q2;
846
        q2 <= q3;
847
        q3 <= q4;
848
        q4 <= q5;
849
        q5 <= q6;
850
        q6 <= q7;
851
        q7 <= dta_in;
852
      end
853
    else
854
      begin
855
        q0 <= q0;
856
        q1 <= q1;
857
        q2 <= q2;
858
        q3 <= q3;
859
        q4 <= q4;
860
        q5 <= q5;
861
        q6 <= q6;
862
        q7 <= q7;
863
      end
864
 
865
  always @(posedge clk)
866
    if (~rst)
867
      begin
868
        x0 <= 'sd0;
869
        x1 <= 'sd0;
870
        x2 <= 'sd0;
871
        x3 <= 'sd0;
872
        x4 <= 'sd0;
873
        minus_x4 <= 'sd0;
874
        x5 <= 'sd0;
875
        x6 <= 'sd0;
876
        x7 <= 'sd0;
877
      end
878
    else if (clk_en && (dta_in_cntr == 4'd8))
879
      begin
880
        x0 <= q0;
881
        x1 <= q1;
882
        x2 <= q2;
883
        x3 <= q3;
884
        x4 <= q4;
885
        minus_x4 <= ~{q4[dta_in_width-1], q4}+1'b1;
886
        x5 <= q5;
887
        x6 <= q6;
888
        x7 <= q7;
889
      end
890
    else
891
      begin
892
        x0 <= x0;
893
        x1 <= x1;
894
        x2 <= x2;
895
        x3 <= x3;
896
        x4 <= x4;
897
        minus_x4 <= minus_x4;
898
        x5 <= x5;
899
        x6 <= x6;
900
        x7 <= x7;
901
      end
902
 
903
  /* input counter */
904
  always @(posedge clk)
905
    if (~rst) dta_in_cntr <= 4'b0;
906
    else if (clk_en && (dta_in_cntr == 4'd8) && dta_in_valid) dta_in_cntr <= 3'd1;
907
    else if (clk_en && (dta_in_cntr == 4'd8)) dta_in_cntr <= 3'd0;
908
    else if (clk_en && dta_in_valid) dta_in_cntr <= dta_in_cntr + 3'd1;
909
    else dta_in_cntr <= dta_in_cntr;
910
 
911
  /*
912
   * IDCT calculation
913
   */
914
 
915
  /* next state logic */
916
  always @*
917
    case (state)
918
      STATE_IDLE:   if (dta_in_cntr == 4'd8) next = STATE_0;
919
                    else next = STATE_IDLE;
920
      STATE_0:      next = STATE_1;
921
      STATE_1:      next = STATE_2;
922
      STATE_2:      next = STATE_3;
923
      STATE_3:      next = STATE_4;
924
      STATE_4:      next = STATE_5;
925
      STATE_5:      next = STATE_6;
926
      STATE_6:      next = STATE_7;
927
      STATE_7:      if (dta_in_cntr == 4'd8) next = STATE_0;
928
                    else next = STATE_IDLE;
929
      default       next = STATE_IDLE;
930
    endcase
931
 
932
  /* state */
933
  always @(posedge clk)
934
    if(~rst) state <= STATE_IDLE;
935
    else if (clk_en) state <= next;
936
    else  state <= state;
937
 
938
  always @(posedge clk)
939
    if (~rst)
940
      cos2 <= COSVAL_B;
941
    else if (clk_en)
942
      case (state)
943
        STATE_0:       cos2 <= COSVAL_C;
944
        STATE_1:       cos2 <= COSVAL_MINUSC;
945
        STATE_2:       cos2 <= COSVAL_MINUSB;
946
        STATE_3:       cos2 <= COSVAL_MINUSB;
947
        STATE_4:       cos2 <= COSVAL_MINUSC;
948
        STATE_5:       cos2 <= COSVAL_C;
949
        STATE_6:       cos2 <= COSVAL_B;
950
        STATE_7:       cos2 <= COSVAL_B;
951
        default        cos2 <= COSVAL_B;
952
      endcase
953
    else
954
      cos2 <= cos2;
955
 
956
  always @(posedge clk)
957
    if (~rst)
958
      cos6 <= COSVAL_C;
959
    else if (clk_en)
960
      case (state)
961
        STATE_0:       cos6 <= COSVAL_MINUSB;
962
        STATE_1:       cos6 <= COSVAL_B;
963
        STATE_2:       cos6 <= COSVAL_MINUSC;
964
        STATE_3:       cos6 <= COSVAL_MINUSC;
965
        STATE_4:       cos6 <= COSVAL_B;
966
        STATE_5:       cos6 <= COSVAL_MINUSB;
967
        STATE_6:       cos6 <= COSVAL_C;
968
        STATE_7:       cos6 <= COSVAL_C;
969
        default        cos6 <= COSVAL_C;
970
      endcase
971
    else
972
      cos6 <= cos6;
973
 
974
  always @(posedge clk)
975
    if (~rst)
976
      cos1 <= COSVAL_D;
977
    else if (clk_en)
978
      case (state)
979
        STATE_0:       cos1 <= COSVAL_E;
980
        STATE_1:       cos1 <= COSVAL_F;
981
        STATE_2:       cos1 <= COSVAL_G;
982
        STATE_3:       cos1 <= COSVAL_G;
983
        STATE_4:       cos1 <= COSVAL_F;
984
        STATE_5:       cos1 <= COSVAL_E;
985
        STATE_6:       cos1 <= COSVAL_D;
986
        STATE_7:       cos1 <= COSVAL_D;
987
        default        cos1 <= COSVAL_D;
988
      endcase
989
    else
990
      cos1 <= cos1;
991
 
992
  always @(posedge clk)
993
    if (~rst)
994
      cos3 <= COSVAL_E;
995
    else if (clk_en)
996
      case (state)
997
        STATE_0:       cos3 <= COSVAL_MINUSG;
998
        STATE_1:       cos3 <= COSVAL_MINUSD;
999
        STATE_2:       cos3 <= COSVAL_MINUSF;
1000
        STATE_3:       cos3 <= COSVAL_MINUSF;
1001
        STATE_4:       cos3 <= COSVAL_MINUSD;
1002
        STATE_5:       cos3 <= COSVAL_MINUSG;
1003
        STATE_6:       cos3 <= COSVAL_E;
1004
        STATE_7:       cos3 <= COSVAL_E;
1005
        default        cos3 <= COSVAL_E;
1006
      endcase
1007
    else
1008
      cos3 <= cos3;
1009
 
1010
  always @(posedge clk)
1011
    if (~rst)
1012
      cos5 <= COSVAL_F;
1013
    else if (clk_en)
1014
      case (state)
1015
        STATE_0:       cos5 <= COSVAL_MINUSD;
1016
        STATE_1:       cos5 <= COSVAL_G;
1017
        STATE_2:       cos5 <= COSVAL_E;
1018
        STATE_3:       cos5 <= COSVAL_E;
1019
        STATE_4:       cos5 <= COSVAL_G;
1020
        STATE_5:       cos5 <= COSVAL_MINUSD;
1021
        STATE_6:       cos5 <= COSVAL_F;
1022
        STATE_7:       cos5 <= COSVAL_F;
1023
        default        cos5 <= COSVAL_F;
1024
      endcase
1025
    else
1026
      cos5 <= cos5;
1027
 
1028
  always @(posedge clk)
1029
    if (~rst)
1030
      cos7 <= COSVAL_G;
1031
    else if (clk_en)
1032
      case (state)
1033
        STATE_0:       cos7 <= COSVAL_MINUSF;
1034
        STATE_1:       cos7 <= COSVAL_E;
1035
        STATE_2:       cos7 <= COSVAL_MINUSD;
1036
        STATE_3:       cos7 <= COSVAL_MINUSD;
1037
        STATE_4:       cos7 <= COSVAL_E;
1038
        STATE_5:       cos7 <= COSVAL_MINUSF;
1039
        STATE_6:       cos7 <= COSVAL_G;
1040
        STATE_7:       cos7 <= COSVAL_G;
1041
        default        cos7 <= COSVAL_G;
1042
      endcase
1043
    else
1044
      cos7 <= cos7;
1045
 
1046
  /* The 22x18 multipliers */
1047
 
1048
  always @(posedge clk)                                         /* prod0 <= cos0 * x0 + offset; offset added for proper rounding. Uses shifts, avoids a multipier. */
1049
    if (~rst) prod0_delayed <= 'sd0;
1050
    else if (clk_en)
1051
        prod0_delayed <= {{(reg_width - dta_in_width){x0[dta_in_width-1]}}, x0, {scale{1'b0}}} + offset;
1052
    else prod0_delayed <= prod0_delayed;
1053
 
1054
  always @(posedge clk)
1055
    if (~rst) prod0 <= 'sd0;
1056
    else if (clk_en) prod0 <= prod0_delayed;
1057
    else prod0 <= prod0;
1058
 
1059
  mult22x16 mult_prod1(clk, clk_en, rst, prod1, cos1, x1);      /* prod1 <= cos1 * x1; */
1060
  mult22x16 mult_prod2(clk, clk_en, rst, prod2, cos2, x2);      /* prod2 <= cos2 * x2; */
1061
  mult22x16 mult_prod3(clk, clk_en, rst, prod3, cos3, x3);      /* prod3 <= cos3 * x3; */
1062
 
1063
  always @(posedge clk)                                         /* prod4 <= cos4 * x4. Uses shifts, avoids a multipier. */
1064
    if (~rst) prod4_delayed <= 'sd0;
1065
    else if (clk_en)
1066
        case (state)
1067
          STATE_0,
1068
          STATE_3,
1069
          STATE_4,
1070
          STATE_7: prod4_delayed <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
1071
          STATE_1,
1072
          STATE_2,
1073
          STATE_5,
1074
          STATE_6: prod4_delayed <=  {{(reg_width - dta_in_width-1){minus_x4[dta_in_width]}}, minus_x4, {scale{1'b0}}};
1075
          default  prod4_delayed <=  {{(reg_width - dta_in_width){x4[dta_in_width-1]}}, x4, {scale{1'b0}}};
1076
        endcase
1077
    else prod4_delayed <= prod4_delayed;
1078
 
1079
  always @(posedge clk)
1080
    if (~rst) prod4 <= 'sd0;
1081
    else if (clk_en) prod4 <= prod4_delayed;
1082
    else prod4 <= prod4;
1083
 
1084
  mult22x16 mult_prod5(clk, clk_en, rst, prod5, cos5, x5);      /* prod5 <= cos5 * x5; */
1085
  mult22x16 mult_prod6(clk, clk_en, rst, prod6, cos6, x6);      /* prod6 <= cos6 * x6; */
1086
  mult22x16 mult_prod7(clk, clk_en, rst, prod7, cos7, x7);      /* prod7 <= cos7 * x7; */
1087
 
1088
  always @(posedge clk)
1089
    if (~rst)
1090
      begin
1091
        sum02 <= 'sd0;
1092
        sum46 <= 'sd0;
1093
        sum13 <= 'sd0;
1094
        sum57 <= 'sd0;
1095
        sum0246 <= 'sd0;
1096
        sum1357 <= 'sd0;
1097
      end
1098
    else if (clk_en)
1099
      begin
1100
        sum02 <= {{(reg_width-prod_width){prod0[prod_width-1]}}, prod0} + {{(reg_width-prod_width){prod2[prod_width-1]}}, prod2};
1101
        sum46 <= {{(reg_width-prod_width){prod4[prod_width-1]}}, prod4} + {{(reg_width-prod_width){prod6[prod_width-1]}}, prod6};
1102
        sum13 <= {{(reg_width-prod_width){prod1[prod_width-1]}}, prod1} + {{(reg_width-prod_width){prod3[prod_width-1]}}, prod3};
1103
        sum57 <= {{(reg_width-prod_width){prod5[prod_width-1]}}, prod5} + {{(reg_width-prod_width){prod7[prod_width-1]}}, prod7};
1104
        sum0246 <= sum02 + sum46;
1105
        sum1357 <= sum13 + sum57;
1106
      end
1107
    else
1108
      begin
1109
        sum02 <= sum02;
1110
        sum46 <= sum46;
1111
        sum13 <= sum13;
1112
        sum57 <= sum57;
1113
        sum0246 <= sum0246;
1114
        sum1357 <= sum1357;
1115
      end
1116
 
1117
  always @(posedge clk)
1118
    if (~rst)
1119
      begin
1120
        dta_out_val_0 <= 1'b0;
1121
        dta_out_val_1 <= 1'b0;
1122
        dta_out_val_2 <= 1'b0;
1123
        dta_out_val_3 <= 1'b0;
1124
        dta_out_val_4 <= 1'b0;
1125
        dta_out_valid <= 1'b0;
1126
      end
1127
    else if (clk_en)
1128
      begin
1129
        dta_out_val_0 <= (state != STATE_IDLE);
1130
        dta_out_val_1 <= dta_out_val_0;
1131
        dta_out_val_2 <= dta_out_val_1;
1132
        dta_out_val_3 <= dta_out_val_2;
1133
        dta_out_val_4 <= dta_out_val_3;
1134
        dta_out_valid <= dta_out_val_4;
1135
      end
1136
    else
1137
      begin
1138
        dta_out_val_0 <= dta_out_val_0;
1139
        dta_out_val_1 <= dta_out_val_1;
1140
        dta_out_val_2 <= dta_out_val_2;
1141
        dta_out_val_3 <= dta_out_val_3;
1142
        dta_out_val_4 <= dta_out_val_4;
1143
        dta_out_valid <= dta_out_valid;
1144
      end
1145
 
1146
  /*
1147
   * Looking at the equation for the 1d idct, the final step when calculating
1148
   * y0..y3 is addition, when calculating y4..y7 subtraction.
1149
   * register add_0 is 1 when one needs to add, 0 when one needs to subtract.
1150
   */
1151
 
1152
  always @(posedge clk)
1153
    if (~rst)
1154
      add_0 <= 1'd0;
1155
    else if (clk_en)
1156
      case (state)
1157
        STATE_0,
1158
        STATE_1,
1159
        STATE_2,
1160
        STATE_3:       add_0 <= 1'b1;
1161
        STATE_4,
1162
        STATE_6,
1163
        STATE_5,
1164
        STATE_7:       add_0 <= 1'b0;
1165
        default        add_0 <= 1'b0;
1166
      endcase
1167
    else
1168
      add_0 <= add_0;
1169
 
1170
  always @(posedge clk)
1171
    if (~rst)
1172
      begin
1173
        add_1 <= 1'b0;
1174
        add_2 <= 1'b0;
1175
        add_3 <= 1'b0;
1176
      end
1177
    else if (clk_en)
1178
      begin
1179
        add_1 <= add_0;
1180
        add_2 <= add_1;
1181
        add_3 <= add_2;
1182
      end
1183
    else
1184
      begin
1185
      add_1 <= add_1;
1186
      add_2 <= add_2;
1187
      add_3 <= add_3;
1188
      end
1189
 
1190
  always @(posedge clk)
1191
    if (~rst)
1192
      y <= 'sd0;
1193
    else if (clk_en && add_3)
1194
      y <= sum0246 + sum1357;
1195
    else if (clk_en)
1196
      y <= sum0246 - sum1357;
1197
    else
1198
      y <= y;
1199
 
1200
  always @(posedge clk)
1201
    if (~rst) dta_out <= 'sd0;
1202
    else if (clk_en) dta_out <=  y >>> dta_shift;
1203
    else dta_out <= dta_out;
1204
 
1205
`ifdef DEBUG_IDCT_1D
1206
  always @(posedge clk)
1207
    begin
1208
      $strobe("%m\toffset: %d", offset);
1209
      $strobe("%m\tcos0: -------- cos1: %8d cos2: %8d cos3: %8d cos4: -------- cos5: %8d cos6: %8d cos7: %8d", cos1, cos2, cos3, cos5, cos6, cos7);
1210
      $strobe("%m\t  x0: %8d   x1: %8d   x2: %8d   x3: %8d   x4: %8d   x5: %8d   x6: %8d   x7: %8d",   x0,   x1,   x2,   x3,   x4,   x5,   x6,   x7);
1211
      $strobe("%m\tprod0: %d prod1: %d prod2: %d prod3: %d prod4: %d prod5: %d prod6: %d prod7: %d", prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7);
1212
      $strobe("%m\tsum02: %8d sum46: %8d sum13: %8d sum57: %8d", sum02, sum46, sum13, sum57);
1213
      $strobe("%m\tsum0246: %8d sum1357: %8d", sum0246, sum1357);
1214
      $strobe("%m\ty: %8d", y);
1215
      $strobe("%m\tdta_out: %8d", dta_out);
1216
    end
1217
`endif
1218
 
1219
endmodule
1220
 
1221
/*
1222
 * 8x8 transpose ram. Swaps rows and columns.
1223
 * Modeled after the chasing pointers algorithm described in
1224
 * "Architecture and Bus-Arbitration Schemes for MPEG-2 Video Decoder",
1225
 * Jui-Hua Li and Nam Ling,
1226
 * IEEE Transactions on Circuits and Systems for Video Technology, Vol. 9, No. 5, August 1999, p.727-736.
1227
 */
1228
 
1229
module transpose(clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid, dta_out_eob);
1230
  parameter  dta_width=16;                           // data width;
1231
  input                    clk;                      // clock
1232
  input                    clk_en;                   // clock enable
1233
  input                    rst;                      // synchronous active low reset
1234
  input    [dta_width -1:0]dta_in; // data in
1235
  input                    dta_in_valid;
1236
  output   [dta_width -1:0]dta_out; // data out
1237
  output reg               dta_out_valid;
1238
  output reg               dta_out_eob;
1239
 
1240
  reg                      read_direction;
1241
  reg                      write_direction;
1242
  reg                 [5:0]rd_cnt;
1243
  wire                [5:0]rd_addr;
1244
  reg                 [5:0]wr_cnt;
1245
  wire                [5:0]wr_addr;
1246
 
1247
  parameter [1:0]
1248
    STATE_IDLE        = 2'd0,
1249
    STATE_READING     = 2'd1;
1250
 
1251
  reg [1:0]state = STATE_IDLE;
1252
  reg [1:0]next;
1253
 
1254
  /*
1255
   * We've got one dual-port ram, with simultaneous reads and writes.
1256
   *
1257
   * 1. Wait reading data until address 50 is being written.
1258
   * 2. When address 50 is being written, read address 0.
1259
   * 3. Once address 50 has been written, whenever the write address increments, the read address also increments.
1260
   * 4. When address 63 has been written, writing direction reverses:
1261
   *    if we were writing in row order, we begin writing in column order,
1262
   *    if we were writing in column order, we begin writing in row order.
1263
   * 5. When address 63 has been read, reading direction reverses:
1264
   *    if we were reading in row order, we begin reading in column order,
1265
   *    if we were reading in column order, we begin reading in row order.
1266
   */
1267
 
1268
  /* next state logic */
1269
  always @*
1270
    case (state)
1271
      STATE_IDLE:        if (dta_in_valid && (wr_cnt == 6'd49)) next = STATE_READING;
1272
                         else next = STATE_IDLE;
1273
 
1274
      STATE_READING:     next = STATE_READING;
1275
 
1276
      default            next = STATE_IDLE;
1277
    endcase
1278
 
1279
  /* state */
1280
    always @(posedge clk)
1281
      if(~rst) state <= STATE_IDLE;
1282
      else if (clk_en) state <= next;
1283
      else  state <= state;
1284
 
1285
  /* read counter */
1286
  always @(posedge clk)
1287
    if (~rst) rd_cnt <= 6'b0;
1288
    else if (clk_en && (state == STATE_READING) && dta_in_valid) rd_cnt <= rd_cnt + 6'd1;
1289
    else rd_cnt <= rd_cnt;
1290
 
1291
  always @(posedge clk)
1292
    if (~rst) dta_out_valid <= 1'b0;
1293
    else if (clk_en) dta_out_valid <= dta_in_valid && (state == STATE_READING);
1294
    else dta_out_valid <= dta_out_valid;
1295
 
1296
  always @(posedge clk)
1297
    if (~rst) dta_out_eob <= 1'b0;
1298
    else if (clk_en) dta_out_eob <= (rd_cnt == 6'd63);
1299
    else dta_out_eob <= dta_out_eob;
1300
 
1301
  always @(posedge clk)
1302
    if (~rst) wr_cnt <= 6'b0;
1303
    else if (clk_en && dta_in_valid) wr_cnt <= wr_cnt + 6'd1;
1304
    else wr_cnt <= wr_cnt;
1305
 
1306
  always @(posedge clk)
1307
    if (~rst) read_direction <= 1'b0;
1308
    else if (clk_en && (rd_cnt == 6'd63)) read_direction <= ~read_direction;
1309
    else read_direction <= read_direction;
1310
 
1311
  always @(posedge clk)
1312
    if (~rst) write_direction <= 1'b1;
1313
    else if (clk_en && dta_in_valid && (wr_cnt == 6'd63)) write_direction <= ~write_direction;
1314
    else write_direction <= write_direction;
1315
 
1316
  assign wr_addr = write_direction ? wr_cnt : {wr_cnt[2:0], wr_cnt[5:3]}; // swap rows and columns in address
1317
 
1318
  assign rd_addr = read_direction ? rd_cnt : {rd_cnt[2:0], rd_cnt[5:3]}; // swap rows and columns in address
1319
 
1320
  /* transposition memory */
1321
 
1322
  generic_dpram
1323
    #(.aw(6),                                                 // number of bits in address bus
1324
    .dw(dta_width))                                           // number of bits in data bus
1325
    ram0 (
1326
    // read port
1327
    .rclk(clk),                                               // read clock, rising edge trigger
1328
    .rrst(rst),                                               // read port reset, active high
1329
    .rce(1'b1),                                               // read port chip enable, active high
1330
    .oe(1'b1),                                                // output enable, active high
1331
    .raddr(rd_addr),                                          // read address
1332
    .do(dta_out),                                             // data output
1333
    // write port
1334
    .wclk(clk),                                               // write clock, rising edge trigger
1335
    .wrst(rst),                                               // write port reset, active high
1336
    .wce(clk_en),                                             // write port chip enable, active high
1337
    .we(dta_in_valid),                                        // write enable, active high
1338
    .waddr(wr_addr),                                          // write address, after inverse zigzag
1339
    .di(dta_in)                                               // data input
1340
    );
1341
 
1342
`ifdef DEBUG_TRANSPOSE
1343
  always @(posedge clk)
1344
    begin
1345
      case (state)
1346
        STATE_IDLE:       $strobe("%m\tSTATE_IDLE       wr_cnt: %d rd_cnt: %d wr_dir: %d rd_dir: %d dta_in: %d dta_in_valid: %d dta_out: %d dta_out_valid: %d dta_out_eob: %d", wr_cnt, rd_cnt, write_direction, read_direction, dta_in, dta_in_valid, dta_out, dta_out_valid, dta_out_eob);
1347
        STATE_READING:    $strobe("%m\tSTATE_READING    wr_cnt: %d rd_cnt: %d wr_dir: %d rd_dir: %d dta_in: %d dta_in_valid: %d dta_out: %d dta_out_valid: %d dta_out_eob: %d", wr_cnt, rd_cnt, write_direction, read_direction, dta_in, dta_in_valid, dta_out, dta_out_valid, dta_out_eob);
1348
        default           $strobe("%m\t***Unknown state wr_cnt: %d rd_cnt: %d wr_dir: %d rd_dir: %d dta_in: %d dta_in_valid: %d dta_out: %d dta_out_valid: %d dta_out_eob: %d", wr_cnt, rd_cnt, write_direction, read_direction, dta_in, dta_in_valid, dta_out, dta_out_valid, dta_out_eob);
1349
      endcase
1350
    end
1351
`endif
1352
 
1353
endmodule
1354
 
1355
/*
1356
 * Clips idct output to -256..255
1357
 */
1358
 
1359
module clip_col(clk, clk_en, rst, dta_in, dta_in_valid, dta_out, dta_out_valid);
1360
  input                  clk;                      // clock
1361
  input                  clk_en;                   // clock enable
1362
  input                  rst;                      // synchronous active low reset
1363
  input signed     [20:0]dta_in;                   // data in
1364
  input                  dta_in_valid;
1365
  output reg signed [8:0]dta_out;                 // data out
1366
  output reg             dta_out_valid;
1367
 
1368
  always @(posedge clk)
1369
    if (~rst) dta_out <= 'sd0;
1370
    else if (clk_en && ((dta_in[20:8] == 13'b1111111111111) || (dta_in[20:8] == 13'b000000000000))) dta_out <= dta_in[8:0];
1371
    else if (clk_en) dta_out <= {dta_in[20], {8{~dta_in[20]}}}; // clipping
1372
    else dta_out <= dta_out;
1373
 
1374
  always @(posedge clk)
1375
    if (~rst) dta_out_valid <= 'sd0;
1376
    else if (clk_en) dta_out_valid <= dta_in_valid;
1377
    else dta_out_valid <= dta_out_valid;
1378
 
1379
endmodule
1380
 
1381
module mult22x16(clk, clk_en, rst, product, multiplicand, multiplier);
1382
   input         clk;
1383
   input         clk_en;
1384
   input         rst;
1385
   input signed [21:0]  multiplier;
1386
   input signed [15:0]  multiplicand;
1387
   output reg signed [37:0] product;
1388
 
1389
/*
1390
 * the following code implements
1391
 *   always @(posedge clk)
1392
 *     product <= multiplier * multiplicand;
1393
 * using only a single 18x18 multiplier, a few shifts and adders.
1394
 *
1395
 * See "Expanding Virtex-II" by Ken Chapman, Xilinx UK, 06/30/2001 for
1396
 * a discussion about expanding multipliers.
1397
 *
1398
 */
1399
   wire /* unsigned */  [3:0] multiplier_lsb;
1400
   wire signed         [17:0] multiplier_msb;
1401
 
1402
   reg signed [19:0] partial_product_1;
1403
   reg signed [33:0] partial_product_2;
1404
 
1405
   assign multiplier_lsb = multiplier[3:0];
1406
   assign multiplier_msb = multiplier[21:4];
1407
 
1408
   always @(posedge clk)
1409
     if (~rst) partial_product_2 <= 34'b0;
1410
     else if (clk_en) partial_product_2 <= multiplier_msb * multiplicand;
1411
     else partial_product_2 <= partial_product_2;
1412
 
1413
   always @(posedge clk)
1414
     if (~rst) partial_product_1 <= 20'b0;
1415
     else if (clk_en)
1416
       partial_product_1 <= (multiplier_lsb[0] ? {{4{multiplicand[15]}}, multiplicand      } : 20'b0) +
1417
                            (multiplier_lsb[1] ? {{3{multiplicand[15]}}, multiplicand, 1'b0} : 20'b0) +
1418
                            (multiplier_lsb[2] ? {{2{multiplicand[15]}}, multiplicand, 2'b0} : 20'b0) +
1419
                            (multiplier_lsb[3] ? {{1{multiplicand[15]}}, multiplicand, 3'b0} : 20'b0);
1420
     else partial_product_1 <= partial_product_1;
1421
 
1422
   always @(posedge clk)
1423
     if (~rst) product <= 38'b0;
1424
     else if (clk_en) product <=  {partial_product_2, 4'b0} + { {18{partial_product_1[19]}}, partial_product_1};
1425
     else product <= product;
1426
 
1427
endmodule
1428
 
1429
/* not truncated */

powered by: WebSVN 2.1.0

© copyright 1999-2019 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.