OpenCores
URL https://opencores.org/ocsvn/mpeg2fpga/mpeg2fpga/trunk

Subversion Repositories mpeg2fpga

[/] [mpeg2fpga/] [trunk/] [tools/] [mpeg2dec/] [mmxidct.S] - Blame information for rev 2

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 2 kdv
/*
2
 * the input data is tranposed and each 16 bit element in the 8x8 matrix
3
 * is left aligned:
4
 * for example in 11...1110000 format
5
 * If the iDCT is of I macroblock then 0.5 needs to be added to the;DC Component
6
 * (element[0][0] of the matrix)
7
 */
8
 
9
.data
10
        .align 16
11
        .type    preSC,@object
12
preSC:  .short  16384,22725,21407,19266,16384,12873,8867,4520
13
        .short  22725,31521,29692,26722,22725,17855,12299,6270
14
        .short  21407,29692,27969,25172,21407,16819,11585,5906
15
        .short  19266,26722,25172,22654,19266,15137,10426,5315
16
        .short  16384,22725,21407,19266,16384,12873,8867,4520
17
        .short  12873,17855,16819,15137,25746,20228,13933,7103
18
        .short  17734,24598,23170,20853,17734,13933,9597,4892
19
        .short  18081,25080,23624,21261,18081,14206,9785,4988
20
        .size    preSC,128
21
        .align 8
22
        .type   x0005000200010001,@object
23
        .size   x0005000200010001,8
24
x0005000200010001:
25
        .long   0x00010001,0x00050002
26
        .align 8
27
        .type   x0040000000000000,@object
28
        .size   x0040000000000000,8
29
x0040000000000000:
30
        .long   0, 0x00400000
31
        .align 8
32
        .type   x5a825a825a825a82,@object
33
        .size   x5a825a825a825a82,8
34
x5a825a825a825a82:
35
        .long   0x5a825a82, 0x5a825a82
36
        .align 8
37
        .type   x539f539f539f539f,@object
38
        .size   x539f539f539f539f,8
39
x539f539f539f539f:
40
        .long   0x539f539f,0x539f539f
41
        .align 8
42
        .type   x4546454645464546,@object
43
        .size   x4546454645464546,8
44
x4546454645464546:
45
        .long   0x45464546,0x45464546
46
        .align 8
47
        .type   x61f861f861f861f8,@object
48
        .size   x61f861f861f861f8,8
49
x61f861f861f861f8:
50
        .long   0x61f861f8,0x61f861f8
51
        .align 8
52
        .type    scratch1,@object
53
        .size    scratch1,8
54
scratch1:
55
        .long 0,0
56
        .align 8
57
        .type    scratch3,@object
58
        .size    scratch3,8
59
scratch3:
60
        .long 0,0
61
        .align 8
62
        .type    scratch5,@object
63
        .size    scratch5,8
64
scratch5:
65
        .long 0,0
66
        .align 8
67
        .type    scratch7,@object
68
        .size    scratch7,8
69
scratch7:
70
        .long 0,0
71
        .type    x0,@object
72
        .size    x0,8
73
x0:
74
        .long 0,0
75
        .align 8
76
.text
77
        .align 4
78
.globl IDCT_mmx
79
        .type    IDCT_mmx,@function
80
IDCT_mmx:
81
        pushl %ebp
82
        movl %esp,%ebp
83
        pushl %ebx
84
        pushl %ecx
85
        pushl %edx
86
        pushl %esi
87
        pushl %edi
88
        leal preSC, %ecx
89
        movl 8(%ebp),%esi               /* source matrix */
90
        movq (%esi), %mm0
91
        movq 8(%esi), %mm1
92
        movq 16(%esi), %mm2
93
        movq 24(%esi), %mm3
94
        movq 32(%esi), %mm4
95
        movq 40(%esi), %mm5
96
        movq 48(%esi), %mm6
97
        movq 56(%esi), %mm7
98
        psllw $4, %mm0
99
        psllw $4, %mm1
100
        psllw $4, %mm2
101
        psllw $4, %mm3
102
        psllw $4, %mm4
103
        psllw $4, %mm5
104
        psllw $4, %mm6
105
        psllw $4, %mm7
106
        movq %mm0,  (%esi)
107
        movq %mm1, 8(%esi)
108
        movq %mm2,16(%esi)
109
        movq %mm3,24(%esi)
110
        movq %mm4,32(%esi)
111
        movq %mm5,40(%esi)
112
        movq %mm6,48(%esi)
113
        movq %mm7,56(%esi)
114
        movq 64(%esi), %mm0
115
        movq 72(%esi), %mm1
116
        movq 80(%esi), %mm2
117
        movq 88(%esi), %mm3
118
        movq 96(%esi), %mm4
119
        movq 104(%esi), %mm5
120
        movq 112(%esi), %mm6
121
        movq 120(%esi), %mm7
122
        psllw $4, %mm0
123
        psllw $4, %mm1
124
        psllw $4, %mm2
125
        psllw $4, %mm3
126
        psllw $4, %mm4
127
        psllw $4, %mm5
128
        psllw $4, %mm6
129
        psllw $4, %mm7
130
        movq %mm0,64(%esi)
131
        movq %mm1,72(%esi)
132
        movq %mm2,80(%esi)
133
        movq %mm3,88(%esi)
134
        movq %mm4,96(%esi)
135
        movq %mm5,104(%esi)
136
        movq %mm6,112(%esi)
137
        movq %mm7,120(%esi)
138
/* column 0: even part
139
 * use V4, V12, V0, V8 to produce V22..V25
140
 */
141
        movq 8*12(%ecx), %mm0   /* maybe the first mul can be done together */
142
                                /* with the dequantization in iHuff module */
143
        pmulhw 8*12(%esi), %mm0         /* V12 */
144
        movq 8*4(%ecx), %mm1
145
        pmulhw 8*4(%esi), %mm1          /* V4 */
146
        movq (%ecx), %mm3
147
        psraw $1, %mm0                  /* t64=t66 */
148
        pmulhw (%esi), %mm3             /* V0 */
149
        movq 8*8(%ecx), %mm5            /* duplicate V4 */
150
        movq %mm1, %mm2                 /* added 11/1/96 */
151
        pmulhw 8*8(%esi),%mm5           /* V8 */
152
        psubsw %mm0, %mm1               /* V16 */
153
        pmulhw x5a825a825a825a82, %mm1  /* 23170 ->V18 */
154
        paddsw %mm0, %mm2               /* V17 */
155
        movq %mm2, %mm0                 /* duplicate V17 */
156
        psraw $1, %mm2                  /* t75=t82 */
157
        psraw $2, %mm0                  /* t72 */
158
        movq %mm3, %mm4                 /* duplicate V0 */
159
        paddsw %mm5, %mm3               /* V19 */
160
        psubsw %mm5, %mm4               /* V20 ;mm5 free */
161
/* moved from the block below */
162
        movq 8*10(%ecx), %mm7
163
        psraw $1, %mm3                  /* t74=t81 */
164
        movq %mm3, %mm6                 /* duplicate t74=t81 */
165
        psraw $2, %mm4                  /* t77=t79 */
166
        psubsw %mm0, %mm1               /* V21 ; mm0 free */
167
        paddsw %mm2, %mm3               /* V22 */
168
        movq %mm1, %mm5                 /* duplicate V21 */
169
        paddsw %mm4, %mm1               /* V23 */
170
        movq %mm3, 8*4(%esi)            /* V22 */
171
        psubsw %mm5, %mm4               /* V24; mm5 free */
172
        movq %mm1, 8*12(%esi)           /* V23 */
173
        psubsw %mm2, %mm6               /* V25; mm2 free */
174
        movq %mm4, (%esi)               /* V24 */
175
/* keep mm6 alive all along the next block */
176
        /* movq %mm6, 8*8(%esi)         V25 */
177
/* column 0: odd part
178
 * use V2, V6, V10, V14 to produce V31, V39, V40, V41
179
 */
180
/* moved above: movq 8*10(%ecx), %mm7 */
181
 
182
        pmulhw 8*10(%esi), %mm7         /* V10 */
183
        movq 8*6(%ecx), %mm0
184
        pmulhw 8*6(%esi), %mm0          /* V6 */
185
        movq 8*2(%ecx), %mm5
186
        movq %mm7, %mm3                 /* duplicate V10 */
187
        pmulhw 8*2(%esi), %mm5          /* V2 */
188
        movq 8*14(%ecx), %mm4
189
        psubsw %mm0, %mm7               /* V26 */
190
        pmulhw 8*14(%esi), %mm4         /* V14 */
191
        paddsw %mm0, %mm3               /* V29 ; free mm0 */
192
        movq %mm7, %mm1                 /* duplicate V26 */
193
        psraw $1, %mm3                  /* t91=t94 */
194
        pmulhw x539f539f539f539f,%mm7   /* V33 */
195
        psraw $1, %mm1                  /* t96 */
196
        movq %mm5, %mm0                 /* duplicate V2 */
197
        psraw $2, %mm4                  /* t85=t87 */
198
        paddsw %mm4,%mm5                /* V27 */
199
        psubsw %mm4, %mm0               /* V28 ; free mm4 */
200
        movq %mm0, %mm2                 /* duplicate V28 */
201
        psraw $1, %mm5                  /* t90=t93 */
202
        pmulhw x4546454645464546,%mm0   /* V35 */
203
        psraw $1, %mm2                  /* t97 */
204
        movq %mm5, %mm4                 /* duplicate t90=t93 */
205
        psubsw %mm2, %mm1               /* V32 ; free mm2 */
206
        pmulhw x61f861f861f861f8,%mm1   /* V36 */
207
        psllw $1, %mm7                  /* t107 */
208
        paddsw %mm3, %mm5               /* V31 */
209
        psubsw %mm3, %mm4               /* V30 ; free mm3 */
210
        pmulhw x5a825a825a825a82,%mm4   /* V34 */
211
        nop
212
        psubsw %mm1, %mm0               /* V38 */
213
        psubsw %mm7, %mm1               /* V37 ; free mm7 */
214
        psllw $1, %mm1                  /* t114 */
215
/* move from the next block */
216
        movq %mm6, %mm3                 /* duplicate V25 */
217
/* move from the next block */
218
        movq 8*4(%esi), %mm7            /* V22 */
219
        psllw $1, %mm0                  /* t110 */
220
        psubsw %mm5, %mm0               /* V39 (mm5 needed for next block) */
221
        psllw $2, %mm4                  /* t112 */
222
/* moved from the next block */
223
        movq 8*12(%esi), %mm2           /* V23 */
224
        psubsw %mm0, %mm4               /* V40 */
225
        paddsw %mm4, %mm1               /* V41; free mm0 */
226
/* moved from the next block */
227
        psllw $1, %mm2                  /* t117=t125 */
228
/* column 0: output butterfly */
229
/* moved above:
230
 * movq %mm6, %mm3                      duplicate V25
231
 * movq 8*4(%esi), %mm7                 V22
232
 * movq 8*12(%esi), %mm2                V23
233
 * psllw $1, %mm2                       t117=t125
234
 */
235
        psubsw %mm1, %mm6               /* tm6 */
236
        paddsw %mm1, %mm3               /* tm8; free mm1 */
237
        movq %mm7, %mm1                 /* duplicate V22 */
238
        paddsw %mm5, %mm7               /* tm0 */
239
        movq %mm3, 8*8(%esi)            /* tm8; free mm3 */
240
        psubsw %mm5, %mm1               /* tm14; free mm5 */
241
        movq %mm6, 8*6(%esi)            /* tm6; free mm6 */
242
        movq %mm2, %mm3                 /* duplicate t117=t125 */
243
        movq (%esi), %mm6               /* V24 */
244
        paddsw %mm0, %mm2               /* tm2 */
245
        movq %mm7, (%esi)               /* tm0; free mm7 */
246
        psubsw %mm0, %mm3               /* tm12; free mm0 */
247
        movq %mm1, 8*14(%esi)           /* tm14; free mm1 */
248
        psllw $1, %mm6                  /* t119=t123 */
249
        movq %mm2, 8*2(%esi)            /* tm2; free mm2 */
250
        movq %mm6, %mm0                 /* duplicate t119=t123 */
251
        movq %mm3, 8*12(%esi)           /* tm12; free mm3 */
252
        paddsw %mm4, %mm6               /* tm4 */
253
/* moved from next block */
254
        movq 8*5(%ecx), %mm1
255
        psubsw %mm4, %mm0               /* tm10; free mm4 */
256
/* moved from next block */
257
        pmulhw 8*5(%esi), %mm1          /* V5 */
258
        movq %mm6, 8*4(%esi)            /* tm4; free mm6 */
259
        movq %mm0, 8*10(%esi)           /* tm10; free mm0 */
260
/* column 1: even part
261
 * use V5, V13, V1, V9 to produce V56..V59
262
 */
263
/* moved to prev block:
264
 *      movq 8*5(%ecx), %mm1
265
 *      pmulhw 8*5(%esi), %mm1           V5
266
 */
267
        movq 8*13(%ecx), %mm7
268
        psllw $1, %mm1                  /* t128=t130 */
269
        pmulhw 8*13(%esi), %mm7         /* V13 */
270
        movq %mm1, %mm2                 /* duplicate t128=t130 */
271
        movq 8(%ecx), %mm3
272
        pmulhw 8(%esi), %mm3            /* V1 */
273
        movq 8*9(%ecx), %mm5
274
        psubsw %mm7, %mm1               /* V50 */
275
        pmulhw 8*9(%esi), %mm5          /* V9 */
276
        paddsw %mm7, %mm2               /* V51 */
277
        pmulhw x5a825a825a825a82, %mm1  /* 23170 ->V52 */
278
        movq %mm2, %mm6                 /* duplicate V51 */
279
        psraw $1, %mm2                  /* t138=t144 */
280
        movq %mm3, %mm4                 /* duplicate V1 */
281
        psraw $2, %mm6                  /* t136 */
282
        paddsw %mm5, %mm3               /* V53 */
283
        psubsw %mm5, %mm4               /* V54 ;mm5 free */
284
        movq %mm3, %mm7                 /* duplicate V53 */
285
/* moved from next block */
286
        movq 8*11(%ecx), %mm0
287
        psraw $1, %mm4                  /* t140=t142 */
288
        psubsw %mm6, %mm1               /* V55 ; mm6 free */
289
        paddsw %mm2, %mm3               /* V56 */
290
        movq %mm4, %mm5                 /* duplicate t140=t142 */
291
        paddsw %mm1, %mm4               /* V57 */
292
        movq %mm3, 8*5(%esi)            /* V56 */
293
        psubsw %mm1, %mm5               /* V58; mm1 free */
294
        movq %mm4, 8*13(%esi)           /* V57 */
295
        psubsw %mm2, %mm7               /* V59; mm2 free */
296
        movq %mm5, 8*9(%esi)            /* V58 */
297
/* keep mm7 alive all along the next block
298
 *      movq %mm7, 8(%esi)              V59
299
 * moved above
300
 *      movq 8*11(%ecx), %mm0
301
 */
302
        pmulhw 8*11(%esi), %mm0         /* V11 */
303
        movq 8*7(%ecx), %mm6
304
        pmulhw 8*7(%esi), %mm6          /* V7 */
305
        movq 8*15(%ecx), %mm4
306
        movq %mm0, %mm3                 /* duplicate V11 */
307
        pmulhw 8*15(%esi), %mm4         /* V15 */
308
        movq 8*3(%ecx), %mm5
309
        psllw $1, %mm6                  /* t146=t152 */
310
        pmulhw 8*3(%esi), %mm5          /* V3 */
311
        paddsw %mm6, %mm0               /* V63 */
312
/* note that V15 computation has a correction step:
313
 * this is a 'magic' constant that rebiases the results to be closer to the
314
 * expected result.  this magic constant can be refined to reduce the error
315
 * even more by doing the correction step in a later stage when the number
316
 * is actually multiplied by 16
317
 */
318
        paddw x0005000200010001, %mm4
319
        psubsw %mm6, %mm3               /* V60 ; free mm6 */
320
        psraw $1, %mm0                  /* t154=t156 */
321
        movq %mm3, %mm1                 /* duplicate V60 */
322
        pmulhw x539f539f539f539f, %mm1  /* V67 */
323
        movq %mm5, %mm6                 /* duplicate V3 */
324
        psraw $2, %mm4                  /* t148=t150 */
325
        paddsw %mm4, %mm5               /* V61 */
326
        psubsw %mm4, %mm6               /* V62 ; free mm4 */
327
        movq %mm5, %mm4                 /* duplicate V61 */
328
        psllw $1, %mm1                  /* t169 */
329
        paddsw %mm0, %mm5               /* V65 -> result */
330
        psubsw %mm0, %mm4               /* V64 ; free mm0 */
331
        pmulhw x5a825a825a825a82, %mm4  /* V68 */
332
        psraw $1, %mm3                  /* t158 */
333
        psubsw %mm6, %mm3               /* V66 */
334
        movq %mm5, %mm2                 /* duplicate V65 */
335
        pmulhw x61f861f861f861f8, %mm3  /* V70 */
336
        psllw $1, %mm6                  /* t165 */
337
        pmulhw x4546454645464546, %mm6  /* V69 */
338
        psraw $1, %mm2                  /* t172 */
339
/* moved from next block */
340
        movq 8*5(%esi), %mm0            /* V56 */
341
        psllw $1, %mm4                  /* t174 */
342
/* moved from next block */
343
        psraw $1, %mm0                  /* t177=t188 */
344
        nop
345
        psubsw %mm3, %mm6               /* V72 */
346
        psubsw %mm1, %mm3               /* V71 ; free mm1 */
347
        psubsw %mm2, %mm6               /* V73 ; free mm2 */
348
/* moved from next block */
349
        psraw $1, %mm5                  /* t178=t189 */
350
        psubsw %mm6, %mm4               /* V74 */
351
/* moved from next block */
352
        movq %mm0, %mm1                 /* duplicate t177=t188 */
353
        paddsw %mm4, %mm3               /* V75 */
354
/* moved from next block */
355
        paddsw %mm5, %mm0               /* tm1 */
356
/* location
357
 *  5 - V56
358
 * 13 - V57
359
 *  9 - V58
360
 *  X - V59, mm7
361
 *  X - V65, mm5
362
 *  X - V73, mm6
363
 *  X - V74, mm4
364
 *  X - V75, mm3
365
 * free mm0, mm1 & mm2
366
 * moved above
367
 *      movq 8*5(%esi), %mm0            V56
368
 *      psllw $1, %mm0                  t177=t188 ! new !!
369
 *      psllw $1, %mm5                  t178=t189 ! new !!
370
 *      movq %mm0, %mm1                 duplicate t177=t188
371
 *      paddsw %mm5, %mm0               tm1
372
 */
373
        movq 8*13(%esi), %mm2           /* V57 */
374
        psubsw %mm5, %mm1               /* tm15; free mm5 */
375
        movq %mm0, 8(%esi)              /* tm1; free mm0 */
376
        psraw $1, %mm7                  /* t182=t184 ! new !! */
377
/* save the store as used directly in the transpose
378
 *      movq %mm1, 120(%esi)            tm15; free mm1
379
 */
380
        movq %mm7, %mm5                 /* duplicate t182=t184 */
381
        psubsw %mm3, %mm7               /* tm7 */
382
        paddsw %mm3, %mm5               /* tm9; free mm3 */
383
        movq 8*9(%esi), %mm0            /* V58 */
384
        movq %mm2, %mm3                 /* duplicate V57 */
385
        movq %mm7, 8*7(%esi)            /* tm7; free mm7 */
386
        psubsw %mm6, %mm3               /* tm13 */
387
        paddsw %mm6, %mm2               /* tm3 ; free mm6 */
388
/* moved up from the transpose */
389
        movq %mm3, %mm7
390
/* moved up from the transpose */
391
        punpcklwd %mm1, %mm3
392
        movq %mm0, %mm6                 /* duplicate V58 */
393
        movq %mm2, 8*3(%esi)            /* tm3; free mm2 */
394
        paddsw %mm4, %mm0               /* tm5 */
395
        psubsw %mm4, %mm6               /* tm11; free mm4 */
396
/* moved up from the transpose */
397
        punpckhwd %mm1, %mm7
398
        movq %mm0, 8*5(%esi)            /* tm5; free mm0 */
399
/* moved up from the transpose */
400
        movq %mm5, %mm2
401
/* transpose - M4 part
402
 *  ---------       ---------
403
 * | M1 | M2 |     | M1'| M3'|
404
 *  ---------  -->  ---------
405
 * | M3 | M4 |     | M2'| M4'|
406
 *  ---------       ---------
407
 * Two alternatives: use full mmword approach so the following code can be
408
 * scheduled before the transpose is done without stores, or use the faster
409
 * half mmword stores (when possible)
410
 */
411
        movd %mm3, 8*9+4(%esi)          /* MS part of tmt9 */
412
        punpcklwd %mm6, %mm5
413
        movd %mm7, 8*13+4(%esi)         /* MS part of tmt13 */
414
        punpckhwd %mm6, %mm2
415
        movd %mm5, 8*9(%esi)            /* LS part of tmt9 */
416
        punpckhdq %mm3, %mm5            /* free mm3 */
417
        movd %mm2, 8*13(%esi)           /* LS part of tmt13 */
418
        punpckhdq %mm7, %mm2            /* free mm7 */
419
/* moved up from the M3 transpose */
420
        movq 8*8(%esi), %mm0
421
/* moved up from the M3 transpose */
422
        movq 8*10(%esi), %mm1
423
/* moved up from the M3 transpose */
424
        movq %mm0, %mm3
425
/* shuffle the rest of the data, and write it with 2 mmword writes */
426
        movq %mm5, 8*11(%esi)           /* tmt11 */
427
/* moved up from the M3 transpose */
428
        punpcklwd %mm1, %mm0
429
        movq %mm2, 8*15(%esi)           /* tmt15 */
430
/* moved up from the M3 transpose */
431
        punpckhwd %mm1, %mm3
432
/* transpose - M3 part
433
 * moved up to previous code section
434
 *      movq 8*8(%esi), %mm0
435
 *      movq 8*10(%esi), %mm1
436
 *      movq %mm0, %mm3
437
 *      punpcklwd %mm1, %mm0
438
 *      punpckhwd %mm1, %mm3
439
 */
440
        movq 8*12(%esi), %mm6
441
        movq 8*14(%esi), %mm4
442
        movq %mm6, %mm2
443
/* shuffle the data and write the lower parts of the transposed in 4 dwords */
444
        punpcklwd %mm4, %mm6
445
        movq %mm0, %mm1
446
        punpckhdq %mm6, %mm1
447
        movq %mm3, %mm7
448
        punpckhwd %mm4, %mm2            /* free mm4 */
449
        punpckldq %mm6, %mm0            /* free mm6 */
450
/* moved from next block */
451
        movq 8*13(%esi), %mm4           /* tmt13 */
452
        punpckldq %mm2, %mm3
453
        punpckhdq %mm2, %mm7            /* free mm2 */
454
/* moved from next block */
455
        movq %mm3, %mm5                 /* duplicate tmt5 */
456
/* column 1: even part (after transpose)
457
* moved above
458
*       movq %mm3, %mm5                 duplicate tmt5
459
*       movq 8*13(%esi), %mm4           tmt13
460
*/
461
        psubsw %mm4, %mm3               /* V134 */
462
        pmulhw x5a825a825a825a82, %mm3  /* 23170 ->V136 */
463
        movq 8*9(%esi), %mm6            /* tmt9 */
464
        paddsw %mm4, %mm5               /* V135 ; mm4 free */
465
        movq %mm0, %mm4                 /* duplicate tmt1 */
466
        paddsw %mm6, %mm0               /* V137 */
467
        psubsw %mm6, %mm4               /* V138 ; mm6 free */
468
        psllw $2, %mm3                  /* t290 */
469
        psubsw %mm5, %mm3               /* V139 */
470
        movq %mm0, %mm6                 /* duplicate V137 */
471
        paddsw %mm5, %mm0               /* V140 */
472
        movq %mm4, %mm2                 /* duplicate V138 */
473
        paddsw %mm3, %mm2               /* V141 */
474
        psubsw %mm3, %mm4               /* V142 ; mm3 free */
475
        movq %mm0, 8*9(%esi)            /* V140 */
476
        psubsw %mm5, %mm6               /* V143 ; mm5 free */
477
/* moved from next block */
478
        movq 8*11(%esi), %mm0           /* tmt11 */
479
        movq %mm2, 8*13(%esi)           /* V141 */
480
/* moved from next block */
481
        movq %mm0, %mm2                 /* duplicate tmt11 */
482
/* column 1: odd part (after transpose) */
483
/* moved up to the prev block
484
 *      movq 8*11(%esi), %mm0           tmt11
485
 *      movq %mm0, %mm2                 duplicate tmt11
486
 */
487
        movq 8*15(%esi), %mm5           /* tmt15 */
488
        psubsw %mm7, %mm0               /* V144 */
489
        movq %mm0, %mm3                 /* duplicate V144 */
490
        paddsw %mm7, %mm2               /* V147 ; free mm7 */
491
        pmulhw x539f539f539f539f, %mm0  /* 21407-> V151 */
492
        movq %mm1, %mm7                 /* duplicate tmt3 */
493
        paddsw %mm5, %mm7               /* V145 */
494
        psubsw %mm5, %mm1               /* V146 ; free mm5 */
495
        psubsw %mm1, %mm3               /* V150 */
496
        movq %mm7, %mm5                 /* duplicate V145 */
497
        pmulhw x4546454645464546, %mm1  /* 17734-> V153 */
498
        psubsw %mm2, %mm5               /* V148 */
499
        pmulhw x61f861f861f861f8, %mm3  /* 25080-> V154 */
500
        psllw $2, %mm0                  /* t311 */
501
        pmulhw x5a825a825a825a82, %mm5  /* 23170-> V152 */
502
        paddsw %mm2, %mm7               /* V149 ; free mm2 */
503
        psllw $1, %mm1                  /* t313 */
504
        nop     /* without the nop - freeze here for one clock */
505
        movq %mm3, %mm2                 /* duplicate V154 */
506
        psubsw %mm0, %mm3               /* V155 ; free mm0 */
507
        psubsw %mm2, %mm1               /* V156 ; free mm2 */
508
/* moved from the next block */
509
        movq %mm6, %mm2                 /* duplicate V143 */
510
/* moved from the next block */
511
        movq 8*13(%esi), %mm0           /* V141 */
512
        psllw $1, %mm1                  /* t315 */
513
        psubsw %mm7, %mm1               /* V157 (keep V149) */
514
        psllw $2, %mm5                  /* t317 */
515
        psubsw %mm1, %mm5               /* V158 */
516
        psllw $1, %mm3                  /* t319 */
517
        paddsw %mm5, %mm3               /* V159 */
518
/* column 1: output butterfly (after transform)
519
 * moved to the prev block
520
 *      movq %mm6, %mm2                 duplicate V143
521
 *      movq 8*13(%esi), %mm0           V141
522
 */
523
        psubsw %mm3, %mm2               /* V163 */
524
        paddsw %mm3, %mm6               /* V164 ; free mm3 */
525
        movq %mm4, %mm3                 /* duplicate V142 */
526
        psubsw %mm5, %mm4               /* V165 ; free mm5 */
527
        movq %mm2, scratch7             /* out7 */
528
        psraw $4, %mm6
529
        psraw $4, %mm4
530
        paddsw %mm5, %mm3               /* V162 */
531
        movq 8*9(%esi), %mm2            /* V140 */
532
        movq %mm0, %mm5                 /* duplicate V141 */
533
/* in order not to perculate this line up,
534
 * we read 72(%esi) very near to this location
535
 */
536
        movq %mm6, 8*9(%esi)            /* out9 */
537
        paddsw %mm1, %mm0               /* V161 */
538
        movq %mm3, scratch5             /* out5 */
539
        psubsw %mm1, %mm5               /* V166 ; free mm1 */
540
        movq %mm4, 8*11(%esi)           /* out11 */
541
        psraw $4, %mm5
542
        movq %mm0, scratch3             /* out3 */
543
        movq %mm2, %mm4                 /* duplicate V140 */
544
        movq %mm5, 8*13(%esi)           /* out13 */
545
        paddsw %mm7, %mm2               /* V160 */
546
/* moved from the next block */
547
        movq 8(%esi), %mm0
548
        psubsw %mm7, %mm4               /* V167 ; free mm7 */
549
/* moved from the next block */
550
        movq 8*3(%esi), %mm7
551
        psraw $4, %mm4
552
        movq %mm2, scratch1             /* out1 */
553
/* moved from the next block */
554
        movq %mm0, %mm1
555
        movq %mm4, 8*15(%esi)           /* out15 */
556
/* moved from the next block */
557
        punpcklwd %mm7, %mm0
558
/* transpose - M2 parts
559
 * moved up to the prev block
560
 *      movq 8(%esi), %mm0
561
 *      movq 8*3(%esi), %mm7
562
 *      movq %mm0, %mm1
563
 *      punpcklwd %mm7, %mm0
564
 */
565
        movq 8*5(%esi), %mm5
566
        punpckhwd %mm7, %mm1
567
        movq 8*7(%esi), %mm4
568
        movq %mm5, %mm3
569
/* shuffle the data and write the lower parts of the trasposed in 4 dwords */
570
        movd %mm0, 8*8(%esi)            /* LS part of tmt8 */
571
        punpcklwd %mm4, %mm5
572
        movd %mm1, 8*12(%esi)           /* LS part of tmt12 */
573
        punpckhwd %mm4, %mm3
574
        movd %mm5, 8*8+4(%esi)          /* MS part of tmt8 */
575
        punpckhdq %mm5, %mm0            /* tmt10 */
576
        movd %mm3, 8*12+4(%esi)         /* MS part of tmt12 */
577
        punpckhdq %mm3, %mm1            /* tmt14 */
578
/* transpose - M1 parts */
579
        movq (%esi), %mm7
580
        movq 8*2(%esi), %mm2
581
        movq %mm7, %mm6
582
        movq 8*4(%esi), %mm5
583
        punpcklwd %mm2, %mm7
584
        movq 8*6(%esi), %mm4
585
        punpckhwd %mm2, %mm6            /* free mm2 */
586
        movq %mm5, %mm3
587
        punpcklwd %mm4, %mm5
588
        punpckhwd %mm4, %mm3            /* free mm4 */
589
        movq %mm7, %mm2
590
        movq %mm6, %mm4
591
        punpckldq %mm5, %mm7            /* tmt0 */
592
        punpckhdq %mm5, %mm2            /* tmt2 ; free mm5 */
593
/* shuffle the rest of the data, and write it with 2 mmword writes */
594
        punpckldq %mm3, %mm6            /* tmt4 */
595
/* moved from next block */
596
        movq %mm2, %mm5                 /* duplicate tmt2 */
597
        punpckhdq %mm3, %mm4            /* tmt6 ; free mm3 */
598
/* moved from next block */
599
        movq %mm0, %mm3                 /* duplicate tmt10 */
600
/* column 0: odd part (after transpose)
601
 *moved up to prev block
602
 *      movq %mm0, %mm3                 duplicate tmt10
603
 *      movq %mm2, %mm5                 duplicate tmt2
604
 */
605
        psubsw %mm4, %mm0               /* V110 */
606
        paddsw %mm4, %mm3               /* V113 ; free mm4 */
607
        movq %mm0, %mm4                 /* duplicate V110 */
608
        paddsw %mm1, %mm2               /* V111 */
609
        pmulhw x539f539f539f539f, %mm0  /* 21407-> V117 */
610
        psubsw %mm1, %mm5               /* V112 ; free mm1 */
611
        psubsw %mm5, %mm4               /* V116 */
612
        movq %mm2, %mm1                 /* duplicate V111 */
613
        pmulhw x4546454645464546, %mm5  /* 17734-> V119 */
614
        psubsw %mm3, %mm2               /* V114 */
615
        pmulhw x61f861f861f861f8, %mm4  /* 25080-> V120 */
616
        paddsw %mm3, %mm1               /* V115 ; free mm3 */
617
        pmulhw x5a825a825a825a82, %mm2  /* 23170-> V118 */
618
        psllw $2, %mm0                  /* t266 */
619
        movq %mm1, (%esi)               /* save V115 */
620
        psllw $1, %mm5                  /* t268 */
621
        psubsw %mm4, %mm5               /* V122 */
622
        psubsw %mm0, %mm4               /* V121 ; free mm0 */
623
        psllw $1, %mm5                  /* t270 */
624
        psubsw %mm1, %mm5               /* V123 ; free mm1 */
625
        psllw $2, %mm2                  /* t272 */
626
        psubsw %mm5, %mm2               /* V124 (keep V123) */
627
        psllw $1, %mm4                  /* t274 */
628
        movq %mm5, 8*2(%esi)            /* save V123 ; free mm5 */
629
        paddsw %mm2, %mm4               /* V125 (keep V124) */
630
/* column 0: even part (after transpose) */
631
        movq 8*12(%esi), %mm0           /* tmt12 */
632
        movq %mm6, %mm3                 /* duplicate tmt4 */
633
        psubsw %mm0, %mm6               /* V100 */
634
        paddsw %mm0, %mm3               /* V101 ; free mm0 */
635
        pmulhw x5a825a825a825a82, %mm6  /* 23170 ->V102 */
636
        movq %mm7, %mm5                 /* duplicate tmt0 */
637
        movq 8*8(%esi), %mm1            /* tmt8 */
638
        paddsw %mm1, %mm7               /* V103 */
639
        psubsw %mm1, %mm5               /* V104 ; free mm1 */
640
        movq %mm7, %mm0                 /* duplicate V103 */
641
        psllw $2, %mm6                  /* t245 */
642
        paddsw %mm3, %mm7               /* V106 */
643
        movq %mm5, %mm1                 /* duplicate V104 */
644
        psubsw %mm3, %mm6               /* V105 */
645
        psubsw %mm3, %mm0               /* V109; free mm3 */
646
        paddsw %mm6, %mm5               /* V107 */
647
        psubsw %mm6, %mm1               /* V108 ; free mm6 */
648
/* column 0: output butterfly (after transform) */
649
        movq %mm1, %mm3                 /* duplicate V108 */
650
        paddsw %mm2, %mm1               /* out4 */
651
        psraw $4, %mm1
652
        psubsw %mm2, %mm3               /* out10 ; free mm2 */
653
        psraw $4, %mm3
654
        movq %mm0, %mm6                 /* duplicate V109 */
655
        movq %mm1, 8*4(%esi)            /* out4 ; free mm1 */
656
        psubsw %mm4, %mm0               /* out6 */
657
        movq %mm3, 8*10(%esi)           /* out10 ; free mm3 */
658
        psraw $4, %mm0
659
        paddsw %mm4, %mm6               /* out8 ; free mm4 */
660
        movq %mm7, %mm1                 /* duplicate V106 */
661
        movq %mm0, 8*6(%esi)            /* out6 ; free mm0 */
662
        psraw $4, %mm6
663
        movq (%esi), %mm4               /* V115 */
664
        movq %mm6, 8*8(%esi)            /* out8 ; free mm6 */
665
        movq %mm5, %mm2                 /* duplicate V107 */
666
        movq 8*2(%esi), %mm3            /* V123 */
667
        paddsw %mm4, %mm7               /* out0 */
668
/* moved up from next block */
669
        movq scratch3, %mm0
670
        psraw $4, %mm7
671
/* moved up from next block */
672
        movq scratch5, %mm6
673
        psubsw %mm4, %mm1               /* out14 ; free mm4 */
674
        paddsw %mm3, %mm5               /* out2 */
675
        psraw $4, %mm1
676
        movq %mm7, (%esi)               /* out0 ; free mm7 */
677
        psraw $4, %mm5
678
        movq %mm1, 8*14(%esi)           /* out14 ; free mm1 */
679
        psubsw %mm3, %mm2               /* out12 ; free mm3 */
680
        movq %mm5, 8*2(%esi)            /* out2 ; free mm5 */
681
        psraw $4, %mm2
682
/* moved up to the prev block */
683
        movq scratch7, %mm4
684
/* moved up to the prev block */
685
        psraw $4, %mm0
686
        movq %mm2, 8*12(%esi)           /* out12 ; free mm2 */
687
/* moved up to the prev block */
688
        psraw $4, %mm6
689
/* move back the data to its correct place
690
* moved up to the prev block
691
 *      movq scratch3, %mm0
692
 *      movq scratch5, %mm6
693
 *      movq scratch7, %mm4
694
 *      psraw $4, %mm0
695
 *      psraw $4, %mm6
696
*/
697
        movq scratch1, %mm1
698
        psraw $4, %mm4
699
        movq %mm0, 8*3(%esi)            /* out3 */
700
        psraw $4, %mm1
701
        movq %mm6, 8*5(%esi)            /* out5 */
702
        movq %mm4, 8*7(%esi)            /* out7 */
703
        movq %mm1, 8(%esi)              /* out1 */
704
        popl %edi
705
        popl %esi
706
        popl %edx
707
        popl %ecx
708
        popl %ebx
709
        movl %ebp,%esp
710
        popl %ebp
711
        ret
712
.Lfe1:
713
        .size    IDCT_mmx,.Lfe1-IDCT_mmx

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.