OpenCores
URL https://opencores.org/ocsvn/mpeg2fpga/mpeg2fpga/trunk

Subversion Repositories mpeg2fpga

[/] [mpeg2fpga/] [trunk/] [tools/] [mpeg2dec/] [mmxupsampleyuvrgb.txt] - Blame information for rev 2

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 2 kdv
__int64 const1 = 0x59BA0000D24B59BA; // Cr_r Cr_b Cr_g Cr_r
2
__int64 const2 = 0x00007168E9FA0000; // Cb-r Cb_b Cb_g Cb_r
3
__int64 const5 = 0x0000D24B59BA0000; // Cr_b Cr_g Cr_r Cr_b
4
__int64 const6 = 0x7168E9FA00007168; // Cb_b Cb_g Cb_r Cb_b
5
 
6
// constants for factors (One_Half/fix(x)) << 2
7
__int64 const05 = 0x0001000000000001; // Cr_r Cr_b Cr_g Cr_r
8
__int64 const15 = 0x00000001FFFA0000; // Cb-r Cb_b Cb_g Cb_r
9
__int64 const45 = 0x0000000000010000; // Cr_b Cr_g Cr_r Cr_b
10
__int64 const55 = 0x0001FFFA00000001; // Cb_b Cb_g Cb_r Cb_b
11
 
12
/*
13
* Upsample and color convert for the case of 2:1 horizontal and 2:1
14
vertical.
15
*/
16
 
17
METHODDEF(void)
18
h2v2_merged_upsample (j_decompress_ptr cinfo,
19
JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
20
JSAMPARRAY output_buf)
21
{
22
// added for MMX
23
 
24
__int64 const128 = 0x0080008000800080;
25
 
26
__int64 empty = 0x0000000000000000;
27
 
28
__int64 davemask = 0x0000FFFFFFFF0000;
29
 
30
////////////////////////////////////
31
 
32
my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
33
register int y, cred, cgreen, cblue;
34
int cb, cr;
35
register JSAMPROW outptr0, outptr1;
36
JSAMPROW inptr00, inptr01, inptr1, inptr2;
37
JDIMENSION col;
38
/* copy these pointers into registers if possible */
39
register JSAMPLE * range_limit = cinfo->sample_range_limit;
40
int * Crrtab = upsample->Cr_r_tab;
41
int * Cbbtab = upsample->Cb_b_tab;
42
INT32 * Crgtab = upsample->Cr_g_tab;
43
INT32 * Cbgtab = upsample->Cb_g_tab;
44
SHIFT_TEMPS
45
 
46
// Added for MMX
47
 
48
register int width = cinfo->image_width;
49
int cols = cinfo->output_width;
50
int cols_asm = (cols >> 3);
51
int diff = cols - (cols_asm<<3);
52
int cols_asm_copy = cols_asm;
53
 
54
///////////////////////////////////////
55
 
56
inptr00 = input_buf[0][in_row_group_ctr*2];
57
inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
58
inptr1 = input_buf[1][in_row_group_ctr];
59
inptr2 = input_buf[2][in_row_group_ctr];
60
outptr0 = output_buf[0];
61
outptr1 = output_buf[1];
62
/* Loop for each group of output pixels */
63
 
64
_asm
65
{
66
mov esi, inptr00
67
mov eax, inptr01
68
mov ebx, inptr2
69
mov ecx, inptr1
70
mov edi, outptr0
71
mov edx, outptr1
72
do_next16:
73
 
74
movd mm0, [ebx] ; 0 0 0 0 Cr3 Cr2 Cr1 Cr0
75
pxor mm6, mm6
76
punpcklbw mm0, mm0 ; Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0
77
movq mm7, const128
78
punpcklwd mm0, mm0 ; Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0
79
movq mm4, mm0
80
punpcklbw mm0, mm6 ; Cr0 Cr0 Cr0 Cr0
81
psubsw mm0, mm7 ; Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128
82
movd mm1, [ecx] ; 0 0 0 0 Cb3 Cb2 Cb1 Cb0
83
psllw mm0, 2 ; left shift by 2 bits
84
punpcklbw mm1, mm1 ; Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0
85
paddsw mm0, const05 ; add (one_half/fix(x)) << 2
86
punpcklwd mm1, mm1 ; Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0
87
movq mm5, mm1
88
pmulhw mm0, const1 ; multiply by (fix(x) >> 1)
89
punpcklbw mm1, mm6 ; Cb0 Cb0 Cb0 Cb0
90
punpckhbw mm4, mm6 ; Cr1 Cr1 Cr1 Cr1
91
psubsw mm1, mm7 ; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
92
punpckhbw mm5, mm6 ; Cb1 Cb1 Cb1 Cb1
93
psllw mm1, 2 ; left shift by 2 bits
94
paddsw mm1, const15 ; add (one_half/fix(x)) << 2
95
psubsw mm4, mm7 ; Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128
96
psubsw mm5, mm7 ; Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128
97
pmulhw mm1, const2 ; multiply by (fix(x) >> 1)
98
psllw mm4, 2 ; left shift by 2 bits
99
psllw mm5, 2 ; left shift by 2 bits
100
paddsw mm4, const45 ; add (one_half/fix(x)) << 2
101
movd mm7, [esi] ; Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0
102
pmulhw mm4, const5 ; multiply by (fix(x) >> 1)
103
movq mm6, mm7
104
punpcklbw mm7, mm7 ; Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0
105
paddsw mm5, const55 ; add (one_half/fix(x)) << 2
106
paddsw mm0, mm1 ; cred0 cbl0 cgr0 cred0
107
movq mm1, mm7
108
pmulhw mm5, const6 ; multiply by (fix(x) >> 1)
109
movq mm2, mm0 ; cred0 cbl0 cgr0 cred0
110
punpcklwd mm7, mm6 ; Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0
111
pand mm2, davemask ; 0 cbl0 cgr0 0
112
psrlq mm1, 16 ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
113
psrlq mm2, 16 ; 0 0 cbl0 cgr0
114
punpcklbw mm7, empty ; Y1 Y0 Y0 Y0
115
paddsw mm4, mm5 ; cbl1 cgr1 cred1 cbl1
116
movq mm3, mm4 ; cbl1 cgr1 cred1 cbl1
117
pand mm3, davemask ; 0 cgr1 cred1 0
118
paddsw mm7, mm0 ; r1 b0 g0 r0
119
psllq mm3, 16 ; cgr1 cred1 0 0
120
movq mm6, mm1 ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
121
por mm2, mm3 ; cgr1 cred1 cbl0 cgr0
122
punpcklbw mm6, empty ; Y4 Y4 Y1 Y1
123
movd mm3, [eax] ; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
124
paddsw mm6, mm2 ; g4 r4 b1 g1
125
packuswb mm7, mm6 ; g4 r4 b1 g1 r1 b0 g0 r0
126
movq mm6, mm3 ; Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2
127
punpcklbw mm3, mm3 ; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
128
movq [edi], mm7 ; move to memory g4 r4 b1 g1 r1 b0 g0 r0
129
movq mm5, mm3 ; Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2
130
punpcklwd mm3, mm6 ; X X X X Y3 Y2 Y2 Y2
131
punpcklbw mm3, empty ; Y3 Y2 Y2 Y2
132
psrlq mm5, 16 ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
133
paddsw mm3, mm0 ; r3 b2 g2 r2
134
movq mm6, mm5 ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
135
movq mm0, mm1 ; 0 0 Y5 Y5 Y4 Y4 Y1 Y1
136
punpckldq mm6, mm6 ; X X X X Y6 Y6 Y3 Y3
137
punpcklbw mm6, empty ; Y6 Y6 Y3 Y3
138
psrlq mm1, 24 ; 0 0 0 0 0 Y5 Y5 Y4
139
paddsw mm6, mm2 ; g6 r6 b3 g3
140
packuswb mm3, mm6 ; g6 r6 b3 g3 r3 b2 g2 r2
141
movq mm2, mm5 ; 0 0 Y7 Y7 Y6 Y6 Y3 Y3
142
psrlq mm0, 32 ; 0 0 0 0 0 0 Y5 Y5
143
movq [edx], mm3 ; move to memory g6 r6 b3 g3 r3 b2 g2 r2
144
punpcklwd mm1, mm0 ; X X X X Y5 Y5 Y5 Y4
145
psrlq mm5, 24 ; 0 0 0 0 0 Y7 Y7 Y6
146
movd mm0, [ebx] ; 0 0 0 0 Cr5 Cr4 Cr3 Cr2
147
psrlq mm2, 32 ; 0 0 0 0 0 0 Y7 Y7
148
psrlq mm0, 16
149
punpcklbw mm1, empty ; Y5 Y5 Y5 Y4
150
punpcklwd mm5, mm2 ; X X X X Y7 Y7 Y7 Y6
151
paddsw mm1, mm4 ; b5 g5 r5 b4
152
punpcklbw mm5, empty ; Y7 Y7 Y7 Y6
153
pxor mm6, mm6 ; clear mm6 registr
154
punpcklbw mm0, mm0 ; X X X X Cr3 Cr3 Cr2 Cr2
155
paddsw mm5, mm4 ; b7 g7 r7 b6
156
punpcklwd mm0, mm0 ; Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2
157
movq mm4, mm0
158
movd mm3, [ecx] ; 0 0 0 0 Cb5 Cb4 Cb3 Cb2
159
punpcklbw mm0, mm6 ; Cr2 Cr2 Cr2 Cr2
160
psrlq mm3, 16
161
psubsw mm0, const128 ; Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128
162
punpcklbw mm3, mm3 ; X X X X Cb3 Cb3 Cb2 Cb2
163
psllw mm0, 2 ; left shift by 2 bits
164
paddsw mm0, const05 ; add (one_half/fix(x)) << 2
165
punpcklwd mm3, mm3 ; Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2
166
movq mm7, mm3
167
pmulhw mm0, const1 ; multiply by (fix(x) >> 1)
168
punpcklbw mm3, mm6 ; Cb2 Cb2 Cb2 Cb2
169
psubsw mm3, const128 ; Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128
170
punpckhbw mm4, mm6 ; Cr3 Cr3 Cr3 Cr3
171
psllw mm3, 2 ; left shift by 2 bits
172
paddsw mm3, const15 ; add (one_half/fix(x)) << 2
173
punpckhbw mm7, mm6 ; Cb3 Cb3 Cb3 Cb3
174
pmulhw mm3, const2 ; multiply by (fix(x) >> 1)
175
psubsw mm7, const128 ; Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128
176
paddsw mm0, mm3 ; cred2 cbl2 cgr2 cred2
177
psllw mm7, 2 ; left shift by 2 bits
178
psubsw mm4, const128 ; Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128
179
movd mm3, [esi+4] ; Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
180
psllw mm4, 2 ; left shift by 2 bits
181
paddsw mm7, const55 ; add (one_half/fix(x)) << 2
182
movq mm6, mm3 ; Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8
183
movq mm2, mm0
184
pand mm2, davemask
185
punpcklbw mm3, mm3 ; Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8
186
psrlq mm2, 16
187
paddsw mm4, const45 ; add (one_half/fix(x)) << 2
188
punpcklwd mm3, mm6 ; X X X X Y9 Y8 Y8 Y8
189
pmulhw mm4, const5 ; multiply by (fix(x) >> 1)
190
pmulhw mm7, const6 ; multiply by (fix(x) >> 1)
191
punpcklbw mm3, empty ; Y9 Y8 Y8 Y8
192
paddsw mm4, mm7 ; cbl3 cgr3 cred3 cbl3
193
paddsw mm3, mm0 ; r9 b8 g8 r8
194
movq mm7, mm4
195
packuswb mm1, mm3 ; r9 b8 g8 r8 b5 g5 r5 b4
196
movd mm3, [eax+4] ; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
197
pand mm7, davemask
198
psrlq mm6, 8 ; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
199
psllq mm7, 16
200
movq [edi+8], mm1 ; move to memory r9 b8 g8 r8 b5 g5 r5 b4
201
por mm2, mm7
202
movq mm7, mm3 ; Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10
203
punpcklbw mm3, mm3 ; X X X X Y11 Y11 Y10 Y10
204
pxor mm1, mm1
205
punpcklwd mm3, mm7 ; X X X X Y11 Y10 Y10 Y10
206
punpcklbw mm3, mm1 ; Y11 Y10 Y10 Y10
207
psrlq mm7, 8 ; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
208
paddsw mm3, mm0 ; r11 b10 g10 r10
209
movq mm0, mm7 ; 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11
210
packuswb mm5, mm3 ; r11 b10 g10 r10 b7 g7 r7 b6
211
punpcklbw mm7, mm7 ; X X X X Y14 Y14 Y11 Y11
212
movq [edx+8], mm5 ; move to memory r11 b10 g10 r10 b7 g7 r7 b6
213
movq mm3, mm6 ; 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9
214
punpcklbw mm6, mm6 ; X X X X Y12 Y12 Y9 Y9
215
punpcklbw mm7, mm1 ; Y14 Y14 Y11 Y11
216
punpcklbw mm6, mm1 ; Y12 Y12 Y9 Y9
217
paddsw mm7, mm2 ; g14 r14 b11 g11
218
paddsw mm6, mm2 ; g12 r12 b9 g9
219
psrlq mm3, 8 ; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
220
movq mm1, mm3 ; 0 0 Y21 Y20 Y17 Y16 Y13 Y12
221
punpcklbw mm3, mm3 ; X X X X Y13 Y13 Y12 Y12
222
add esi, 8
223
psrlq mm3, 16 ; X X X X X X Y13 Y13 modified on 09/24
224
punpcklwd mm1, mm3 ; X X X X Y13 Y13 Y13 Y12
225
add eax, 8
226
psrlq mm0, 8 ; 0 0 Y23 Y22 Y19 Y18 Y15 Y14
227
punpcklbw mm1, empty ; Y13 Y13 Y13 Y12
228
movq mm5, mm0 ; 0 0 Y23 Y22 Y19 Y18 Y15 Y14
229
punpcklbw mm0, mm0 ; X X X X Y15 Y15 Y14 Y14
230
paddsw mm1, mm4 ; b13 g13 r13 b12
231
psrlq mm0, 16 ; X X X X X X Y15 Y15
232
add edi, 24
233
punpcklwd mm5, mm0 ; X X X X Y15 Y15 Y15 Y14
234
packuswb mm6, mm1 ; b13 g13 r13 b12 g12 r12 b9 g9
235
add edx, 24
236
punpcklbw mm5, empty ; Y15 Y15 Y15 Y14
237
add ebx, 4
238
paddsw mm5, mm4 ; b15 g15 r15 b14
239
movq [edi-8], mm6 ; move to memory b13 g13 r13 b12 g12 r12 b9 g9
240
packuswb mm7, mm5 ; b15 g15 r15 b14 g14 r14 b11 g11
241
add ecx, 4
242
movq [edx-8], mm7 ; move to memory b15 g15 r15 b14 g14 r14 b11 g11
243
dec cols_asm
244
jnz do_next16
245
EMMS
246
}
247
 
248
inptr1 += (cols_asm_copy<<2);
249
inptr2 += (cols_asm_copy<<2);
250
inptr00 += (cols_asm_copy<<3);
251
inptr01 += (cols_asm_copy<<3);
252
outptr0 += cols_asm_copy*24;
253
outptr1 += cols_asm_copy*24;
254
 
255
// Process the stragglers in C
256
for (col = diff >> 1; col > 0; col--) {
257
/* Do the chroma part of the calculation */
258
cb = GETJSAMPLE(*inptr1++);
259
cr = GETJSAMPLE(*inptr2++);
260
cred = Crrtab[cr];
261
cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
262
cblue = Cbbtab[cb];
263
/* Fetch 4 Y values and emit 4 pixels */
264
y = GETJSAMPLE(*inptr00++);
265
outptr0[RGB_RED] = range_limit[y + cred];
266
outptr0[RGB_GREEN] = range_limit[y + cgreen];
267
outptr0[RGB_BLUE] = range_limit[y + cblue];
268
outptr0 += RGB_PIXELSIZE;
269
y = GETJSAMPLE(*inptr00++);
270
outptr0[RGB_RED] = range_limit[y + cred];
271
outptr0[RGB_GREEN] = range_limit[y + cgreen];
272
outptr0[RGB_BLUE] = range_limit[y + cblue];
273
outptr0 += RGB_PIXELSIZE;
274
y = GETJSAMPLE(*inptr01++);
275
outptr1[RGB_RED] = range_limit[y + cred];
276
outptr1[RGB_GREEN] = range_limit[y + cgreen];
277
outptr1[RGB_BLUE] = range_limit[y + cblue];
278
outptr1 += RGB_PIXELSIZE;
279
y = GETJSAMPLE(*inptr01++);
280
outptr1[RGB_RED] = range_limit[y + cred];
281
outptr1[RGB_GREEN] = range_limit[y + cgreen];
282
outptr1[RGB_BLUE] = range_limit[y + cblue];
283
outptr1 += RGB_PIXELSIZE;
284
}
285
 
286
/* If image width is odd, do the last output column separately */
287
//if (cinfo->output_width & 1) {
288
if (diff & 1) {
289
cb = GETJSAMPLE(*inptr1);
290
cr = GETJSAMPLE(*inptr2);
291
cred = Crrtab[cr];
292
cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
293
cblue = Cbbtab[cb];
294
y = GETJSAMPLE(*inptr00);
295
outptr0[RGB_RED] = range_limit[y + cred];
296
outptr0[RGB_GREEN] = range_limit[y + cgreen];
297
outptr0[RGB_BLUE] = range_limit[y + cblue];
298
y = GETJSAMPLE(*inptr01);
299
outptr1[RGB_RED] = range_limit[y + cred];
300
outptr1[RGB_GREEN] = range_limit[y + cgreen];
301
outptr1[RGB_BLUE] = range_limit[y + cblue];
302
}
303
 
304
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.