25 #if COMPILE_TEMPLATE_MMXEXT
26 #define PREFETCH "prefetchnta"
28 #define PREFETCH " # nop"
31 #if COMPILE_TEMPLATE_MMXEXT
32 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
36 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
38 #define YSCALEYUV2PACKEDX_UV \
40 "xor %%"REG_a", %%"REG_a" \n\t"\
44 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
45 "mov (%%"REG_d"), %%"REG_S" \n\t"\
46 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
47 "movq %%mm3, %%mm4 \n\t"\
50 "movq 8(%%"REG_d"), %%mm0 \n\t" \
51 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
52 "add %6, %%"REG_S" \n\t" \
53 "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" \
54 "add $16, %%"REG_d" \n\t"\
55 "mov (%%"REG_d"), %%"REG_S" \n\t"\
56 "pmulhw %%mm0, %%mm2 \n\t"\
57 "pmulhw %%mm0, %%mm5 \n\t"\
58 "paddw %%mm2, %%mm3 \n\t"\
59 "paddw %%mm5, %%mm4 \n\t"\
60 "test %%"REG_S", %%"REG_S" \n\t"\
63 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
64 "lea "offset"(%0), %%"REG_d" \n\t"\
65 "mov (%%"REG_d"), %%"REG_S" \n\t"\
66 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
67 "movq "#dst1", "#dst2" \n\t"\
70 "movq 8(%%"REG_d"), "#coeff" \n\t" \
71 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" \
72 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" \
73 "add $16, %%"REG_d" \n\t"\
74 "mov (%%"REG_d"), %%"REG_S" \n\t"\
75 "pmulhw "#coeff", "#src1" \n\t"\
76 "pmulhw "#coeff", "#src2" \n\t"\
77 "paddw "#src1", "#dst1" \n\t"\
78 "paddw "#src2", "#dst2" \n\t"\
79 "test %%"REG_S", %%"REG_S" \n\t"\
82 #define YSCALEYUV2PACKEDX \
83 YSCALEYUV2PACKEDX_UV \
84 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
86 #define YSCALEYUV2PACKEDX_END \
87 :: "r" (&c->redDither), \
88 "m" (dummy), "m" (dummy), "m" (dummy),\
89 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
90 : "%"REG_a, "%"REG_d, "%"REG_S \
93 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
95 "xor %%"REG_a", %%"REG_a" \n\t"\
99 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
100 "mov (%%"REG_d"), %%"REG_S" \n\t"\
101 "pxor %%mm4, %%mm4 \n\t"\
102 "pxor %%mm5, %%mm5 \n\t"\
103 "pxor %%mm6, %%mm6 \n\t"\
104 "pxor %%mm7, %%mm7 \n\t"\
107 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" \
108 "add %6, %%"REG_S" \n\t" \
109 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
110 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
111 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" \
112 "movq %%mm0, %%mm3 \n\t"\
113 "punpcklwd %%mm1, %%mm0 \n\t"\
114 "punpckhwd %%mm1, %%mm3 \n\t"\
115 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" \
116 "pmaddwd %%mm1, %%mm0 \n\t"\
117 "pmaddwd %%mm1, %%mm3 \n\t"\
118 "paddd %%mm0, %%mm4 \n\t"\
119 "paddd %%mm3, %%mm5 \n\t"\
120 "add %6, %%"REG_S" \n\t" \
121 "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" \
122 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
123 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
124 "test %%"REG_S", %%"REG_S" \n\t"\
125 "movq %%mm2, %%mm0 \n\t"\
126 "punpcklwd %%mm3, %%mm2 \n\t"\
127 "punpckhwd %%mm3, %%mm0 \n\t"\
128 "pmaddwd %%mm1, %%mm2 \n\t"\
129 "pmaddwd %%mm1, %%mm0 \n\t"\
130 "paddd %%mm2, %%mm6 \n\t"\
131 "paddd %%mm0, %%mm7 \n\t"\
133 "psrad $16, %%mm4 \n\t"\
134 "psrad $16, %%mm5 \n\t"\
135 "psrad $16, %%mm6 \n\t"\
136 "psrad $16, %%mm7 \n\t"\
137 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
138 "packssdw %%mm5, %%mm4 \n\t"\
139 "packssdw %%mm7, %%mm6 \n\t"\
140 "paddw %%mm0, %%mm4 \n\t"\
141 "paddw %%mm0, %%mm6 \n\t"\
142 "movq %%mm4, "U_TEMP"(%0) \n\t"\
143 "movq %%mm6, "V_TEMP"(%0) \n\t"\
145 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
146 "lea "offset"(%0), %%"REG_d" \n\t"\
147 "mov (%%"REG_d"), %%"REG_S" \n\t"\
148 "pxor %%mm1, %%mm1 \n\t"\
149 "pxor %%mm5, %%mm5 \n\t"\
150 "pxor %%mm7, %%mm7 \n\t"\
151 "pxor %%mm6, %%mm6 \n\t"\
154 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" \
155 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \
156 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
157 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" \
158 "movq %%mm0, %%mm3 \n\t"\
159 "punpcklwd %%mm4, %%mm0 \n\t"\
160 "punpckhwd %%mm4, %%mm3 \n\t"\
161 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" \
162 "pmaddwd %%mm4, %%mm0 \n\t"\
163 "pmaddwd %%mm4, %%mm3 \n\t"\
164 "paddd %%mm0, %%mm1 \n\t"\
165 "paddd %%mm3, %%mm5 \n\t"\
166 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" \
167 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
168 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
169 "test %%"REG_S", %%"REG_S" \n\t"\
170 "movq %%mm2, %%mm0 \n\t"\
171 "punpcklwd %%mm3, %%mm2 \n\t"\
172 "punpckhwd %%mm3, %%mm0 \n\t"\
173 "pmaddwd %%mm4, %%mm2 \n\t"\
174 "pmaddwd %%mm4, %%mm0 \n\t"\
175 "paddd %%mm2, %%mm7 \n\t"\
176 "paddd %%mm0, %%mm6 \n\t"\
178 "psrad $16, %%mm1 \n\t"\
179 "psrad $16, %%mm5 \n\t"\
180 "psrad $16, %%mm7 \n\t"\
181 "psrad $16, %%mm6 \n\t"\
182 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
183 "packssdw %%mm5, %%mm1 \n\t"\
184 "packssdw %%mm6, %%mm7 \n\t"\
185 "paddw %%mm0, %%mm1 \n\t"\
186 "paddw %%mm0, %%mm7 \n\t"\
187 "movq "U_TEMP"(%0), %%mm3 \n\t"\
188 "movq "V_TEMP"(%0), %%mm4 \n\t"\
190 #define YSCALEYUV2PACKEDX_ACCURATE \
191 YSCALEYUV2PACKEDX_ACCURATE_UV \
192 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
194 #define YSCALEYUV2RGBX \
195 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \
196 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \
197 "movq %%mm3, %%mm2 \n\t" \
198 "movq %%mm4, %%mm5 \n\t" \
199 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
200 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
202 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
203 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
204 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \
205 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \
206 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
207 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
209 "paddw %%mm3, %%mm4 \n\t"\
210 "movq %%mm2, %%mm0 \n\t"\
211 "movq %%mm5, %%mm6 \n\t"\
212 "movq %%mm4, %%mm3 \n\t"\
213 "punpcklwd %%mm2, %%mm2 \n\t"\
214 "punpcklwd %%mm5, %%mm5 \n\t"\
215 "punpcklwd %%mm4, %%mm4 \n\t"\
216 "paddw %%mm1, %%mm2 \n\t"\
217 "paddw %%mm1, %%mm5 \n\t"\
218 "paddw %%mm1, %%mm4 \n\t"\
219 "punpckhwd %%mm0, %%mm0 \n\t"\
220 "punpckhwd %%mm6, %%mm6 \n\t"\
221 "punpckhwd %%mm3, %%mm3 \n\t"\
222 "paddw %%mm7, %%mm0 \n\t"\
223 "paddw %%mm7, %%mm6 \n\t"\
224 "paddw %%mm7, %%mm3 \n\t"\
226 "packuswb %%mm0, %%mm2 \n\t"\
227 "packuswb %%mm6, %%mm5 \n\t"\
228 "packuswb %%mm3, %%mm4 \n\t"\
230 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
231 "movq "#b", "#q2" \n\t" \
232 "movq "#r", "#t" \n\t" \
233 "punpcklbw "#g", "#b" \n\t" \
234 "punpcklbw "#a", "#r" \n\t" \
235 "punpckhbw "#g", "#q2" \n\t" \
236 "punpckhbw "#a", "#t" \n\t" \
237 "movq "#b", "#q0" \n\t" \
238 "movq "#q2", "#q3" \n\t" \
239 "punpcklwd "#r", "#q0" \n\t" \
240 "punpckhwd "#r", "#b" \n\t" \
241 "punpcklwd "#t", "#q2" \n\t" \
242 "punpckhwd "#t", "#q3" \n\t" \
244 MOVNTQ( q0, (dst, index, 4))\
245 MOVNTQ( b, 8(dst, index, 4))\
246 MOVNTQ( q2, 16(dst, index, 4))\
247 MOVNTQ( q3, 24(dst, index, 4))\
249 "add $8, "#index" \n\t"\
250 "cmp "#dstw", "#index" \n\t"\
252 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
255 const int16_t **lumSrc,
int lumFilterSize,
256 const int16_t *chrFilter,
const int16_t **chrUSrc,
257 const int16_t **chrVSrc,
258 int chrFilterSize,
const int16_t **alpSrc,
263 x86_reg uv_off = c->uv_off_byte;
268 "movq %%mm2, "U_TEMP"(%0) \n\t"
269 "movq %%mm4, "V_TEMP"(%0) \n\t"
270 "movq %%mm5, "Y_TEMP"(%0) \n\t"
272 "movq "Y_TEMP"(%0), %%mm5 \n\t"
273 "psraw $3, %%mm1 \n\t"
274 "psraw $3, %%mm7 \n\t"
275 "packuswb %%mm7, %%mm1 \n\t"
276 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
281 "pcmpeqd %%mm7, %%mm7 \n\t"
282 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
288 const int16_t **lumSrc,
int lumFilterSize,
289 const int16_t *chrFilter,
const int16_t **chrUSrc,
290 const int16_t **chrVSrc,
291 int chrFilterSize,
const int16_t **alpSrc,
296 x86_reg uv_off = c->uv_off_byte;
302 "psraw $3, %%mm1 \n\t"
303 "psraw $3, %%mm7 \n\t"
304 "packuswb %%mm7, %%mm1 \n\t"
305 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
310 "pcmpeqd %%mm7, %%mm7 \n\t"
311 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
316 #define REAL_WRITERGB16(dst, dstw, index) \
317 "pand "MANGLE(bF8)", %%mm2 \n\t" \
318 "pand "MANGLE(bFC)", %%mm4 \n\t" \
319 "pand "MANGLE(bF8)", %%mm5 \n\t" \
320 "psrlq $3, %%mm2 \n\t"\
322 "movq %%mm2, %%mm1 \n\t"\
323 "movq %%mm4, %%mm3 \n\t"\
325 "punpcklbw %%mm7, %%mm3 \n\t"\
326 "punpcklbw %%mm5, %%mm2 \n\t"\
327 "punpckhbw %%mm7, %%mm4 \n\t"\
328 "punpckhbw %%mm5, %%mm1 \n\t"\
330 "psllq $3, %%mm3 \n\t"\
331 "psllq $3, %%mm4 \n\t"\
333 "por %%mm3, %%mm2 \n\t"\
334 "por %%mm4, %%mm1 \n\t"\
336 MOVNTQ(%%mm2, (dst, index, 2))\
337 MOVNTQ(%%mm1, 8(dst, index, 2))\
339 "add $8, "#index" \n\t"\
340 "cmp "#dstw", "#index" \n\t"\
342 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
345 const int16_t **lumSrc,
int lumFilterSize,
346 const int16_t *chrFilter,
const int16_t **chrUSrc,
347 const int16_t **chrVSrc,
348 int chrFilterSize,
const int16_t **alpSrc,
353 x86_reg uv_off = c->uv_off_byte;
357 "pxor %%mm7, %%mm7 \n\t"
369 const int16_t **lumSrc,
int lumFilterSize,
370 const int16_t *chrFilter,
const int16_t **chrUSrc,
371 const int16_t **chrVSrc,
372 int chrFilterSize,
const int16_t **alpSrc,
377 x86_reg uv_off = c->uv_off_byte;
381 "pxor %%mm7, %%mm7 \n\t"
392 #define REAL_WRITERGB15(dst, dstw, index) \
393 "pand "MANGLE(bF8)", %%mm2 \n\t" \
394 "pand "MANGLE(bF8)", %%mm4 \n\t" \
395 "pand "MANGLE(bF8)", %%mm5 \n\t" \
396 "psrlq $3, %%mm2 \n\t"\
397 "psrlq $1, %%mm5 \n\t"\
399 "movq %%mm2, %%mm1 \n\t"\
400 "movq %%mm4, %%mm3 \n\t"\
402 "punpcklbw %%mm7, %%mm3 \n\t"\
403 "punpcklbw %%mm5, %%mm2 \n\t"\
404 "punpckhbw %%mm7, %%mm4 \n\t"\
405 "punpckhbw %%mm5, %%mm1 \n\t"\
407 "psllq $2, %%mm3 \n\t"\
408 "psllq $2, %%mm4 \n\t"\
410 "por %%mm3, %%mm2 \n\t"\
411 "por %%mm4, %%mm1 \n\t"\
413 MOVNTQ(%%mm2, (dst, index, 2))\
414 MOVNTQ(%%mm1, 8(dst, index, 2))\
416 "add $8, "#index" \n\t"\
417 "cmp "#dstw", "#index" \n\t"\
419 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
422 const int16_t **lumSrc,
int lumFilterSize,
423 const int16_t *chrFilter,
const int16_t **chrUSrc,
424 const int16_t **chrVSrc,
425 int chrFilterSize,
const int16_t **alpSrc,
430 x86_reg uv_off = c->uv_off_byte;
434 "pxor %%mm7, %%mm7 \n\t"
446 const int16_t **lumSrc,
int lumFilterSize,
447 const int16_t *chrFilter,
const int16_t **chrUSrc,
448 const int16_t **chrVSrc,
449 int chrFilterSize,
const int16_t **alpSrc,
454 x86_reg uv_off = c->uv_off_byte;
458 "pxor %%mm7, %%mm7 \n\t"
469 #define WRITEBGR24MMX(dst, dstw, index) \
471 "movq %%mm2, %%mm1 \n\t" \
472 "movq %%mm5, %%mm6 \n\t" \
473 "punpcklbw %%mm4, %%mm2 \n\t" \
474 "punpcklbw %%mm7, %%mm5 \n\t" \
475 "punpckhbw %%mm4, %%mm1 \n\t" \
476 "punpckhbw %%mm7, %%mm6 \n\t" \
477 "movq %%mm2, %%mm0 \n\t" \
478 "movq %%mm1, %%mm3 \n\t" \
479 "punpcklwd %%mm5, %%mm0 \n\t" \
480 "punpckhwd %%mm5, %%mm2 \n\t" \
481 "punpcklwd %%mm6, %%mm1 \n\t" \
482 "punpckhwd %%mm6, %%mm3 \n\t" \
484 "movq %%mm0, %%mm4 \n\t" \
485 "movq %%mm2, %%mm6 \n\t" \
486 "movq %%mm1, %%mm5 \n\t" \
487 "movq %%mm3, %%mm7 \n\t" \
489 "psllq $40, %%mm0 \n\t" \
490 "psllq $40, %%mm2 \n\t" \
491 "psllq $40, %%mm1 \n\t" \
492 "psllq $40, %%mm3 \n\t" \
494 "punpckhdq %%mm4, %%mm0 \n\t" \
495 "punpckhdq %%mm6, %%mm2 \n\t" \
496 "punpckhdq %%mm5, %%mm1 \n\t" \
497 "punpckhdq %%mm7, %%mm3 \n\t" \
499 "psrlq $8, %%mm0 \n\t" \
500 "movq %%mm2, %%mm6 \n\t" \
501 "psllq $40, %%mm2 \n\t" \
502 "por %%mm2, %%mm0 \n\t" \
503 MOVNTQ(%%mm0, (dst))\
505 "psrlq $24, %%mm6 \n\t" \
506 "movq %%mm1, %%mm5 \n\t" \
507 "psllq $24, %%mm1 \n\t" \
508 "por %%mm1, %%mm6 \n\t" \
509 MOVNTQ(%%mm6, 8(dst))\
511 "psrlq $40, %%mm5 \n\t" \
512 "psllq $8, %%mm3 \n\t" \
513 "por %%mm3, %%mm5 \n\t" \
514 MOVNTQ(%%mm5, 16(dst))\
516 "add $24, "#dst" \n\t"\
518 "add $8, "#index" \n\t"\
519 "cmp "#dstw", "#index" \n\t"\
522 #define WRITEBGR24MMXEXT(dst, dstw, index) \
524 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
525 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
526 "pshufw $0x50, %%mm2, %%mm1 \n\t" \
527 "pshufw $0x50, %%mm4, %%mm3 \n\t" \
528 "pshufw $0x00, %%mm5, %%mm6 \n\t" \
530 "pand %%mm0, %%mm1 \n\t" \
531 "pand %%mm0, %%mm3 \n\t" \
532 "pand %%mm7, %%mm6 \n\t" \
534 "psllq $8, %%mm3 \n\t" \
535 "por %%mm1, %%mm6 \n\t"\
536 "por %%mm3, %%mm6 \n\t"\
537 MOVNTQ(%%mm6, (dst))\
539 "psrlq $8, %%mm4 \n\t" \
540 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \
541 "pshufw $0x55, %%mm4, %%mm3 \n\t" \
542 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \
544 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \
545 "pand %%mm7, %%mm3 \n\t" \
546 "pand %%mm0, %%mm6 \n\t" \
548 "por %%mm1, %%mm3 \n\t" \
549 "por %%mm3, %%mm6 \n\t"\
550 MOVNTQ(%%mm6, 8(dst))\
552 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \
553 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \
554 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \
556 "pand %%mm7, %%mm1 \n\t" \
557 "pand %%mm0, %%mm3 \n\t" \
558 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \
560 "por %%mm1, %%mm3 \n\t"\
561 "por %%mm3, %%mm6 \n\t"\
562 MOVNTQ(%%mm6, 16(dst))\
564 "add $24, "#dst" \n\t"\
566 "add $8, "#index" \n\t"\
567 "cmp "#dstw", "#index" \n\t"\
570 #if COMPILE_TEMPLATE_MMXEXT
572 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
575 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
579 const int16_t **lumSrc,
int lumFilterSize,
580 const int16_t *chrFilter,
const int16_t **chrUSrc,
581 const int16_t **chrVSrc,
582 int chrFilterSize,
const int16_t **alpSrc,
587 x86_reg uv_off = c->uv_off_byte;
591 "pxor %%mm7, %%mm7 \n\t"
592 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
"\n\t"
593 "add %4, %%"REG_c
" \n\t"
595 ::
"r" (&c->redDither),
596 "m" (dummy),
"m" (dummy),
"m" (dummy),
597 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
598 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
603 const int16_t **lumSrc,
int lumFilterSize,
604 const int16_t *chrFilter,
const int16_t **chrUSrc,
605 const int16_t **chrVSrc,
606 int chrFilterSize,
const int16_t **alpSrc,
611 x86_reg uv_off = c->uv_off_byte;
615 "pxor %%mm7, %%mm7 \n\t"
616 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_c
" \n\t"
617 "add %4, %%"REG_c
" \n\t"
619 ::
"r" (&c->redDither),
620 "m" (dummy),
"m" (dummy),
"m" (dummy),
621 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
622 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S
626 #define REAL_WRITEYUY2(dst, dstw, index) \
627 "packuswb %%mm3, %%mm3 \n\t"\
628 "packuswb %%mm4, %%mm4 \n\t"\
629 "packuswb %%mm7, %%mm1 \n\t"\
630 "punpcklbw %%mm4, %%mm3 \n\t"\
631 "movq %%mm1, %%mm7 \n\t"\
632 "punpcklbw %%mm3, %%mm1 \n\t"\
633 "punpckhbw %%mm3, %%mm7 \n\t"\
635 MOVNTQ(%%mm1, (dst, index, 2))\
636 MOVNTQ(%%mm7, 8(dst, index, 2))\
638 "add $8, "#index" \n\t"\
639 "cmp "#dstw", "#index" \n\t"\
641 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
644 const int16_t **lumSrc,
int lumFilterSize,
645 const int16_t *chrFilter,
const int16_t **chrUSrc,
646 const int16_t **chrVSrc,
647 int chrFilterSize,
const int16_t **alpSrc,
652 x86_reg uv_off = c->uv_off_byte;
656 "psraw $3, %%mm3 \n\t"
657 "psraw $3, %%mm4 \n\t"
658 "psraw $3, %%mm1 \n\t"
659 "psraw $3, %%mm7 \n\t"
665 const int16_t **lumSrc,
int lumFilterSize,
666 const int16_t *chrFilter,
const int16_t **chrUSrc,
667 const int16_t **chrVSrc,
668 int chrFilterSize,
const int16_t **alpSrc,
673 x86_reg uv_off = c->uv_off_byte;
677 "psraw $3, %%mm3 \n\t"
678 "psraw $3, %%mm4 \n\t"
679 "psraw $3, %%mm1 \n\t"
680 "psraw $3, %%mm7 \n\t"
685 #define REAL_YSCALEYUV2RGB_UV(index, c) \
686 "xor "#index", "#index" \n\t"\
689 "movq (%2, "#index"), %%mm2 \n\t" \
690 "movq (%3, "#index"), %%mm3 \n\t" \
691 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
692 "movq (%2, "#index"), %%mm5 \n\t" \
693 "movq (%3, "#index"), %%mm4 \n\t" \
694 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
695 "psubw %%mm3, %%mm2 \n\t" \
696 "psubw %%mm4, %%mm5 \n\t" \
697 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
698 "pmulhw %%mm0, %%mm2 \n\t" \
699 "pmulhw %%mm0, %%mm5 \n\t" \
700 "psraw $4, %%mm3 \n\t" \
701 "psraw $4, %%mm4 \n\t" \
702 "paddw %%mm2, %%mm3 \n\t" \
703 "paddw %%mm5, %%mm4 \n\t" \
704 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
705 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
706 "movq %%mm3, %%mm2 \n\t" \
707 "movq %%mm4, %%mm5 \n\t" \
708 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
709 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
712 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
713 "movq ("#b1", "#index", 2), %%mm0 \n\t" \
714 "movq ("#b2", "#index", 2), %%mm1 \n\t" \
715 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \
716 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \
717 "psubw %%mm1, %%mm0 \n\t" \
718 "psubw %%mm7, %%mm6 \n\t" \
719 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
720 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
721 "psraw $4, %%mm1 \n\t" \
722 "psraw $4, %%mm7 \n\t" \
723 "paddw %%mm0, %%mm1 \n\t" \
724 "paddw %%mm6, %%mm7 \n\t" \
726 #define REAL_YSCALEYUV2RGB_COEFF(c) \
727 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
728 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
729 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
730 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
731 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
732 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
734 "paddw %%mm3, %%mm4 \n\t"\
735 "movq %%mm2, %%mm0 \n\t"\
736 "movq %%mm5, %%mm6 \n\t"\
737 "movq %%mm4, %%mm3 \n\t"\
738 "punpcklwd %%mm2, %%mm2 \n\t"\
739 "punpcklwd %%mm5, %%mm5 \n\t"\
740 "punpcklwd %%mm4, %%mm4 \n\t"\
741 "paddw %%mm1, %%mm2 \n\t"\
742 "paddw %%mm1, %%mm5 \n\t"\
743 "paddw %%mm1, %%mm4 \n\t"\
744 "punpckhwd %%mm0, %%mm0 \n\t"\
745 "punpckhwd %%mm6, %%mm6 \n\t"\
746 "punpckhwd %%mm3, %%mm3 \n\t"\
747 "paddw %%mm7, %%mm0 \n\t"\
748 "paddw %%mm7, %%mm6 \n\t"\
749 "paddw %%mm7, %%mm3 \n\t"\
751 "packuswb %%mm0, %%mm2 \n\t"\
752 "packuswb %%mm6, %%mm5 \n\t"\
753 "packuswb %%mm3, %%mm4 \n\t"\
755 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
757 #define YSCALEYUV2RGB(index, c) \
758 REAL_YSCALEYUV2RGB_UV(index, c) \
759 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
760 REAL_YSCALEYUV2RGB_COEFF(c)
766 const int16_t *ubuf[2],
const int16_t *vbuf[2],
767 const int16_t *abuf[2],
uint8_t *dest,
768 int dstW,
int yalpha,
int uvalpha,
int y)
770 const int16_t *buf0 = buf[0], *buf1 = buf[1],
771 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
774 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
779 "psraw $3, %%mm1 \n\t"
780 "psraw $3, %%mm7 \n\t"
781 "packuswb %%mm7, %%mm1 \n\t"
782 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
783 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (dest),
785 "r" (abuf0),
"r" (abuf1)
789 *(
const uint16_t **)(&c->u_temp)=abuf0;
790 *(
const uint16_t **)(&c->v_temp)=abuf1;
793 "mov %4, %%"REG_b
" \n\t"
794 "push %%"REG_BP
" \n\t"
798 "mov "U_TEMP"(%5), %0 \n\t"
799 "mov "V_TEMP"(%5), %1 \n\t"
801 "psraw $3, %%mm1 \n\t"
802 "psraw $3, %%mm7 \n\t"
803 "packuswb %%mm7, %%mm1 \n\t"
806 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
807 "pop %%"REG_BP
" \n\t"
809 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
816 "mov %4, %%"REG_b
" \n\t"
817 "push %%"REG_BP
" \n\t"
819 "pcmpeqd %%mm7, %%mm7 \n\t"
820 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
821 "pop %%"REG_BP
" \n\t"
823 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
830 const int16_t *ubuf[2],
const int16_t *vbuf[2],
831 const int16_t *abuf[2],
uint8_t *dest,
832 int dstW,
int yalpha,
int uvalpha,
int y)
834 const int16_t *buf0 = buf[0], *buf1 = buf[1],
835 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
840 "mov %4, %%"REG_b
" \n\t"
841 "push %%"REG_BP
" \n\t"
843 "pxor %%mm7, %%mm7 \n\t"
845 "pop %%"REG_BP
" \n\t"
847 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
853 const int16_t *ubuf[2],
const int16_t *vbuf[2],
854 const int16_t *abuf[2],
uint8_t *dest,
855 int dstW,
int yalpha,
int uvalpha,
int y)
857 const int16_t *buf0 = buf[0], *buf1 = buf[1],
858 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
863 "mov %4, %%"REG_b
" \n\t"
864 "push %%"REG_BP
" \n\t"
866 "pxor %%mm7, %%mm7 \n\t"
874 "pop %%"REG_BP
" \n\t"
876 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
882 const int16_t *ubuf[2],
const int16_t *vbuf[2],
883 const int16_t *abuf[2],
uint8_t *dest,
884 int dstW,
int yalpha,
int uvalpha,
int y)
886 const int16_t *buf0 = buf[0], *buf1 = buf[1],
887 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
892 "mov %4, %%"REG_b
" \n\t"
893 "push %%"REG_BP
" \n\t"
895 "pxor %%mm7, %%mm7 \n\t"
903 "pop %%"REG_BP
" \n\t"
905 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
910 #define REAL_YSCALEYUV2PACKED(index, c) \
911 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
912 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
913 "psraw $3, %%mm0 \n\t"\
914 "psraw $3, %%mm1 \n\t"\
915 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
916 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
917 "xor "#index", "#index" \n\t"\
920 "movq (%2, "#index"), %%mm2 \n\t" \
921 "movq (%3, "#index"), %%mm3 \n\t" \
922 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
923 "movq (%2, "#index"), %%mm5 \n\t" \
924 "movq (%3, "#index"), %%mm4 \n\t" \
925 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
926 "psubw %%mm3, %%mm2 \n\t" \
927 "psubw %%mm4, %%mm5 \n\t" \
928 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
929 "pmulhw %%mm0, %%mm2 \n\t" \
930 "pmulhw %%mm0, %%mm5 \n\t" \
931 "psraw $7, %%mm3 \n\t" \
932 "psraw $7, %%mm4 \n\t" \
933 "paddw %%mm2, %%mm3 \n\t" \
934 "paddw %%mm5, %%mm4 \n\t" \
935 "movq (%0, "#index", 2), %%mm0 \n\t" \
936 "movq (%1, "#index", 2), %%mm1 \n\t" \
937 "movq 8(%0, "#index", 2), %%mm6 \n\t" \
938 "movq 8(%1, "#index", 2), %%mm7 \n\t" \
939 "psubw %%mm1, %%mm0 \n\t" \
940 "psubw %%mm7, %%mm6 \n\t" \
941 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
942 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
943 "psraw $7, %%mm1 \n\t" \
944 "psraw $7, %%mm7 \n\t" \
945 "paddw %%mm0, %%mm1 \n\t" \
946 "paddw %%mm6, %%mm7 \n\t" \
948 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
951 const int16_t *ubuf[2],
const int16_t *vbuf[2],
952 const int16_t *abuf[2],
uint8_t *dest,
953 int dstW,
int yalpha,
int uvalpha,
int y)
955 const int16_t *buf0 = buf[0], *buf1 = buf[1],
956 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
961 "mov %4, %%"REG_b
" \n\t"
962 "push %%"REG_BP
" \n\t"
965 "pop %%"REG_BP
" \n\t"
967 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
972 #define REAL_YSCALEYUV2RGB1(index, c) \
973 "xor "#index", "#index" \n\t"\
976 "movq (%2, "#index"), %%mm3 \n\t" \
977 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
978 "movq (%2, "#index"), %%mm4 \n\t" \
979 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
980 "psraw $4, %%mm3 \n\t" \
981 "psraw $4, %%mm4 \n\t" \
982 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
983 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
984 "movq %%mm3, %%mm2 \n\t" \
985 "movq %%mm4, %%mm5 \n\t" \
986 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
987 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
989 "movq (%0, "#index", 2), %%mm1 \n\t" \
990 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
991 "psraw $4, %%mm1 \n\t" \
992 "psraw $4, %%mm7 \n\t" \
993 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
994 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
995 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
996 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
997 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
998 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1000 "paddw %%mm3, %%mm4 \n\t"\
1001 "movq %%mm2, %%mm0 \n\t"\
1002 "movq %%mm5, %%mm6 \n\t"\
1003 "movq %%mm4, %%mm3 \n\t"\
1004 "punpcklwd %%mm2, %%mm2 \n\t"\
1005 "punpcklwd %%mm5, %%mm5 \n\t"\
1006 "punpcklwd %%mm4, %%mm4 \n\t"\
1007 "paddw %%mm1, %%mm2 \n\t"\
1008 "paddw %%mm1, %%mm5 \n\t"\
1009 "paddw %%mm1, %%mm4 \n\t"\
1010 "punpckhwd %%mm0, %%mm0 \n\t"\
1011 "punpckhwd %%mm6, %%mm6 \n\t"\
1012 "punpckhwd %%mm3, %%mm3 \n\t"\
1013 "paddw %%mm7, %%mm0 \n\t"\
1014 "paddw %%mm7, %%mm6 \n\t"\
1015 "paddw %%mm7, %%mm3 \n\t"\
1017 "packuswb %%mm0, %%mm2 \n\t"\
1018 "packuswb %%mm6, %%mm5 \n\t"\
1019 "packuswb %%mm3, %%mm4 \n\t"\
1021 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1024 #define REAL_YSCALEYUV2RGB1b(index, c) \
1025 "xor "#index", "#index" \n\t"\
1028 "movq (%2, "#index"), %%mm2 \n\t" \
1029 "movq (%3, "#index"), %%mm3 \n\t" \
1030 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1031 "movq (%2, "#index"), %%mm5 \n\t" \
1032 "movq (%3, "#index"), %%mm4 \n\t" \
1033 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1034 "paddw %%mm2, %%mm3 \n\t" \
1035 "paddw %%mm5, %%mm4 \n\t" \
1036 "psrlw $5, %%mm3 \n\t" \
1037 "psrlw $5, %%mm4 \n\t" \
1038 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
1039 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
1040 "movq %%mm3, %%mm2 \n\t" \
1041 "movq %%mm4, %%mm5 \n\t" \
1042 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1043 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1045 "movq (%0, "#index", 2), %%mm1 \n\t" \
1046 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1047 "psraw $4, %%mm1 \n\t" \
1048 "psraw $4, %%mm7 \n\t" \
1049 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1050 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1051 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
1052 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
1053 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1054 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1056 "paddw %%mm3, %%mm4 \n\t"\
1057 "movq %%mm2, %%mm0 \n\t"\
1058 "movq %%mm5, %%mm6 \n\t"\
1059 "movq %%mm4, %%mm3 \n\t"\
1060 "punpcklwd %%mm2, %%mm2 \n\t"\
1061 "punpcklwd %%mm5, %%mm5 \n\t"\
1062 "punpcklwd %%mm4, %%mm4 \n\t"\
1063 "paddw %%mm1, %%mm2 \n\t"\
1064 "paddw %%mm1, %%mm5 \n\t"\
1065 "paddw %%mm1, %%mm4 \n\t"\
1066 "punpckhwd %%mm0, %%mm0 \n\t"\
1067 "punpckhwd %%mm6, %%mm6 \n\t"\
1068 "punpckhwd %%mm3, %%mm3 \n\t"\
1069 "paddw %%mm7, %%mm0 \n\t"\
1070 "paddw %%mm7, %%mm6 \n\t"\
1071 "paddw %%mm7, %%mm3 \n\t"\
1073 "packuswb %%mm0, %%mm2 \n\t"\
1074 "packuswb %%mm6, %%mm5 \n\t"\
1075 "packuswb %%mm3, %%mm4 \n\t"\
1077 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1079 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1080 "movq (%1, "#index", 2), %%mm7 \n\t" \
1081 "movq 8(%1, "#index", 2), %%mm1 \n\t" \
1082 "psraw $7, %%mm7 \n\t" \
1083 "psraw $7, %%mm1 \n\t" \
1084 "packuswb %%mm1, %%mm7 \n\t"
1085 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1091 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1092 const int16_t *abuf0,
uint8_t *dest,
1093 int dstW,
int uvalpha,
int y)
1095 const int16_t *ubuf0 = ubuf[0];
1096 const int16_t *buf1= buf0;
1098 if (uvalpha < 2048) {
1099 const int16_t *ubuf1 = ubuf[0];
1103 "mov %4, %%"REG_b
" \n\t"
1104 "push %%"REG_BP
" \n\t"
1107 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1108 "pop %%"REG_BP
" \n\t"
1110 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1116 "mov %4, %%"REG_b
" \n\t"
1117 "push %%"REG_BP
" \n\t"
1119 "pcmpeqd %%mm7, %%mm7 \n\t"
1120 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1121 "pop %%"REG_BP
" \n\t"
1123 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1128 const int16_t *ubuf1 = ubuf[1];
1132 "mov %4, %%"REG_b
" \n\t"
1133 "push %%"REG_BP
" \n\t"
1136 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137 "pop %%"REG_BP
" \n\t"
1139 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1145 "mov %4, %%"REG_b
" \n\t"
1146 "push %%"REG_BP
" \n\t"
1148 "pcmpeqd %%mm7, %%mm7 \n\t"
1149 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1150 "pop %%"REG_BP
" \n\t"
1152 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1160 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1161 const int16_t *abuf0,
uint8_t *dest,
1162 int dstW,
int uvalpha,
int y)
1164 const int16_t *ubuf0 = ubuf[0];
1165 const int16_t *buf1= buf0;
1167 if (uvalpha < 2048) {
1168 const int16_t *ubuf1 = ubuf[0];
1171 "mov %4, %%"REG_b
" \n\t"
1172 "push %%"REG_BP
" \n\t"
1174 "pxor %%mm7, %%mm7 \n\t"
1176 "pop %%"REG_BP
" \n\t"
1178 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1182 const int16_t *ubuf1 = ubuf[1];
1185 "mov %4, %%"REG_b
" \n\t"
1186 "push %%"REG_BP
" \n\t"
1188 "pxor %%mm7, %%mm7 \n\t"
1190 "pop %%"REG_BP
" \n\t"
1192 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1199 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1200 const int16_t *abuf0,
uint8_t *dest,
1201 int dstW,
int uvalpha,
int y)
1203 const int16_t *ubuf0 = ubuf[0];
1204 const int16_t *buf1= buf0;
1206 if (uvalpha < 2048) {
1207 const int16_t *ubuf1 = ubuf[0];
1210 "mov %4, %%"REG_b
" \n\t"
1211 "push %%"REG_BP
" \n\t"
1213 "pxor %%mm7, %%mm7 \n\t"
1221 "pop %%"REG_BP
" \n\t"
1223 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1227 const int16_t *ubuf1 = ubuf[1];
1230 "mov %4, %%"REG_b
" \n\t"
1231 "push %%"REG_BP
" \n\t"
1233 "pxor %%mm7, %%mm7 \n\t"
1241 "pop %%"REG_BP
" \n\t"
1243 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1250 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1251 const int16_t *abuf0,
uint8_t *dest,
1252 int dstW,
int uvalpha,
int y)
1254 const int16_t *ubuf0 = ubuf[0];
1255 const int16_t *buf1= buf0;
1257 if (uvalpha < 2048) {
1258 const int16_t *ubuf1 = ubuf[0];
1261 "mov %4, %%"REG_b
" \n\t"
1262 "push %%"REG_BP
" \n\t"
1264 "pxor %%mm7, %%mm7 \n\t"
1272 "pop %%"REG_BP
" \n\t"
1274 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1278 const int16_t *ubuf1 = ubuf[1];
1281 "mov %4, %%"REG_b
" \n\t"
1282 "push %%"REG_BP
" \n\t"
1284 "pxor %%mm7, %%mm7 \n\t"
1292 "pop %%"REG_BP
" \n\t"
1294 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1300 #define REAL_YSCALEYUV2PACKED1(index, c) \
1301 "xor "#index", "#index" \n\t"\
1304 "movq (%2, "#index"), %%mm3 \n\t" \
1305 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1306 "movq (%2, "#index"), %%mm4 \n\t" \
1307 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1308 "psraw $7, %%mm3 \n\t" \
1309 "psraw $7, %%mm4 \n\t" \
1310 "movq (%0, "#index", 2), %%mm1 \n\t" \
1311 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1312 "psraw $7, %%mm1 \n\t" \
1313 "psraw $7, %%mm7 \n\t" \
1315 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1317 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1318 "xor "#index", "#index" \n\t"\
1321 "movq (%2, "#index"), %%mm2 \n\t" \
1322 "movq (%3, "#index"), %%mm3 \n\t" \
1323 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1324 "movq (%2, "#index"), %%mm5 \n\t" \
1325 "movq (%3, "#index"), %%mm4 \n\t" \
1326 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1327 "paddw %%mm2, %%mm3 \n\t" \
1328 "paddw %%mm5, %%mm4 \n\t" \
1329 "psrlw $8, %%mm3 \n\t" \
1330 "psrlw $8, %%mm4 \n\t" \
1331 "movq (%0, "#index", 2), %%mm1 \n\t" \
1332 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1333 "psraw $7, %%mm1 \n\t" \
1334 "psraw $7, %%mm7 \n\t"
1335 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1338 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1339 const int16_t *abuf0,
uint8_t *dest,
1340 int dstW,
int uvalpha,
int y)
1342 const int16_t *ubuf0 = ubuf[0];
1343 const int16_t *buf1= buf0;
1345 if (uvalpha < 2048) {
1346 const int16_t *ubuf1 = ubuf[0];
1349 "mov %4, %%"REG_b
" \n\t"
1350 "push %%"REG_BP
" \n\t"
1353 "pop %%"REG_BP
" \n\t"
1355 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1359 const int16_t *ubuf1 = ubuf[1];
1362 "mov %4, %%"REG_b
" \n\t"
1363 "push %%"REG_BP
" \n\t"
1366 "pop %%"REG_BP
" \n\t"
1368 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1374 #if COMPILE_TEMPLATE_MMXEXT
1376 int dstWidth,
const uint8_t *src,
1379 int32_t *filterPos = c->hLumFilterPos;
1380 int16_t *
filter = c->hLumFilter;
1381 void *mmxextFilterCode = c->lumMmxextFilterCode;
1392 "mov %%"REG_b
", %5 \n\t"
1394 "mov -8(%%rsp), %%"REG_a
" \n\t"
1395 "mov %%"REG_a
", %6 \n\t"
1399 "mov -8(%%rsp), %%"REG_a
" \n\t"
1400 "mov %%"REG_a
", %5 \n\t"
1403 "pxor %%mm7, %%mm7 \n\t"
1404 "mov %0, %%"REG_c
" \n\t"
1405 "mov %1, %%"REG_D
" \n\t"
1406 "mov %2, %%"REG_d
" \n\t"
1407 "mov %3, %%"REG_b
" \n\t"
1408 "xor %%"REG_a
", %%"REG_a
" \n\t"
1414 #define CALL_MMXEXT_FILTER_CODE \
1415 "movl (%%"REG_b"), %%esi \n\t"\
1417 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
1418 "add %%"REG_S", %%"REG_c" \n\t"\
1419 "add %%"REG_a", %%"REG_D" \n\t"\
1420 "xor %%"REG_a", %%"REG_a" \n\t"\
1423 #define CALL_MMXEXT_FILTER_CODE \
1424 "movl (%%"REG_b"), %%esi \n\t"\
1426 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
1427 "add %%"REG_a", %%"REG_D" \n\t"\
1428 "xor %%"REG_a", %%"REG_a" \n\t"\
1432 CALL_MMXEXT_FILTER_CODE
1433 CALL_MMXEXT_FILTER_CODE
1434 CALL_MMXEXT_FILTER_CODE
1435 CALL_MMXEXT_FILTER_CODE
1436 CALL_MMXEXT_FILTER_CODE
1437 CALL_MMXEXT_FILTER_CODE
1438 CALL_MMXEXT_FILTER_CODE
1439 CALL_MMXEXT_FILTER_CODE
1442 "mov %5, %%"REG_b
" \n\t"
1444 "mov %6, %%"REG_a
" \n\t"
1445 "mov %%"REG_a
", -8(%%rsp) \n\t"
1449 "mov %5, %%"REG_a
" \n\t"
1450 "mov %%"REG_a
", -8(%%rsp) \n\t"
1453 ::
"m" (src),
"m" (dst),
"m" (
filter),
"m" (filterPos),
1454 "m" (mmxextFilterCode)
1461 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1467 for (i=dstWidth-1; (i*xInc)>>16 >=
srcW-1; i--)
1468 dst[i] = src[
srcW-1]*128;
1472 int dstWidth,
const uint8_t *src1,
1475 int32_t *filterPos = c->hChrFilterPos;
1476 int16_t *filter = c->hChrFilter;
1477 void *mmxextFilterCode = c->chrMmxextFilterCode;
1488 "mov %%"REG_b
", %7 \n\t"
1490 "mov -8(%%rsp), %%"REG_a
" \n\t"
1491 "mov %%"REG_a
", %8 \n\t"
1495 "mov -8(%%rsp), %%"REG_a
" \n\t"
1496 "mov %%"REG_a
", %7 \n\t"
1499 "pxor %%mm7, %%mm7 \n\t"
1500 "mov %0, %%"REG_c
" \n\t"
1501 "mov %1, %%"REG_D
" \n\t"
1502 "mov %2, %%"REG_d
" \n\t"
1503 "mov %3, %%"REG_b
" \n\t"
1504 "xor %%"REG_a
", %%"REG_a
" \n\t"
1509 CALL_MMXEXT_FILTER_CODE
1510 CALL_MMXEXT_FILTER_CODE
1511 CALL_MMXEXT_FILTER_CODE
1512 CALL_MMXEXT_FILTER_CODE
1513 "xor %%"REG_a
", %%"REG_a
" \n\t"
1514 "mov %5, %%"REG_c
" \n\t"
1515 "mov %6, %%"REG_D
" \n\t"
1520 CALL_MMXEXT_FILTER_CODE
1521 CALL_MMXEXT_FILTER_CODE
1522 CALL_MMXEXT_FILTER_CODE
1523 CALL_MMXEXT_FILTER_CODE
1526 "mov %7, %%"REG_b
" \n\t"
1528 "mov %8, %%"REG_a
" \n\t"
1529 "mov %%"REG_a
", -8(%%rsp) \n\t"
1533 "mov %7, %%"REG_a
" \n\t"
1534 "mov %%"REG_a
", -8(%%rsp) \n\t"
1537 ::
"m" (src1),
"m" (dst1),
"m" (
filter),
"m" (filterPos),
1538 "m" (mmxextFilterCode),
"m" (src2),
"m"(dst2)
1545 :
"%"REG_a,
"%"REG_c,
"%"REG_d,
"%"REG_S,
"%"REG_D
1551 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1552 dst1[i] = src1[srcW-1]*128;
1553 dst2[i] = src2[srcW-1]*128;
1567 switch (c->dstFormat) {
1578 switch (c->dstFormat) {
1590 switch (c->dstFormat) {
1617 if (c->srcBpc == 8 && c->dstBpc <= 10) {
1619 #if COMPILE_TEMPLATE_MMXEXT
1625 c->hyscale_fast =
NULL;
1626 c->hcscale_fast =
NULL;
1627 #if COMPILE_TEMPLATE_MMXEXT
void(* hcscale_fast)(struct SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
#define YSCALEYUV2RGB1_ALPHA(index)
#define ALP_MMX_FILTER_OFFSET
static void RENAME() yuv2rgb32_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKED1(index, c)
static void RENAME() yuv2yuyv422_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define DECLARE_ALIGNED(n, t, v)
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
int dstY
Last destination vertical line output from last slice.
#define YSCALEYUV2PACKEDX_END
#define SWS_FULL_CHR_H_INT
#define SWS_FAST_BILINEAR
enum AVPixelFormat dstFormat
Destination pixel format.
#define WRITERGB15(dst, dstw, index)
static void RENAME() yuv2rgb565_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb32_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb565_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2PACKEDX
static void RENAME() yuv2rgb32_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
YV12 to RGB without scaling or interpolating.
#define CONFIG_SWSCALE_ALPHA
#define WRITEBGR24(dst, dstw, index)
planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (firs...
#define YSCALEYUV2RGB1b(index, c)
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
#define YSCALEYUV2PACKED(index, c)
static void RENAME() yuv2rgb565_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define WRITERGB16(dst, dstw, index)
static void filter(MpegAudioContext *s, int ch, const short *samples, int incr)
as above, but U and V bytes are swapped
#define YSCALEYUV2PACKEDX_ACCURATE
static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
static void RENAME() yuv2rgb555_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
static void RENAME() yuv2yuyv422_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
static av_cold void RENAME() sws_init_swscale(SwsContext *c)
packed RGB 8:8:8, 24bpp, BGRBGR...
int dstW
Width of destination luma/alpha planes.
static void RENAME() yuv2yuyv422_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_X(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
#define YSCALEYUV2RGB(index, c)
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
#define YSCALEYUV2PACKED1b(index, c)
static void RENAME() yuv2rgb555_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2bgr24_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2yuyv422_X_ar(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int dstY)
static void RENAME() yuv2rgb32_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
vertical bilinear scale YV12 to RGB
#define AV_PIX_FMT_RGB555
void(* hyscale_fast)(struct SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Scale one horizontal line of input data using a bilinear filter to produce one line of output data...
static void RENAME() yuv2rgb565_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
#define AV_PIX_FMT_RGB565
static void RENAME() yuv2bgr24_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
static void RENAME() yuv2rgb555_2(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
#define WRITEYUY2(dst, dstw, index)
int srcW
Width of source luma/alpha planes.
AVPixelFormat
Pixel format.
static void RENAME() yuv2bgr24_1(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y)
#define YSCALEYUV2RGB1(index, c)