37 #if COMPILE_TEMPLATE_AMD3DNOW
38 #define PREFETCH "prefetch"
39 #define PAVGB "pavgusb"
40 #elif COMPILE_TEMPLATE_MMXEXT
41 #define PREFETCH "prefetchnta"
44 #define PREFETCH " # nop"
47 #if COMPILE_TEMPLATE_AMD3DNOW
54 #if COMPILE_TEMPLATE_MMXEXT
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
59 #define SFENCE " # nop"
62 #if !COMPILE_TEMPLATE_SSE2
64 #if !COMPILE_TEMPLATE_AMD3DNOW
73 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
75 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
79 "movd (%1), %%mm0 \n\t"
80 "punpckldq 3(%1), %%mm0 \n\t"
81 "movd 6(%1), %%mm1 \n\t"
82 "punpckldq 9(%1), %%mm1 \n\t"
83 "movd 12(%1), %%mm2 \n\t"
84 "punpckldq 15(%1), %%mm2 \n\t"
85 "movd 18(%1), %%mm3 \n\t"
86 "punpckldq 21(%1), %%mm3 \n\t"
87 "por %%mm7, %%mm0 \n\t"
88 "por %%mm7, %%mm1 \n\t"
89 "por %%mm7, %%mm2 \n\t"
90 "por %%mm7, %%mm3 \n\t"
93 MOVNTQ" %%mm2, 16(%0) \n\t"
100 __asm__
volatile(
SFENCE:::
"memory");
101 __asm__
volatile(
EMMS:::
"memory");
110 #define STORE_BGR24_MMX \
111 "psrlq $8, %%mm2 \n\t" \
112 "psrlq $8, %%mm3 \n\t" \
113 "psrlq $8, %%mm6 \n\t" \
114 "psrlq $8, %%mm7 \n\t" \
115 "pand "MANGLE(mask24l)", %%mm0\n\t" \
116 "pand "MANGLE(mask24l)", %%mm1\n\t" \
117 "pand "MANGLE(mask24l)", %%mm4\n\t" \
118 "pand "MANGLE(mask24l)", %%mm5\n\t" \
119 "pand "MANGLE(mask24h)", %%mm2\n\t" \
120 "pand "MANGLE(mask24h)", %%mm3\n\t" \
121 "pand "MANGLE(mask24h)", %%mm6\n\t" \
122 "pand "MANGLE(mask24h)", %%mm7\n\t" \
123 "por %%mm2, %%mm0 \n\t" \
124 "por %%mm3, %%mm1 \n\t" \
125 "por %%mm6, %%mm4 \n\t" \
126 "por %%mm7, %%mm5 \n\t" \
128 "movq %%mm1, %%mm2 \n\t" \
129 "movq %%mm4, %%mm3 \n\t" \
130 "psllq $48, %%mm2 \n\t" \
131 "psllq $32, %%mm3 \n\t" \
132 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
133 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
134 "por %%mm2, %%mm0 \n\t" \
135 "psrlq $16, %%mm1 \n\t" \
136 "psrlq $32, %%mm4 \n\t" \
137 "psllq $16, %%mm5 \n\t" \
138 "por %%mm3, %%mm1 \n\t" \
139 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
140 "por %%mm5, %%mm4 \n\t" \
142 MOVNTQ" %%mm0, (%0) \n\t" \
143 MOVNTQ" %%mm1, 8(%0) \n\t" \
144 MOVNTQ" %%mm4, 16(%0)"
154 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
159 "movq (%1), %%mm0 \n\t"
160 "movq 8(%1), %%mm1 \n\t"
161 "movq 16(%1), %%mm4 \n\t"
162 "movq 24(%1), %%mm5 \n\t"
163 "movq %%mm0, %%mm2 \n\t"
164 "movq %%mm1, %%mm3 \n\t"
165 "movq %%mm4, %%mm6 \n\t"
166 "movq %%mm5, %%mm7 \n\t"
173 __asm__
volatile(
SFENCE:::
"memory");
174 __asm__
volatile(
EMMS:::
"memory");
196 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
197 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
202 "movq (%1), %%mm0 \n\t"
203 "movq 8(%1), %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
217 __asm__
volatile(
SFENCE:::
"memory");
218 __asm__
volatile(
EMMS:::
"memory");
221 register unsigned x= *((
const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227 register unsigned short x= *((
const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
239 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
240 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
241 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
246 "movq (%1), %%mm0 \n\t"
247 "movq 8(%1), %%mm2 \n\t"
248 "movq %%mm0, %%mm1 \n\t"
249 "movq %%mm2, %%mm3 \n\t"
250 "psrlq $1, %%mm0 \n\t"
251 "psrlq $1, %%mm2 \n\t"
252 "pand %%mm7, %%mm0 \n\t"
253 "pand %%mm7, %%mm2 \n\t"
254 "pand %%mm6, %%mm1 \n\t"
255 "pand %%mm6, %%mm3 \n\t"
256 "por %%mm1, %%mm0 \n\t"
257 "por %%mm3, %%mm2 \n\t"
265 __asm__
volatile(
SFENCE:::
"memory");
266 __asm__
volatile(
EMMS:::
"memory");
269 register uint32_t x= *((
const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
275 register uint16_t x= *((
const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
285 uint16_t *d = (uint16_t *)dst;
289 "movq %3, %%mm5 \n\t"
290 "movq %4, %%mm6 \n\t"
291 "movq %5, %%mm7 \n\t"
296 "movd (%1), %%mm0 \n\t"
297 "movd 4(%1), %%mm3 \n\t"
298 "punpckldq 8(%1), %%mm0 \n\t"
299 "punpckldq 12(%1), %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm4 \n\t"
302 "pand %%mm6, %%mm0 \n\t"
303 "pand %%mm6, %%mm3 \n\t"
304 "pmaddwd %%mm7, %%mm0 \n\t"
305 "pmaddwd %%mm7, %%mm3 \n\t"
306 "pand %%mm5, %%mm1 \n\t"
307 "pand %%mm5, %%mm4 \n\t"
308 "por %%mm1, %%mm0 \n\t"
309 "por %%mm4, %%mm3 \n\t"
310 "psrld $5, %%mm0 \n\t"
311 "pslld $11, %%mm3 \n\t"
312 "por %%mm3, %%mm0 \n\t"
320 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
322 __asm__
volatile(
SFENCE:::
"memory");
323 __asm__
volatile(
EMMS:::
"memory");
325 register int rgb = *(
const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
335 uint16_t *d = (uint16_t *)dst;
337 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
339 "movq %0, %%mm7 \n\t"
340 "movq %1, %%mm6 \n\t"
341 ::
"m"(red_16mask),
"m"(green_16mask));
346 "movd (%1), %%mm0 \n\t"
347 "movd 4(%1), %%mm3 \n\t"
348 "punpckldq 8(%1), %%mm0 \n\t"
349 "punpckldq 12(%1), %%mm3 \n\t"
350 "movq %%mm0, %%mm1 \n\t"
351 "movq %%mm0, %%mm2 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "movq %%mm3, %%mm5 \n\t"
354 "psllq $8, %%mm0 \n\t"
355 "psllq $8, %%mm3 \n\t"
356 "pand %%mm7, %%mm0 \n\t"
357 "pand %%mm7, %%mm3 \n\t"
358 "psrlq $5, %%mm1 \n\t"
359 "psrlq $5, %%mm4 \n\t"
360 "pand %%mm6, %%mm1 \n\t"
361 "pand %%mm6, %%mm4 \n\t"
362 "psrlq $19, %%mm2 \n\t"
363 "psrlq $19, %%mm5 \n\t"
364 "pand %2, %%mm2 \n\t"
365 "pand %2, %%mm5 \n\t"
366 "por %%mm1, %%mm0 \n\t"
367 "por %%mm4, %%mm3 \n\t"
368 "por %%mm2, %%mm0 \n\t"
369 "por %%mm5, %%mm3 \n\t"
370 "psllq $16, %%mm3 \n\t"
371 "por %%mm3, %%mm0 \n\t"
373 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
377 __asm__
volatile(
SFENCE:::
"memory");
378 __asm__
volatile(
EMMS:::
"memory");
380 register int rgb = *(
const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
390 uint16_t *d = (uint16_t *)dst;
394 "movq %3, %%mm5 \n\t"
395 "movq %4, %%mm6 \n\t"
396 "movq %5, %%mm7 \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd 4(%1), %%mm3 \n\t"
403 "punpckldq 8(%1), %%mm0 \n\t"
404 "punpckldq 12(%1), %%mm3 \n\t"
405 "movq %%mm0, %%mm1 \n\t"
406 "movq %%mm3, %%mm4 \n\t"
407 "pand %%mm6, %%mm0 \n\t"
408 "pand %%mm6, %%mm3 \n\t"
409 "pmaddwd %%mm7, %%mm0 \n\t"
410 "pmaddwd %%mm7, %%mm3 \n\t"
411 "pand %%mm5, %%mm1 \n\t"
412 "pand %%mm5, %%mm4 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "psrld $6, %%mm0 \n\t"
416 "pslld $10, %%mm3 \n\t"
417 "por %%mm3, %%mm0 \n\t"
425 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
427 __asm__
volatile(
SFENCE:::
"memory");
428 __asm__
volatile(
EMMS:::
"memory");
430 register int rgb = *(
const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
440 uint16_t *d = (uint16_t *)dst;
442 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
444 "movq %0, %%mm7 \n\t"
445 "movq %1, %%mm6 \n\t"
446 ::
"m"(red_15mask),
"m"(green_15mask));
451 "movd (%1), %%mm0 \n\t"
452 "movd 4(%1), %%mm3 \n\t"
453 "punpckldq 8(%1), %%mm0 \n\t"
454 "punpckldq 12(%1), %%mm3 \n\t"
455 "movq %%mm0, %%mm1 \n\t"
456 "movq %%mm0, %%mm2 \n\t"
457 "movq %%mm3, %%mm4 \n\t"
458 "movq %%mm3, %%mm5 \n\t"
459 "psllq $7, %%mm0 \n\t"
460 "psllq $7, %%mm3 \n\t"
461 "pand %%mm7, %%mm0 \n\t"
462 "pand %%mm7, %%mm3 \n\t"
463 "psrlq $6, %%mm1 \n\t"
464 "psrlq $6, %%mm4 \n\t"
465 "pand %%mm6, %%mm1 \n\t"
466 "pand %%mm6, %%mm4 \n\t"
467 "psrlq $19, %%mm2 \n\t"
468 "psrlq $19, %%mm5 \n\t"
469 "pand %2, %%mm2 \n\t"
470 "pand %2, %%mm5 \n\t"
471 "por %%mm1, %%mm0 \n\t"
472 "por %%mm4, %%mm3 \n\t"
473 "por %%mm2, %%mm0 \n\t"
474 "por %%mm5, %%mm3 \n\t"
475 "psllq $16, %%mm3 \n\t"
476 "por %%mm3, %%mm0 \n\t"
478 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
482 __asm__
volatile(
SFENCE:::
"memory");
483 __asm__
volatile(
EMMS:::
"memory");
485 register int rgb = *(
const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
495 uint16_t *d = (uint16_t *)dst;
497 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
499 "movq %0, %%mm7 \n\t"
500 "movq %1, %%mm6 \n\t"
501 ::
"m"(red_16mask),
"m"(green_16mask));
506 "movd (%1), %%mm0 \n\t"
507 "movd 3(%1), %%mm3 \n\t"
508 "punpckldq 6(%1), %%mm0 \n\t"
509 "punpckldq 9(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm0, %%mm2 \n\t"
512 "movq %%mm3, %%mm4 \n\t"
513 "movq %%mm3, %%mm5 \n\t"
514 "psrlq $3, %%mm0 \n\t"
515 "psrlq $3, %%mm3 \n\t"
516 "pand %2, %%mm0 \n\t"
517 "pand %2, %%mm3 \n\t"
518 "psrlq $5, %%mm1 \n\t"
519 "psrlq $5, %%mm4 \n\t"
520 "pand %%mm6, %%mm1 \n\t"
521 "pand %%mm6, %%mm4 \n\t"
522 "psrlq $8, %%mm2 \n\t"
523 "psrlq $8, %%mm5 \n\t"
524 "pand %%mm7, %%mm2 \n\t"
525 "pand %%mm7, %%mm5 \n\t"
526 "por %%mm1, %%mm0 \n\t"
527 "por %%mm4, %%mm3 \n\t"
528 "por %%mm2, %%mm0 \n\t"
529 "por %%mm5, %%mm3 \n\t"
530 "psllq $16, %%mm3 \n\t"
531 "por %%mm3, %%mm0 \n\t"
533 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
537 __asm__
volatile(
SFENCE:::
"memory");
538 __asm__
volatile(
EMMS:::
"memory");
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
552 uint16_t *d = (uint16_t *)dst;
554 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
556 "movq %0, %%mm7 \n\t"
557 "movq %1, %%mm6 \n\t"
558 ::
"m"(red_16mask),
"m"(green_16mask));
563 "movd (%1), %%mm0 \n\t"
564 "movd 3(%1), %%mm3 \n\t"
565 "punpckldq 6(%1), %%mm0 \n\t"
566 "punpckldq 9(%1), %%mm3 \n\t"
567 "movq %%mm0, %%mm1 \n\t"
568 "movq %%mm0, %%mm2 \n\t"
569 "movq %%mm3, %%mm4 \n\t"
570 "movq %%mm3, %%mm5 \n\t"
571 "psllq $8, %%mm0 \n\t"
572 "psllq $8, %%mm3 \n\t"
573 "pand %%mm7, %%mm0 \n\t"
574 "pand %%mm7, %%mm3 \n\t"
575 "psrlq $5, %%mm1 \n\t"
576 "psrlq $5, %%mm4 \n\t"
577 "pand %%mm6, %%mm1 \n\t"
578 "pand %%mm6, %%mm4 \n\t"
579 "psrlq $19, %%mm2 \n\t"
580 "psrlq $19, %%mm5 \n\t"
581 "pand %2, %%mm2 \n\t"
582 "pand %2, %%mm5 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584 "por %%mm4, %%mm3 \n\t"
585 "por %%mm2, %%mm0 \n\t"
586 "por %%mm5, %%mm3 \n\t"
587 "psllq $16, %%mm3 \n\t"
588 "por %%mm3, %%mm0 \n\t"
590 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
594 __asm__
volatile(
SFENCE:::
"memory");
595 __asm__
volatile(
EMMS:::
"memory");
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
609 uint16_t *d = (uint16_t *)dst;
611 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
613 "movq %0, %%mm7 \n\t"
614 "movq %1, %%mm6 \n\t"
615 ::
"m"(red_15mask),
"m"(green_15mask));
620 "movd (%1), %%mm0 \n\t"
621 "movd 3(%1), %%mm3 \n\t"
622 "punpckldq 6(%1), %%mm0 \n\t"
623 "punpckldq 9(%1), %%mm3 \n\t"
624 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5 \n\t"
628 "psrlq $3, %%mm0 \n\t"
629 "psrlq $3, %%mm3 \n\t"
630 "pand %2, %%mm0 \n\t"
631 "pand %2, %%mm3 \n\t"
632 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $9, %%mm2 \n\t"
637 "psrlq $9, %%mm5 \n\t"
638 "pand %%mm7, %%mm2 \n\t"
639 "pand %%mm7, %%mm5 \n\t"
640 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0 \n\t"
647 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
651 __asm__
volatile(
SFENCE:::
"memory");
652 __asm__
volatile(
EMMS:::
"memory");
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
666 uint16_t *d = (uint16_t *)dst;
668 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
670 "movq %0, %%mm7 \n\t"
671 "movq %1, %%mm6 \n\t"
672 ::
"m"(red_15mask),
"m"(green_15mask));
677 "movd (%1), %%mm0 \n\t"
678 "movd 3(%1), %%mm3 \n\t"
679 "punpckldq 6(%1), %%mm0 \n\t"
680 "punpckldq 9(%1), %%mm3 \n\t"
681 "movq %%mm0, %%mm1 \n\t"
682 "movq %%mm0, %%mm2 \n\t"
683 "movq %%mm3, %%mm4 \n\t"
684 "movq %%mm3, %%mm5 \n\t"
685 "psllq $7, %%mm0 \n\t"
686 "psllq $7, %%mm3 \n\t"
687 "pand %%mm7, %%mm0 \n\t"
688 "pand %%mm7, %%mm3 \n\t"
689 "psrlq $6, %%mm1 \n\t"
690 "psrlq $6, %%mm4 \n\t"
691 "pand %%mm6, %%mm1 \n\t"
692 "pand %%mm6, %%mm4 \n\t"
693 "psrlq $19, %%mm2 \n\t"
694 "psrlq $19, %%mm5 \n\t"
695 "pand %2, %%mm2 \n\t"
696 "pand %2, %%mm5 \n\t"
697 "por %%mm1, %%mm0 \n\t"
698 "por %%mm4, %%mm3 \n\t"
699 "por %%mm2, %%mm0 \n\t"
700 "por %%mm5, %%mm3 \n\t"
701 "psllq $16, %%mm3 \n\t"
702 "por %%mm3, %%mm0 \n\t"
704 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
708 __asm__
volatile(
SFENCE:::
"memory");
709 __asm__
volatile(
EMMS:::
"memory");
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
742 const uint16_t *mm_end;
744 const uint16_t *s = (
const uint16_t*)src;
745 end = s + src_size/2;
746 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
751 "movq (%1), %%mm0 \n\t"
752 "movq (%1), %%mm1 \n\t"
753 "movq (%1), %%mm2 \n\t"
754 "pand %2, %%mm0 \n\t"
755 "pand %3, %%mm1 \n\t"
756 "pand %4, %%mm2 \n\t"
757 "psllq $3, %%mm0 \n\t"
758 "psrlq $2, %%mm1 \n\t"
759 "psrlq $7, %%mm2 \n\t"
760 "movq %%mm0, %%mm3 \n\t"
761 "movq %%mm1, %%mm4 \n\t"
762 "movq %%mm2, %%mm5 \n\t"
763 "punpcklwd %5, %%mm0 \n\t"
764 "punpcklwd %5, %%mm1 \n\t"
765 "punpcklwd %5, %%mm2 \n\t"
766 "punpckhwd %5, %%mm3 \n\t"
767 "punpckhwd %5, %%mm4 \n\t"
768 "punpckhwd %5, %%mm5 \n\t"
769 "psllq $8, %%mm1 \n\t"
770 "psllq $16, %%mm2 \n\t"
771 "por %%mm1, %%mm0 \n\t"
772 "por %%mm2, %%mm0 \n\t"
773 "psllq $8, %%mm4 \n\t"
774 "psllq $16, %%mm5 \n\t"
775 "por %%mm4, %%mm3 \n\t"
776 "por %%mm5, %%mm3 \n\t"
778 "movq %%mm0, %%mm6 \n\t"
779 "movq %%mm3, %%mm7 \n\t"
781 "movq 8(%1), %%mm0 \n\t"
782 "movq 8(%1), %%mm1 \n\t"
783 "movq 8(%1), %%mm2 \n\t"
784 "pand %2, %%mm0 \n\t"
785 "pand %3, %%mm1 \n\t"
786 "pand %4, %%mm2 \n\t"
787 "psllq $3, %%mm0 \n\t"
788 "psrlq $2, %%mm1 \n\t"
789 "psrlq $7, %%mm2 \n\t"
790 "movq %%mm0, %%mm3 \n\t"
791 "movq %%mm1, %%mm4 \n\t"
792 "movq %%mm2, %%mm5 \n\t"
793 "punpcklwd %5, %%mm0 \n\t"
794 "punpcklwd %5, %%mm1 \n\t"
795 "punpcklwd %5, %%mm2 \n\t"
796 "punpckhwd %5, %%mm3 \n\t"
797 "punpckhwd %5, %%mm4 \n\t"
798 "punpckhwd %5, %%mm5 \n\t"
799 "psllq $8, %%mm1 \n\t"
800 "psllq $16, %%mm2 \n\t"
801 "por %%mm1, %%mm0 \n\t"
802 "por %%mm2, %%mm0 \n\t"
803 "psllq $8, %%mm4 \n\t"
804 "psllq $16, %%mm5 \n\t"
805 "por %%mm4, %%mm3 \n\t"
806 "por %%mm5, %%mm3 \n\t"
809 :
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
813 "movq %%mm0, %%mm4 \n\t"
814 "movq %%mm3, %%mm5 \n\t"
815 "movq %%mm6, %%mm0 \n\t"
816 "movq %%mm7, %%mm1 \n\t"
818 "movq %%mm4, %%mm6 \n\t"
819 "movq %%mm5, %%mm7 \n\t"
820 "movq %%mm0, %%mm2 \n\t"
821 "movq %%mm1, %%mm3 \n\t"
830 __asm__
volatile(
SFENCE:::
"memory");
831 __asm__
volatile(
EMMS:::
"memory");
833 register uint16_t bgr;
835 *d++ = (bgr&0x1F)<<3;
836 *d++ = (bgr&0x3E0)>>2;
837 *d++ = (bgr&0x7C00)>>7;
844 const uint16_t *mm_end;
846 const uint16_t *s = (
const uint16_t *)src;
847 end = s + src_size/2;
848 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
853 "movq (%1), %%mm0 \n\t"
854 "movq (%1), %%mm1 \n\t"
855 "movq (%1), %%mm2 \n\t"
856 "pand %2, %%mm0 \n\t"
857 "pand %3, %%mm1 \n\t"
858 "pand %4, %%mm2 \n\t"
859 "psllq $3, %%mm0 \n\t"
860 "psrlq $3, %%mm1 \n\t"
861 "psrlq $8, %%mm2 \n\t"
862 "movq %%mm0, %%mm3 \n\t"
863 "movq %%mm1, %%mm4 \n\t"
864 "movq %%mm2, %%mm5 \n\t"
865 "punpcklwd %5, %%mm0 \n\t"
866 "punpcklwd %5, %%mm1 \n\t"
867 "punpcklwd %5, %%mm2 \n\t"
868 "punpckhwd %5, %%mm3 \n\t"
869 "punpckhwd %5, %%mm4 \n\t"
870 "punpckhwd %5, %%mm5 \n\t"
871 "psllq $8, %%mm1 \n\t"
872 "psllq $16, %%mm2 \n\t"
873 "por %%mm1, %%mm0 \n\t"
874 "por %%mm2, %%mm0 \n\t"
875 "psllq $8, %%mm4 \n\t"
876 "psllq $16, %%mm5 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm5, %%mm3 \n\t"
880 "movq %%mm0, %%mm6 \n\t"
881 "movq %%mm3, %%mm7 \n\t"
883 "movq 8(%1), %%mm0 \n\t"
884 "movq 8(%1), %%mm1 \n\t"
885 "movq 8(%1), %%mm2 \n\t"
886 "pand %2, %%mm0 \n\t"
887 "pand %3, %%mm1 \n\t"
888 "pand %4, %%mm2 \n\t"
889 "psllq $3, %%mm0 \n\t"
890 "psrlq $3, %%mm1 \n\t"
891 "psrlq $8, %%mm2 \n\t"
892 "movq %%mm0, %%mm3 \n\t"
893 "movq %%mm1, %%mm4 \n\t"
894 "movq %%mm2, %%mm5 \n\t"
895 "punpcklwd %5, %%mm0 \n\t"
896 "punpcklwd %5, %%mm1 \n\t"
897 "punpcklwd %5, %%mm2 \n\t"
898 "punpckhwd %5, %%mm3 \n\t"
899 "punpckhwd %5, %%mm4 \n\t"
900 "punpckhwd %5, %%mm5 \n\t"
901 "psllq $8, %%mm1 \n\t"
902 "psllq $16, %%mm2 \n\t"
903 "por %%mm1, %%mm0 \n\t"
904 "por %%mm2, %%mm0 \n\t"
905 "psllq $8, %%mm4 \n\t"
906 "psllq $16, %%mm5 \n\t"
907 "por %%mm4, %%mm3 \n\t"
908 "por %%mm5, %%mm3 \n\t"
910 :
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
914 "movq %%mm0, %%mm4 \n\t"
915 "movq %%mm3, %%mm5 \n\t"
916 "movq %%mm6, %%mm0 \n\t"
917 "movq %%mm7, %%mm1 \n\t"
919 "movq %%mm4, %%mm6 \n\t"
920 "movq %%mm5, %%mm7 \n\t"
921 "movq %%mm0, %%mm2 \n\t"
922 "movq %%mm1, %%mm3 \n\t"
931 __asm__
volatile(
SFENCE:::
"memory");
932 __asm__
volatile(
EMMS:::
"memory");
934 register uint16_t bgr;
936 *d++ = (bgr&0x1F)<<3;
937 *d++ = (bgr&0x7E0)>>3;
938 *d++ = (bgr&0xF800)>>8;
950 "packuswb %%mm7, %%mm0 \n\t" \
951 "packuswb %%mm7, %%mm1 \n\t" \
952 "packuswb %%mm7, %%mm2 \n\t" \
953 "punpcklbw %%mm1, %%mm0 \n\t" \
954 "punpcklbw %%mm6, %%mm2 \n\t" \
955 "movq %%mm0, %%mm3 \n\t" \
956 "punpcklwd %%mm2, %%mm0 \n\t" \
957 "punpckhwd %%mm2, %%mm3 \n\t" \
958 MOVNTQ" %%mm0, (%0) \n\t" \
959 MOVNTQ" %%mm3, 8(%0) \n\t" \
964 const uint16_t *mm_end;
966 const uint16_t *s = (
const uint16_t *)src;
967 end = s + src_size/2;
968 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
969 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
970 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
975 "movq (%1), %%mm0 \n\t"
976 "movq (%1), %%mm1 \n\t"
977 "movq (%1), %%mm2 \n\t"
978 "pand %2, %%mm0 \n\t"
979 "pand %3, %%mm1 \n\t"
980 "pand %4, %%mm2 \n\t"
981 "psllq $3, %%mm0 \n\t"
982 "psrlq $2, %%mm1 \n\t"
983 "psrlq $7, %%mm2 \n\t"
985 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r)
990 __asm__
volatile(
SFENCE:::
"memory");
991 __asm__
volatile(
EMMS:::
"memory");
993 register uint16_t bgr;
995 *d++ = (bgr&0x1F)<<3;
996 *d++ = (bgr&0x3E0)>>2;
997 *d++ = (bgr&0x7C00)>>7;
1004 const uint16_t *end;
1005 const uint16_t *mm_end;
1007 const uint16_t *s = (
const uint16_t*)src;
1008 end = s + src_size/2;
1009 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1010 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1011 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1013 while (s < mm_end) {
1016 "movq (%1), %%mm0 \n\t"
1017 "movq (%1), %%mm1 \n\t"
1018 "movq (%1), %%mm2 \n\t"
1019 "pand %2, %%mm0 \n\t"
1020 "pand %3, %%mm1 \n\t"
1021 "pand %4, %%mm2 \n\t"
1022 "psllq $3, %%mm0 \n\t"
1023 "psrlq $3, %%mm1 \n\t"
1024 "psrlq $8, %%mm2 \n\t"
1026 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r)
1031 __asm__
volatile(
SFENCE:::
"memory");
1032 __asm__
volatile(
EMMS:::
"memory");
1034 register uint16_t bgr;
1036 *d++ = (bgr&0x1F)<<3;
1037 *d++ = (bgr&0x7E0)>>3;
1038 *d++ = (bgr&0xF800)>>8;
1052 "movq %3, %%mm7 \n\t"
1053 "pxor %4, %%mm7 \n\t"
1054 "movq %%mm7, %%mm6 \n\t"
1055 "pxor %5, %%mm7 \n\t"
1059 "movq (%1, %0), %%mm0 \n\t"
1060 "movq 8(%1, %0), %%mm1 \n\t"
1061 # if COMPILE_TEMPLATE_MMXEXT
1062 "pshufw $177, %%mm0, %%mm3 \n\t"
1063 "pshufw $177, %%mm1, %%mm5 \n\t"
1064 "pand %%mm7, %%mm0 \n\t"
1065 "pand %%mm6, %%mm3 \n\t"
1066 "pand %%mm7, %%mm1 \n\t"
1067 "pand %%mm6, %%mm5 \n\t"
1068 "por %%mm3, %%mm0 \n\t"
1069 "por %%mm5, %%mm1 \n\t"
1071 "movq %%mm0, %%mm2 \n\t"
1072 "movq %%mm1, %%mm4 \n\t"
1073 "pand %%mm7, %%mm0 \n\t"
1074 "pand %%mm6, %%mm2 \n\t"
1075 "pand %%mm7, %%mm1 \n\t"
1076 "pand %%mm6, %%mm4 \n\t"
1077 "movq %%mm2, %%mm3 \n\t"
1078 "movq %%mm4, %%mm5 \n\t"
1079 "pslld $16, %%mm2 \n\t"
1080 "psrld $16, %%mm3 \n\t"
1081 "pslld $16, %%mm4 \n\t"
1082 "psrld $16, %%mm5 \n\t"
1083 "por %%mm2, %%mm0 \n\t"
1084 "por %%mm4, %%mm1 \n\t"
1085 "por %%mm3, %%mm0 \n\t"
1086 "por %%mm5, %%mm1 \n\t"
1088 MOVNTQ" %%mm0, (%2, %0) \n\t"
1089 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1096 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1098 for (; idx<15; idx+=4) {
1099 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1101 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1108 x86_reg mmx_size= 23 - src_size;
1110 "test %%"REG_a
", %%"REG_a
" \n\t"
1112 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1113 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1114 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1118 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1119 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1120 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1121 "psllq $16, %%mm0 \n\t"
1122 "pand %%mm5, %%mm0 \n\t"
1123 "pand %%mm6, %%mm1 \n\t"
1124 "pand %%mm7, %%mm2 \n\t"
1125 "por %%mm0, %%mm1 \n\t"
1126 "por %%mm2, %%mm1 \n\t"
1127 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1128 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1129 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1130 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1131 "pand %%mm7, %%mm0 \n\t"
1132 "pand %%mm5, %%mm1 \n\t"
1133 "pand %%mm6, %%mm2 \n\t"
1134 "por %%mm0, %%mm1 \n\t"
1135 "por %%mm2, %%mm1 \n\t"
1136 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1137 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1138 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1139 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1140 "pand %%mm6, %%mm0 \n\t"
1141 "pand %%mm7, %%mm1 \n\t"
1142 "pand %%mm5, %%mm2 \n\t"
1143 "por %%mm0, %%mm1 \n\t"
1144 "por %%mm2, %%mm1 \n\t"
1145 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1146 "add $24, %%"REG_a
" \n\t"
1150 :
"r" (src-mmx_size),
"r"(dst-mmx_size)
1153 __asm__
volatile(
SFENCE:::
"memory");
1154 __asm__
volatile(
EMMS:::
"memory");
1156 if (mmx_size==23)
return;
1160 src_size= 23-mmx_size;
1163 for (i=0; i<src_size; i+=3) {
1166 dst[i + 1] = src[i + 1];
1167 dst[i + 2] = src[i + 0];
1174 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1178 for (y=0; y<
height; y++) {
1181 "xor %%"REG_a
", %%"REG_a
" \n\t"
1184 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1187 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1188 "movq %%mm0, %%mm2 \n\t"
1189 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1190 "punpcklbw %%mm1, %%mm0 \n\t"
1191 "punpckhbw %%mm1, %%mm2 \n\t"
1193 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1194 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1195 "movq %%mm3, %%mm4 \n\t"
1196 "movq %%mm5, %%mm6 \n\t"
1197 "punpcklbw %%mm0, %%mm3 \n\t"
1198 "punpckhbw %%mm0, %%mm4 \n\t"
1199 "punpcklbw %%mm2, %%mm5 \n\t"
1200 "punpckhbw %%mm2, %%mm6 \n\t"
1202 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1203 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1204 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1205 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1207 "add $8, %%"REG_a
" \n\t"
1208 "cmp %4, %%"REG_a
" \n\t"
1210 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1213 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1214 usrc += chromStride;
1215 vsrc += chromStride;
1231 int lumStride,
int chromStride,
int dstStride)
1239 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1243 for (y=0; y<
height; y++) {
1246 "xor %%"REG_a
", %%"REG_a
" \n\t"
1249 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1252 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1253 "movq %%mm0, %%mm2 \n\t"
1254 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1255 "punpcklbw %%mm1, %%mm0 \n\t"
1256 "punpckhbw %%mm1, %%mm2 \n\t"
1258 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1259 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1260 "movq %%mm0, %%mm4 \n\t"
1261 "movq %%mm2, %%mm6 \n\t"
1262 "punpcklbw %%mm3, %%mm0 \n\t"
1263 "punpckhbw %%mm3, %%mm4 \n\t"
1264 "punpcklbw %%mm5, %%mm2 \n\t"
1265 "punpckhbw %%mm5, %%mm6 \n\t"
1267 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1268 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1269 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1270 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1272 "add $8, %%"REG_a
" \n\t"
1273 "cmp %4, %%"REG_a
" \n\t"
1275 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1278 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1279 usrc += chromStride;
1280 vsrc += chromStride;
1296 int lumStride,
int chromStride,
int dstStride)
1307 int lumStride,
int chromStride,
int dstStride)
1317 int lumStride,
int chromStride,
int dstStride)
1328 int lumStride,
int chromStride,
int srcStride)
1332 for (y=0; y<
height; y+=2) {
1334 "xor %%"REG_a
", %%"REG_a
" \n\t"
1335 "pcmpeqw %%mm7, %%mm7 \n\t"
1336 "psrlw $8, %%mm7 \n\t"
1339 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1340 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1341 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1342 "movq %%mm0, %%mm2 \n\t"
1343 "movq %%mm1, %%mm3 \n\t"
1344 "psrlw $8, %%mm0 \n\t"
1345 "psrlw $8, %%mm1 \n\t"
1346 "pand %%mm7, %%mm2 \n\t"
1347 "pand %%mm7, %%mm3 \n\t"
1348 "packuswb %%mm1, %%mm0 \n\t"
1349 "packuswb %%mm3, %%mm2 \n\t"
1351 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1353 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1354 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1355 "movq %%mm1, %%mm3 \n\t"
1356 "movq %%mm2, %%mm4 \n\t"
1357 "psrlw $8, %%mm1 \n\t"
1358 "psrlw $8, %%mm2 \n\t"
1359 "pand %%mm7, %%mm3 \n\t"
1360 "pand %%mm7, %%mm4 \n\t"
1361 "packuswb %%mm2, %%mm1 \n\t"
1362 "packuswb %%mm4, %%mm3 \n\t"
1364 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1366 "movq %%mm0, %%mm2 \n\t"
1367 "movq %%mm1, %%mm3 \n\t"
1368 "psrlw $8, %%mm0 \n\t"
1369 "psrlw $8, %%mm1 \n\t"
1370 "pand %%mm7, %%mm2 \n\t"
1371 "pand %%mm7, %%mm3 \n\t"
1372 "packuswb %%mm1, %%mm0 \n\t"
1373 "packuswb %%mm3, %%mm2 \n\t"
1375 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1376 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1378 "add $8, %%"REG_a
" \n\t"
1379 "cmp %4, %%"REG_a
" \n\t"
1381 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1382 :
"memory",
"%"REG_a
1389 "xor %%"REG_a
", %%"REG_a
" \n\t"
1392 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1393 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1394 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1395 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1396 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1397 "pand %%mm7, %%mm0 \n\t"
1398 "pand %%mm7, %%mm1 \n\t"
1399 "pand %%mm7, %%mm2 \n\t"
1400 "pand %%mm7, %%mm3 \n\t"
1401 "packuswb %%mm1, %%mm0 \n\t"
1402 "packuswb %%mm3, %%mm2 \n\t"
1404 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1405 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1407 "add $8, %%"REG_a
" \n\t"
1408 "cmp %4, %%"REG_a
" \n\t"
1411 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1412 :
"memory",
"%"REG_a
1414 udst += chromStride;
1415 vdst += chromStride;
1419 __asm__
volatile(
EMMS" \n\t"
1425 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1433 for (x=0; x<srcWidth-1; x++) {
1434 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1435 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1437 dst[2*srcWidth-1]= src[srcWidth-1];
1441 for (y=1; y<srcHeight; y++) {
1442 const x86_reg mmxSize= srcWidth&~15;
1444 "mov %4, %%"REG_a
" \n\t"
1445 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1446 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1447 "movq %%mm4, %%mm2 \n\t"
1448 "psllq $8, %%mm4 \n\t"
1449 "pand %%mm0, %%mm2 \n\t"
1450 "por %%mm2, %%mm4 \n\t"
1451 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1452 "movq %%mm5, %%mm3 \n\t"
1453 "psllq $8, %%mm5 \n\t"
1454 "pand %%mm0, %%mm3 \n\t"
1455 "por %%mm3, %%mm5 \n\t"
1457 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1458 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1459 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1460 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1461 PAVGB
" %%mm0, %%mm5 \n\t"
1462 PAVGB
" %%mm0, %%mm3 \n\t"
1463 PAVGB
" %%mm0, %%mm5 \n\t"
1464 PAVGB
" %%mm0, %%mm3 \n\t"
1465 PAVGB
" %%mm1, %%mm4 \n\t"
1466 PAVGB
" %%mm1, %%mm2 \n\t"
1467 PAVGB
" %%mm1, %%mm4 \n\t"
1468 PAVGB
" %%mm1, %%mm2 \n\t"
1469 "movq %%mm5, %%mm7 \n\t"
1470 "movq %%mm4, %%mm6 \n\t"
1471 "punpcklbw %%mm3, %%mm5 \n\t"
1472 "punpckhbw %%mm3, %%mm7 \n\t"
1473 "punpcklbw %%mm2, %%mm4 \n\t"
1474 "punpckhbw %%mm2, %%mm6 \n\t"
1475 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1476 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1477 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1478 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1479 "add $8, %%"REG_a
" \n\t"
1480 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1481 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1483 ::
"r" (src + mmxSize ),
"r" (src + srcStride + mmxSize ),
1484 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1489 for (x=mmxSize-1; x<srcWidth-1; x++) {
1490 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1491 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1492 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1493 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1495 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1496 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1505 for (x=0; x<srcWidth-1; x++) {
1506 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1507 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1509 dst[2*srcWidth-1]= src[srcWidth-1];
1511 __asm__
volatile(
EMMS" \n\t"
1517 #if !COMPILE_TEMPLATE_AMD3DNOW
1526 int lumStride,
int chromStride,
int srcStride)
1529 const x86_reg chromWidth= width>>1;
1530 for (y=0; y<
height; y+=2) {
1532 "xor %%"REG_a
", %%"REG_a
" \n\t"
1533 "pcmpeqw %%mm7, %%mm7 \n\t"
1534 "psrlw $8, %%mm7 \n\t"
1537 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1538 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1539 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1540 "movq %%mm0, %%mm2 \n\t"
1541 "movq %%mm1, %%mm3 \n\t"
1542 "pand %%mm7, %%mm0 \n\t"
1543 "pand %%mm7, %%mm1 \n\t"
1544 "psrlw $8, %%mm2 \n\t"
1545 "psrlw $8, %%mm3 \n\t"
1546 "packuswb %%mm1, %%mm0 \n\t"
1547 "packuswb %%mm3, %%mm2 \n\t"
1549 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1551 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1552 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1553 "movq %%mm1, %%mm3 \n\t"
1554 "movq %%mm2, %%mm4 \n\t"
1555 "pand %%mm7, %%mm1 \n\t"
1556 "pand %%mm7, %%mm2 \n\t"
1557 "psrlw $8, %%mm3 \n\t"
1558 "psrlw $8, %%mm4 \n\t"
1559 "packuswb %%mm2, %%mm1 \n\t"
1560 "packuswb %%mm4, %%mm3 \n\t"
1562 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1564 "movq %%mm0, %%mm2 \n\t"
1565 "movq %%mm1, %%mm3 \n\t"
1566 "psrlw $8, %%mm0 \n\t"
1567 "psrlw $8, %%mm1 \n\t"
1568 "pand %%mm7, %%mm2 \n\t"
1569 "pand %%mm7, %%mm3 \n\t"
1570 "packuswb %%mm1, %%mm0 \n\t"
1571 "packuswb %%mm3, %%mm2 \n\t"
1573 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1574 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1576 "add $8, %%"REG_a
" \n\t"
1577 "cmp %4, %%"REG_a
" \n\t"
1579 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1580 :
"memory",
"%"REG_a
1587 "xor %%"REG_a
", %%"REG_a
" \n\t"
1590 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1591 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1592 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1593 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1594 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1595 "psrlw $8, %%mm0 \n\t"
1596 "psrlw $8, %%mm1 \n\t"
1597 "psrlw $8, %%mm2 \n\t"
1598 "psrlw $8, %%mm3 \n\t"
1599 "packuswb %%mm1, %%mm0 \n\t"
1600 "packuswb %%mm3, %%mm2 \n\t"
1602 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1603 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1605 "add $8, %%"REG_a
" \n\t"
1606 "cmp %4, %%"REG_a
" \n\t"
1609 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1610 :
"memory",
"%"REG_a
1612 udst += chromStride;
1613 vdst += chromStride;
1617 __asm__
volatile(
EMMS" \n\t"
1632 int lumStride,
int chromStride,
int srcStride)
1635 const x86_reg chromWidth= width>>1;
1636 for (y=0; y<height-2; y+=2) {
1638 for (i=0; i<2; i++) {
1640 "mov %2, %%"REG_a
" \n\t"
1641 "movq "MANGLE(ff_bgr2YCoeff)
", %%mm6 \n\t"
1642 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1643 "pxor %%mm7, %%mm7 \n\t"
1644 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1648 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1649 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1650 "punpcklbw %%mm7, %%mm0 \n\t"
1651 "punpcklbw %%mm7, %%mm1 \n\t"
1652 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1653 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1654 "punpcklbw %%mm7, %%mm2 \n\t"
1655 "punpcklbw %%mm7, %%mm3 \n\t"
1656 "pmaddwd %%mm6, %%mm0 \n\t"
1657 "pmaddwd %%mm6, %%mm1 \n\t"
1658 "pmaddwd %%mm6, %%mm2 \n\t"
1659 "pmaddwd %%mm6, %%mm3 \n\t"
1660 #ifndef FAST_BGR2YV12
1661 "psrad $8, %%mm0 \n\t"
1662 "psrad $8, %%mm1 \n\t"
1663 "psrad $8, %%mm2 \n\t"
1664 "psrad $8, %%mm3 \n\t"
1666 "packssdw %%mm1, %%mm0 \n\t"
1667 "packssdw %%mm3, %%mm2 \n\t"
1668 "pmaddwd %%mm5, %%mm0 \n\t"
1669 "pmaddwd %%mm5, %%mm2 \n\t"
1670 "packssdw %%mm2, %%mm0 \n\t"
1671 "psraw $7, %%mm0 \n\t"
1673 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1674 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1675 "punpcklbw %%mm7, %%mm4 \n\t"
1676 "punpcklbw %%mm7, %%mm1 \n\t"
1677 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1678 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1679 "punpcklbw %%mm7, %%mm2 \n\t"
1680 "punpcklbw %%mm7, %%mm3 \n\t"
1681 "pmaddwd %%mm6, %%mm4 \n\t"
1682 "pmaddwd %%mm6, %%mm1 \n\t"
1683 "pmaddwd %%mm6, %%mm2 \n\t"
1684 "pmaddwd %%mm6, %%mm3 \n\t"
1685 #ifndef FAST_BGR2YV12
1686 "psrad $8, %%mm4 \n\t"
1687 "psrad $8, %%mm1 \n\t"
1688 "psrad $8, %%mm2 \n\t"
1689 "psrad $8, %%mm3 \n\t"
1691 "packssdw %%mm1, %%mm4 \n\t"
1692 "packssdw %%mm3, %%mm2 \n\t"
1693 "pmaddwd %%mm5, %%mm4 \n\t"
1694 "pmaddwd %%mm5, %%mm2 \n\t"
1695 "add $24, %%"REG_d
" \n\t"
1696 "packssdw %%mm2, %%mm4 \n\t"
1697 "psraw $7, %%mm4 \n\t"
1699 "packuswb %%mm4, %%mm0 \n\t"
1700 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1702 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1703 "add $8, %%"REG_a
" \n\t"
1705 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width)
1706 :
"%"REG_a,
"%"REG_d
1713 "mov %4, %%"REG_a
" \n\t"
1714 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1715 "movq "MANGLE(ff_bgr2UCoeff)
", %%mm6 \n\t"
1716 "pxor %%mm7, %%mm7 \n\t"
1717 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1718 "add %%"REG_d
", %%"REG_d
" \n\t"
1723 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1724 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1725 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1726 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1727 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1728 PAVGB
" %%mm1, %%mm0 \n\t"
1729 PAVGB
" %%mm3, %%mm2 \n\t"
1730 "movq %%mm0, %%mm1 \n\t"
1731 "movq %%mm2, %%mm3 \n\t"
1732 "psrlq $24, %%mm0 \n\t"
1733 "psrlq $24, %%mm2 \n\t"
1734 PAVGB
" %%mm1, %%mm0 \n\t"
1735 PAVGB
" %%mm3, %%mm2 \n\t"
1736 "punpcklbw %%mm7, %%mm0 \n\t"
1737 "punpcklbw %%mm7, %%mm2 \n\t"
1739 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1740 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1741 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1742 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1743 "punpcklbw %%mm7, %%mm0 \n\t"
1744 "punpcklbw %%mm7, %%mm1 \n\t"
1745 "punpcklbw %%mm7, %%mm2 \n\t"
1746 "punpcklbw %%mm7, %%mm3 \n\t"
1747 "paddw %%mm1, %%mm0 \n\t"
1748 "paddw %%mm3, %%mm2 \n\t"
1749 "paddw %%mm2, %%mm0 \n\t"
1750 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1751 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1752 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1753 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1754 "punpcklbw %%mm7, %%mm4 \n\t"
1755 "punpcklbw %%mm7, %%mm1 \n\t"
1756 "punpcklbw %%mm7, %%mm2 \n\t"
1757 "punpcklbw %%mm7, %%mm3 \n\t"
1758 "paddw %%mm1, %%mm4 \n\t"
1759 "paddw %%mm3, %%mm2 \n\t"
1760 "paddw %%mm4, %%mm2 \n\t"
1761 "psrlw $2, %%mm0 \n\t"
1762 "psrlw $2, %%mm2 \n\t"
1764 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1765 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1767 "pmaddwd %%mm0, %%mm1 \n\t"
1768 "pmaddwd %%mm2, %%mm3 \n\t"
1769 "pmaddwd %%mm6, %%mm0 \n\t"
1770 "pmaddwd %%mm6, %%mm2 \n\t"
1771 #ifndef FAST_BGR2YV12
1772 "psrad $8, %%mm0 \n\t"
1773 "psrad $8, %%mm1 \n\t"
1774 "psrad $8, %%mm2 \n\t"
1775 "psrad $8, %%mm3 \n\t"
1777 "packssdw %%mm2, %%mm0 \n\t"
1778 "packssdw %%mm3, %%mm1 \n\t"
1779 "pmaddwd %%mm5, %%mm0 \n\t"
1780 "pmaddwd %%mm5, %%mm1 \n\t"
1781 "packssdw %%mm1, %%mm0 \n\t"
1782 "psraw $7, %%mm0 \n\t"
1784 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1785 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1786 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1787 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1788 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1789 PAVGB
" %%mm1, %%mm4 \n\t"
1790 PAVGB
" %%mm3, %%mm2 \n\t"
1791 "movq %%mm4, %%mm1 \n\t"
1792 "movq %%mm2, %%mm3 \n\t"
1793 "psrlq $24, %%mm4 \n\t"
1794 "psrlq $24, %%mm2 \n\t"
1795 PAVGB
" %%mm1, %%mm4 \n\t"
1796 PAVGB
" %%mm3, %%mm2 \n\t"
1797 "punpcklbw %%mm7, %%mm4 \n\t"
1798 "punpcklbw %%mm7, %%mm2 \n\t"
1800 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1801 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1802 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1803 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1804 "punpcklbw %%mm7, %%mm4 \n\t"
1805 "punpcklbw %%mm7, %%mm1 \n\t"
1806 "punpcklbw %%mm7, %%mm2 \n\t"
1807 "punpcklbw %%mm7, %%mm3 \n\t"
1808 "paddw %%mm1, %%mm4 \n\t"
1809 "paddw %%mm3, %%mm2 \n\t"
1810 "paddw %%mm2, %%mm4 \n\t"
1811 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1812 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1813 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1814 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1815 "punpcklbw %%mm7, %%mm5 \n\t"
1816 "punpcklbw %%mm7, %%mm1 \n\t"
1817 "punpcklbw %%mm7, %%mm2 \n\t"
1818 "punpcklbw %%mm7, %%mm3 \n\t"
1819 "paddw %%mm1, %%mm5 \n\t"
1820 "paddw %%mm3, %%mm2 \n\t"
1821 "paddw %%mm5, %%mm2 \n\t"
1822 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1823 "psrlw $2, %%mm4 \n\t"
1824 "psrlw $2, %%mm2 \n\t"
1826 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1827 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1829 "pmaddwd %%mm4, %%mm1 \n\t"
1830 "pmaddwd %%mm2, %%mm3 \n\t"
1831 "pmaddwd %%mm6, %%mm4 \n\t"
1832 "pmaddwd %%mm6, %%mm2 \n\t"
1833 #ifndef FAST_BGR2YV12
1834 "psrad $8, %%mm4 \n\t"
1835 "psrad $8, %%mm1 \n\t"
1836 "psrad $8, %%mm2 \n\t"
1837 "psrad $8, %%mm3 \n\t"
1839 "packssdw %%mm2, %%mm4 \n\t"
1840 "packssdw %%mm3, %%mm1 \n\t"
1841 "pmaddwd %%mm5, %%mm4 \n\t"
1842 "pmaddwd %%mm5, %%mm1 \n\t"
1843 "add $24, %%"REG_d
" \n\t"
1844 "packssdw %%mm1, %%mm4 \n\t"
1845 "psraw $7, %%mm4 \n\t"
1847 "movq %%mm0, %%mm1 \n\t"
1848 "punpckldq %%mm4, %%mm0 \n\t"
1849 "punpckhdq %%mm4, %%mm1 \n\t"
1850 "packsswb %%mm1, %%mm0 \n\t"
1851 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1852 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1853 "punpckhdq %%mm0, %%mm0 \n\t"
1854 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1855 "add $4, %%"REG_a
" \n\t"
1857 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth)
1858 :
"%"REG_a,
"%"REG_d
1861 udst += chromStride;
1862 vdst += chromStride;
1866 __asm__
volatile(
EMMS" \n\t"
1870 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1874 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1877 int src2Stride,
int dstStride)
1881 for (h=0; h <
height; h++) {
1884 #if COMPILE_TEMPLATE_SSE2
1886 "xor %%"REG_a
", %%"REG_a
" \n\t"
1890 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1891 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1892 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1893 "punpcklbw %%xmm2, %%xmm0 \n\t"
1894 "punpckhbw %%xmm2, %%xmm1 \n\t"
1895 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1896 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1897 "add $16, %%"REG_a
" \n\t"
1898 "cmp %3, %%"REG_a
" \n\t"
1900 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1901 :
"memory",
"%"REG_a
""
1905 "xor %%"REG_a
", %%"REG_a
" \n\t"
1909 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1910 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1911 "movq %%mm0, %%mm1 \n\t"
1912 "movq %%mm2, %%mm3 \n\t"
1913 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1914 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1915 "punpcklbw %%mm4, %%mm0 \n\t"
1916 "punpckhbw %%mm4, %%mm1 \n\t"
1917 "punpcklbw %%mm5, %%mm2 \n\t"
1918 "punpckhbw %%mm5, %%mm3 \n\t"
1919 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1920 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1921 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1922 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1923 "add $16, %%"REG_a
" \n\t"
1924 "cmp %3, %%"REG_a
" \n\t"
1926 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1927 :
"memory",
"%"REG_a
1930 for (w= (width&(~15)); w <
width; w++) {
1931 dest[2*w+0] = src1[w];
1932 dest[2*w+1] = src2[w];
1946 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1952 int dst1Stride,
int dst2Stride)
1956 for (h = 0; h <
height; h++) {
1970 #if !COMPILE_TEMPLATE_SSE2
1971 #if !COMPILE_TEMPLATE_AMD3DNOW
1975 int srcStride1,
int srcStride2,
1976 int dstStride1,
int dstStride2)
1980 w=width/2; h=height/2;
1984 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1986 const uint8_t* s1=src1+srcStride1*(y>>1);
1989 for (;x<w-31;x+=32) {
1992 "movq (%1,%2), %%mm0 \n\t"
1993 "movq 8(%1,%2), %%mm2 \n\t"
1994 "movq 16(%1,%2), %%mm4 \n\t"
1995 "movq 24(%1,%2), %%mm6 \n\t"
1996 "movq %%mm0, %%mm1 \n\t"
1997 "movq %%mm2, %%mm3 \n\t"
1998 "movq %%mm4, %%mm5 \n\t"
1999 "movq %%mm6, %%mm7 \n\t"
2000 "punpcklbw %%mm0, %%mm0 \n\t"
2001 "punpckhbw %%mm1, %%mm1 \n\t"
2002 "punpcklbw %%mm2, %%mm2 \n\t"
2003 "punpckhbw %%mm3, %%mm3 \n\t"
2004 "punpcklbw %%mm4, %%mm4 \n\t"
2005 "punpckhbw %%mm5, %%mm5 \n\t"
2006 "punpcklbw %%mm6, %%mm6 \n\t"
2007 "punpckhbw %%mm7, %%mm7 \n\t"
2008 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2009 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2010 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2011 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2012 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2013 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2014 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2015 MOVNTQ" %%mm7, 56(%0,%2,2)"
2016 ::
"r"(d),
"r"(s1),
"r"(x)
2019 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2022 const uint8_t* s2=src2+srcStride2*(y>>1);
2025 for (;x<w-31;x+=32) {
2028 "movq (%1,%2), %%mm0 \n\t"
2029 "movq 8(%1,%2), %%mm2 \n\t"
2030 "movq 16(%1,%2), %%mm4 \n\t"
2031 "movq 24(%1,%2), %%mm6 \n\t"
2032 "movq %%mm0, %%mm1 \n\t"
2033 "movq %%mm2, %%mm3 \n\t"
2034 "movq %%mm4, %%mm5 \n\t"
2035 "movq %%mm6, %%mm7 \n\t"
2036 "punpcklbw %%mm0, %%mm0 \n\t"
2037 "punpckhbw %%mm1, %%mm1 \n\t"
2038 "punpcklbw %%mm2, %%mm2 \n\t"
2039 "punpckhbw %%mm3, %%mm3 \n\t"
2040 "punpcklbw %%mm4, %%mm4 \n\t"
2041 "punpckhbw %%mm5, %%mm5 \n\t"
2042 "punpcklbw %%mm6, %%mm6 \n\t"
2043 "punpckhbw %%mm7, %%mm7 \n\t"
2044 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2045 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2046 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2047 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2048 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2049 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2050 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2051 MOVNTQ" %%mm7, 56(%0,%2,2)"
2052 ::
"r"(d),
"r"(s2),
"r"(x)
2055 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2067 int srcStride1,
int srcStride2,
2068 int srcStride3,
int dstStride)
2074 const uint8_t* yp=src1+srcStride1*y;
2075 const uint8_t* up=src2+srcStride2*(y>>2);
2076 const uint8_t* vp=src3+srcStride3*(y>>2);
2084 "movq (%1, %0, 4), %%mm0 \n\t"
2085 "movq (%2, %0), %%mm1 \n\t"
2086 "movq (%3, %0), %%mm2 \n\t"
2087 "movq %%mm0, %%mm3 \n\t"
2088 "movq %%mm1, %%mm4 \n\t"
2089 "movq %%mm2, %%mm5 \n\t"
2090 "punpcklbw %%mm1, %%mm1 \n\t"
2091 "punpcklbw %%mm2, %%mm2 \n\t"
2092 "punpckhbw %%mm4, %%mm4 \n\t"
2093 "punpckhbw %%mm5, %%mm5 \n\t"
2095 "movq %%mm1, %%mm6 \n\t"
2096 "punpcklbw %%mm2, %%mm1 \n\t"
2097 "punpcklbw %%mm1, %%mm0 \n\t"
2098 "punpckhbw %%mm1, %%mm3 \n\t"
2099 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2100 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2102 "punpckhbw %%mm2, %%mm6 \n\t"
2103 "movq 8(%1, %0, 4), %%mm0 \n\t"
2104 "movq %%mm0, %%mm3 \n\t"
2105 "punpcklbw %%mm6, %%mm0 \n\t"
2106 "punpckhbw %%mm6, %%mm3 \n\t"
2107 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2108 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2110 "movq %%mm4, %%mm6 \n\t"
2111 "movq 16(%1, %0, 4), %%mm0 \n\t"
2112 "movq %%mm0, %%mm3 \n\t"
2113 "punpcklbw %%mm5, %%mm4 \n\t"
2114 "punpcklbw %%mm4, %%mm0 \n\t"
2115 "punpckhbw %%mm4, %%mm3 \n\t"
2116 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2117 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2119 "punpckhbw %%mm5, %%mm6 \n\t"
2120 "movq 24(%1, %0, 4), %%mm0 \n\t"
2121 "movq %%mm0, %%mm3 \n\t"
2122 "punpcklbw %%mm6, %%mm0 \n\t"
2123 "punpckhbw %%mm6, %%mm3 \n\t"
2124 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2125 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2128 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2132 const int x2 = x<<2;
2135 d[8*x+2] = yp[x2+1];
2137 d[8*x+4] = yp[x2+2];
2139 d[8*x+6] = yp[x2+3];
2160 "pcmpeqw %%mm7, %%mm7 \n\t"
2161 "psrlw $8, %%mm7 \n\t"
2163 "movq -30(%1, %0, 2), %%mm0 \n\t"
2164 "movq -22(%1, %0, 2), %%mm1 \n\t"
2165 "movq -14(%1, %0, 2), %%mm2 \n\t"
2166 "movq -6(%1, %0, 2), %%mm3 \n\t"
2167 "pand %%mm7, %%mm0 \n\t"
2168 "pand %%mm7, %%mm1 \n\t"
2169 "pand %%mm7, %%mm2 \n\t"
2170 "pand %%mm7, %%mm3 \n\t"
2171 "packuswb %%mm1, %%mm0 \n\t"
2172 "packuswb %%mm3, %%mm2 \n\t"
2173 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2174 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2178 :
"r"(src),
"r"(dst)
2183 dst[count]= src[2*count];
2188 #if !COMPILE_TEMPLATE_AMD3DNOW
2198 "pcmpeqw %%mm7, %%mm7 \n\t"
2199 "psrlw $8, %%mm7 \n\t"
2201 "movq -28(%1, %0, 4), %%mm0 \n\t"
2202 "movq -20(%1, %0, 4), %%mm1 \n\t"
2203 "movq -12(%1, %0, 4), %%mm2 \n\t"
2204 "movq -4(%1, %0, 4), %%mm3 \n\t"
2205 "pand %%mm7, %%mm0 \n\t"
2206 "pand %%mm7, %%mm1 \n\t"
2207 "pand %%mm7, %%mm2 \n\t"
2208 "pand %%mm7, %%mm3 \n\t"
2209 "packuswb %%mm1, %%mm0 \n\t"
2210 "packuswb %%mm3, %%mm2 \n\t"
2211 "movq %%mm0, %%mm1 \n\t"
2212 "movq %%mm2, %%mm3 \n\t"
2213 "psrlw $8, %%mm0 \n\t"
2214 "psrlw $8, %%mm2 \n\t"
2215 "pand %%mm7, %%mm1 \n\t"
2216 "pand %%mm7, %%mm3 \n\t"
2217 "packuswb %%mm2, %%mm0 \n\t"
2218 "packuswb %%mm3, %%mm1 \n\t"
2219 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2220 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2224 :
"r"(src),
"r"(dst0),
"r"(dst1)
2229 dst0[count]= src[4*count+0];
2230 dst1[count]= src[4*count+2];
2247 "pcmpeqw %%mm7, %%mm7 \n\t"
2248 "psrlw $8, %%mm7 \n\t"
2250 "movq -28(%1, %0, 4), %%mm0 \n\t"
2251 "movq -20(%1, %0, 4), %%mm1 \n\t"
2252 "movq -12(%1, %0, 4), %%mm2 \n\t"
2253 "movq -4(%1, %0, 4), %%mm3 \n\t"
2254 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2255 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2256 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2257 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2258 "pand %%mm7, %%mm0 \n\t"
2259 "pand %%mm7, %%mm1 \n\t"
2260 "pand %%mm7, %%mm2 \n\t"
2261 "pand %%mm7, %%mm3 \n\t"
2262 "packuswb %%mm1, %%mm0 \n\t"
2263 "packuswb %%mm3, %%mm2 \n\t"
2264 "movq %%mm0, %%mm1 \n\t"
2265 "movq %%mm2, %%mm3 \n\t"
2266 "psrlw $8, %%mm0 \n\t"
2267 "psrlw $8, %%mm2 \n\t"
2268 "pand %%mm7, %%mm1 \n\t"
2269 "pand %%mm7, %%mm3 \n\t"
2270 "packuswb %%mm2, %%mm0 \n\t"
2271 "packuswb %%mm3, %%mm1 \n\t"
2272 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2273 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2277 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2283 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2284 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2289 #if !COMPILE_TEMPLATE_AMD3DNOW
2299 "pcmpeqw %%mm7, %%mm7 \n\t"
2300 "psrlw $8, %%mm7 \n\t"
2302 "movq -28(%1, %0, 4), %%mm0 \n\t"
2303 "movq -20(%1, %0, 4), %%mm1 \n\t"
2304 "movq -12(%1, %0, 4), %%mm2 \n\t"
2305 "movq -4(%1, %0, 4), %%mm3 \n\t"
2306 "psrlw $8, %%mm0 \n\t"
2307 "psrlw $8, %%mm1 \n\t"
2308 "psrlw $8, %%mm2 \n\t"
2309 "psrlw $8, %%mm3 \n\t"
2310 "packuswb %%mm1, %%mm0 \n\t"
2311 "packuswb %%mm3, %%mm2 \n\t"
2312 "movq %%mm0, %%mm1 \n\t"
2313 "movq %%mm2, %%mm3 \n\t"
2314 "psrlw $8, %%mm0 \n\t"
2315 "psrlw $8, %%mm2 \n\t"
2316 "pand %%mm7, %%mm1 \n\t"
2317 "pand %%mm7, %%mm3 \n\t"
2318 "packuswb %%mm2, %%mm0 \n\t"
2319 "packuswb %%mm3, %%mm1 \n\t"
2320 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2321 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2325 :
"r"(src),
"r"(dst0),
"r"(dst1)
2331 dst0[count]= src[4*count+0];
2332 dst1[count]= src[4*count+2];
2349 "pcmpeqw %%mm7, %%mm7 \n\t"
2350 "psrlw $8, %%mm7 \n\t"
2352 "movq -28(%1, %0, 4), %%mm0 \n\t"
2353 "movq -20(%1, %0, 4), %%mm1 \n\t"
2354 "movq -12(%1, %0, 4), %%mm2 \n\t"
2355 "movq -4(%1, %0, 4), %%mm3 \n\t"
2356 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2357 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2358 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2359 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2360 "psrlw $8, %%mm0 \n\t"
2361 "psrlw $8, %%mm1 \n\t"
2362 "psrlw $8, %%mm2 \n\t"
2363 "psrlw $8, %%mm3 \n\t"
2364 "packuswb %%mm1, %%mm0 \n\t"
2365 "packuswb %%mm3, %%mm2 \n\t"
2366 "movq %%mm0, %%mm1 \n\t"
2367 "movq %%mm2, %%mm3 \n\t"
2368 "psrlw $8, %%mm0 \n\t"
2369 "psrlw $8, %%mm2 \n\t"
2370 "pand %%mm7, %%mm1 \n\t"
2371 "pand %%mm7, %%mm3 \n\t"
2372 "packuswb %%mm2, %%mm0 \n\t"
2373 "packuswb %%mm3, %%mm1 \n\t"
2374 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2375 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2379 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2387 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2388 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2395 int lumStride,
int chromStride,
int srcStride)
2398 const int chromWidth= -((-
width)>>1);
2400 for (y=0; y<
height; y++) {
2418 #if !COMPILE_TEMPLATE_AMD3DNOW
2421 int lumStride,
int chromStride,
int srcStride)
2424 const int chromWidth= -((-
width)>>1);
2426 for (y=0; y<
height; y++) {
2445 int lumStride,
int chromStride,
int srcStride)
2448 const int chromWidth= -((-
width)>>1);
2450 for (y=0; y<
height; y++) {
2468 #if !COMPILE_TEMPLATE_AMD3DNOW
2471 int lumStride,
int chromStride,
int srcStride)
2474 const int chromWidth= -((-
width)>>1);
2476 for (y=0; y<
height; y++) {
2496 #if !COMPILE_TEMPLATE_SSE2
2497 #if !COMPILE_TEMPLATE_AMD3DNOW
2527 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2536 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2539 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Macro definitions for various function/variable attributes.
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() shuffle_bytes_2103(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
static av_cold void RENAME() rgb2rgb_init(void)
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)