39 #if COMPILE_TEMPLATE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PAVGB "pavgusb"
42 #elif COMPILE_TEMPLATE_MMXEXT
43 #define PREFETCH "prefetchnta"
46 #define PREFETCH " # nop"
49 #if COMPILE_TEMPLATE_AMD3DNOW
56 #if COMPILE_TEMPLATE_MMXEXT
57 #define MOVNTQ "movntq"
58 #define SFENCE "sfence"
61 #define SFENCE " # nop"
64 #if !COMPILE_TEMPLATE_SSE2
66 #if !COMPILE_TEMPLATE_AMD3DNOW
75 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
77 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
81 "movd (%1), %%mm0 \n\t"
82 "punpckldq 3(%1), %%mm0 \n\t"
83 "movd 6(%1), %%mm1 \n\t"
84 "punpckldq 9(%1), %%mm1 \n\t"
85 "movd 12(%1), %%mm2 \n\t"
86 "punpckldq 15(%1), %%mm2 \n\t"
87 "movd 18(%1), %%mm3 \n\t"
88 "punpckldq 21(%1), %%mm3 \n\t"
89 "por %%mm7, %%mm0 \n\t"
90 "por %%mm7, %%mm1 \n\t"
91 "por %%mm7, %%mm2 \n\t"
92 "por %%mm7, %%mm3 \n\t"
95 MOVNTQ" %%mm2, 16(%0) \n\t"
102 __asm__
volatile(
SFENCE:::
"memory");
103 __asm__
volatile(
EMMS:::
"memory");
112 #define STORE_BGR24_MMX \
113 "psrlq $8, %%mm2 \n\t" \
114 "psrlq $8, %%mm3 \n\t" \
115 "psrlq $8, %%mm6 \n\t" \
116 "psrlq $8, %%mm7 \n\t" \
117 "pand "MANGLE(mask24l)", %%mm0\n\t" \
118 "pand "MANGLE(mask24l)", %%mm1\n\t" \
119 "pand "MANGLE(mask24l)", %%mm4\n\t" \
120 "pand "MANGLE(mask24l)", %%mm5\n\t" \
121 "pand "MANGLE(mask24h)", %%mm2\n\t" \
122 "pand "MANGLE(mask24h)", %%mm3\n\t" \
123 "pand "MANGLE(mask24h)", %%mm6\n\t" \
124 "pand "MANGLE(mask24h)", %%mm7\n\t" \
125 "por %%mm2, %%mm0 \n\t" \
126 "por %%mm3, %%mm1 \n\t" \
127 "por %%mm6, %%mm4 \n\t" \
128 "por %%mm7, %%mm5 \n\t" \
130 "movq %%mm1, %%mm2 \n\t" \
131 "movq %%mm4, %%mm3 \n\t" \
132 "psllq $48, %%mm2 \n\t" \
133 "psllq $32, %%mm3 \n\t" \
134 "pand "MANGLE(mask24hh)", %%mm2\n\t" \
135 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
136 "por %%mm2, %%mm0 \n\t" \
137 "psrlq $16, %%mm1 \n\t" \
138 "psrlq $32, %%mm4 \n\t" \
139 "psllq $16, %%mm5 \n\t" \
140 "por %%mm3, %%mm1 \n\t" \
141 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
142 "por %%mm5, %%mm4 \n\t" \
144 MOVNTQ" %%mm0, (%0) \n\t" \
145 MOVNTQ" %%mm1, 8(%0) \n\t" \
146 MOVNTQ" %%mm4, 16(%0)"
156 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
161 "movq (%1), %%mm0 \n\t"
162 "movq 8(%1), %%mm1 \n\t"
163 "movq 16(%1), %%mm4 \n\t"
164 "movq 24(%1), %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
175 __asm__
volatile(
SFENCE:::
"memory");
176 __asm__
volatile(
EMMS:::
"memory");
198 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
199 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
204 "movq (%1), %%mm0 \n\t"
205 "movq 8(%1), %%mm2 \n\t"
206 "movq %%mm0, %%mm1 \n\t"
207 "movq %%mm2, %%mm3 \n\t"
208 "pand %%mm4, %%mm0 \n\t"
209 "pand %%mm4, %%mm2 \n\t"
210 "paddw %%mm1, %%mm0 \n\t"
211 "paddw %%mm3, %%mm2 \n\t"
219 __asm__
volatile(
SFENCE:::
"memory");
220 __asm__
volatile(
EMMS:::
"memory");
223 register unsigned x= *((
const uint32_t *)s);
224 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
229 register unsigned short x= *((
const uint16_t *)s);
230 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
241 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
242 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
243 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
248 "movq (%1), %%mm0 \n\t"
249 "movq 8(%1), %%mm2 \n\t"
250 "movq %%mm0, %%mm1 \n\t"
251 "movq %%mm2, %%mm3 \n\t"
252 "psrlq $1, %%mm0 \n\t"
253 "psrlq $1, %%mm2 \n\t"
254 "pand %%mm7, %%mm0 \n\t"
255 "pand %%mm7, %%mm2 \n\t"
256 "pand %%mm6, %%mm1 \n\t"
257 "pand %%mm6, %%mm3 \n\t"
258 "por %%mm1, %%mm0 \n\t"
259 "por %%mm3, %%mm2 \n\t"
267 __asm__
volatile(
SFENCE:::
"memory");
268 __asm__
volatile(
EMMS:::
"memory");
271 register uint32_t x= *((
const uint32_t*)s);
272 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
277 register uint16_t x= *((
const uint16_t*)s);
278 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
287 uint16_t *d = (uint16_t *)dst;
291 "movq %3, %%mm5 \n\t"
292 "movq %4, %%mm6 \n\t"
293 "movq %5, %%mm7 \n\t"
298 "movd (%1), %%mm0 \n\t"
299 "movd 4(%1), %%mm3 \n\t"
300 "punpckldq 8(%1), %%mm0 \n\t"
301 "punpckldq 12(%1), %%mm3 \n\t"
302 "movq %%mm0, %%mm1 \n\t"
303 "movq %%mm3, %%mm4 \n\t"
304 "pand %%mm6, %%mm0 \n\t"
305 "pand %%mm6, %%mm3 \n\t"
306 "pmaddwd %%mm7, %%mm0 \n\t"
307 "pmaddwd %%mm7, %%mm3 \n\t"
308 "pand %%mm5, %%mm1 \n\t"
309 "pand %%mm5, %%mm4 \n\t"
310 "por %%mm1, %%mm0 \n\t"
311 "por %%mm4, %%mm3 \n\t"
312 "psrld $5, %%mm0 \n\t"
313 "pslld $11, %%mm3 \n\t"
314 "por %%mm3, %%mm0 \n\t"
322 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
324 __asm__
volatile(
SFENCE:::
"memory");
325 __asm__
volatile(
EMMS:::
"memory");
327 register int rgb = *(
const uint32_t*)s; s += 4;
328 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
337 uint16_t *d = (uint16_t *)dst;
339 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
341 "movq %0, %%mm7 \n\t"
342 "movq %1, %%mm6 \n\t"
343 ::
"m"(red_16mask),
"m"(green_16mask));
348 "movd (%1), %%mm0 \n\t"
349 "movd 4(%1), %%mm3 \n\t"
350 "punpckldq 8(%1), %%mm0 \n\t"
351 "punpckldq 12(%1), %%mm3 \n\t"
352 "movq %%mm0, %%mm1 \n\t"
353 "movq %%mm0, %%mm2 \n\t"
354 "movq %%mm3, %%mm4 \n\t"
355 "movq %%mm3, %%mm5 \n\t"
356 "psllq $8, %%mm0 \n\t"
357 "psllq $8, %%mm3 \n\t"
358 "pand %%mm7, %%mm0 \n\t"
359 "pand %%mm7, %%mm3 \n\t"
360 "psrlq $5, %%mm1 \n\t"
361 "psrlq $5, %%mm4 \n\t"
362 "pand %%mm6, %%mm1 \n\t"
363 "pand %%mm6, %%mm4 \n\t"
364 "psrlq $19, %%mm2 \n\t"
365 "psrlq $19, %%mm5 \n\t"
366 "pand %2, %%mm2 \n\t"
367 "pand %2, %%mm5 \n\t"
368 "por %%mm1, %%mm0 \n\t"
369 "por %%mm4, %%mm3 \n\t"
370 "por %%mm2, %%mm0 \n\t"
371 "por %%mm5, %%mm3 \n\t"
372 "psllq $16, %%mm3 \n\t"
373 "por %%mm3, %%mm0 \n\t"
375 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
379 __asm__
volatile(
SFENCE:::
"memory");
380 __asm__
volatile(
EMMS:::
"memory");
382 register int rgb = *(
const uint32_t*)s; s += 4;
383 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
392 uint16_t *d = (uint16_t *)dst;
396 "movq %3, %%mm5 \n\t"
397 "movq %4, %%mm6 \n\t"
398 "movq %5, %%mm7 \n\t"
403 "movd (%1), %%mm0 \n\t"
404 "movd 4(%1), %%mm3 \n\t"
405 "punpckldq 8(%1), %%mm0 \n\t"
406 "punpckldq 12(%1), %%mm3 \n\t"
407 "movq %%mm0, %%mm1 \n\t"
408 "movq %%mm3, %%mm4 \n\t"
409 "pand %%mm6, %%mm0 \n\t"
410 "pand %%mm6, %%mm3 \n\t"
411 "pmaddwd %%mm7, %%mm0 \n\t"
412 "pmaddwd %%mm7, %%mm3 \n\t"
413 "pand %%mm5, %%mm1 \n\t"
414 "pand %%mm5, %%mm4 \n\t"
415 "por %%mm1, %%mm0 \n\t"
416 "por %%mm4, %%mm3 \n\t"
417 "psrld $6, %%mm0 \n\t"
418 "pslld $10, %%mm3 \n\t"
419 "por %%mm3, %%mm0 \n\t"
427 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
429 __asm__
volatile(
SFENCE:::
"memory");
430 __asm__
volatile(
EMMS:::
"memory");
432 register int rgb = *(
const uint32_t*)s; s += 4;
433 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
442 uint16_t *d = (uint16_t *)dst;
444 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
446 "movq %0, %%mm7 \n\t"
447 "movq %1, %%mm6 \n\t"
448 ::
"m"(red_15mask),
"m"(green_15mask));
453 "movd (%1), %%mm0 \n\t"
454 "movd 4(%1), %%mm3 \n\t"
455 "punpckldq 8(%1), %%mm0 \n\t"
456 "punpckldq 12(%1), %%mm3 \n\t"
457 "movq %%mm0, %%mm1 \n\t"
458 "movq %%mm0, %%mm2 \n\t"
459 "movq %%mm3, %%mm4 \n\t"
460 "movq %%mm3, %%mm5 \n\t"
461 "psllq $7, %%mm0 \n\t"
462 "psllq $7, %%mm3 \n\t"
463 "pand %%mm7, %%mm0 \n\t"
464 "pand %%mm7, %%mm3 \n\t"
465 "psrlq $6, %%mm1 \n\t"
466 "psrlq $6, %%mm4 \n\t"
467 "pand %%mm6, %%mm1 \n\t"
468 "pand %%mm6, %%mm4 \n\t"
469 "psrlq $19, %%mm2 \n\t"
470 "psrlq $19, %%mm5 \n\t"
471 "pand %2, %%mm2 \n\t"
472 "pand %2, %%mm5 \n\t"
473 "por %%mm1, %%mm0 \n\t"
474 "por %%mm4, %%mm3 \n\t"
475 "por %%mm2, %%mm0 \n\t"
476 "por %%mm5, %%mm3 \n\t"
477 "psllq $16, %%mm3 \n\t"
478 "por %%mm3, %%mm0 \n\t"
480 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
484 __asm__
volatile(
SFENCE:::
"memory");
485 __asm__
volatile(
EMMS:::
"memory");
487 register int rgb = *(
const uint32_t*)s; s += 4;
488 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
497 uint16_t *d = (uint16_t *)dst;
499 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
501 "movq %0, %%mm7 \n\t"
502 "movq %1, %%mm6 \n\t"
503 ::
"m"(red_16mask),
"m"(green_16mask));
508 "movd (%1), %%mm0 \n\t"
509 "movd 3(%1), %%mm3 \n\t"
510 "punpckldq 6(%1), %%mm0 \n\t"
511 "punpckldq 9(%1), %%mm3 \n\t"
512 "movq %%mm0, %%mm1 \n\t"
513 "movq %%mm0, %%mm2 \n\t"
514 "movq %%mm3, %%mm4 \n\t"
515 "movq %%mm3, %%mm5 \n\t"
516 "psrlq $3, %%mm0 \n\t"
517 "psrlq $3, %%mm3 \n\t"
518 "pand %2, %%mm0 \n\t"
519 "pand %2, %%mm3 \n\t"
520 "psrlq $5, %%mm1 \n\t"
521 "psrlq $5, %%mm4 \n\t"
522 "pand %%mm6, %%mm1 \n\t"
523 "pand %%mm6, %%mm4 \n\t"
524 "psrlq $8, %%mm2 \n\t"
525 "psrlq $8, %%mm5 \n\t"
526 "pand %%mm7, %%mm2 \n\t"
527 "pand %%mm7, %%mm5 \n\t"
528 "por %%mm1, %%mm0 \n\t"
529 "por %%mm4, %%mm3 \n\t"
530 "por %%mm2, %%mm0 \n\t"
531 "por %%mm5, %%mm3 \n\t"
532 "psllq $16, %%mm3 \n\t"
533 "por %%mm3, %%mm0 \n\t"
535 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
539 __asm__
volatile(
SFENCE:::
"memory");
540 __asm__
volatile(
EMMS:::
"memory");
545 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
554 uint16_t *d = (uint16_t *)dst;
556 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
558 "movq %0, %%mm7 \n\t"
559 "movq %1, %%mm6 \n\t"
560 ::
"m"(red_16mask),
"m"(green_16mask));
565 "movd (%1), %%mm0 \n\t"
566 "movd 3(%1), %%mm3 \n\t"
567 "punpckldq 6(%1), %%mm0 \n\t"
568 "punpckldq 9(%1), %%mm3 \n\t"
569 "movq %%mm0, %%mm1 \n\t"
570 "movq %%mm0, %%mm2 \n\t"
571 "movq %%mm3, %%mm4 \n\t"
572 "movq %%mm3, %%mm5 \n\t"
573 "psllq $8, %%mm0 \n\t"
574 "psllq $8, %%mm3 \n\t"
575 "pand %%mm7, %%mm0 \n\t"
576 "pand %%mm7, %%mm3 \n\t"
577 "psrlq $5, %%mm1 \n\t"
578 "psrlq $5, %%mm4 \n\t"
579 "pand %%mm6, %%mm1 \n\t"
580 "pand %%mm6, %%mm4 \n\t"
581 "psrlq $19, %%mm2 \n\t"
582 "psrlq $19, %%mm5 \n\t"
583 "pand %2, %%mm2 \n\t"
584 "pand %2, %%mm5 \n\t"
585 "por %%mm1, %%mm0 \n\t"
586 "por %%mm4, %%mm3 \n\t"
587 "por %%mm2, %%mm0 \n\t"
588 "por %%mm5, %%mm3 \n\t"
589 "psllq $16, %%mm3 \n\t"
590 "por %%mm3, %%mm0 \n\t"
592 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
596 __asm__
volatile(
SFENCE:::
"memory");
597 __asm__
volatile(
EMMS:::
"memory");
602 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
611 uint16_t *d = (uint16_t *)dst;
613 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
615 "movq %0, %%mm7 \n\t"
616 "movq %1, %%mm6 \n\t"
617 ::
"m"(red_15mask),
"m"(green_15mask));
622 "movd (%1), %%mm0 \n\t"
623 "movd 3(%1), %%mm3 \n\t"
624 "punpckldq 6(%1), %%mm0 \n\t"
625 "punpckldq 9(%1), %%mm3 \n\t"
626 "movq %%mm0, %%mm1 \n\t"
627 "movq %%mm0, %%mm2 \n\t"
628 "movq %%mm3, %%mm4 \n\t"
629 "movq %%mm3, %%mm5 \n\t"
630 "psrlq $3, %%mm0 \n\t"
631 "psrlq $3, %%mm3 \n\t"
632 "pand %2, %%mm0 \n\t"
633 "pand %2, %%mm3 \n\t"
634 "psrlq $6, %%mm1 \n\t"
635 "psrlq $6, %%mm4 \n\t"
636 "pand %%mm6, %%mm1 \n\t"
637 "pand %%mm6, %%mm4 \n\t"
638 "psrlq $9, %%mm2 \n\t"
639 "psrlq $9, %%mm5 \n\t"
640 "pand %%mm7, %%mm2 \n\t"
641 "pand %%mm7, %%mm5 \n\t"
642 "por %%mm1, %%mm0 \n\t"
643 "por %%mm4, %%mm3 \n\t"
644 "por %%mm2, %%mm0 \n\t"
645 "por %%mm5, %%mm3 \n\t"
646 "psllq $16, %%mm3 \n\t"
647 "por %%mm3, %%mm0 \n\t"
649 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
653 __asm__
volatile(
SFENCE:::
"memory");
654 __asm__
volatile(
EMMS:::
"memory");
659 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
668 uint16_t *d = (uint16_t *)dst;
670 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
672 "movq %0, %%mm7 \n\t"
673 "movq %1, %%mm6 \n\t"
674 ::
"m"(red_15mask),
"m"(green_15mask));
679 "movd (%1), %%mm0 \n\t"
680 "movd 3(%1), %%mm3 \n\t"
681 "punpckldq 6(%1), %%mm0 \n\t"
682 "punpckldq 9(%1), %%mm3 \n\t"
683 "movq %%mm0, %%mm1 \n\t"
684 "movq %%mm0, %%mm2 \n\t"
685 "movq %%mm3, %%mm4 \n\t"
686 "movq %%mm3, %%mm5 \n\t"
687 "psllq $7, %%mm0 \n\t"
688 "psllq $7, %%mm3 \n\t"
689 "pand %%mm7, %%mm0 \n\t"
690 "pand %%mm7, %%mm3 \n\t"
691 "psrlq $6, %%mm1 \n\t"
692 "psrlq $6, %%mm4 \n\t"
693 "pand %%mm6, %%mm1 \n\t"
694 "pand %%mm6, %%mm4 \n\t"
695 "psrlq $19, %%mm2 \n\t"
696 "psrlq $19, %%mm5 \n\t"
697 "pand %2, %%mm2 \n\t"
698 "pand %2, %%mm5 \n\t"
699 "por %%mm1, %%mm0 \n\t"
700 "por %%mm4, %%mm3 \n\t"
701 "por %%mm2, %%mm0 \n\t"
702 "por %%mm5, %%mm3 \n\t"
703 "psllq $16, %%mm3 \n\t"
704 "por %%mm3, %%mm0 \n\t"
706 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
710 __asm__
volatile(
SFENCE:::
"memory");
711 __asm__
volatile(
EMMS:::
"memory");
716 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
744 const uint16_t *mm_end;
746 const uint16_t *s = (
const uint16_t*)src;
747 end = s + src_size/2;
748 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
753 "movq (%1), %%mm0 \n\t"
754 "movq (%1), %%mm1 \n\t"
755 "movq (%1), %%mm2 \n\t"
756 "pand %2, %%mm0 \n\t"
757 "pand %3, %%mm1 \n\t"
758 "pand %4, %%mm2 \n\t"
759 "psllq $3, %%mm0 \n\t"
760 "psrlq $2, %%mm1 \n\t"
761 "psrlq $7, %%mm2 \n\t"
762 "movq %%mm0, %%mm3 \n\t"
763 "movq %%mm1, %%mm4 \n\t"
764 "movq %%mm2, %%mm5 \n\t"
765 "punpcklwd %5, %%mm0 \n\t"
766 "punpcklwd %5, %%mm1 \n\t"
767 "punpcklwd %5, %%mm2 \n\t"
768 "punpckhwd %5, %%mm3 \n\t"
769 "punpckhwd %5, %%mm4 \n\t"
770 "punpckhwd %5, %%mm5 \n\t"
771 "psllq $8, %%mm1 \n\t"
772 "psllq $16, %%mm2 \n\t"
773 "por %%mm1, %%mm0 \n\t"
774 "por %%mm2, %%mm0 \n\t"
775 "psllq $8, %%mm4 \n\t"
776 "psllq $16, %%mm5 \n\t"
777 "por %%mm4, %%mm3 \n\t"
778 "por %%mm5, %%mm3 \n\t"
780 "movq %%mm0, %%mm6 \n\t"
781 "movq %%mm3, %%mm7 \n\t"
783 "movq 8(%1), %%mm0 \n\t"
784 "movq 8(%1), %%mm1 \n\t"
785 "movq 8(%1), %%mm2 \n\t"
786 "pand %2, %%mm0 \n\t"
787 "pand %3, %%mm1 \n\t"
788 "pand %4, %%mm2 \n\t"
789 "psllq $3, %%mm0 \n\t"
790 "psrlq $2, %%mm1 \n\t"
791 "psrlq $7, %%mm2 \n\t"
792 "movq %%mm0, %%mm3 \n\t"
793 "movq %%mm1, %%mm4 \n\t"
794 "movq %%mm2, %%mm5 \n\t"
795 "punpcklwd %5, %%mm0 \n\t"
796 "punpcklwd %5, %%mm1 \n\t"
797 "punpcklwd %5, %%mm2 \n\t"
798 "punpckhwd %5, %%mm3 \n\t"
799 "punpckhwd %5, %%mm4 \n\t"
800 "punpckhwd %5, %%mm5 \n\t"
801 "psllq $8, %%mm1 \n\t"
802 "psllq $16, %%mm2 \n\t"
803 "por %%mm1, %%mm0 \n\t"
804 "por %%mm2, %%mm0 \n\t"
805 "psllq $8, %%mm4 \n\t"
806 "psllq $16, %%mm5 \n\t"
807 "por %%mm4, %%mm3 \n\t"
808 "por %%mm5, %%mm3 \n\t"
811 :
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
815 "movq %%mm0, %%mm4 \n\t"
816 "movq %%mm3, %%mm5 \n\t"
817 "movq %%mm6, %%mm0 \n\t"
818 "movq %%mm7, %%mm1 \n\t"
820 "movq %%mm4, %%mm6 \n\t"
821 "movq %%mm5, %%mm7 \n\t"
822 "movq %%mm0, %%mm2 \n\t"
823 "movq %%mm1, %%mm3 \n\t"
832 __asm__
volatile(
SFENCE:::
"memory");
833 __asm__
volatile(
EMMS:::
"memory");
835 register uint16_t bgr;
837 *d++ = (bgr&0x1F)<<3;
838 *d++ = (bgr&0x3E0)>>2;
839 *d++ = (bgr&0x7C00)>>7;
846 const uint16_t *mm_end;
848 const uint16_t *s = (
const uint16_t *)src;
849 end = s + src_size/2;
850 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
855 "movq (%1), %%mm0 \n\t"
856 "movq (%1), %%mm1 \n\t"
857 "movq (%1), %%mm2 \n\t"
858 "pand %2, %%mm0 \n\t"
859 "pand %3, %%mm1 \n\t"
860 "pand %4, %%mm2 \n\t"
861 "psllq $3, %%mm0 \n\t"
862 "psrlq $3, %%mm1 \n\t"
863 "psrlq $8, %%mm2 \n\t"
864 "movq %%mm0, %%mm3 \n\t"
865 "movq %%mm1, %%mm4 \n\t"
866 "movq %%mm2, %%mm5 \n\t"
867 "punpcklwd %5, %%mm0 \n\t"
868 "punpcklwd %5, %%mm1 \n\t"
869 "punpcklwd %5, %%mm2 \n\t"
870 "punpckhwd %5, %%mm3 \n\t"
871 "punpckhwd %5, %%mm4 \n\t"
872 "punpckhwd %5, %%mm5 \n\t"
873 "psllq $8, %%mm1 \n\t"
874 "psllq $16, %%mm2 \n\t"
875 "por %%mm1, %%mm0 \n\t"
876 "por %%mm2, %%mm0 \n\t"
877 "psllq $8, %%mm4 \n\t"
878 "psllq $16, %%mm5 \n\t"
879 "por %%mm4, %%mm3 \n\t"
880 "por %%mm5, %%mm3 \n\t"
882 "movq %%mm0, %%mm6 \n\t"
883 "movq %%mm3, %%mm7 \n\t"
885 "movq 8(%1), %%mm0 \n\t"
886 "movq 8(%1), %%mm1 \n\t"
887 "movq 8(%1), %%mm2 \n\t"
888 "pand %2, %%mm0 \n\t"
889 "pand %3, %%mm1 \n\t"
890 "pand %4, %%mm2 \n\t"
891 "psllq $3, %%mm0 \n\t"
892 "psrlq $3, %%mm1 \n\t"
893 "psrlq $8, %%mm2 \n\t"
894 "movq %%mm0, %%mm3 \n\t"
895 "movq %%mm1, %%mm4 \n\t"
896 "movq %%mm2, %%mm5 \n\t"
897 "punpcklwd %5, %%mm0 \n\t"
898 "punpcklwd %5, %%mm1 \n\t"
899 "punpcklwd %5, %%mm2 \n\t"
900 "punpckhwd %5, %%mm3 \n\t"
901 "punpckhwd %5, %%mm4 \n\t"
902 "punpckhwd %5, %%mm5 \n\t"
903 "psllq $8, %%mm1 \n\t"
904 "psllq $16, %%mm2 \n\t"
905 "por %%mm1, %%mm0 \n\t"
906 "por %%mm2, %%mm0 \n\t"
907 "psllq $8, %%mm4 \n\t"
908 "psllq $16, %%mm5 \n\t"
909 "por %%mm4, %%mm3 \n\t"
910 "por %%mm5, %%mm3 \n\t"
912 :
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
916 "movq %%mm0, %%mm4 \n\t"
917 "movq %%mm3, %%mm5 \n\t"
918 "movq %%mm6, %%mm0 \n\t"
919 "movq %%mm7, %%mm1 \n\t"
921 "movq %%mm4, %%mm6 \n\t"
922 "movq %%mm5, %%mm7 \n\t"
923 "movq %%mm0, %%mm2 \n\t"
924 "movq %%mm1, %%mm3 \n\t"
933 __asm__
volatile(
SFENCE:::
"memory");
934 __asm__
volatile(
EMMS:::
"memory");
936 register uint16_t bgr;
938 *d++ = (bgr&0x1F)<<3;
939 *d++ = (bgr&0x7E0)>>3;
940 *d++ = (bgr&0xF800)>>8;
952 "packuswb %%mm7, %%mm0 \n\t" \
953 "packuswb %%mm7, %%mm1 \n\t" \
954 "packuswb %%mm7, %%mm2 \n\t" \
955 "punpcklbw %%mm1, %%mm0 \n\t" \
956 "punpcklbw %%mm6, %%mm2 \n\t" \
957 "movq %%mm0, %%mm3 \n\t" \
958 "punpcklwd %%mm2, %%mm0 \n\t" \
959 "punpckhwd %%mm2, %%mm3 \n\t" \
960 MOVNTQ" %%mm0, (%0) \n\t" \
961 MOVNTQ" %%mm3, 8(%0) \n\t" \
966 const uint16_t *mm_end;
968 const uint16_t *s = (
const uint16_t *)src;
969 end = s + src_size/2;
970 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
971 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
972 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
977 "movq (%1), %%mm0 \n\t"
978 "movq (%1), %%mm1 \n\t"
979 "movq (%1), %%mm2 \n\t"
980 "pand %2, %%mm0 \n\t"
981 "pand %3, %%mm1 \n\t"
982 "pand %4, %%mm2 \n\t"
983 "psllq $3, %%mm0 \n\t"
984 "psrlq $2, %%mm1 \n\t"
985 "psrlq $7, %%mm2 \n\t"
987 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r)
992 __asm__
volatile(
SFENCE:::
"memory");
993 __asm__
volatile(
EMMS:::
"memory");
995 register uint16_t bgr;
997 *d++ = (bgr&0x1F)<<3;
998 *d++ = (bgr&0x3E0)>>2;
999 *d++ = (bgr&0x7C00)>>7;
1006 const uint16_t *end;
1007 const uint16_t *mm_end;
1009 const uint16_t *s = (
const uint16_t*)src;
1010 end = s + src_size/2;
1011 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1012 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1013 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1015 while (s < mm_end) {
1018 "movq (%1), %%mm0 \n\t"
1019 "movq (%1), %%mm1 \n\t"
1020 "movq (%1), %%mm2 \n\t"
1021 "pand %2, %%mm0 \n\t"
1022 "pand %3, %%mm1 \n\t"
1023 "pand %4, %%mm2 \n\t"
1024 "psllq $3, %%mm0 \n\t"
1025 "psrlq $3, %%mm1 \n\t"
1026 "psrlq $8, %%mm2 \n\t"
1028 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r)
1033 __asm__
volatile(
SFENCE:::
"memory");
1034 __asm__
volatile(
EMMS:::
"memory");
1036 register uint16_t bgr;
1038 *d++ = (bgr&0x1F)<<3;
1039 *d++ = (bgr&0x7E0)>>3;
1040 *d++ = (bgr&0xF800)>>8;
1054 "movq %3, %%mm7 \n\t"
1055 "pxor %4, %%mm7 \n\t"
1056 "movq %%mm7, %%mm6 \n\t"
1057 "pxor %5, %%mm7 \n\t"
1061 "movq (%1, %0), %%mm0 \n\t"
1062 "movq 8(%1, %0), %%mm1 \n\t"
1063 # if COMPILE_TEMPLATE_MMXEXT
1064 "pshufw $177, %%mm0, %%mm3 \n\t"
1065 "pshufw $177, %%mm1, %%mm5 \n\t"
1066 "pand %%mm7, %%mm0 \n\t"
1067 "pand %%mm6, %%mm3 \n\t"
1068 "pand %%mm7, %%mm1 \n\t"
1069 "pand %%mm6, %%mm5 \n\t"
1070 "por %%mm3, %%mm0 \n\t"
1071 "por %%mm5, %%mm1 \n\t"
1073 "movq %%mm0, %%mm2 \n\t"
1074 "movq %%mm1, %%mm4 \n\t"
1075 "pand %%mm7, %%mm0 \n\t"
1076 "pand %%mm6, %%mm2 \n\t"
1077 "pand %%mm7, %%mm1 \n\t"
1078 "pand %%mm6, %%mm4 \n\t"
1079 "movq %%mm2, %%mm3 \n\t"
1080 "movq %%mm4, %%mm5 \n\t"
1081 "pslld $16, %%mm2 \n\t"
1082 "psrld $16, %%mm3 \n\t"
1083 "pslld $16, %%mm4 \n\t"
1084 "psrld $16, %%mm5 \n\t"
1085 "por %%mm2, %%mm0 \n\t"
1086 "por %%mm4, %%mm1 \n\t"
1087 "por %%mm3, %%mm0 \n\t"
1088 "por %%mm5, %%mm1 \n\t"
1090 MOVNTQ" %%mm0, (%2, %0) \n\t"
1091 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1098 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1100 for (; idx<15; idx+=4) {
1101 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1103 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1110 x86_reg mmx_size= 23 - src_size;
1112 "test %%"REG_a
", %%"REG_a
" \n\t"
1114 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1115 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1116 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1120 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1121 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1122 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1123 "psllq $16, %%mm0 \n\t"
1124 "pand %%mm5, %%mm0 \n\t"
1125 "pand %%mm6, %%mm1 \n\t"
1126 "pand %%mm7, %%mm2 \n\t"
1127 "por %%mm0, %%mm1 \n\t"
1128 "por %%mm2, %%mm1 \n\t"
1129 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1130 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1131 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1132 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1133 "pand %%mm7, %%mm0 \n\t"
1134 "pand %%mm5, %%mm1 \n\t"
1135 "pand %%mm6, %%mm2 \n\t"
1136 "por %%mm0, %%mm1 \n\t"
1137 "por %%mm2, %%mm1 \n\t"
1138 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1139 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1140 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1141 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1142 "pand %%mm6, %%mm0 \n\t"
1143 "pand %%mm7, %%mm1 \n\t"
1144 "pand %%mm5, %%mm2 \n\t"
1145 "por %%mm0, %%mm1 \n\t"
1146 "por %%mm2, %%mm1 \n\t"
1147 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1148 "add $24, %%"REG_a
" \n\t"
1152 :
"r" (src-mmx_size),
"r"(dst-mmx_size)
1155 __asm__
volatile(
SFENCE:::
"memory");
1156 __asm__
volatile(
EMMS:::
"memory");
1158 if (mmx_size==23)
return;
1162 src_size= 23-mmx_size;
1165 for (i=0; i<src_size; i+=3) {
1168 dst[i + 1] = src[i + 1];
1169 dst[i + 2] = src[i + 0];
1176 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1180 for (y=0; y<
height; y++) {
1183 "xor %%"REG_a
", %%"REG_a
" \n\t"
1186 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1189 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1190 "movq %%mm0, %%mm2 \n\t"
1191 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1192 "punpcklbw %%mm1, %%mm0 \n\t"
1193 "punpckhbw %%mm1, %%mm2 \n\t"
1195 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1196 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1197 "movq %%mm3, %%mm4 \n\t"
1198 "movq %%mm5, %%mm6 \n\t"
1199 "punpcklbw %%mm0, %%mm3 \n\t"
1200 "punpckhbw %%mm0, %%mm4 \n\t"
1201 "punpcklbw %%mm2, %%mm5 \n\t"
1202 "punpckhbw %%mm2, %%mm6 \n\t"
1204 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1205 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1206 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1207 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1209 "add $8, %%"REG_a
" \n\t"
1210 "cmp %4, %%"REG_a
" \n\t"
1212 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1215 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1216 usrc += chromStride;
1217 vsrc += chromStride;
1233 int lumStride,
int chromStride,
int dstStride)
1241 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1245 for (y=0; y<
height; y++) {
1248 "xor %%"REG_a
", %%"REG_a
" \n\t"
1251 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1254 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1255 "movq %%mm0, %%mm2 \n\t"
1256 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1257 "punpcklbw %%mm1, %%mm0 \n\t"
1258 "punpckhbw %%mm1, %%mm2 \n\t"
1260 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1261 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1262 "movq %%mm0, %%mm4 \n\t"
1263 "movq %%mm2, %%mm6 \n\t"
1264 "punpcklbw %%mm3, %%mm0 \n\t"
1265 "punpckhbw %%mm3, %%mm4 \n\t"
1266 "punpcklbw %%mm5, %%mm2 \n\t"
1267 "punpckhbw %%mm5, %%mm6 \n\t"
1269 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1270 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1271 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1272 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1274 "add $8, %%"REG_a
" \n\t"
1275 "cmp %4, %%"REG_a
" \n\t"
1277 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1280 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1281 usrc += chromStride;
1282 vsrc += chromStride;
1298 int lumStride,
int chromStride,
int dstStride)
1309 int lumStride,
int chromStride,
int dstStride)
1319 int lumStride,
int chromStride,
int dstStride)
1330 int lumStride,
int chromStride,
int srcStride)
1334 for (y=0; y<
height; y+=2) {
1336 "xor %%"REG_a
", %%"REG_a
" \n\t"
1337 "pcmpeqw %%mm7, %%mm7 \n\t"
1338 "psrlw $8, %%mm7 \n\t"
1341 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1342 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1343 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1344 "movq %%mm0, %%mm2 \n\t"
1345 "movq %%mm1, %%mm3 \n\t"
1346 "psrlw $8, %%mm0 \n\t"
1347 "psrlw $8, %%mm1 \n\t"
1348 "pand %%mm7, %%mm2 \n\t"
1349 "pand %%mm7, %%mm3 \n\t"
1350 "packuswb %%mm1, %%mm0 \n\t"
1351 "packuswb %%mm3, %%mm2 \n\t"
1353 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1355 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1356 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1357 "movq %%mm1, %%mm3 \n\t"
1358 "movq %%mm2, %%mm4 \n\t"
1359 "psrlw $8, %%mm1 \n\t"
1360 "psrlw $8, %%mm2 \n\t"
1361 "pand %%mm7, %%mm3 \n\t"
1362 "pand %%mm7, %%mm4 \n\t"
1363 "packuswb %%mm2, %%mm1 \n\t"
1364 "packuswb %%mm4, %%mm3 \n\t"
1366 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1368 "movq %%mm0, %%mm2 \n\t"
1369 "movq %%mm1, %%mm3 \n\t"
1370 "psrlw $8, %%mm0 \n\t"
1371 "psrlw $8, %%mm1 \n\t"
1372 "pand %%mm7, %%mm2 \n\t"
1373 "pand %%mm7, %%mm3 \n\t"
1374 "packuswb %%mm1, %%mm0 \n\t"
1375 "packuswb %%mm3, %%mm2 \n\t"
1377 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1378 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1380 "add $8, %%"REG_a
" \n\t"
1381 "cmp %4, %%"REG_a
" \n\t"
1383 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1384 :
"memory",
"%"REG_a
1391 "xor %%"REG_a
", %%"REG_a
" \n\t"
1394 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1395 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1396 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1397 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1398 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1399 "pand %%mm7, %%mm0 \n\t"
1400 "pand %%mm7, %%mm1 \n\t"
1401 "pand %%mm7, %%mm2 \n\t"
1402 "pand %%mm7, %%mm3 \n\t"
1403 "packuswb %%mm1, %%mm0 \n\t"
1404 "packuswb %%mm3, %%mm2 \n\t"
1406 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1407 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1409 "add $8, %%"REG_a
" \n\t"
1410 "cmp %4, %%"REG_a
" \n\t"
1413 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1414 :
"memory",
"%"REG_a
1416 udst += chromStride;
1417 vdst += chromStride;
1421 __asm__
volatile(
EMMS" \n\t"
1427 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1435 for (x=0; x<srcWidth-1; x++) {
1436 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1437 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1439 dst[2*srcWidth-1]= src[srcWidth-1];
1443 for (y=1; y<srcHeight; y++) {
1444 const x86_reg mmxSize= srcWidth&~15;
1446 "mov %4, %%"REG_a
" \n\t"
1447 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1448 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1449 "movq %%mm4, %%mm2 \n\t"
1450 "psllq $8, %%mm4 \n\t"
1451 "pand %%mm0, %%mm2 \n\t"
1452 "por %%mm2, %%mm4 \n\t"
1453 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1454 "movq %%mm5, %%mm3 \n\t"
1455 "psllq $8, %%mm5 \n\t"
1456 "pand %%mm0, %%mm3 \n\t"
1457 "por %%mm3, %%mm5 \n\t"
1459 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1460 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1461 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1462 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1463 PAVGB
" %%mm0, %%mm5 \n\t"
1464 PAVGB
" %%mm0, %%mm3 \n\t"
1465 PAVGB
" %%mm0, %%mm5 \n\t"
1466 PAVGB
" %%mm0, %%mm3 \n\t"
1467 PAVGB
" %%mm1, %%mm4 \n\t"
1468 PAVGB
" %%mm1, %%mm2 \n\t"
1469 PAVGB
" %%mm1, %%mm4 \n\t"
1470 PAVGB
" %%mm1, %%mm2 \n\t"
1471 "movq %%mm5, %%mm7 \n\t"
1472 "movq %%mm4, %%mm6 \n\t"
1473 "punpcklbw %%mm3, %%mm5 \n\t"
1474 "punpckhbw %%mm3, %%mm7 \n\t"
1475 "punpcklbw %%mm2, %%mm4 \n\t"
1476 "punpckhbw %%mm2, %%mm6 \n\t"
1477 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1478 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1479 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1480 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1481 "add $8, %%"REG_a
" \n\t"
1482 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1483 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1485 ::
"r" (src + mmxSize ),
"r" (src + srcStride + mmxSize ),
1486 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1491 for (x=mmxSize-1; x<srcWidth-1; x++) {
1492 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1493 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1494 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1495 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1497 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1498 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1507 for (x=0; x<srcWidth-1; x++) {
1508 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1509 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1511 dst[2*srcWidth-1]= src[srcWidth-1];
1513 __asm__
volatile(
EMMS" \n\t"
1519 #if !COMPILE_TEMPLATE_AMD3DNOW
1528 int lumStride,
int chromStride,
int srcStride)
1531 const x86_reg chromWidth= width>>1;
1532 for (y=0; y<
height; y+=2) {
1534 "xor %%"REG_a
", %%"REG_a
" \n\t"
1535 "pcmpeqw %%mm7, %%mm7 \n\t"
1536 "psrlw $8, %%mm7 \n\t"
1539 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1540 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1541 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1542 "movq %%mm0, %%mm2 \n\t"
1543 "movq %%mm1, %%mm3 \n\t"
1544 "pand %%mm7, %%mm0 \n\t"
1545 "pand %%mm7, %%mm1 \n\t"
1546 "psrlw $8, %%mm2 \n\t"
1547 "psrlw $8, %%mm3 \n\t"
1548 "packuswb %%mm1, %%mm0 \n\t"
1549 "packuswb %%mm3, %%mm2 \n\t"
1551 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1553 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1554 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1555 "movq %%mm1, %%mm3 \n\t"
1556 "movq %%mm2, %%mm4 \n\t"
1557 "pand %%mm7, %%mm1 \n\t"
1558 "pand %%mm7, %%mm2 \n\t"
1559 "psrlw $8, %%mm3 \n\t"
1560 "psrlw $8, %%mm4 \n\t"
1561 "packuswb %%mm2, %%mm1 \n\t"
1562 "packuswb %%mm4, %%mm3 \n\t"
1564 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1566 "movq %%mm0, %%mm2 \n\t"
1567 "movq %%mm1, %%mm3 \n\t"
1568 "psrlw $8, %%mm0 \n\t"
1569 "psrlw $8, %%mm1 \n\t"
1570 "pand %%mm7, %%mm2 \n\t"
1571 "pand %%mm7, %%mm3 \n\t"
1572 "packuswb %%mm1, %%mm0 \n\t"
1573 "packuswb %%mm3, %%mm2 \n\t"
1575 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1576 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1578 "add $8, %%"REG_a
" \n\t"
1579 "cmp %4, %%"REG_a
" \n\t"
1581 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1582 :
"memory",
"%"REG_a
1589 "xor %%"REG_a
", %%"REG_a
" \n\t"
1592 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1593 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1594 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1595 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1596 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1597 "psrlw $8, %%mm0 \n\t"
1598 "psrlw $8, %%mm1 \n\t"
1599 "psrlw $8, %%mm2 \n\t"
1600 "psrlw $8, %%mm3 \n\t"
1601 "packuswb %%mm1, %%mm0 \n\t"
1602 "packuswb %%mm3, %%mm2 \n\t"
1604 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1605 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1607 "add $8, %%"REG_a
" \n\t"
1608 "cmp %4, %%"REG_a
" \n\t"
1611 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1612 :
"memory",
"%"REG_a
1614 udst += chromStride;
1615 vdst += chromStride;
1619 __asm__
volatile(
EMMS" \n\t"
1634 int lumStride,
int chromStride,
int srcStride)
1637 const x86_reg chromWidth= width>>1;
1638 for (y=0; y<height-2; y+=2) {
1640 for (i=0; i<2; i++) {
1642 "mov %2, %%"REG_a
" \n\t"
1643 "movq "MANGLE(ff_bgr2YCoeff)
", %%mm6 \n\t"
1644 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1645 "pxor %%mm7, %%mm7 \n\t"
1646 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1650 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1651 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1652 "punpcklbw %%mm7, %%mm0 \n\t"
1653 "punpcklbw %%mm7, %%mm1 \n\t"
1654 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1655 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1656 "punpcklbw %%mm7, %%mm2 \n\t"
1657 "punpcklbw %%mm7, %%mm3 \n\t"
1658 "pmaddwd %%mm6, %%mm0 \n\t"
1659 "pmaddwd %%mm6, %%mm1 \n\t"
1660 "pmaddwd %%mm6, %%mm2 \n\t"
1661 "pmaddwd %%mm6, %%mm3 \n\t"
1662 #ifndef FAST_BGR2YV12
1663 "psrad $8, %%mm0 \n\t"
1664 "psrad $8, %%mm1 \n\t"
1665 "psrad $8, %%mm2 \n\t"
1666 "psrad $8, %%mm3 \n\t"
1668 "packssdw %%mm1, %%mm0 \n\t"
1669 "packssdw %%mm3, %%mm2 \n\t"
1670 "pmaddwd %%mm5, %%mm0 \n\t"
1671 "pmaddwd %%mm5, %%mm2 \n\t"
1672 "packssdw %%mm2, %%mm0 \n\t"
1673 "psraw $7, %%mm0 \n\t"
1675 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1676 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1677 "punpcklbw %%mm7, %%mm4 \n\t"
1678 "punpcklbw %%mm7, %%mm1 \n\t"
1679 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1680 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1681 "punpcklbw %%mm7, %%mm2 \n\t"
1682 "punpcklbw %%mm7, %%mm3 \n\t"
1683 "pmaddwd %%mm6, %%mm4 \n\t"
1684 "pmaddwd %%mm6, %%mm1 \n\t"
1685 "pmaddwd %%mm6, %%mm2 \n\t"
1686 "pmaddwd %%mm6, %%mm3 \n\t"
1687 #ifndef FAST_BGR2YV12
1688 "psrad $8, %%mm4 \n\t"
1689 "psrad $8, %%mm1 \n\t"
1690 "psrad $8, %%mm2 \n\t"
1691 "psrad $8, %%mm3 \n\t"
1693 "packssdw %%mm1, %%mm4 \n\t"
1694 "packssdw %%mm3, %%mm2 \n\t"
1695 "pmaddwd %%mm5, %%mm4 \n\t"
1696 "pmaddwd %%mm5, %%mm2 \n\t"
1697 "add $24, %%"REG_d
" \n\t"
1698 "packssdw %%mm2, %%mm4 \n\t"
1699 "psraw $7, %%mm4 \n\t"
1701 "packuswb %%mm4, %%mm0 \n\t"
1702 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1704 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1705 "add $8, %%"REG_a
" \n\t"
1707 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width)
1708 :
"%"REG_a,
"%"REG_d
1715 "mov %4, %%"REG_a
" \n\t"
1716 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1717 "movq "MANGLE(ff_bgr2UCoeff)
", %%mm6 \n\t"
1718 "pxor %%mm7, %%mm7 \n\t"
1719 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1720 "add %%"REG_d
", %%"REG_d
" \n\t"
1725 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1726 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1727 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1728 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1729 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1730 PAVGB
" %%mm1, %%mm0 \n\t"
1731 PAVGB
" %%mm3, %%mm2 \n\t"
1732 "movq %%mm0, %%mm1 \n\t"
1733 "movq %%mm2, %%mm3 \n\t"
1734 "psrlq $24, %%mm0 \n\t"
1735 "psrlq $24, %%mm2 \n\t"
1736 PAVGB
" %%mm1, %%mm0 \n\t"
1737 PAVGB
" %%mm3, %%mm2 \n\t"
1738 "punpcklbw %%mm7, %%mm0 \n\t"
1739 "punpcklbw %%mm7, %%mm2 \n\t"
1741 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1742 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1743 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1744 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1745 "punpcklbw %%mm7, %%mm0 \n\t"
1746 "punpcklbw %%mm7, %%mm1 \n\t"
1747 "punpcklbw %%mm7, %%mm2 \n\t"
1748 "punpcklbw %%mm7, %%mm3 \n\t"
1749 "paddw %%mm1, %%mm0 \n\t"
1750 "paddw %%mm3, %%mm2 \n\t"
1751 "paddw %%mm2, %%mm0 \n\t"
1752 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1753 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1754 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1755 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1756 "punpcklbw %%mm7, %%mm4 \n\t"
1757 "punpcklbw %%mm7, %%mm1 \n\t"
1758 "punpcklbw %%mm7, %%mm2 \n\t"
1759 "punpcklbw %%mm7, %%mm3 \n\t"
1760 "paddw %%mm1, %%mm4 \n\t"
1761 "paddw %%mm3, %%mm2 \n\t"
1762 "paddw %%mm4, %%mm2 \n\t"
1763 "psrlw $2, %%mm0 \n\t"
1764 "psrlw $2, %%mm2 \n\t"
1766 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1767 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1769 "pmaddwd %%mm0, %%mm1 \n\t"
1770 "pmaddwd %%mm2, %%mm3 \n\t"
1771 "pmaddwd %%mm6, %%mm0 \n\t"
1772 "pmaddwd %%mm6, %%mm2 \n\t"
1773 #ifndef FAST_BGR2YV12
1774 "psrad $8, %%mm0 \n\t"
1775 "psrad $8, %%mm1 \n\t"
1776 "psrad $8, %%mm2 \n\t"
1777 "psrad $8, %%mm3 \n\t"
1779 "packssdw %%mm2, %%mm0 \n\t"
1780 "packssdw %%mm3, %%mm1 \n\t"
1781 "pmaddwd %%mm5, %%mm0 \n\t"
1782 "pmaddwd %%mm5, %%mm1 \n\t"
1783 "packssdw %%mm1, %%mm0 \n\t"
1784 "psraw $7, %%mm0 \n\t"
1786 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1787 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1788 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1789 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1790 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1791 PAVGB
" %%mm1, %%mm4 \n\t"
1792 PAVGB
" %%mm3, %%mm2 \n\t"
1793 "movq %%mm4, %%mm1 \n\t"
1794 "movq %%mm2, %%mm3 \n\t"
1795 "psrlq $24, %%mm4 \n\t"
1796 "psrlq $24, %%mm2 \n\t"
1797 PAVGB
" %%mm1, %%mm4 \n\t"
1798 PAVGB
" %%mm3, %%mm2 \n\t"
1799 "punpcklbw %%mm7, %%mm4 \n\t"
1800 "punpcklbw %%mm7, %%mm2 \n\t"
1802 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1803 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1804 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1805 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1806 "punpcklbw %%mm7, %%mm4 \n\t"
1807 "punpcklbw %%mm7, %%mm1 \n\t"
1808 "punpcklbw %%mm7, %%mm2 \n\t"
1809 "punpcklbw %%mm7, %%mm3 \n\t"
1810 "paddw %%mm1, %%mm4 \n\t"
1811 "paddw %%mm3, %%mm2 \n\t"
1812 "paddw %%mm2, %%mm4 \n\t"
1813 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1814 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1815 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1816 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1817 "punpcklbw %%mm7, %%mm5 \n\t"
1818 "punpcklbw %%mm7, %%mm1 \n\t"
1819 "punpcklbw %%mm7, %%mm2 \n\t"
1820 "punpcklbw %%mm7, %%mm3 \n\t"
1821 "paddw %%mm1, %%mm5 \n\t"
1822 "paddw %%mm3, %%mm2 \n\t"
1823 "paddw %%mm5, %%mm2 \n\t"
1824 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1825 "psrlw $2, %%mm4 \n\t"
1826 "psrlw $2, %%mm2 \n\t"
1828 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1829 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1831 "pmaddwd %%mm4, %%mm1 \n\t"
1832 "pmaddwd %%mm2, %%mm3 \n\t"
1833 "pmaddwd %%mm6, %%mm4 \n\t"
1834 "pmaddwd %%mm6, %%mm2 \n\t"
1835 #ifndef FAST_BGR2YV12
1836 "psrad $8, %%mm4 \n\t"
1837 "psrad $8, %%mm1 \n\t"
1838 "psrad $8, %%mm2 \n\t"
1839 "psrad $8, %%mm3 \n\t"
1841 "packssdw %%mm2, %%mm4 \n\t"
1842 "packssdw %%mm3, %%mm1 \n\t"
1843 "pmaddwd %%mm5, %%mm4 \n\t"
1844 "pmaddwd %%mm5, %%mm1 \n\t"
1845 "add $24, %%"REG_d
" \n\t"
1846 "packssdw %%mm1, %%mm4 \n\t"
1847 "psraw $7, %%mm4 \n\t"
1849 "movq %%mm0, %%mm1 \n\t"
1850 "punpckldq %%mm4, %%mm0 \n\t"
1851 "punpckhdq %%mm4, %%mm1 \n\t"
1852 "packsswb %%mm1, %%mm0 \n\t"
1853 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1854 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1855 "punpckhdq %%mm0, %%mm0 \n\t"
1856 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1857 "add $4, %%"REG_a
" \n\t"
1859 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth)
1860 :
"%"REG_a,
"%"REG_d
1863 udst += chromStride;
1864 vdst += chromStride;
1868 __asm__
volatile(
EMMS" \n\t"
1872 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1876 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1879 int src2Stride,
int dstStride)
1883 for (h=0; h <
height; h++) {
1886 #if COMPILE_TEMPLATE_SSE2
1888 "xor %%"REG_a
", %%"REG_a
" \n\t"
1892 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1893 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1894 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1895 "punpcklbw %%xmm2, %%xmm0 \n\t"
1896 "punpckhbw %%xmm2, %%xmm1 \n\t"
1897 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1898 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1899 "add $16, %%"REG_a
" \n\t"
1900 "cmp %3, %%"REG_a
" \n\t"
1902 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1903 :
"memory",
"%"REG_a
""
1907 "xor %%"REG_a
", %%"REG_a
" \n\t"
1911 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1912 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1913 "movq %%mm0, %%mm1 \n\t"
1914 "movq %%mm2, %%mm3 \n\t"
1915 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1916 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1917 "punpcklbw %%mm4, %%mm0 \n\t"
1918 "punpckhbw %%mm4, %%mm1 \n\t"
1919 "punpcklbw %%mm5, %%mm2 \n\t"
1920 "punpckhbw %%mm5, %%mm3 \n\t"
1921 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1922 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1923 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1924 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1925 "add $16, %%"REG_a
" \n\t"
1926 "cmp %3, %%"REG_a
" \n\t"
1928 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1929 :
"memory",
"%"REG_a
1932 for (w= (width&(~15)); w <
width; w++) {
1933 dest[2*w+0] = src1[w];
1934 dest[2*w+1] = src2[w];
1948 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1954 int dst1Stride,
int dst2Stride)
1958 for (h = 0; h <
height; h++) {
1972 #if !COMPILE_TEMPLATE_SSE2
1973 #if !COMPILE_TEMPLATE_AMD3DNOW
1977 int srcStride1,
int srcStride2,
1978 int dstStride1,
int dstStride2)
1982 w=width/2; h=height/2;
1986 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1988 const uint8_t* s1=src1+srcStride1*(y>>1);
1991 for (;x<w-31;x+=32) {
1994 "movq (%1,%2), %%mm0 \n\t"
1995 "movq 8(%1,%2), %%mm2 \n\t"
1996 "movq 16(%1,%2), %%mm4 \n\t"
1997 "movq 24(%1,%2), %%mm6 \n\t"
1998 "movq %%mm0, %%mm1 \n\t"
1999 "movq %%mm2, %%mm3 \n\t"
2000 "movq %%mm4, %%mm5 \n\t"
2001 "movq %%mm6, %%mm7 \n\t"
2002 "punpcklbw %%mm0, %%mm0 \n\t"
2003 "punpckhbw %%mm1, %%mm1 \n\t"
2004 "punpcklbw %%mm2, %%mm2 \n\t"
2005 "punpckhbw %%mm3, %%mm3 \n\t"
2006 "punpcklbw %%mm4, %%mm4 \n\t"
2007 "punpckhbw %%mm5, %%mm5 \n\t"
2008 "punpcklbw %%mm6, %%mm6 \n\t"
2009 "punpckhbw %%mm7, %%mm7 \n\t"
2010 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2011 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2012 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2013 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2014 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2015 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2016 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2017 MOVNTQ" %%mm7, 56(%0,%2,2)"
2018 ::
"r"(d),
"r"(s1),
"r"(x)
2021 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2024 const uint8_t* s2=src2+srcStride2*(y>>1);
2027 for (;x<w-31;x+=32) {
2030 "movq (%1,%2), %%mm0 \n\t"
2031 "movq 8(%1,%2), %%mm2 \n\t"
2032 "movq 16(%1,%2), %%mm4 \n\t"
2033 "movq 24(%1,%2), %%mm6 \n\t"
2034 "movq %%mm0, %%mm1 \n\t"
2035 "movq %%mm2, %%mm3 \n\t"
2036 "movq %%mm4, %%mm5 \n\t"
2037 "movq %%mm6, %%mm7 \n\t"
2038 "punpcklbw %%mm0, %%mm0 \n\t"
2039 "punpckhbw %%mm1, %%mm1 \n\t"
2040 "punpcklbw %%mm2, %%mm2 \n\t"
2041 "punpckhbw %%mm3, %%mm3 \n\t"
2042 "punpcklbw %%mm4, %%mm4 \n\t"
2043 "punpckhbw %%mm5, %%mm5 \n\t"
2044 "punpcklbw %%mm6, %%mm6 \n\t"
2045 "punpckhbw %%mm7, %%mm7 \n\t"
2046 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2047 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2048 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2049 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2050 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2051 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2052 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2053 MOVNTQ" %%mm7, 56(%0,%2,2)"
2054 ::
"r"(d),
"r"(s2),
"r"(x)
2057 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2069 int srcStride1,
int srcStride2,
2070 int srcStride3,
int dstStride)
2076 const uint8_t* yp=src1+srcStride1*y;
2077 const uint8_t* up=src2+srcStride2*(y>>2);
2078 const uint8_t* vp=src3+srcStride3*(y>>2);
2086 "movq (%1, %0, 4), %%mm0 \n\t"
2087 "movq (%2, %0), %%mm1 \n\t"
2088 "movq (%3, %0), %%mm2 \n\t"
2089 "movq %%mm0, %%mm3 \n\t"
2090 "movq %%mm1, %%mm4 \n\t"
2091 "movq %%mm2, %%mm5 \n\t"
2092 "punpcklbw %%mm1, %%mm1 \n\t"
2093 "punpcklbw %%mm2, %%mm2 \n\t"
2094 "punpckhbw %%mm4, %%mm4 \n\t"
2095 "punpckhbw %%mm5, %%mm5 \n\t"
2097 "movq %%mm1, %%mm6 \n\t"
2098 "punpcklbw %%mm2, %%mm1 \n\t"
2099 "punpcklbw %%mm1, %%mm0 \n\t"
2100 "punpckhbw %%mm1, %%mm3 \n\t"
2101 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2102 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2104 "punpckhbw %%mm2, %%mm6 \n\t"
2105 "movq 8(%1, %0, 4), %%mm0 \n\t"
2106 "movq %%mm0, %%mm3 \n\t"
2107 "punpcklbw %%mm6, %%mm0 \n\t"
2108 "punpckhbw %%mm6, %%mm3 \n\t"
2109 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2110 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2112 "movq %%mm4, %%mm6 \n\t"
2113 "movq 16(%1, %0, 4), %%mm0 \n\t"
2114 "movq %%mm0, %%mm3 \n\t"
2115 "punpcklbw %%mm5, %%mm4 \n\t"
2116 "punpcklbw %%mm4, %%mm0 \n\t"
2117 "punpckhbw %%mm4, %%mm3 \n\t"
2118 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2119 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2121 "punpckhbw %%mm5, %%mm6 \n\t"
2122 "movq 24(%1, %0, 4), %%mm0 \n\t"
2123 "movq %%mm0, %%mm3 \n\t"
2124 "punpcklbw %%mm6, %%mm0 \n\t"
2125 "punpckhbw %%mm6, %%mm3 \n\t"
2126 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2127 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2130 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2134 const int x2 = x<<2;
2137 d[8*x+2] = yp[x2+1];
2139 d[8*x+4] = yp[x2+2];
2141 d[8*x+6] = yp[x2+3];
2162 "pcmpeqw %%mm7, %%mm7 \n\t"
2163 "psrlw $8, %%mm7 \n\t"
2165 "movq -30(%1, %0, 2), %%mm0 \n\t"
2166 "movq -22(%1, %0, 2), %%mm1 \n\t"
2167 "movq -14(%1, %0, 2), %%mm2 \n\t"
2168 "movq -6(%1, %0, 2), %%mm3 \n\t"
2169 "pand %%mm7, %%mm0 \n\t"
2170 "pand %%mm7, %%mm1 \n\t"
2171 "pand %%mm7, %%mm2 \n\t"
2172 "pand %%mm7, %%mm3 \n\t"
2173 "packuswb %%mm1, %%mm0 \n\t"
2174 "packuswb %%mm3, %%mm2 \n\t"
2175 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2176 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2180 :
"r"(src),
"r"(dst)
2185 dst[count]= src[2*count];
2190 #if !COMPILE_TEMPLATE_AMD3DNOW
2200 "pcmpeqw %%mm7, %%mm7 \n\t"
2201 "psrlw $8, %%mm7 \n\t"
2203 "movq -28(%1, %0, 4), %%mm0 \n\t"
2204 "movq -20(%1, %0, 4), %%mm1 \n\t"
2205 "movq -12(%1, %0, 4), %%mm2 \n\t"
2206 "movq -4(%1, %0, 4), %%mm3 \n\t"
2207 "pand %%mm7, %%mm0 \n\t"
2208 "pand %%mm7, %%mm1 \n\t"
2209 "pand %%mm7, %%mm2 \n\t"
2210 "pand %%mm7, %%mm3 \n\t"
2211 "packuswb %%mm1, %%mm0 \n\t"
2212 "packuswb %%mm3, %%mm2 \n\t"
2213 "movq %%mm0, %%mm1 \n\t"
2214 "movq %%mm2, %%mm3 \n\t"
2215 "psrlw $8, %%mm0 \n\t"
2216 "psrlw $8, %%mm2 \n\t"
2217 "pand %%mm7, %%mm1 \n\t"
2218 "pand %%mm7, %%mm3 \n\t"
2219 "packuswb %%mm2, %%mm0 \n\t"
2220 "packuswb %%mm3, %%mm1 \n\t"
2221 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2222 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2226 :
"r"(src),
"r"(dst0),
"r"(dst1)
2231 dst0[count]= src[4*count+0];
2232 dst1[count]= src[4*count+2];
2249 "pcmpeqw %%mm7, %%mm7 \n\t"
2250 "psrlw $8, %%mm7 \n\t"
2252 "movq -28(%1, %0, 4), %%mm0 \n\t"
2253 "movq -20(%1, %0, 4), %%mm1 \n\t"
2254 "movq -12(%1, %0, 4), %%mm2 \n\t"
2255 "movq -4(%1, %0, 4), %%mm3 \n\t"
2256 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2257 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2258 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2259 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2260 "pand %%mm7, %%mm0 \n\t"
2261 "pand %%mm7, %%mm1 \n\t"
2262 "pand %%mm7, %%mm2 \n\t"
2263 "pand %%mm7, %%mm3 \n\t"
2264 "packuswb %%mm1, %%mm0 \n\t"
2265 "packuswb %%mm3, %%mm2 \n\t"
2266 "movq %%mm0, %%mm1 \n\t"
2267 "movq %%mm2, %%mm3 \n\t"
2268 "psrlw $8, %%mm0 \n\t"
2269 "psrlw $8, %%mm2 \n\t"
2270 "pand %%mm7, %%mm1 \n\t"
2271 "pand %%mm7, %%mm3 \n\t"
2272 "packuswb %%mm2, %%mm0 \n\t"
2273 "packuswb %%mm3, %%mm1 \n\t"
2274 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2275 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2279 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2285 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2286 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2291 #if !COMPILE_TEMPLATE_AMD3DNOW
2301 "pcmpeqw %%mm7, %%mm7 \n\t"
2302 "psrlw $8, %%mm7 \n\t"
2304 "movq -28(%1, %0, 4), %%mm0 \n\t"
2305 "movq -20(%1, %0, 4), %%mm1 \n\t"
2306 "movq -12(%1, %0, 4), %%mm2 \n\t"
2307 "movq -4(%1, %0, 4), %%mm3 \n\t"
2308 "psrlw $8, %%mm0 \n\t"
2309 "psrlw $8, %%mm1 \n\t"
2310 "psrlw $8, %%mm2 \n\t"
2311 "psrlw $8, %%mm3 \n\t"
2312 "packuswb %%mm1, %%mm0 \n\t"
2313 "packuswb %%mm3, %%mm2 \n\t"
2314 "movq %%mm0, %%mm1 \n\t"
2315 "movq %%mm2, %%mm3 \n\t"
2316 "psrlw $8, %%mm0 \n\t"
2317 "psrlw $8, %%mm2 \n\t"
2318 "pand %%mm7, %%mm1 \n\t"
2319 "pand %%mm7, %%mm3 \n\t"
2320 "packuswb %%mm2, %%mm0 \n\t"
2321 "packuswb %%mm3, %%mm1 \n\t"
2322 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2323 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2327 :
"r"(src),
"r"(dst0),
"r"(dst1)
2333 dst0[count]= src[4*count+0];
2334 dst1[count]= src[4*count+2];
2351 "pcmpeqw %%mm7, %%mm7 \n\t"
2352 "psrlw $8, %%mm7 \n\t"
2354 "movq -28(%1, %0, 4), %%mm0 \n\t"
2355 "movq -20(%1, %0, 4), %%mm1 \n\t"
2356 "movq -12(%1, %0, 4), %%mm2 \n\t"
2357 "movq -4(%1, %0, 4), %%mm3 \n\t"
2358 PAVGB
" -28(%2, %0, 4), %%mm0 \n\t"
2359 PAVGB
" -20(%2, %0, 4), %%mm1 \n\t"
2360 PAVGB
" -12(%2, %0, 4), %%mm2 \n\t"
2361 PAVGB
" - 4(%2, %0, 4), %%mm3 \n\t"
2362 "psrlw $8, %%mm0 \n\t"
2363 "psrlw $8, %%mm1 \n\t"
2364 "psrlw $8, %%mm2 \n\t"
2365 "psrlw $8, %%mm3 \n\t"
2366 "packuswb %%mm1, %%mm0 \n\t"
2367 "packuswb %%mm3, %%mm2 \n\t"
2368 "movq %%mm0, %%mm1 \n\t"
2369 "movq %%mm2, %%mm3 \n\t"
2370 "psrlw $8, %%mm0 \n\t"
2371 "psrlw $8, %%mm2 \n\t"
2372 "pand %%mm7, %%mm1 \n\t"
2373 "pand %%mm7, %%mm3 \n\t"
2374 "packuswb %%mm2, %%mm0 \n\t"
2375 "packuswb %%mm3, %%mm1 \n\t"
2376 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2377 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2381 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2389 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2390 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2397 int lumStride,
int chromStride,
int srcStride)
2400 const int chromWidth= -((-
width)>>1);
2402 for (y=0; y<
height; y++) {
2420 #if !COMPILE_TEMPLATE_AMD3DNOW
2423 int lumStride,
int chromStride,
int srcStride)
2426 const int chromWidth= -((-
width)>>1);
2428 for (y=0; y<
height; y++) {
2447 int lumStride,
int chromStride,
int srcStride)
2450 const int chromWidth= -((-
width)>>1);
2452 for (y=0; y<
height; y++) {
2470 #if !COMPILE_TEMPLATE_AMD3DNOW
2473 int lumStride,
int chromStride,
int srcStride)
2476 const int chromWidth= -((-
width)>>1);
2478 for (y=0; y<
height; y++) {
2498 #if !COMPILE_TEMPLATE_SSE2
2499 #if !COMPILE_TEMPLATE_AMD3DNOW
2529 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2538 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2541 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Macro definitions for various function/variable attributes.
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() shuffle_bytes_2103(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
static av_cold void RENAME() rgb2rgb_init(void)
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)