Libav
rgb2rgb_template.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  * lot of big-endian byte order fixes by Alex Beregszaszi
9  *
10  * This file is part of Libav.
11  *
12  * Libav is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * Libav is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with Libav; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 
29 #include "libavutil/attributes.h"
30 
31 #undef PREFETCH
32 #undef MOVNTQ
33 #undef EMMS
34 #undef SFENCE
35 #undef PAVGB
36 
37 #if COMPILE_TEMPLATE_AMD3DNOW
38 #define PREFETCH "prefetch"
39 #define PAVGB "pavgusb"
40 #elif COMPILE_TEMPLATE_MMXEXT
41 #define PREFETCH "prefetchnta"
42 #define PAVGB "pavgb"
43 #else
44 #define PREFETCH " # nop"
45 #endif
46 
47 #if COMPILE_TEMPLATE_AMD3DNOW
48 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
49 #define EMMS "femms"
50 #else
51 #define EMMS "emms"
52 #endif
53 
54 #if COMPILE_TEMPLATE_MMXEXT
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
57 #else
58 #define MOVNTQ "movq"
59 #define SFENCE " # nop"
60 #endif
61 
62 #if !COMPILE_TEMPLATE_SSE2
63 
64 #if !COMPILE_TEMPLATE_AMD3DNOW
65 
66 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
67 {
68  uint8_t *dest = dst;
69  const uint8_t *s = src;
70  const uint8_t *end;
71  const uint8_t *mm_end;
72  end = s + src_size;
73  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
74  mm_end = end - 23;
75  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
76  while (s < mm_end) {
77  __asm__ volatile(
78  PREFETCH" 32(%1) \n\t"
79  "movd (%1), %%mm0 \n\t"
80  "punpckldq 3(%1), %%mm0 \n\t"
81  "movd 6(%1), %%mm1 \n\t"
82  "punpckldq 9(%1), %%mm1 \n\t"
83  "movd 12(%1), %%mm2 \n\t"
84  "punpckldq 15(%1), %%mm2 \n\t"
85  "movd 18(%1), %%mm3 \n\t"
86  "punpckldq 21(%1), %%mm3 \n\t"
87  "por %%mm7, %%mm0 \n\t"
88  "por %%mm7, %%mm1 \n\t"
89  "por %%mm7, %%mm2 \n\t"
90  "por %%mm7, %%mm3 \n\t"
91  MOVNTQ" %%mm0, (%0) \n\t"
92  MOVNTQ" %%mm1, 8(%0) \n\t"
93  MOVNTQ" %%mm2, 16(%0) \n\t"
94  MOVNTQ" %%mm3, 24(%0)"
95  :: "r"(dest), "r"(s)
96  :"memory");
97  dest += 32;
98  s += 24;
99  }
100  __asm__ volatile(SFENCE:::"memory");
101  __asm__ volatile(EMMS:::"memory");
102  while (s < end) {
103  *dest++ = *s++;
104  *dest++ = *s++;
105  *dest++ = *s++;
106  *dest++ = 255;
107  }
108 }
109 
110 #define STORE_BGR24_MMX \
111  "psrlq $8, %%mm2 \n\t" \
112  "psrlq $8, %%mm3 \n\t" \
113  "psrlq $8, %%mm6 \n\t" \
114  "psrlq $8, %%mm7 \n\t" \
115  "pand "MANGLE(mask24l)", %%mm0\n\t" \
116  "pand "MANGLE(mask24l)", %%mm1\n\t" \
117  "pand "MANGLE(mask24l)", %%mm4\n\t" \
118  "pand "MANGLE(mask24l)", %%mm5\n\t" \
119  "pand "MANGLE(mask24h)", %%mm2\n\t" \
120  "pand "MANGLE(mask24h)", %%mm3\n\t" \
121  "pand "MANGLE(mask24h)", %%mm6\n\t" \
122  "pand "MANGLE(mask24h)", %%mm7\n\t" \
123  "por %%mm2, %%mm0 \n\t" \
124  "por %%mm3, %%mm1 \n\t" \
125  "por %%mm6, %%mm4 \n\t" \
126  "por %%mm7, %%mm5 \n\t" \
127  \
128  "movq %%mm1, %%mm2 \n\t" \
129  "movq %%mm4, %%mm3 \n\t" \
130  "psllq $48, %%mm2 \n\t" \
131  "psllq $32, %%mm3 \n\t" \
132  "pand "MANGLE(mask24hh)", %%mm2\n\t" \
133  "pand "MANGLE(mask24hhh)", %%mm3\n\t" \
134  "por %%mm2, %%mm0 \n\t" \
135  "psrlq $16, %%mm1 \n\t" \
136  "psrlq $32, %%mm4 \n\t" \
137  "psllq $16, %%mm5 \n\t" \
138  "por %%mm3, %%mm1 \n\t" \
139  "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \
140  "por %%mm5, %%mm4 \n\t" \
141  \
142  MOVNTQ" %%mm0, (%0) \n\t" \
143  MOVNTQ" %%mm1, 8(%0) \n\t" \
144  MOVNTQ" %%mm4, 16(%0)"
145 
146 
147 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
148 {
149  uint8_t *dest = dst;
150  const uint8_t *s = src;
151  const uint8_t *end;
152  const uint8_t *mm_end;
153  end = s + src_size;
154  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
155  mm_end = end - 31;
156  while (s < mm_end) {
157  __asm__ volatile(
158  PREFETCH" 32(%1) \n\t"
159  "movq (%1), %%mm0 \n\t"
160  "movq 8(%1), %%mm1 \n\t"
161  "movq 16(%1), %%mm4 \n\t"
162  "movq 24(%1), %%mm5 \n\t"
163  "movq %%mm0, %%mm2 \n\t"
164  "movq %%mm1, %%mm3 \n\t"
165  "movq %%mm4, %%mm6 \n\t"
166  "movq %%mm5, %%mm7 \n\t"
168  :: "r"(dest), "r"(s)
169  :"memory");
170  dest += 24;
171  s += 32;
172  }
173  __asm__ volatile(SFENCE:::"memory");
174  __asm__ volatile(EMMS:::"memory");
175  while (s < end) {
176  *dest++ = *s++;
177  *dest++ = *s++;
178  *dest++ = *s++;
179  s++;
180  }
181 }
182 
183 /*
184  original by Strepto/Astral
185  ported to gcc & bugfixed: A'rpi
186  MMXEXT, 3DNOW optimization by Nick Kurshev
187  32-bit C version, and and&add trick by Michael Niedermayer
188 */
189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
190 {
191  register const uint8_t* s=src;
192  register uint8_t* d=dst;
193  register const uint8_t *end;
194  const uint8_t *mm_end;
195  end = s + src_size;
196  __asm__ volatile(PREFETCH" %0"::"m"(*s));
197  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
198  mm_end = end - 15;
199  while (s<mm_end) {
200  __asm__ volatile(
201  PREFETCH" 32(%1) \n\t"
202  "movq (%1), %%mm0 \n\t"
203  "movq 8(%1), %%mm2 \n\t"
204  "movq %%mm0, %%mm1 \n\t"
205  "movq %%mm2, %%mm3 \n\t"
206  "pand %%mm4, %%mm0 \n\t"
207  "pand %%mm4, %%mm2 \n\t"
208  "paddw %%mm1, %%mm0 \n\t"
209  "paddw %%mm3, %%mm2 \n\t"
210  MOVNTQ" %%mm0, (%0) \n\t"
211  MOVNTQ" %%mm2, 8(%0)"
212  :: "r"(d), "r"(s)
213  );
214  d+=16;
215  s+=16;
216  }
217  __asm__ volatile(SFENCE:::"memory");
218  __asm__ volatile(EMMS:::"memory");
219  mm_end = end - 3;
220  while (s < mm_end) {
221  register unsigned x= *((const uint32_t *)s);
222  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
223  d+=4;
224  s+=4;
225  }
226  if (s < end) {
227  register unsigned short x= *((const uint16_t *)s);
228  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
229  }
230 }
231 
232 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
233 {
234  register const uint8_t* s=src;
235  register uint8_t* d=dst;
236  register const uint8_t *end;
237  const uint8_t *mm_end;
238  end = s + src_size;
239  __asm__ volatile(PREFETCH" %0"::"m"(*s));
240  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
241  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
242  mm_end = end - 15;
243  while (s<mm_end) {
244  __asm__ volatile(
245  PREFETCH" 32(%1) \n\t"
246  "movq (%1), %%mm0 \n\t"
247  "movq 8(%1), %%mm2 \n\t"
248  "movq %%mm0, %%mm1 \n\t"
249  "movq %%mm2, %%mm3 \n\t"
250  "psrlq $1, %%mm0 \n\t"
251  "psrlq $1, %%mm2 \n\t"
252  "pand %%mm7, %%mm0 \n\t"
253  "pand %%mm7, %%mm2 \n\t"
254  "pand %%mm6, %%mm1 \n\t"
255  "pand %%mm6, %%mm3 \n\t"
256  "por %%mm1, %%mm0 \n\t"
257  "por %%mm3, %%mm2 \n\t"
258  MOVNTQ" %%mm0, (%0) \n\t"
259  MOVNTQ" %%mm2, 8(%0)"
260  :: "r"(d), "r"(s)
261  );
262  d+=16;
263  s+=16;
264  }
265  __asm__ volatile(SFENCE:::"memory");
266  __asm__ volatile(EMMS:::"memory");
267  mm_end = end - 3;
268  while (s < mm_end) {
269  register uint32_t x= *((const uint32_t*)s);
270  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
271  s+=4;
272  d+=4;
273  }
274  if (s < end) {
275  register uint16_t x= *((const uint16_t*)s);
276  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
277  }
278 }
279 
280 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
281 {
282  const uint8_t *s = src;
283  const uint8_t *end;
284  const uint8_t *mm_end;
285  uint16_t *d = (uint16_t *)dst;
286  end = s + src_size;
287  mm_end = end - 15;
288  __asm__ volatile(
289  "movq %3, %%mm5 \n\t"
290  "movq %4, %%mm6 \n\t"
291  "movq %5, %%mm7 \n\t"
292  "jmp 2f \n\t"
293  ".p2align 4 \n\t"
294  "1: \n\t"
295  PREFETCH" 32(%1) \n\t"
296  "movd (%1), %%mm0 \n\t"
297  "movd 4(%1), %%mm3 \n\t"
298  "punpckldq 8(%1), %%mm0 \n\t"
299  "punpckldq 12(%1), %%mm3 \n\t"
300  "movq %%mm0, %%mm1 \n\t"
301  "movq %%mm3, %%mm4 \n\t"
302  "pand %%mm6, %%mm0 \n\t"
303  "pand %%mm6, %%mm3 \n\t"
304  "pmaddwd %%mm7, %%mm0 \n\t"
305  "pmaddwd %%mm7, %%mm3 \n\t"
306  "pand %%mm5, %%mm1 \n\t"
307  "pand %%mm5, %%mm4 \n\t"
308  "por %%mm1, %%mm0 \n\t"
309  "por %%mm4, %%mm3 \n\t"
310  "psrld $5, %%mm0 \n\t"
311  "pslld $11, %%mm3 \n\t"
312  "por %%mm3, %%mm0 \n\t"
313  MOVNTQ" %%mm0, (%0) \n\t"
314  "add $16, %1 \n\t"
315  "add $8, %0 \n\t"
316  "2: \n\t"
317  "cmp %2, %1 \n\t"
318  " jb 1b \n\t"
319  : "+r" (d), "+r"(s)
320  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
321  );
322  __asm__ volatile(SFENCE:::"memory");
323  __asm__ volatile(EMMS:::"memory");
324  while (s < end) {
325  register int rgb = *(const uint32_t*)s; s += 4;
326  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
327  }
328 }
329 
330 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
331 {
332  const uint8_t *s = src;
333  const uint8_t *end;
334  const uint8_t *mm_end;
335  uint16_t *d = (uint16_t *)dst;
336  end = s + src_size;
337  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
338  __asm__ volatile(
339  "movq %0, %%mm7 \n\t"
340  "movq %1, %%mm6 \n\t"
341  ::"m"(red_16mask),"m"(green_16mask));
342  mm_end = end - 15;
343  while (s < mm_end) {
344  __asm__ volatile(
345  PREFETCH" 32(%1) \n\t"
346  "movd (%1), %%mm0 \n\t"
347  "movd 4(%1), %%mm3 \n\t"
348  "punpckldq 8(%1), %%mm0 \n\t"
349  "punpckldq 12(%1), %%mm3 \n\t"
350  "movq %%mm0, %%mm1 \n\t"
351  "movq %%mm0, %%mm2 \n\t"
352  "movq %%mm3, %%mm4 \n\t"
353  "movq %%mm3, %%mm5 \n\t"
354  "psllq $8, %%mm0 \n\t"
355  "psllq $8, %%mm3 \n\t"
356  "pand %%mm7, %%mm0 \n\t"
357  "pand %%mm7, %%mm3 \n\t"
358  "psrlq $5, %%mm1 \n\t"
359  "psrlq $5, %%mm4 \n\t"
360  "pand %%mm6, %%mm1 \n\t"
361  "pand %%mm6, %%mm4 \n\t"
362  "psrlq $19, %%mm2 \n\t"
363  "psrlq $19, %%mm5 \n\t"
364  "pand %2, %%mm2 \n\t"
365  "pand %2, %%mm5 \n\t"
366  "por %%mm1, %%mm0 \n\t"
367  "por %%mm4, %%mm3 \n\t"
368  "por %%mm2, %%mm0 \n\t"
369  "por %%mm5, %%mm3 \n\t"
370  "psllq $16, %%mm3 \n\t"
371  "por %%mm3, %%mm0 \n\t"
372  MOVNTQ" %%mm0, (%0) \n\t"
373  :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
374  d += 4;
375  s += 16;
376  }
377  __asm__ volatile(SFENCE:::"memory");
378  __asm__ volatile(EMMS:::"memory");
379  while (s < end) {
380  register int rgb = *(const uint32_t*)s; s += 4;
381  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
382  }
383 }
384 
385 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
386 {
387  const uint8_t *s = src;
388  const uint8_t *end;
389  const uint8_t *mm_end;
390  uint16_t *d = (uint16_t *)dst;
391  end = s + src_size;
392  mm_end = end - 15;
393  __asm__ volatile(
394  "movq %3, %%mm5 \n\t"
395  "movq %4, %%mm6 \n\t"
396  "movq %5, %%mm7 \n\t"
397  "jmp 2f \n\t"
398  ".p2align 4 \n\t"
399  "1: \n\t"
400  PREFETCH" 32(%1) \n\t"
401  "movd (%1), %%mm0 \n\t"
402  "movd 4(%1), %%mm3 \n\t"
403  "punpckldq 8(%1), %%mm0 \n\t"
404  "punpckldq 12(%1), %%mm3 \n\t"
405  "movq %%mm0, %%mm1 \n\t"
406  "movq %%mm3, %%mm4 \n\t"
407  "pand %%mm6, %%mm0 \n\t"
408  "pand %%mm6, %%mm3 \n\t"
409  "pmaddwd %%mm7, %%mm0 \n\t"
410  "pmaddwd %%mm7, %%mm3 \n\t"
411  "pand %%mm5, %%mm1 \n\t"
412  "pand %%mm5, %%mm4 \n\t"
413  "por %%mm1, %%mm0 \n\t"
414  "por %%mm4, %%mm3 \n\t"
415  "psrld $6, %%mm0 \n\t"
416  "pslld $10, %%mm3 \n\t"
417  "por %%mm3, %%mm0 \n\t"
418  MOVNTQ" %%mm0, (%0) \n\t"
419  "add $16, %1 \n\t"
420  "add $8, %0 \n\t"
421  "2: \n\t"
422  "cmp %2, %1 \n\t"
423  " jb 1b \n\t"
424  : "+r" (d), "+r"(s)
425  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
426  );
427  __asm__ volatile(SFENCE:::"memory");
428  __asm__ volatile(EMMS:::"memory");
429  while (s < end) {
430  register int rgb = *(const uint32_t*)s; s += 4;
431  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
432  }
433 }
434 
435 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
436 {
437  const uint8_t *s = src;
438  const uint8_t *end;
439  const uint8_t *mm_end;
440  uint16_t *d = (uint16_t *)dst;
441  end = s + src_size;
442  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
443  __asm__ volatile(
444  "movq %0, %%mm7 \n\t"
445  "movq %1, %%mm6 \n\t"
446  ::"m"(red_15mask),"m"(green_15mask));
447  mm_end = end - 15;
448  while (s < mm_end) {
449  __asm__ volatile(
450  PREFETCH" 32(%1) \n\t"
451  "movd (%1), %%mm0 \n\t"
452  "movd 4(%1), %%mm3 \n\t"
453  "punpckldq 8(%1), %%mm0 \n\t"
454  "punpckldq 12(%1), %%mm3 \n\t"
455  "movq %%mm0, %%mm1 \n\t"
456  "movq %%mm0, %%mm2 \n\t"
457  "movq %%mm3, %%mm4 \n\t"
458  "movq %%mm3, %%mm5 \n\t"
459  "psllq $7, %%mm0 \n\t"
460  "psllq $7, %%mm3 \n\t"
461  "pand %%mm7, %%mm0 \n\t"
462  "pand %%mm7, %%mm3 \n\t"
463  "psrlq $6, %%mm1 \n\t"
464  "psrlq $6, %%mm4 \n\t"
465  "pand %%mm6, %%mm1 \n\t"
466  "pand %%mm6, %%mm4 \n\t"
467  "psrlq $19, %%mm2 \n\t"
468  "psrlq $19, %%mm5 \n\t"
469  "pand %2, %%mm2 \n\t"
470  "pand %2, %%mm5 \n\t"
471  "por %%mm1, %%mm0 \n\t"
472  "por %%mm4, %%mm3 \n\t"
473  "por %%mm2, %%mm0 \n\t"
474  "por %%mm5, %%mm3 \n\t"
475  "psllq $16, %%mm3 \n\t"
476  "por %%mm3, %%mm0 \n\t"
477  MOVNTQ" %%mm0, (%0) \n\t"
478  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
479  d += 4;
480  s += 16;
481  }
482  __asm__ volatile(SFENCE:::"memory");
483  __asm__ volatile(EMMS:::"memory");
484  while (s < end) {
485  register int rgb = *(const uint32_t*)s; s += 4;
486  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
487  }
488 }
489 
490 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
491 {
492  const uint8_t *s = src;
493  const uint8_t *end;
494  const uint8_t *mm_end;
495  uint16_t *d = (uint16_t *)dst;
496  end = s + src_size;
497  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
498  __asm__ volatile(
499  "movq %0, %%mm7 \n\t"
500  "movq %1, %%mm6 \n\t"
501  ::"m"(red_16mask),"m"(green_16mask));
502  mm_end = end - 11;
503  while (s < mm_end) {
504  __asm__ volatile(
505  PREFETCH" 32(%1) \n\t"
506  "movd (%1), %%mm0 \n\t"
507  "movd 3(%1), %%mm3 \n\t"
508  "punpckldq 6(%1), %%mm0 \n\t"
509  "punpckldq 9(%1), %%mm3 \n\t"
510  "movq %%mm0, %%mm1 \n\t"
511  "movq %%mm0, %%mm2 \n\t"
512  "movq %%mm3, %%mm4 \n\t"
513  "movq %%mm3, %%mm5 \n\t"
514  "psrlq $3, %%mm0 \n\t"
515  "psrlq $3, %%mm3 \n\t"
516  "pand %2, %%mm0 \n\t"
517  "pand %2, %%mm3 \n\t"
518  "psrlq $5, %%mm1 \n\t"
519  "psrlq $5, %%mm4 \n\t"
520  "pand %%mm6, %%mm1 \n\t"
521  "pand %%mm6, %%mm4 \n\t"
522  "psrlq $8, %%mm2 \n\t"
523  "psrlq $8, %%mm5 \n\t"
524  "pand %%mm7, %%mm2 \n\t"
525  "pand %%mm7, %%mm5 \n\t"
526  "por %%mm1, %%mm0 \n\t"
527  "por %%mm4, %%mm3 \n\t"
528  "por %%mm2, %%mm0 \n\t"
529  "por %%mm5, %%mm3 \n\t"
530  "psllq $16, %%mm3 \n\t"
531  "por %%mm3, %%mm0 \n\t"
532  MOVNTQ" %%mm0, (%0) \n\t"
533  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
534  d += 4;
535  s += 12;
536  }
537  __asm__ volatile(SFENCE:::"memory");
538  __asm__ volatile(EMMS:::"memory");
539  while (s < end) {
540  const int b = *s++;
541  const int g = *s++;
542  const int r = *s++;
543  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
544  }
545 }
546 
547 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
548 {
549  const uint8_t *s = src;
550  const uint8_t *end;
551  const uint8_t *mm_end;
552  uint16_t *d = (uint16_t *)dst;
553  end = s + src_size;
554  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
555  __asm__ volatile(
556  "movq %0, %%mm7 \n\t"
557  "movq %1, %%mm6 \n\t"
558  ::"m"(red_16mask),"m"(green_16mask));
559  mm_end = end - 15;
560  while (s < mm_end) {
561  __asm__ volatile(
562  PREFETCH" 32(%1) \n\t"
563  "movd (%1), %%mm0 \n\t"
564  "movd 3(%1), %%mm3 \n\t"
565  "punpckldq 6(%1), %%mm0 \n\t"
566  "punpckldq 9(%1), %%mm3 \n\t"
567  "movq %%mm0, %%mm1 \n\t"
568  "movq %%mm0, %%mm2 \n\t"
569  "movq %%mm3, %%mm4 \n\t"
570  "movq %%mm3, %%mm5 \n\t"
571  "psllq $8, %%mm0 \n\t"
572  "psllq $8, %%mm3 \n\t"
573  "pand %%mm7, %%mm0 \n\t"
574  "pand %%mm7, %%mm3 \n\t"
575  "psrlq $5, %%mm1 \n\t"
576  "psrlq $5, %%mm4 \n\t"
577  "pand %%mm6, %%mm1 \n\t"
578  "pand %%mm6, %%mm4 \n\t"
579  "psrlq $19, %%mm2 \n\t"
580  "psrlq $19, %%mm5 \n\t"
581  "pand %2, %%mm2 \n\t"
582  "pand %2, %%mm5 \n\t"
583  "por %%mm1, %%mm0 \n\t"
584  "por %%mm4, %%mm3 \n\t"
585  "por %%mm2, %%mm0 \n\t"
586  "por %%mm5, %%mm3 \n\t"
587  "psllq $16, %%mm3 \n\t"
588  "por %%mm3, %%mm0 \n\t"
589  MOVNTQ" %%mm0, (%0) \n\t"
590  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
591  d += 4;
592  s += 12;
593  }
594  __asm__ volatile(SFENCE:::"memory");
595  __asm__ volatile(EMMS:::"memory");
596  while (s < end) {
597  const int r = *s++;
598  const int g = *s++;
599  const int b = *s++;
600  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
601  }
602 }
603 
604 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
605 {
606  const uint8_t *s = src;
607  const uint8_t *end;
608  const uint8_t *mm_end;
609  uint16_t *d = (uint16_t *)dst;
610  end = s + src_size;
611  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
612  __asm__ volatile(
613  "movq %0, %%mm7 \n\t"
614  "movq %1, %%mm6 \n\t"
615  ::"m"(red_15mask),"m"(green_15mask));
616  mm_end = end - 11;
617  while (s < mm_end) {
618  __asm__ volatile(
619  PREFETCH" 32(%1) \n\t"
620  "movd (%1), %%mm0 \n\t"
621  "movd 3(%1), %%mm3 \n\t"
622  "punpckldq 6(%1), %%mm0 \n\t"
623  "punpckldq 9(%1), %%mm3 \n\t"
624  "movq %%mm0, %%mm1 \n\t"
625  "movq %%mm0, %%mm2 \n\t"
626  "movq %%mm3, %%mm4 \n\t"
627  "movq %%mm3, %%mm5 \n\t"
628  "psrlq $3, %%mm0 \n\t"
629  "psrlq $3, %%mm3 \n\t"
630  "pand %2, %%mm0 \n\t"
631  "pand %2, %%mm3 \n\t"
632  "psrlq $6, %%mm1 \n\t"
633  "psrlq $6, %%mm4 \n\t"
634  "pand %%mm6, %%mm1 \n\t"
635  "pand %%mm6, %%mm4 \n\t"
636  "psrlq $9, %%mm2 \n\t"
637  "psrlq $9, %%mm5 \n\t"
638  "pand %%mm7, %%mm2 \n\t"
639  "pand %%mm7, %%mm5 \n\t"
640  "por %%mm1, %%mm0 \n\t"
641  "por %%mm4, %%mm3 \n\t"
642  "por %%mm2, %%mm0 \n\t"
643  "por %%mm5, %%mm3 \n\t"
644  "psllq $16, %%mm3 \n\t"
645  "por %%mm3, %%mm0 \n\t"
646  MOVNTQ" %%mm0, (%0) \n\t"
647  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
648  d += 4;
649  s += 12;
650  }
651  __asm__ volatile(SFENCE:::"memory");
652  __asm__ volatile(EMMS:::"memory");
653  while (s < end) {
654  const int b = *s++;
655  const int g = *s++;
656  const int r = *s++;
657  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
658  }
659 }
660 
661 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
662 {
663  const uint8_t *s = src;
664  const uint8_t *end;
665  const uint8_t *mm_end;
666  uint16_t *d = (uint16_t *)dst;
667  end = s + src_size;
668  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
669  __asm__ volatile(
670  "movq %0, %%mm7 \n\t"
671  "movq %1, %%mm6 \n\t"
672  ::"m"(red_15mask),"m"(green_15mask));
673  mm_end = end - 15;
674  while (s < mm_end) {
675  __asm__ volatile(
676  PREFETCH" 32(%1) \n\t"
677  "movd (%1), %%mm0 \n\t"
678  "movd 3(%1), %%mm3 \n\t"
679  "punpckldq 6(%1), %%mm0 \n\t"
680  "punpckldq 9(%1), %%mm3 \n\t"
681  "movq %%mm0, %%mm1 \n\t"
682  "movq %%mm0, %%mm2 \n\t"
683  "movq %%mm3, %%mm4 \n\t"
684  "movq %%mm3, %%mm5 \n\t"
685  "psllq $7, %%mm0 \n\t"
686  "psllq $7, %%mm3 \n\t"
687  "pand %%mm7, %%mm0 \n\t"
688  "pand %%mm7, %%mm3 \n\t"
689  "psrlq $6, %%mm1 \n\t"
690  "psrlq $6, %%mm4 \n\t"
691  "pand %%mm6, %%mm1 \n\t"
692  "pand %%mm6, %%mm4 \n\t"
693  "psrlq $19, %%mm2 \n\t"
694  "psrlq $19, %%mm5 \n\t"
695  "pand %2, %%mm2 \n\t"
696  "pand %2, %%mm5 \n\t"
697  "por %%mm1, %%mm0 \n\t"
698  "por %%mm4, %%mm3 \n\t"
699  "por %%mm2, %%mm0 \n\t"
700  "por %%mm5, %%mm3 \n\t"
701  "psllq $16, %%mm3 \n\t"
702  "por %%mm3, %%mm0 \n\t"
703  MOVNTQ" %%mm0, (%0) \n\t"
704  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
705  d += 4;
706  s += 12;
707  }
708  __asm__ volatile(SFENCE:::"memory");
709  __asm__ volatile(EMMS:::"memory");
710  while (s < end) {
711  const int r = *s++;
712  const int g = *s++;
713  const int b = *s++;
714  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
715  }
716 }
717 
718 /*
719  I use less accurate approximation here by simply left-shifting the input
720  value and filling the low order bits with zeroes. This method improves PNG
721  compression but this scheme cannot reproduce white exactly, since it does
722  not generate an all-ones maximum value; the net effect is to darken the
723  image slightly.
724 
725  The better method should be "left bit replication":
726 
727  4 3 2 1 0
728  ---------
729  1 1 0 1 1
730 
731  7 6 5 4 3 2 1 0
732  ----------------
733  1 1 0 1 1 1 1 0
734  |=======| |===|
735  | leftmost bits repeated to fill open bits
736  |
737  original bits
738 */
739 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
740 {
741  const uint16_t *end;
742  const uint16_t *mm_end;
743  uint8_t *d = dst;
744  const uint16_t *s = (const uint16_t*)src;
745  end = s + src_size/2;
746  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
747  mm_end = end - 7;
748  while (s < mm_end) {
749  __asm__ volatile(
750  PREFETCH" 32(%1) \n\t"
751  "movq (%1), %%mm0 \n\t"
752  "movq (%1), %%mm1 \n\t"
753  "movq (%1), %%mm2 \n\t"
754  "pand %2, %%mm0 \n\t"
755  "pand %3, %%mm1 \n\t"
756  "pand %4, %%mm2 \n\t"
757  "psllq $3, %%mm0 \n\t"
758  "psrlq $2, %%mm1 \n\t"
759  "psrlq $7, %%mm2 \n\t"
760  "movq %%mm0, %%mm3 \n\t"
761  "movq %%mm1, %%mm4 \n\t"
762  "movq %%mm2, %%mm5 \n\t"
763  "punpcklwd %5, %%mm0 \n\t"
764  "punpcklwd %5, %%mm1 \n\t"
765  "punpcklwd %5, %%mm2 \n\t"
766  "punpckhwd %5, %%mm3 \n\t"
767  "punpckhwd %5, %%mm4 \n\t"
768  "punpckhwd %5, %%mm5 \n\t"
769  "psllq $8, %%mm1 \n\t"
770  "psllq $16, %%mm2 \n\t"
771  "por %%mm1, %%mm0 \n\t"
772  "por %%mm2, %%mm0 \n\t"
773  "psllq $8, %%mm4 \n\t"
774  "psllq $16, %%mm5 \n\t"
775  "por %%mm4, %%mm3 \n\t"
776  "por %%mm5, %%mm3 \n\t"
777 
778  "movq %%mm0, %%mm6 \n\t"
779  "movq %%mm3, %%mm7 \n\t"
780 
781  "movq 8(%1), %%mm0 \n\t"
782  "movq 8(%1), %%mm1 \n\t"
783  "movq 8(%1), %%mm2 \n\t"
784  "pand %2, %%mm0 \n\t"
785  "pand %3, %%mm1 \n\t"
786  "pand %4, %%mm2 \n\t"
787  "psllq $3, %%mm0 \n\t"
788  "psrlq $2, %%mm1 \n\t"
789  "psrlq $7, %%mm2 \n\t"
790  "movq %%mm0, %%mm3 \n\t"
791  "movq %%mm1, %%mm4 \n\t"
792  "movq %%mm2, %%mm5 \n\t"
793  "punpcklwd %5, %%mm0 \n\t"
794  "punpcklwd %5, %%mm1 \n\t"
795  "punpcklwd %5, %%mm2 \n\t"
796  "punpckhwd %5, %%mm3 \n\t"
797  "punpckhwd %5, %%mm4 \n\t"
798  "punpckhwd %5, %%mm5 \n\t"
799  "psllq $8, %%mm1 \n\t"
800  "psllq $16, %%mm2 \n\t"
801  "por %%mm1, %%mm0 \n\t"
802  "por %%mm2, %%mm0 \n\t"
803  "psllq $8, %%mm4 \n\t"
804  "psllq $16, %%mm5 \n\t"
805  "por %%mm4, %%mm3 \n\t"
806  "por %%mm5, %%mm3 \n\t"
807 
808  :"=m"(*d)
809  :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
810  :"memory");
811  /* borrowed 32 to 24 */
812  __asm__ volatile(
813  "movq %%mm0, %%mm4 \n\t"
814  "movq %%mm3, %%mm5 \n\t"
815  "movq %%mm6, %%mm0 \n\t"
816  "movq %%mm7, %%mm1 \n\t"
817 
818  "movq %%mm4, %%mm6 \n\t"
819  "movq %%mm5, %%mm7 \n\t"
820  "movq %%mm0, %%mm2 \n\t"
821  "movq %%mm1, %%mm3 \n\t"
822 
824 
825  :: "r"(d), "m"(*s)
826  :"memory");
827  d += 24;
828  s += 8;
829  }
830  __asm__ volatile(SFENCE:::"memory");
831  __asm__ volatile(EMMS:::"memory");
832  while (s < end) {
833  register uint16_t bgr;
834  bgr = *s++;
835  *d++ = (bgr&0x1F)<<3;
836  *d++ = (bgr&0x3E0)>>2;
837  *d++ = (bgr&0x7C00)>>7;
838  }
839 }
840 
841 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
842 {
843  const uint16_t *end;
844  const uint16_t *mm_end;
845  uint8_t *d = (uint8_t *)dst;
846  const uint16_t *s = (const uint16_t *)src;
847  end = s + src_size/2;
848  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
849  mm_end = end - 7;
850  while (s < mm_end) {
851  __asm__ volatile(
852  PREFETCH" 32(%1) \n\t"
853  "movq (%1), %%mm0 \n\t"
854  "movq (%1), %%mm1 \n\t"
855  "movq (%1), %%mm2 \n\t"
856  "pand %2, %%mm0 \n\t"
857  "pand %3, %%mm1 \n\t"
858  "pand %4, %%mm2 \n\t"
859  "psllq $3, %%mm0 \n\t"
860  "psrlq $3, %%mm1 \n\t"
861  "psrlq $8, %%mm2 \n\t"
862  "movq %%mm0, %%mm3 \n\t"
863  "movq %%mm1, %%mm4 \n\t"
864  "movq %%mm2, %%mm5 \n\t"
865  "punpcklwd %5, %%mm0 \n\t"
866  "punpcklwd %5, %%mm1 \n\t"
867  "punpcklwd %5, %%mm2 \n\t"
868  "punpckhwd %5, %%mm3 \n\t"
869  "punpckhwd %5, %%mm4 \n\t"
870  "punpckhwd %5, %%mm5 \n\t"
871  "psllq $8, %%mm1 \n\t"
872  "psllq $16, %%mm2 \n\t"
873  "por %%mm1, %%mm0 \n\t"
874  "por %%mm2, %%mm0 \n\t"
875  "psllq $8, %%mm4 \n\t"
876  "psllq $16, %%mm5 \n\t"
877  "por %%mm4, %%mm3 \n\t"
878  "por %%mm5, %%mm3 \n\t"
879 
880  "movq %%mm0, %%mm6 \n\t"
881  "movq %%mm3, %%mm7 \n\t"
882 
883  "movq 8(%1), %%mm0 \n\t"
884  "movq 8(%1), %%mm1 \n\t"
885  "movq 8(%1), %%mm2 \n\t"
886  "pand %2, %%mm0 \n\t"
887  "pand %3, %%mm1 \n\t"
888  "pand %4, %%mm2 \n\t"
889  "psllq $3, %%mm0 \n\t"
890  "psrlq $3, %%mm1 \n\t"
891  "psrlq $8, %%mm2 \n\t"
892  "movq %%mm0, %%mm3 \n\t"
893  "movq %%mm1, %%mm4 \n\t"
894  "movq %%mm2, %%mm5 \n\t"
895  "punpcklwd %5, %%mm0 \n\t"
896  "punpcklwd %5, %%mm1 \n\t"
897  "punpcklwd %5, %%mm2 \n\t"
898  "punpckhwd %5, %%mm3 \n\t"
899  "punpckhwd %5, %%mm4 \n\t"
900  "punpckhwd %5, %%mm5 \n\t"
901  "psllq $8, %%mm1 \n\t"
902  "psllq $16, %%mm2 \n\t"
903  "por %%mm1, %%mm0 \n\t"
904  "por %%mm2, %%mm0 \n\t"
905  "psllq $8, %%mm4 \n\t"
906  "psllq $16, %%mm5 \n\t"
907  "por %%mm4, %%mm3 \n\t"
908  "por %%mm5, %%mm3 \n\t"
909  :"=m"(*d)
910  :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
911  :"memory");
912  /* borrowed 32 to 24 */
913  __asm__ volatile(
914  "movq %%mm0, %%mm4 \n\t"
915  "movq %%mm3, %%mm5 \n\t"
916  "movq %%mm6, %%mm0 \n\t"
917  "movq %%mm7, %%mm1 \n\t"
918 
919  "movq %%mm4, %%mm6 \n\t"
920  "movq %%mm5, %%mm7 \n\t"
921  "movq %%mm0, %%mm2 \n\t"
922  "movq %%mm1, %%mm3 \n\t"
923 
925 
926  :: "r"(d), "m"(*s)
927  :"memory");
928  d += 24;
929  s += 8;
930  }
931  __asm__ volatile(SFENCE:::"memory");
932  __asm__ volatile(EMMS:::"memory");
933  while (s < end) {
934  register uint16_t bgr;
935  bgr = *s++;
936  *d++ = (bgr&0x1F)<<3;
937  *d++ = (bgr&0x7E0)>>3;
938  *d++ = (bgr&0xF800)>>8;
939  }
940 }
941 
942 /*
943  * mm0 = 00 B3 00 B2 00 B1 00 B0
944  * mm1 = 00 G3 00 G2 00 G1 00 G0
945  * mm2 = 00 R3 00 R2 00 R1 00 R0
946  * mm6 = FF FF FF FF FF FF FF FF
947  * mm7 = 00 00 00 00 00 00 00 00
948  */
949 #define PACK_RGB32 \
950  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
951  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
952  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
953  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
954  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
955  "movq %%mm0, %%mm3 \n\t" \
956  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
957  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
958  MOVNTQ" %%mm0, (%0) \n\t" \
959  MOVNTQ" %%mm3, 8(%0) \n\t" \
960 
961 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
962 {
963  const uint16_t *end;
964  const uint16_t *mm_end;
965  uint8_t *d = dst;
966  const uint16_t *s = (const uint16_t *)src;
967  end = s + src_size/2;
968  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
969  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
970  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
971  mm_end = end - 3;
972  while (s < mm_end) {
973  __asm__ volatile(
974  PREFETCH" 32(%1) \n\t"
975  "movq (%1), %%mm0 \n\t"
976  "movq (%1), %%mm1 \n\t"
977  "movq (%1), %%mm2 \n\t"
978  "pand %2, %%mm0 \n\t"
979  "pand %3, %%mm1 \n\t"
980  "pand %4, %%mm2 \n\t"
981  "psllq $3, %%mm0 \n\t"
982  "psrlq $2, %%mm1 \n\t"
983  "psrlq $7, %%mm2 \n\t"
984  PACK_RGB32
985  ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
986  :"memory");
987  d += 16;
988  s += 4;
989  }
990  __asm__ volatile(SFENCE:::"memory");
991  __asm__ volatile(EMMS:::"memory");
992  while (s < end) {
993  register uint16_t bgr;
994  bgr = *s++;
995  *d++ = (bgr&0x1F)<<3;
996  *d++ = (bgr&0x3E0)>>2;
997  *d++ = (bgr&0x7C00)>>7;
998  *d++ = 255;
999  }
1000 }
1001 
1002 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
1003 {
1004  const uint16_t *end;
1005  const uint16_t *mm_end;
1006  uint8_t *d = dst;
1007  const uint16_t *s = (const uint16_t*)src;
1008  end = s + src_size/2;
1009  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1010  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1011  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1012  mm_end = end - 3;
1013  while (s < mm_end) {
1014  __asm__ volatile(
1015  PREFETCH" 32(%1) \n\t"
1016  "movq (%1), %%mm0 \n\t"
1017  "movq (%1), %%mm1 \n\t"
1018  "movq (%1), %%mm2 \n\t"
1019  "pand %2, %%mm0 \n\t"
1020  "pand %3, %%mm1 \n\t"
1021  "pand %4, %%mm2 \n\t"
1022  "psllq $3, %%mm0 \n\t"
1023  "psrlq $3, %%mm1 \n\t"
1024  "psrlq $8, %%mm2 \n\t"
1025  PACK_RGB32
1026  ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1027  :"memory");
1028  d += 16;
1029  s += 4;
1030  }
1031  __asm__ volatile(SFENCE:::"memory");
1032  __asm__ volatile(EMMS:::"memory");
1033  while (s < end) {
1034  register uint16_t bgr;
1035  bgr = *s++;
1036  *d++ = (bgr&0x1F)<<3;
1037  *d++ = (bgr&0x7E0)>>3;
1038  *d++ = (bgr&0xF800)>>8;
1039  *d++ = 255;
1040  }
1041 }
1042 
1043 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1044 {
1045  x86_reg idx = 15 - src_size;
1046  const uint8_t *s = src-idx;
1047  uint8_t *d = dst-idx;
1048  __asm__ volatile(
1049  "test %0, %0 \n\t"
1050  "jns 2f \n\t"
1051  PREFETCH" (%1, %0) \n\t"
1052  "movq %3, %%mm7 \n\t"
1053  "pxor %4, %%mm7 \n\t"
1054  "movq %%mm7, %%mm6 \n\t"
1055  "pxor %5, %%mm7 \n\t"
1056  ".p2align 4 \n\t"
1057  "1: \n\t"
1058  PREFETCH" 32(%1, %0) \n\t"
1059  "movq (%1, %0), %%mm0 \n\t"
1060  "movq 8(%1, %0), %%mm1 \n\t"
1061 # if COMPILE_TEMPLATE_MMXEXT
1062  "pshufw $177, %%mm0, %%mm3 \n\t"
1063  "pshufw $177, %%mm1, %%mm5 \n\t"
1064  "pand %%mm7, %%mm0 \n\t"
1065  "pand %%mm6, %%mm3 \n\t"
1066  "pand %%mm7, %%mm1 \n\t"
1067  "pand %%mm6, %%mm5 \n\t"
1068  "por %%mm3, %%mm0 \n\t"
1069  "por %%mm5, %%mm1 \n\t"
1070 # else
1071  "movq %%mm0, %%mm2 \n\t"
1072  "movq %%mm1, %%mm4 \n\t"
1073  "pand %%mm7, %%mm0 \n\t"
1074  "pand %%mm6, %%mm2 \n\t"
1075  "pand %%mm7, %%mm1 \n\t"
1076  "pand %%mm6, %%mm4 \n\t"
1077  "movq %%mm2, %%mm3 \n\t"
1078  "movq %%mm4, %%mm5 \n\t"
1079  "pslld $16, %%mm2 \n\t"
1080  "psrld $16, %%mm3 \n\t"
1081  "pslld $16, %%mm4 \n\t"
1082  "psrld $16, %%mm5 \n\t"
1083  "por %%mm2, %%mm0 \n\t"
1084  "por %%mm4, %%mm1 \n\t"
1085  "por %%mm3, %%mm0 \n\t"
1086  "por %%mm5, %%mm1 \n\t"
1087 # endif
1088  MOVNTQ" %%mm0, (%2, %0) \n\t"
1089  MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1090  "add $16, %0 \n\t"
1091  "js 1b \n\t"
1092  SFENCE" \n\t"
1093  EMMS" \n\t"
1094  "2: \n\t"
1095  : "+&r"(idx)
1096  : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1097  : "memory");
1098  for (; idx<15; idx+=4) {
1099  register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1100  v &= 0xff00ff;
1101  *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1102  }
1103 }
1104 
1105 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1106 {
1107  unsigned i;
1108  x86_reg mmx_size= 23 - src_size;
1109  __asm__ volatile (
1110  "test %%"REG_a", %%"REG_a" \n\t"
1111  "jns 2f \n\t"
1112  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1113  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1114  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1115  ".p2align 4 \n\t"
1116  "1: \n\t"
1117  PREFETCH" 32(%1, %%"REG_a") \n\t"
1118  "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1119  "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1120  "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1121  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1122  "pand %%mm5, %%mm0 \n\t"
1123  "pand %%mm6, %%mm1 \n\t"
1124  "pand %%mm7, %%mm2 \n\t"
1125  "por %%mm0, %%mm1 \n\t"
1126  "por %%mm2, %%mm1 \n\t"
1127  "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1128  MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1129  "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1130  "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1131  "pand %%mm7, %%mm0 \n\t"
1132  "pand %%mm5, %%mm1 \n\t"
1133  "pand %%mm6, %%mm2 \n\t"
1134  "por %%mm0, %%mm1 \n\t"
1135  "por %%mm2, %%mm1 \n\t"
1136  "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1137  MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1138  "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1139  "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1140  "pand %%mm6, %%mm0 \n\t"
1141  "pand %%mm7, %%mm1 \n\t"
1142  "pand %%mm5, %%mm2 \n\t"
1143  "por %%mm0, %%mm1 \n\t"
1144  "por %%mm2, %%mm1 \n\t"
1145  MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1146  "add $24, %%"REG_a" \n\t"
1147  " js 1b \n\t"
1148  "2: \n\t"
1149  : "+a" (mmx_size)
1150  : "r" (src-mmx_size), "r"(dst-mmx_size)
1151  );
1152 
1153  __asm__ volatile(SFENCE:::"memory");
1154  __asm__ volatile(EMMS:::"memory");
1155 
1156  if (mmx_size==23) return; //finished, was multiple of 8
1157 
1158  src+= src_size;
1159  dst+= src_size;
1160  src_size= 23-mmx_size;
1161  src-= src_size;
1162  dst-= src_size;
1163  for (i=0; i<src_size; i+=3) {
1164  register uint8_t x;
1165  x = src[i + 2];
1166  dst[i + 1] = src[i + 1];
1167  dst[i + 2] = src[i + 0];
1168  dst[i + 0] = x;
1169  }
1170 }
1171 
1172 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1173  int width, int height,
1174  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1175 {
1176  int y;
1177  const x86_reg chromWidth= width>>1;
1178  for (y=0; y<height; y++) {
1179  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1180  __asm__ volatile(
1181  "xor %%"REG_a", %%"REG_a" \n\t"
1182  ".p2align 4 \n\t"
1183  "1: \n\t"
1184  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1185  PREFETCH" 32(%2, %%"REG_a") \n\t"
1186  PREFETCH" 32(%3, %%"REG_a") \n\t"
1187  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1188  "movq %%mm0, %%mm2 \n\t" // U(0)
1189  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1190  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1191  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1192 
1193  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1194  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1195  "movq %%mm3, %%mm4 \n\t" // Y(0)
1196  "movq %%mm5, %%mm6 \n\t" // Y(8)
1197  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1198  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1199  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1200  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1201 
1202  MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1203  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1204  MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1205  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1206 
1207  "add $8, %%"REG_a" \n\t"
1208  "cmp %4, %%"REG_a" \n\t"
1209  " jb 1b \n\t"
1210  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1211  : "%"REG_a
1212  );
1213  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1214  usrc += chromStride;
1215  vsrc += chromStride;
1216  }
1217  ysrc += lumStride;
1218  dst += dstStride;
1219  }
1220  __asm__(EMMS" \n\t"
1221  SFENCE" \n\t"
1222  :::"memory");
1223 }
1224 
1229 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1230  int width, int height,
1231  int lumStride, int chromStride, int dstStride)
1232 {
1233  //FIXME interpolate chroma
1234  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1235 }
1236 
1237 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1238  int width, int height,
1239  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1240 {
1241  int y;
1242  const x86_reg chromWidth= width>>1;
1243  for (y=0; y<height; y++) {
1244  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1245  __asm__ volatile(
1246  "xor %%"REG_a", %%"REG_a" \n\t"
1247  ".p2align 4 \n\t"
1248  "1: \n\t"
1249  PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1250  PREFETCH" 32(%2, %%"REG_a") \n\t"
1251  PREFETCH" 32(%3, %%"REG_a") \n\t"
1252  "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1253  "movq %%mm0, %%mm2 \n\t" // U(0)
1254  "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1255  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1256  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1257 
1258  "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1259  "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1260  "movq %%mm0, %%mm4 \n\t" // Y(0)
1261  "movq %%mm2, %%mm6 \n\t" // Y(8)
1262  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1263  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1264  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1265  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1266 
1267  MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1268  MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1269  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1270  MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1271 
1272  "add $8, %%"REG_a" \n\t"
1273  "cmp %4, %%"REG_a" \n\t"
1274  " jb 1b \n\t"
1275  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1276  : "%"REG_a
1277  );
1278  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1279  usrc += chromStride;
1280  vsrc += chromStride;
1281  }
1282  ysrc += lumStride;
1283  dst += dstStride;
1284  }
1285  __asm__(EMMS" \n\t"
1286  SFENCE" \n\t"
1287  :::"memory");
1288 }
1289 
1294 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1295  int width, int height,
1296  int lumStride, int chromStride, int dstStride)
1297 {
1298  //FIXME interpolate chroma
1299  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1300 }
1301 
1305 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1306  int width, int height,
1307  int lumStride, int chromStride, int dstStride)
1308 {
1309  RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1310 }
1311 
1315 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1316  int width, int height,
1317  int lumStride, int chromStride, int dstStride)
1318 {
1319  RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1320 }
1321 
1326 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1327  int width, int height,
1328  int lumStride, int chromStride, int srcStride)
1329 {
1330  int y;
1331  const x86_reg chromWidth= width>>1;
1332  for (y=0; y<height; y+=2) {
1333  __asm__ volatile(
1334  "xor %%"REG_a", %%"REG_a" \n\t"
1335  "pcmpeqw %%mm7, %%mm7 \n\t"
1336  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1337  ".p2align 4 \n\t"
1338  "1: \n\t"
1339  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1340  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1341  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1342  "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1343  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1344  "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1345  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1346  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1347  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1348  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1349  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1350 
1351  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1352 
1353  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1354  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1355  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1356  "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1357  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1358  "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1359  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1360  "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1361  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1362  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1363 
1364  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1365 
1366  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1367  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1368  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1369  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1370  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1371  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1372  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1373  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1374 
1375  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1376  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1377 
1378  "add $8, %%"REG_a" \n\t"
1379  "cmp %4, %%"REG_a" \n\t"
1380  " jb 1b \n\t"
1381  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1382  : "memory", "%"REG_a
1383  );
1384 
1385  ydst += lumStride;
1386  src += srcStride;
1387 
1388  __asm__ volatile(
1389  "xor %%"REG_a", %%"REG_a" \n\t"
1390  ".p2align 4 \n\t"
1391  "1: \n\t"
1392  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1393  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1394  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1395  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1396  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1397  "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1398  "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1399  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1400  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1401  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1402  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1403 
1404  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1405  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1406 
1407  "add $8, %%"REG_a" \n\t"
1408  "cmp %4, %%"REG_a" \n\t"
1409  " jb 1b \n\t"
1410 
1411  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1412  : "memory", "%"REG_a
1413  );
1414  udst += chromStride;
1415  vdst += chromStride;
1416  ydst += lumStride;
1417  src += srcStride;
1418  }
1419  __asm__ volatile(EMMS" \n\t"
1420  SFENCE" \n\t"
1421  :::"memory");
1422 }
1423 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1424 
1425 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1426 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1427 {
1428  int x,y;
1429 
1430  dst[0]= src[0];
1431 
1432  // first line
1433  for (x=0; x<srcWidth-1; x++) {
1434  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1435  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1436  }
1437  dst[2*srcWidth-1]= src[srcWidth-1];
1438 
1439  dst+= dstStride;
1440 
1441  for (y=1; y<srcHeight; y++) {
1442  const x86_reg mmxSize= srcWidth&~15;
1443  __asm__ volatile(
1444  "mov %4, %%"REG_a" \n\t"
1445  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1446  "movq (%0, %%"REG_a"), %%mm4 \n\t"
1447  "movq %%mm4, %%mm2 \n\t"
1448  "psllq $8, %%mm4 \n\t"
1449  "pand %%mm0, %%mm2 \n\t"
1450  "por %%mm2, %%mm4 \n\t"
1451  "movq (%1, %%"REG_a"), %%mm5 \n\t"
1452  "movq %%mm5, %%mm3 \n\t"
1453  "psllq $8, %%mm5 \n\t"
1454  "pand %%mm0, %%mm3 \n\t"
1455  "por %%mm3, %%mm5 \n\t"
1456  "1: \n\t"
1457  "movq (%0, %%"REG_a"), %%mm0 \n\t"
1458  "movq (%1, %%"REG_a"), %%mm1 \n\t"
1459  "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1460  "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1461  PAVGB" %%mm0, %%mm5 \n\t"
1462  PAVGB" %%mm0, %%mm3 \n\t"
1463  PAVGB" %%mm0, %%mm5 \n\t"
1464  PAVGB" %%mm0, %%mm3 \n\t"
1465  PAVGB" %%mm1, %%mm4 \n\t"
1466  PAVGB" %%mm1, %%mm2 \n\t"
1467  PAVGB" %%mm1, %%mm4 \n\t"
1468  PAVGB" %%mm1, %%mm2 \n\t"
1469  "movq %%mm5, %%mm7 \n\t"
1470  "movq %%mm4, %%mm6 \n\t"
1471  "punpcklbw %%mm3, %%mm5 \n\t"
1472  "punpckhbw %%mm3, %%mm7 \n\t"
1473  "punpcklbw %%mm2, %%mm4 \n\t"
1474  "punpckhbw %%mm2, %%mm6 \n\t"
1475  MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1476  MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1477  MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1478  MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1479  "add $8, %%"REG_a" \n\t"
1480  "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1481  "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1482  " js 1b \n\t"
1483  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1484  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1485  "g" (-mmxSize)
1486  : "%"REG_a
1487  );
1488 
1489  for (x=mmxSize-1; x<srcWidth-1; x++) {
1490  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1491  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1492  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1493  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1494  }
1495  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1496  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1497 
1498  dst+=dstStride*2;
1499  src+=srcStride;
1500  }
1501 
1502  // last line
1503  dst[0]= src[0];
1504 
1505  for (x=0; x<srcWidth-1; x++) {
1506  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1507  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1508  }
1509  dst[2*srcWidth-1]= src[srcWidth-1];
1510 
1511  __asm__ volatile(EMMS" \n\t"
1512  SFENCE" \n\t"
1513  :::"memory");
1514 }
1515 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1516 
1517 #if !COMPILE_TEMPLATE_AMD3DNOW
1518 
1524 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1525  int width, int height,
1526  int lumStride, int chromStride, int srcStride)
1527 {
1528  int y;
1529  const x86_reg chromWidth= width>>1;
1530  for (y=0; y<height; y+=2) {
1531  __asm__ volatile(
1532  "xor %%"REG_a", %%"REG_a" \n\t"
1533  "pcmpeqw %%mm7, %%mm7 \n\t"
1534  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1535  ".p2align 4 \n\t"
1536  "1: \n\t"
1537  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1538  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1539  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1540  "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1541  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1542  "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1543  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1544  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1545  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1546  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1547  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1548 
1549  MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1550 
1551  "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1552  "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1553  "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1554  "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1555  "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1556  "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1557  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1558  "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1559  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1560  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1561 
1562  MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1563 
1564  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1565  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1566  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1567  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1568  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1569  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1570  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1571  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1572 
1573  MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1574  MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1575 
1576  "add $8, %%"REG_a" \n\t"
1577  "cmp %4, %%"REG_a" \n\t"
1578  " jb 1b \n\t"
1579  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1580  : "memory", "%"REG_a
1581  );
1582 
1583  ydst += lumStride;
1584  src += srcStride;
1585 
1586  __asm__ volatile(
1587  "xor %%"REG_a", %%"REG_a" \n\t"
1588  ".p2align 4 \n\t"
1589  "1: \n\t"
1590  PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1591  "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1592  "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1593  "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1594  "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1595  "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1596  "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1597  "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1598  "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1599  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1600  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1601 
1602  MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1603  MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1604 
1605  "add $8, %%"REG_a" \n\t"
1606  "cmp %4, %%"REG_a" \n\t"
1607  " jb 1b \n\t"
1608 
1609  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1610  : "memory", "%"REG_a
1611  );
1612  udst += chromStride;
1613  vdst += chromStride;
1614  ydst += lumStride;
1615  src += srcStride;
1616  }
1617  __asm__ volatile(EMMS" \n\t"
1618  SFENCE" \n\t"
1619  :::"memory");
1620 }
1621 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1622 
1630 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1631  int width, int height,
1632  int lumStride, int chromStride, int srcStride)
1633 {
1634  int y;
1635  const x86_reg chromWidth= width>>1;
1636  for (y=0; y<height-2; y+=2) {
1637  int i;
1638  for (i=0; i<2; i++) {
1639  __asm__ volatile(
1640  "mov %2, %%"REG_a" \n\t"
1641  "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
1642  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1643  "pxor %%mm7, %%mm7 \n\t"
1644  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1645  ".p2align 4 \n\t"
1646  "1: \n\t"
1647  PREFETCH" 64(%0, %%"REG_d") \n\t"
1648  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1649  "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1650  "punpcklbw %%mm7, %%mm0 \n\t"
1651  "punpcklbw %%mm7, %%mm1 \n\t"
1652  "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1653  "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1654  "punpcklbw %%mm7, %%mm2 \n\t"
1655  "punpcklbw %%mm7, %%mm3 \n\t"
1656  "pmaddwd %%mm6, %%mm0 \n\t"
1657  "pmaddwd %%mm6, %%mm1 \n\t"
1658  "pmaddwd %%mm6, %%mm2 \n\t"
1659  "pmaddwd %%mm6, %%mm3 \n\t"
1660 #ifndef FAST_BGR2YV12
1661  "psrad $8, %%mm0 \n\t"
1662  "psrad $8, %%mm1 \n\t"
1663  "psrad $8, %%mm2 \n\t"
1664  "psrad $8, %%mm3 \n\t"
1665 #endif
1666  "packssdw %%mm1, %%mm0 \n\t"
1667  "packssdw %%mm3, %%mm2 \n\t"
1668  "pmaddwd %%mm5, %%mm0 \n\t"
1669  "pmaddwd %%mm5, %%mm2 \n\t"
1670  "packssdw %%mm2, %%mm0 \n\t"
1671  "psraw $7, %%mm0 \n\t"
1672 
1673  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1674  "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1675  "punpcklbw %%mm7, %%mm4 \n\t"
1676  "punpcklbw %%mm7, %%mm1 \n\t"
1677  "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1678  "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1679  "punpcklbw %%mm7, %%mm2 \n\t"
1680  "punpcklbw %%mm7, %%mm3 \n\t"
1681  "pmaddwd %%mm6, %%mm4 \n\t"
1682  "pmaddwd %%mm6, %%mm1 \n\t"
1683  "pmaddwd %%mm6, %%mm2 \n\t"
1684  "pmaddwd %%mm6, %%mm3 \n\t"
1685 #ifndef FAST_BGR2YV12
1686  "psrad $8, %%mm4 \n\t"
1687  "psrad $8, %%mm1 \n\t"
1688  "psrad $8, %%mm2 \n\t"
1689  "psrad $8, %%mm3 \n\t"
1690 #endif
1691  "packssdw %%mm1, %%mm4 \n\t"
1692  "packssdw %%mm3, %%mm2 \n\t"
1693  "pmaddwd %%mm5, %%mm4 \n\t"
1694  "pmaddwd %%mm5, %%mm2 \n\t"
1695  "add $24, %%"REG_d" \n\t"
1696  "packssdw %%mm2, %%mm4 \n\t"
1697  "psraw $7, %%mm4 \n\t"
1698 
1699  "packuswb %%mm4, %%mm0 \n\t"
1700  "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1701 
1702  MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1703  "add $8, %%"REG_a" \n\t"
1704  " js 1b \n\t"
1705  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
1706  : "%"REG_a, "%"REG_d
1707  );
1708  ydst += lumStride;
1709  src += srcStride;
1710  }
1711  src -= srcStride*2;
1712  __asm__ volatile(
1713  "mov %4, %%"REG_a" \n\t"
1714  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1715  "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
1716  "pxor %%mm7, %%mm7 \n\t"
1717  "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1718  "add %%"REG_d", %%"REG_d" \n\t"
1719  ".p2align 4 \n\t"
1720  "1: \n\t"
1721  PREFETCH" 64(%0, %%"REG_d") \n\t"
1722  PREFETCH" 64(%1, %%"REG_d") \n\t"
1723 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1724  "movq (%0, %%"REG_d"), %%mm0 \n\t"
1725  "movq (%1, %%"REG_d"), %%mm1 \n\t"
1726  "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1727  "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1728  PAVGB" %%mm1, %%mm0 \n\t"
1729  PAVGB" %%mm3, %%mm2 \n\t"
1730  "movq %%mm0, %%mm1 \n\t"
1731  "movq %%mm2, %%mm3 \n\t"
1732  "psrlq $24, %%mm0 \n\t"
1733  "psrlq $24, %%mm2 \n\t"
1734  PAVGB" %%mm1, %%mm0 \n\t"
1735  PAVGB" %%mm3, %%mm2 \n\t"
1736  "punpcklbw %%mm7, %%mm0 \n\t"
1737  "punpcklbw %%mm7, %%mm2 \n\t"
1738 #else
1739  "movd (%0, %%"REG_d"), %%mm0 \n\t"
1740  "movd (%1, %%"REG_d"), %%mm1 \n\t"
1741  "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1742  "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1743  "punpcklbw %%mm7, %%mm0 \n\t"
1744  "punpcklbw %%mm7, %%mm1 \n\t"
1745  "punpcklbw %%mm7, %%mm2 \n\t"
1746  "punpcklbw %%mm7, %%mm3 \n\t"
1747  "paddw %%mm1, %%mm0 \n\t"
1748  "paddw %%mm3, %%mm2 \n\t"
1749  "paddw %%mm2, %%mm0 \n\t"
1750  "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1751  "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1752  "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1753  "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1754  "punpcklbw %%mm7, %%mm4 \n\t"
1755  "punpcklbw %%mm7, %%mm1 \n\t"
1756  "punpcklbw %%mm7, %%mm2 \n\t"
1757  "punpcklbw %%mm7, %%mm3 \n\t"
1758  "paddw %%mm1, %%mm4 \n\t"
1759  "paddw %%mm3, %%mm2 \n\t"
1760  "paddw %%mm4, %%mm2 \n\t"
1761  "psrlw $2, %%mm0 \n\t"
1762  "psrlw $2, %%mm2 \n\t"
1763 #endif
1764  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1765  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1766 
1767  "pmaddwd %%mm0, %%mm1 \n\t"
1768  "pmaddwd %%mm2, %%mm3 \n\t"
1769  "pmaddwd %%mm6, %%mm0 \n\t"
1770  "pmaddwd %%mm6, %%mm2 \n\t"
1771 #ifndef FAST_BGR2YV12
1772  "psrad $8, %%mm0 \n\t"
1773  "psrad $8, %%mm1 \n\t"
1774  "psrad $8, %%mm2 \n\t"
1775  "psrad $8, %%mm3 \n\t"
1776 #endif
1777  "packssdw %%mm2, %%mm0 \n\t"
1778  "packssdw %%mm3, %%mm1 \n\t"
1779  "pmaddwd %%mm5, %%mm0 \n\t"
1780  "pmaddwd %%mm5, %%mm1 \n\t"
1781  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1782  "psraw $7, %%mm0 \n\t"
1783 
1784 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1785  "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1786  "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1787  "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1788  "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1789  PAVGB" %%mm1, %%mm4 \n\t"
1790  PAVGB" %%mm3, %%mm2 \n\t"
1791  "movq %%mm4, %%mm1 \n\t"
1792  "movq %%mm2, %%mm3 \n\t"
1793  "psrlq $24, %%mm4 \n\t"
1794  "psrlq $24, %%mm2 \n\t"
1795  PAVGB" %%mm1, %%mm4 \n\t"
1796  PAVGB" %%mm3, %%mm2 \n\t"
1797  "punpcklbw %%mm7, %%mm4 \n\t"
1798  "punpcklbw %%mm7, %%mm2 \n\t"
1799 #else
1800  "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1801  "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1802  "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1803  "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1804  "punpcklbw %%mm7, %%mm4 \n\t"
1805  "punpcklbw %%mm7, %%mm1 \n\t"
1806  "punpcklbw %%mm7, %%mm2 \n\t"
1807  "punpcklbw %%mm7, %%mm3 \n\t"
1808  "paddw %%mm1, %%mm4 \n\t"
1809  "paddw %%mm3, %%mm2 \n\t"
1810  "paddw %%mm2, %%mm4 \n\t"
1811  "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1812  "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1813  "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1814  "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1815  "punpcklbw %%mm7, %%mm5 \n\t"
1816  "punpcklbw %%mm7, %%mm1 \n\t"
1817  "punpcklbw %%mm7, %%mm2 \n\t"
1818  "punpcklbw %%mm7, %%mm3 \n\t"
1819  "paddw %%mm1, %%mm5 \n\t"
1820  "paddw %%mm3, %%mm2 \n\t"
1821  "paddw %%mm5, %%mm2 \n\t"
1822  "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1823  "psrlw $2, %%mm4 \n\t"
1824  "psrlw $2, %%mm2 \n\t"
1825 #endif
1826  "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
1827  "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
1828 
1829  "pmaddwd %%mm4, %%mm1 \n\t"
1830  "pmaddwd %%mm2, %%mm3 \n\t"
1831  "pmaddwd %%mm6, %%mm4 \n\t"
1832  "pmaddwd %%mm6, %%mm2 \n\t"
1833 #ifndef FAST_BGR2YV12
1834  "psrad $8, %%mm4 \n\t"
1835  "psrad $8, %%mm1 \n\t"
1836  "psrad $8, %%mm2 \n\t"
1837  "psrad $8, %%mm3 \n\t"
1838 #endif
1839  "packssdw %%mm2, %%mm4 \n\t"
1840  "packssdw %%mm3, %%mm1 \n\t"
1841  "pmaddwd %%mm5, %%mm4 \n\t"
1842  "pmaddwd %%mm5, %%mm1 \n\t"
1843  "add $24, %%"REG_d" \n\t"
1844  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1845  "psraw $7, %%mm4 \n\t"
1846 
1847  "movq %%mm0, %%mm1 \n\t"
1848  "punpckldq %%mm4, %%mm0 \n\t"
1849  "punpckhdq %%mm4, %%mm1 \n\t"
1850  "packsswb %%mm1, %%mm0 \n\t"
1851  "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1852  "movd %%mm0, (%2, %%"REG_a") \n\t"
1853  "punpckhdq %%mm0, %%mm0 \n\t"
1854  "movd %%mm0, (%3, %%"REG_a") \n\t"
1855  "add $4, %%"REG_a" \n\t"
1856  " js 1b \n\t"
1857  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
1858  : "%"REG_a, "%"REG_d
1859  );
1860 
1861  udst += chromStride;
1862  vdst += chromStride;
1863  src += srcStride*2;
1864  }
1865 
1866  __asm__ volatile(EMMS" \n\t"
1867  SFENCE" \n\t"
1868  :::"memory");
1869 
1870  rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1871 }
1872 #endif /* !COMPILE_TEMPLATE_SSE2 */
1873 
1874 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1875 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1876  int width, int height, int src1Stride,
1877  int src2Stride, int dstStride)
1878 {
1879  int h;
1880 
1881  for (h=0; h < height; h++) {
1882  int w;
1883 
1884 #if COMPILE_TEMPLATE_SSE2
1885  __asm__(
1886  "xor %%"REG_a", %%"REG_a" \n\t"
1887  "1: \n\t"
1888  PREFETCH" 64(%1, %%"REG_a") \n\t"
1889  PREFETCH" 64(%2, %%"REG_a") \n\t"
1890  "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1891  "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1892  "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1893  "punpcklbw %%xmm2, %%xmm0 \n\t"
1894  "punpckhbw %%xmm2, %%xmm1 \n\t"
1895  "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1896  "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1897  "add $16, %%"REG_a" \n\t"
1898  "cmp %3, %%"REG_a" \n\t"
1899  " jb 1b \n\t"
1900  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1901  : "memory", "%"REG_a""
1902  );
1903 #else
1904  __asm__(
1905  "xor %%"REG_a", %%"REG_a" \n\t"
1906  "1: \n\t"
1907  PREFETCH" 64(%1, %%"REG_a") \n\t"
1908  PREFETCH" 64(%2, %%"REG_a") \n\t"
1909  "movq (%1, %%"REG_a"), %%mm0 \n\t"
1910  "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1911  "movq %%mm0, %%mm1 \n\t"
1912  "movq %%mm2, %%mm3 \n\t"
1913  "movq (%2, %%"REG_a"), %%mm4 \n\t"
1914  "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1915  "punpcklbw %%mm4, %%mm0 \n\t"
1916  "punpckhbw %%mm4, %%mm1 \n\t"
1917  "punpcklbw %%mm5, %%mm2 \n\t"
1918  "punpckhbw %%mm5, %%mm3 \n\t"
1919  MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1920  MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1921  MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1922  MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1923  "add $16, %%"REG_a" \n\t"
1924  "cmp %3, %%"REG_a" \n\t"
1925  " jb 1b \n\t"
1926  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1927  : "memory", "%"REG_a
1928  );
1929 #endif
1930  for (w= (width&(~15)); w < width; w++) {
1931  dest[2*w+0] = src1[w];
1932  dest[2*w+1] = src2[w];
1933  }
1934  dest += dstStride;
1935  src1 += src1Stride;
1936  src2 += src2Stride;
1937  }
1938  __asm__(
1939  EMMS" \n\t"
1940  SFENCE" \n\t"
1941  ::: "memory"
1942  );
1943 }
1944 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
1945 
1946 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1947 void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1948  const uint8_t *src, const uint8_t *unused, int w,
1949  uint32_t *unused2);
1950 static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1951  int width, int height, int srcStride,
1952  int dst1Stride, int dst2Stride)
1953 {
1954  int h;
1955 
1956  for (h = 0; h < height; h++) {
1957  RENAME(ff_nv12ToUV)(dst1, dst2, src, NULL, width, NULL);
1958  src += srcStride;
1959  dst1 += dst1Stride;
1960  dst2 += dst2Stride;
1961  }
1962  __asm__(
1963  EMMS" \n\t"
1964  SFENCE" \n\t"
1965  ::: "memory"
1966  );
1967 }
1968 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1969 
1970 #if !COMPILE_TEMPLATE_SSE2
1971 #if !COMPILE_TEMPLATE_AMD3DNOW
1972 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1973  uint8_t *dst1, uint8_t *dst2,
1974  int width, int height,
1975  int srcStride1, int srcStride2,
1976  int dstStride1, int dstStride2)
1977 {
1978  x86_reg x, y;
1979  int w,h;
1980  w=width/2; h=height/2;
1981  __asm__ volatile(
1982  PREFETCH" %0 \n\t"
1983  PREFETCH" %1 \n\t"
1984  ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1985  for (y=0;y<h;y++) {
1986  const uint8_t* s1=src1+srcStride1*(y>>1);
1987  uint8_t* d=dst1+dstStride1*y;
1988  x=0;
1989  for (;x<w-31;x+=32) {
1990  __asm__ volatile(
1991  PREFETCH" 32(%1,%2) \n\t"
1992  "movq (%1,%2), %%mm0 \n\t"
1993  "movq 8(%1,%2), %%mm2 \n\t"
1994  "movq 16(%1,%2), %%mm4 \n\t"
1995  "movq 24(%1,%2), %%mm6 \n\t"
1996  "movq %%mm0, %%mm1 \n\t"
1997  "movq %%mm2, %%mm3 \n\t"
1998  "movq %%mm4, %%mm5 \n\t"
1999  "movq %%mm6, %%mm7 \n\t"
2000  "punpcklbw %%mm0, %%mm0 \n\t"
2001  "punpckhbw %%mm1, %%mm1 \n\t"
2002  "punpcklbw %%mm2, %%mm2 \n\t"
2003  "punpckhbw %%mm3, %%mm3 \n\t"
2004  "punpcklbw %%mm4, %%mm4 \n\t"
2005  "punpckhbw %%mm5, %%mm5 \n\t"
2006  "punpcklbw %%mm6, %%mm6 \n\t"
2007  "punpckhbw %%mm7, %%mm7 \n\t"
2008  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2009  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2010  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2011  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2012  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2013  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2014  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2015  MOVNTQ" %%mm7, 56(%0,%2,2)"
2016  :: "r"(d), "r"(s1), "r"(x)
2017  :"memory");
2018  }
2019  for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2020  }
2021  for (y=0;y<h;y++) {
2022  const uint8_t* s2=src2+srcStride2*(y>>1);
2023  uint8_t* d=dst2+dstStride2*y;
2024  x=0;
2025  for (;x<w-31;x+=32) {
2026  __asm__ volatile(
2027  PREFETCH" 32(%1,%2) \n\t"
2028  "movq (%1,%2), %%mm0 \n\t"
2029  "movq 8(%1,%2), %%mm2 \n\t"
2030  "movq 16(%1,%2), %%mm4 \n\t"
2031  "movq 24(%1,%2), %%mm6 \n\t"
2032  "movq %%mm0, %%mm1 \n\t"
2033  "movq %%mm2, %%mm3 \n\t"
2034  "movq %%mm4, %%mm5 \n\t"
2035  "movq %%mm6, %%mm7 \n\t"
2036  "punpcklbw %%mm0, %%mm0 \n\t"
2037  "punpckhbw %%mm1, %%mm1 \n\t"
2038  "punpcklbw %%mm2, %%mm2 \n\t"
2039  "punpckhbw %%mm3, %%mm3 \n\t"
2040  "punpcklbw %%mm4, %%mm4 \n\t"
2041  "punpckhbw %%mm5, %%mm5 \n\t"
2042  "punpcklbw %%mm6, %%mm6 \n\t"
2043  "punpckhbw %%mm7, %%mm7 \n\t"
2044  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2045  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2046  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2047  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2048  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2049  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2050  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2051  MOVNTQ" %%mm7, 56(%0,%2,2)"
2052  :: "r"(d), "r"(s2), "r"(x)
2053  :"memory");
2054  }
2055  for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2056  }
2057  __asm__(
2058  EMMS" \n\t"
2059  SFENCE" \n\t"
2060  ::: "memory"
2061  );
2062 }
2063 
2064 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2065  uint8_t *dst,
2066  int width, int height,
2067  int srcStride1, int srcStride2,
2068  int srcStride3, int dstStride)
2069 {
2070  x86_reg x;
2071  int y,w,h;
2072  w=width/2; h=height;
2073  for (y=0;y<h;y++) {
2074  const uint8_t* yp=src1+srcStride1*y;
2075  const uint8_t* up=src2+srcStride2*(y>>2);
2076  const uint8_t* vp=src3+srcStride3*(y>>2);
2077  uint8_t* d=dst+dstStride*y;
2078  x=0;
2079  for (;x<w-7;x+=8) {
2080  __asm__ volatile(
2081  PREFETCH" 32(%1, %0) \n\t"
2082  PREFETCH" 32(%2, %0) \n\t"
2083  PREFETCH" 32(%3, %0) \n\t"
2084  "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2085  "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2086  "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2087  "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2088  "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2089  "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2090  "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2091  "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2092  "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2093  "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2094 
2095  "movq %%mm1, %%mm6 \n\t"
2096  "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2097  "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2098  "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2099  MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2100  MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2101 
2102  "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2103  "movq 8(%1, %0, 4), %%mm0 \n\t"
2104  "movq %%mm0, %%mm3 \n\t"
2105  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2106  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2107  MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2108  MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2109 
2110  "movq %%mm4, %%mm6 \n\t"
2111  "movq 16(%1, %0, 4), %%mm0 \n\t"
2112  "movq %%mm0, %%mm3 \n\t"
2113  "punpcklbw %%mm5, %%mm4 \n\t"
2114  "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2115  "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2116  MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2117  MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2118 
2119  "punpckhbw %%mm5, %%mm6 \n\t"
2120  "movq 24(%1, %0, 4), %%mm0 \n\t"
2121  "movq %%mm0, %%mm3 \n\t"
2122  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2123  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2124  MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2125  MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2126 
2127  : "+r" (x)
2128  : "r"(yp), "r" (up), "r"(vp), "r"(d)
2129  :"memory");
2130  }
2131  for (; x<w; x++) {
2132  const int x2 = x<<2;
2133  d[8*x+0] = yp[x2];
2134  d[8*x+1] = up[x];
2135  d[8*x+2] = yp[x2+1];
2136  d[8*x+3] = vp[x];
2137  d[8*x+4] = yp[x2+2];
2138  d[8*x+5] = up[x];
2139  d[8*x+6] = yp[x2+3];
2140  d[8*x+7] = vp[x];
2141  }
2142  }
2143  __asm__(
2144  EMMS" \n\t"
2145  SFENCE" \n\t"
2146  ::: "memory"
2147  );
2148 }
2149 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2150 
2151 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2152 {
2153  dst += count;
2154  src += 2*count;
2155  count= - count;
2156 
2157  if(count <= -16) {
2158  count += 15;
2159  __asm__ volatile(
2160  "pcmpeqw %%mm7, %%mm7 \n\t"
2161  "psrlw $8, %%mm7 \n\t"
2162  "1: \n\t"
2163  "movq -30(%1, %0, 2), %%mm0 \n\t"
2164  "movq -22(%1, %0, 2), %%mm1 \n\t"
2165  "movq -14(%1, %0, 2), %%mm2 \n\t"
2166  "movq -6(%1, %0, 2), %%mm3 \n\t"
2167  "pand %%mm7, %%mm0 \n\t"
2168  "pand %%mm7, %%mm1 \n\t"
2169  "pand %%mm7, %%mm2 \n\t"
2170  "pand %%mm7, %%mm3 \n\t"
2171  "packuswb %%mm1, %%mm0 \n\t"
2172  "packuswb %%mm3, %%mm2 \n\t"
2173  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2174  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2175  "add $16, %0 \n\t"
2176  " js 1b \n\t"
2177  : "+r"(count)
2178  : "r"(src), "r"(dst)
2179  );
2180  count -= 15;
2181  }
2182  while(count<0) {
2183  dst[count]= src[2*count];
2184  count++;
2185  }
2186 }
2187 
2188 #if !COMPILE_TEMPLATE_AMD3DNOW
2189 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2190 {
2191  dst0+= count;
2192  dst1+= count;
2193  src += 4*count;
2194  count= - count;
2195  if(count <= -8) {
2196  count += 7;
2197  __asm__ volatile(
2198  "pcmpeqw %%mm7, %%mm7 \n\t"
2199  "psrlw $8, %%mm7 \n\t"
2200  "1: \n\t"
2201  "movq -28(%1, %0, 4), %%mm0 \n\t"
2202  "movq -20(%1, %0, 4), %%mm1 \n\t"
2203  "movq -12(%1, %0, 4), %%mm2 \n\t"
2204  "movq -4(%1, %0, 4), %%mm3 \n\t"
2205  "pand %%mm7, %%mm0 \n\t"
2206  "pand %%mm7, %%mm1 \n\t"
2207  "pand %%mm7, %%mm2 \n\t"
2208  "pand %%mm7, %%mm3 \n\t"
2209  "packuswb %%mm1, %%mm0 \n\t"
2210  "packuswb %%mm3, %%mm2 \n\t"
2211  "movq %%mm0, %%mm1 \n\t"
2212  "movq %%mm2, %%mm3 \n\t"
2213  "psrlw $8, %%mm0 \n\t"
2214  "psrlw $8, %%mm2 \n\t"
2215  "pand %%mm7, %%mm1 \n\t"
2216  "pand %%mm7, %%mm3 \n\t"
2217  "packuswb %%mm2, %%mm0 \n\t"
2218  "packuswb %%mm3, %%mm1 \n\t"
2219  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2220  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2221  "add $8, %0 \n\t"
2222  " js 1b \n\t"
2223  : "+r"(count)
2224  : "r"(src), "r"(dst0), "r"(dst1)
2225  );
2226  count -= 7;
2227  }
2228  while(count<0) {
2229  dst0[count]= src[4*count+0];
2230  dst1[count]= src[4*count+2];
2231  count++;
2232  }
2233 }
2234 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2235 
2236 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2237 {
2238  dst0 += count;
2239  dst1 += count;
2240  src0 += 4*count;
2241  src1 += 4*count;
2242  count= - count;
2243 #ifdef PAVGB
2244  if(count <= -8) {
2245  count += 7;
2246  __asm__ volatile(
2247  "pcmpeqw %%mm7, %%mm7 \n\t"
2248  "psrlw $8, %%mm7 \n\t"
2249  "1: \n\t"
2250  "movq -28(%1, %0, 4), %%mm0 \n\t"
2251  "movq -20(%1, %0, 4), %%mm1 \n\t"
2252  "movq -12(%1, %0, 4), %%mm2 \n\t"
2253  "movq -4(%1, %0, 4), %%mm3 \n\t"
2254  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2255  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2256  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2257  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2258  "pand %%mm7, %%mm0 \n\t"
2259  "pand %%mm7, %%mm1 \n\t"
2260  "pand %%mm7, %%mm2 \n\t"
2261  "pand %%mm7, %%mm3 \n\t"
2262  "packuswb %%mm1, %%mm0 \n\t"
2263  "packuswb %%mm3, %%mm2 \n\t"
2264  "movq %%mm0, %%mm1 \n\t"
2265  "movq %%mm2, %%mm3 \n\t"
2266  "psrlw $8, %%mm0 \n\t"
2267  "psrlw $8, %%mm2 \n\t"
2268  "pand %%mm7, %%mm1 \n\t"
2269  "pand %%mm7, %%mm3 \n\t"
2270  "packuswb %%mm2, %%mm0 \n\t"
2271  "packuswb %%mm3, %%mm1 \n\t"
2272  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2273  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2274  "add $8, %0 \n\t"
2275  " js 1b \n\t"
2276  : "+r"(count)
2277  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2278  );
2279  count -= 7;
2280  }
2281 #endif
2282  while(count<0) {
2283  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2284  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2285  count++;
2286  }
2287 }
2288 
2289 #if !COMPILE_TEMPLATE_AMD3DNOW
2290 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2291 {
2292  dst0+= count;
2293  dst1+= count;
2294  src += 4*count;
2295  count= - count;
2296  if(count <= -8) {
2297  count += 7;
2298  __asm__ volatile(
2299  "pcmpeqw %%mm7, %%mm7 \n\t"
2300  "psrlw $8, %%mm7 \n\t"
2301  "1: \n\t"
2302  "movq -28(%1, %0, 4), %%mm0 \n\t"
2303  "movq -20(%1, %0, 4), %%mm1 \n\t"
2304  "movq -12(%1, %0, 4), %%mm2 \n\t"
2305  "movq -4(%1, %0, 4), %%mm3 \n\t"
2306  "psrlw $8, %%mm0 \n\t"
2307  "psrlw $8, %%mm1 \n\t"
2308  "psrlw $8, %%mm2 \n\t"
2309  "psrlw $8, %%mm3 \n\t"
2310  "packuswb %%mm1, %%mm0 \n\t"
2311  "packuswb %%mm3, %%mm2 \n\t"
2312  "movq %%mm0, %%mm1 \n\t"
2313  "movq %%mm2, %%mm3 \n\t"
2314  "psrlw $8, %%mm0 \n\t"
2315  "psrlw $8, %%mm2 \n\t"
2316  "pand %%mm7, %%mm1 \n\t"
2317  "pand %%mm7, %%mm3 \n\t"
2318  "packuswb %%mm2, %%mm0 \n\t"
2319  "packuswb %%mm3, %%mm1 \n\t"
2320  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2321  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2322  "add $8, %0 \n\t"
2323  " js 1b \n\t"
2324  : "+r"(count)
2325  : "r"(src), "r"(dst0), "r"(dst1)
2326  );
2327  count -= 7;
2328  }
2329  src++;
2330  while(count<0) {
2331  dst0[count]= src[4*count+0];
2332  dst1[count]= src[4*count+2];
2333  count++;
2334  }
2335 }
2336 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2337 
2338 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2339 {
2340  dst0 += count;
2341  dst1 += count;
2342  src0 += 4*count;
2343  src1 += 4*count;
2344  count= - count;
2345 #ifdef PAVGB
2346  if(count <= -8) {
2347  count += 7;
2348  __asm__ volatile(
2349  "pcmpeqw %%mm7, %%mm7 \n\t"
2350  "psrlw $8, %%mm7 \n\t"
2351  "1: \n\t"
2352  "movq -28(%1, %0, 4), %%mm0 \n\t"
2353  "movq -20(%1, %0, 4), %%mm1 \n\t"
2354  "movq -12(%1, %0, 4), %%mm2 \n\t"
2355  "movq -4(%1, %0, 4), %%mm3 \n\t"
2356  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2357  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2358  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2359  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2360  "psrlw $8, %%mm0 \n\t"
2361  "psrlw $8, %%mm1 \n\t"
2362  "psrlw $8, %%mm2 \n\t"
2363  "psrlw $8, %%mm3 \n\t"
2364  "packuswb %%mm1, %%mm0 \n\t"
2365  "packuswb %%mm3, %%mm2 \n\t"
2366  "movq %%mm0, %%mm1 \n\t"
2367  "movq %%mm2, %%mm3 \n\t"
2368  "psrlw $8, %%mm0 \n\t"
2369  "psrlw $8, %%mm2 \n\t"
2370  "pand %%mm7, %%mm1 \n\t"
2371  "pand %%mm7, %%mm3 \n\t"
2372  "packuswb %%mm2, %%mm0 \n\t"
2373  "packuswb %%mm3, %%mm1 \n\t"
2374  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2375  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2376  "add $8, %0 \n\t"
2377  " js 1b \n\t"
2378  : "+r"(count)
2379  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2380  );
2381  count -= 7;
2382  }
2383 #endif
2384  src0++;
2385  src1++;
2386  while(count<0) {
2387  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2388  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2389  count++;
2390  }
2391 }
2392 
2393 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2394  int width, int height,
2395  int lumStride, int chromStride, int srcStride)
2396 {
2397  int y;
2398  const int chromWidth= -((-width)>>1);
2399 
2400  for (y=0; y<height; y++) {
2401  RENAME(extract_even)(src, ydst, width);
2402  if(y&1) {
2403  RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2404  udst+= chromStride;
2405  vdst+= chromStride;
2406  }
2407 
2408  src += srcStride;
2409  ydst+= lumStride;
2410  }
2411  __asm__(
2412  EMMS" \n\t"
2413  SFENCE" \n\t"
2414  ::: "memory"
2415  );
2416 }
2417 
2418 #if !COMPILE_TEMPLATE_AMD3DNOW
2419 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2420  int width, int height,
2421  int lumStride, int chromStride, int srcStride)
2422 {
2423  int y;
2424  const int chromWidth= -((-width)>>1);
2425 
2426  for (y=0; y<height; y++) {
2427  RENAME(extract_even)(src, ydst, width);
2428  RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2429 
2430  src += srcStride;
2431  ydst+= lumStride;
2432  udst+= chromStride;
2433  vdst+= chromStride;
2434  }
2435  __asm__(
2436  EMMS" \n\t"
2437  SFENCE" \n\t"
2438  ::: "memory"
2439  );
2440 }
2441 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2442 
2443 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2444  int width, int height,
2445  int lumStride, int chromStride, int srcStride)
2446 {
2447  int y;
2448  const int chromWidth= -((-width)>>1);
2449 
2450  for (y=0; y<height; y++) {
2451  RENAME(extract_even)(src+1, ydst, width);
2452  if(y&1) {
2453  RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2454  udst+= chromStride;
2455  vdst+= chromStride;
2456  }
2457 
2458  src += srcStride;
2459  ydst+= lumStride;
2460  }
2461  __asm__(
2462  EMMS" \n\t"
2463  SFENCE" \n\t"
2464  ::: "memory"
2465  );
2466 }
2467 
2468 #if !COMPILE_TEMPLATE_AMD3DNOW
2469 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2470  int width, int height,
2471  int lumStride, int chromStride, int srcStride)
2472 {
2473  int y;
2474  const int chromWidth= -((-width)>>1);
2475 
2476  for (y=0; y<height; y++) {
2477  RENAME(extract_even)(src+1, ydst, width);
2478  RENAME(extract_even2)(src, udst, vdst, chromWidth);
2479 
2480  src += srcStride;
2481  ydst+= lumStride;
2482  udst+= chromStride;
2483  vdst+= chromStride;
2484  }
2485  __asm__(
2486  EMMS" \n\t"
2487  SFENCE" \n\t"
2488  ::: "memory"
2489  );
2490 }
2491 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2492 #endif /* !COMPILE_TEMPLATE_SSE2 */
2493 
2495 {
2496 #if !COMPILE_TEMPLATE_SSE2
2497 #if !COMPILE_TEMPLATE_AMD3DNOW
2525 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2526 
2527 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2529 #endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2531 
2534 #endif /* !COMPILE_TEMPLATE_SSE2 */
2535 
2536 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2538 #endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
2539 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
2541 #endif
2542 }
static void RENAME() rgb32tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuyvtoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() vu9_to_vu12(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
static void RENAME() uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() rgb16tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
#define RENAME(a)
static void RENAME() rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
Definition: rgb2rgb.c:80
static void RENAME() rgb32tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb24tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yuv422ptoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
static void RENAME() extract_even2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb15to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Macro definitions for various function/variable attributes.
uint8_t
#define av_cold
Definition: attributes.h:66
int x86_reg
Definition: asm.h:70
static void RENAME() yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
#define b
Definition: input.c:52
void rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 2.
static void RENAME() rgb24tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_odd2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
#define PREFETCH
#define MANGLE(a)
Definition: asm.h:110
static void RENAME() rgb24tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
#define r
Definition: input.c:51
static void RENAME() uyvytoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
static void RENAME() shuffle_bytes_2103(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12touyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16 (If this is a problem for anyon...
g
Definition: yuv2rgb.c:535
static av_cold void RENAME() rgb2rgb_init(void)
static void RENAME() yuvPlanartouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
#define PACK_RGB32
#define MOVNTQ
static void RENAME() rgb24to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb16to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() extract_even2avg(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() yuv422ptouyvy(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
NULL
Definition: eval.c:55
static int width
Definition: utils.c:156
static void RENAME() yuvPlanartoyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
static void RENAME() rgb16to32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb15tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static void RENAME() extract_even(const uint8_t *src, uint8_t *dst, x86_reg count)
static void RENAME() rgb32tobgr24(const uint8_t *src, uint8_t *dst, int src_size)
int height
Definition: gxfenc.c:72
static void RENAME() yuyvtoyuv420(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
#define EMMS
static void RENAME() yvu9_to_yuy2(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
static void RENAME() extract_odd2(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
static void RENAME() rgb24tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
#define STORE_BGR24_MMX
#define SFENCE
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
Definition: rgb2rgb.c:85
static void RENAME() rgb24to16(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() rgb32to15(const uint8_t *src, uint8_t *dst, int src_size)
static void RENAME() interleaveBytes(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, int width, int height, int src1Stride, int src2Stride, int dstStride)