SDL  2.0
yuv_rgb_sse_func.h
Go to the documentation of this file.
1 // Copyright 2016 Adrien Descamps
2 // Distributed under BSD 3-Clause License
3 
4 /* You need to define the following macros before including this file:
5  SSE_FUNCTION_NAME
6  STD_FUNCTION_NAME
7  YUV_FORMAT
8  RGB_FORMAT
9 */
10 /* You may define the following macro, which affects generated code:
11  SSE_ALIGNED
12 */
13 
14 #ifdef SSE_ALIGNED
15 /* Unaligned instructions seem faster, even on aligned data? */
16 /*
17 #define LOAD_SI128 _mm_load_si128
18 #define SAVE_SI128 _mm_stream_si128
19 */
20 #define LOAD_SI128 _mm_loadu_si128
21 #define SAVE_SI128 _mm_storeu_si128
22 #else
23 #define LOAD_SI128 _mm_loadu_si128
24 #define SAVE_SI128 _mm_storeu_si128
25 #endif
26 
27 #define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \
28  r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \
29  g_tmp = _mm_add_epi16( \
30  _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \
31  _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \
32  b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \
33  R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \
34  G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \
35  B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \
36  R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \
37  G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \
38  B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \
39 
40 #define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \
41  Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
42  Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \
43  \
44  R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \
45  G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \
46  B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \
47  R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \
48  G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \
49  B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \
50 
51 #define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \
52 { \
53  __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \
54 \
55  red_mask = _mm_set1_epi16((short)0xF800); \
56  RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \
57  RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \
58  RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \
59  RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \
60  tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \
61  tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \
62  tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \
63  tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \
64  RGB1 = _mm_or_si128(RGB1, tmp1); \
65  RGB2 = _mm_or_si128(RGB2, tmp2); \
66  RGB3 = _mm_or_si128(RGB3, tmp3); \
67  RGB4 = _mm_or_si128(RGB4, tmp4); \
68  tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \
69  tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \
70  tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \
71  tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \
72  RGB1 = _mm_or_si128(RGB1, tmp1); \
73  RGB2 = _mm_or_si128(RGB2, tmp2); \
74  RGB3 = _mm_or_si128(RGB3, tmp3); \
75  RGB4 = _mm_or_si128(RGB4, tmp4); \
76 }
77 
78 #define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
79 RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \
80 RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \
81 RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \
82 RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \
83 RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \
84 RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \
85 
86 #define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
87 R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \
88 R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \
89 G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \
90 G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \
91 B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \
92 B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \
93 
94 #define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
95 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
96 PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
97 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
98 PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
99 PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \
100 
101 #define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \
102 { \
103  __m128i lo_ab, hi_ab, lo_gr, hi_gr; \
104 \
105  lo_ab = _mm_unpacklo_epi8( A1, B1 ); \
106  hi_ab = _mm_unpackhi_epi8( A1, B1 ); \
107  lo_gr = _mm_unpacklo_epi8( G1, R1 ); \
108  hi_gr = _mm_unpackhi_epi8( G1, R1 ); \
109  RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
110  RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
111  RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
112  RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
113 \
114  lo_ab = _mm_unpacklo_epi8( A2, B2 ); \
115  hi_ab = _mm_unpackhi_epi8( A2, B2 ); \
116  lo_gr = _mm_unpacklo_epi8( G2, R2 ); \
117  hi_gr = _mm_unpackhi_epi8( G2, R2 ); \
118  RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \
119  RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \
120  RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \
121  RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \
122 }
123 
124 #if RGB_FORMAT == RGB_FORMAT_RGB565
125 
126 #define PACK_PIXEL \
127  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
128  \
129  PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \
130  \
131  PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \
132 
133 #elif RGB_FORMAT == RGB_FORMAT_RGB24
134 
135 #define PACK_PIXEL \
136  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \
137  __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \
138  \
139  PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \
140  \
141  PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \
142 
143 #elif RGB_FORMAT == RGB_FORMAT_RGBA
144 
145 #define PACK_PIXEL \
146  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
147  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
148  __m128i a = _mm_set1_epi8((char)0xFF); \
149  \
150  PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
151  \
152  PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
153 
154 #elif RGB_FORMAT == RGB_FORMAT_BGRA
155 
156 #define PACK_PIXEL \
157  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
158  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
159  __m128i a = _mm_set1_epi8((char)0xFF); \
160  \
161  PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
162  \
163  PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
164 
165 #elif RGB_FORMAT == RGB_FORMAT_ARGB
166 
167 #define PACK_PIXEL \
168  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
169  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
170  __m128i a = _mm_set1_epi8((char)0xFF); \
171  \
172  PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
173  \
174  PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
175 
176 #elif RGB_FORMAT == RGB_FORMAT_ABGR
177 
178 #define PACK_PIXEL \
179  __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \
180  __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \
181  __m128i a = _mm_set1_epi8((char)0xFF); \
182  \
183  PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \
184  \
185  PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \
186 
187 #else
188 #error PACK_PIXEL unimplemented
189 #endif
190 
191 #if RGB_FORMAT == RGB_FORMAT_RGB565
192 
193 #define SAVE_LINE1 \
194  SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
195  SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
196  SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
197  SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
198 
199 #define SAVE_LINE2 \
200  SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \
201  SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \
202  SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \
203  SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \
204 
205 #elif RGB_FORMAT == RGB_FORMAT_RGB24
206 
207 #define SAVE_LINE1 \
208  SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
209  SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
210  SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
211  SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
212  SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
213  SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
214 
215 #define SAVE_LINE2 \
216  SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \
217  SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \
218  SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \
219  SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \
220  SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \
221  SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \
222 
223 #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
224  RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
225 
226 #define SAVE_LINE1 \
227  SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \
228  SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \
229  SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \
230  SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \
231  SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \
232  SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \
233  SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \
234  SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \
235 
236 #define SAVE_LINE2 \
237  SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \
238  SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \
239  SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \
240  SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \
241  SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \
242  SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \
243  SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \
244  SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \
245 
246 #else
247 #error SAVE_LINE unimplemented
248 #endif
249 
250 #if YUV_FORMAT == YUV_FORMAT_420
251 
252 #define READ_Y(y_ptr) \
253  y = LOAD_SI128((const __m128i*)(y_ptr)); \
254 
255 #define READ_UV \
256  u = LOAD_SI128((const __m128i*)(u_ptr)); \
257  v = LOAD_SI128((const __m128i*)(v_ptr)); \
258 
259 #elif YUV_FORMAT == YUV_FORMAT_422
260 
261 #define READ_Y(y_ptr) \
262 { \
263  __m128i y1, y2; \
264  y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \
265  y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \
266  y = _mm_packus_epi16(y1, y2); \
267 }
268 
269 #define READ_UV \
270 { \
271  __m128i u1, u2, u3, u4, v1, v2, v3, v4; \
272  u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \
273  u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \
274  u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \
275  u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \
276  u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \
277  v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \
278  v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \
279  v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \
280  v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \
281  v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \
282 }
283 
284 #elif YUV_FORMAT == YUV_FORMAT_NV12
285 
286 #define READ_Y(y_ptr) \
287  y = LOAD_SI128((const __m128i*)(y_ptr)); \
288 
289 #define READ_UV \
290 { \
291  __m128i u1, u2, v1, v2; \
292  u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \
293  u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \
294  u = _mm_packus_epi16(u1, u2); \
295  v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \
296  v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \
297  v = _mm_packus_epi16(v1, v2); \
298 }
299 
300 #else
301 #error READ_UV unimplemented
302 #endif
303 
304 #define YUV2RGB_32 \
305  __m128i r_tmp, g_tmp, b_tmp; \
306  __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \
307  __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \
308  __m128i y_16_1, y_16_2; \
309  __m128i y, u, v, u_16, v_16; \
310  __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \
311  __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \
312  \
313  READ_UV \
314  \
315  /* process first 16 pixels of first line */\
316  u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \
317  v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \
318  u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
319  v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
320  \
321  UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
322  r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
323  r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
324  \
325  READ_Y(y_ptr1) \
326  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
327  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
328  \
329  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
330  \
331  r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \
332  g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \
333  b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \
334  \
335  /* process first 16 pixels of second line */\
336  r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
337  r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
338  \
339  READ_Y(y_ptr2) \
340  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
341  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
342  \
343  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
344  \
345  r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \
346  g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \
347  b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \
348  \
349  /* process last 16 pixels of first line */\
350  u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \
351  v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \
352  u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \
353  v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \
354  \
355  UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
356  r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \
357  r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \
358  \
359  READ_Y(y_ptr1+16*y_pixel_stride) \
360  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
361  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
362  \
363  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
364  \
365  r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \
366  g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \
367  b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \
368  \
369  /* process last 16 pixels of second line */\
370  r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \
371  r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \
372  \
373  READ_Y(y_ptr2+16*y_pixel_stride) \
374  y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \
375  y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \
376  \
377  ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \
378  \
379  r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \
380  g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \
381  b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \
382  \
383 
384 
386  const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride,
387  uint8_t *RGB, uint32_t RGB_stride,
388  YCbCrType yuv_type)
389 {
390  const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]);
391 #if YUV_FORMAT == YUV_FORMAT_420
392  const int y_pixel_stride = 1;
393  const int uv_pixel_stride = 1;
394  const int uv_x_sample_interval = 2;
395  const int uv_y_sample_interval = 2;
396 #elif YUV_FORMAT == YUV_FORMAT_422
397  const int y_pixel_stride = 2;
398  const int uv_pixel_stride = 4;
399  const int uv_x_sample_interval = 2;
400  const int uv_y_sample_interval = 1;
401 #elif YUV_FORMAT == YUV_FORMAT_NV12
402  const int y_pixel_stride = 1;
403  const int uv_pixel_stride = 2;
404  const int uv_x_sample_interval = 2;
405  const int uv_y_sample_interval = 2;
406 #endif
407 #if RGB_FORMAT == RGB_FORMAT_RGB565
408  const int rgb_pixel_stride = 2;
409 #elif RGB_FORMAT == RGB_FORMAT_RGB24
410  const int rgb_pixel_stride = 3;
411 #elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \
412  RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR
413  const int rgb_pixel_stride = 4;
414 #else
415 #error Unknown RGB pixel size
416 #endif
417 
418  if (width >= 32) {
419  uint32_t xpos, ypos;
420  for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval)
421  {
422  const uint8_t *y_ptr1=Y+ypos*Y_stride,
423  *y_ptr2=Y+(ypos+1)*Y_stride,
424  *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
425  *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
426 
427  uint8_t *rgb_ptr1=RGB+ypos*RGB_stride,
428  *rgb_ptr2=RGB+(ypos+1)*RGB_stride;
429 
430  for(xpos=0; xpos<(width-31); xpos+=32)
431  {
432  YUV2RGB_32
433  {
434  PACK_PIXEL
435  SAVE_LINE1
436  if (uv_y_sample_interval > 1)
437  {
438  SAVE_LINE2
439  }
440  }
441 
442  y_ptr1+=32*y_pixel_stride;
443  y_ptr2+=32*y_pixel_stride;
444  u_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
445  v_ptr+=32*uv_pixel_stride/uv_x_sample_interval;
446  rgb_ptr1+=32*rgb_pixel_stride;
447  rgb_ptr2+=32*rgb_pixel_stride;
448  }
449  }
450 
451  /* Catch the last line, if needed */
452  if (uv_y_sample_interval == 2 && ypos == (height-1))
453  {
454  const uint8_t *y_ptr=Y+ypos*Y_stride,
455  *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride,
456  *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride;
457 
458  uint8_t *rgb_ptr=RGB+ypos*RGB_stride;
459 
460  STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
461  }
462  }
463 
464  /* Catch the right column, if needed */
465  {
466  int converted = (width & ~31);
467  if (converted != width)
468  {
469  const uint8_t *y_ptr=Y+converted*y_pixel_stride,
470  *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval,
471  *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval;
472 
473  uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride;
474 
475  STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type);
476  }
477  }
478 }
479 
480 #undef SSE_FUNCTION_NAME
481 #undef STD_FUNCTION_NAME
482 #undef YUV_FORMAT
483 #undef RGB_FORMAT
484 #undef SSE_ALIGNED
485 #undef LOAD_SI128
486 #undef SAVE_SI128
487 #undef UV2RGB_16
488 #undef ADD_Y2RGB_16
489 #undef PACK_RGB24_32_STEP1
490 #undef PACK_RGB24_32_STEP2
491 #undef PACK_RGB24_32
492 #undef PACK_RGBA_32
493 #undef PACK_PIXEL
494 #undef SAVE_LINE1
495 #undef SAVE_LINE2
496 #undef READ_Y
497 #undef READ_UV
498 #undef YUV2RGB_32
Definition: edid.h:20
void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type)
#define SAVE_LINE2
GLint GLint GLsizei width
Definition: SDL_opengl.h:1572
#define YUV2RGB_32
#define PACK_PIXEL
void STD_FUNCTION_NAME(uint32_t width, uint32_t height, const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, uint8_t *RGB, uint32_t RGB_stride, YCbCrType yuv_type)
static const YUV2RGBParam YUV2RGB[3]
Definition: yuv_rgb.c:42
YCbCrType
Definition: yuv_rgb.h:22
unsigned char uint8_t
unsigned int uint32_t
GLint GLint GLsizei GLsizei height
Definition: SDL_opengl.h:1572
#define V(value)
Definition: yuv_rgb.c:35
GLfloat param
#define SAVE_LINE1