SDL  2.0
SDL_audiotypecvt.c
Go to the documentation of this file.
1 /*
2  Simple DirectMedia Layer
3  Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
4 
5  This software is provided 'as-is', without any express or implied
6  warranty. In no event will the authors be held liable for any damages
7  arising from the use of this software.
8 
9  Permission is granted to anyone to use this software for any purpose,
10  including commercial applications, and to alter it and redistribute it
11  freely, subject to the following restrictions:
12 
13  1. The origin of this software must not be misrepresented; you must not
14  claim that you wrote the original software. If you use this software
15  in a product, an acknowledgment in the product documentation would be
16  appreciated but is not required.
17  2. Altered source versions must be plainly marked as such, and must not be
18  misrepresented as being the original software.
19  3. This notice may not be removed or altered from any source distribution.
20 */
21 
22 #include "../SDL_internal.h"
23 #include "SDL_audio.h"
24 #include "SDL_audio_c.h"
25 #include "SDL_cpuinfo.h"
26 #include "SDL_assert.h"
27 
28 /* !!! FIXME: write NEON code. */
29 #define HAVE_NEON_INTRINSICS 0
30 
31 #ifdef __SSE2__
32 #define HAVE_SSE2_INTRINSICS 1
33 #endif
34 
35 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
36 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* x86_64 guarantees SSE2. */
37 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* Mac OS X/Intel guarantees SSE2. */
39 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* ARMv8+ promise NEON. */
41 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* All Apple ARMv7 chips promise NEON support. */
43 #endif
44 
45 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
46 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
47 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
48 #endif
49 
50 /* Function pointers set to a CPU-specific implementation. */
61 
62 
63 #define DIVBY128 0.0078125f
64 #define DIVBY32768 0.000030517578125f
65 #define DIVBY2147483648 0.00000000046566128730773926
66 
67 
68 #if NEED_SCALAR_CONVERTER_FALLBACKS
69 static void SDLCALL
71 {
72  const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
73  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
74  int i;
75 
76  LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
77 
78  for (i = cvt->len_cvt; i; --i, --src, --dst) {
79  *dst = ((float) *src) * DIVBY128;
80  }
81 
82  cvt->len_cvt *= 4;
83  if (cvt->filters[++cvt->filter_index]) {
84  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
85  }
86 }
87 
88 static void SDLCALL
90 {
91  const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
92  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
93  int i;
94 
95  LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
96 
97  for (i = cvt->len_cvt; i; --i, --src, --dst) {
98  *dst = (((float) *src) * DIVBY128) - 1.0f;
99  }
100 
101  cvt->len_cvt *= 4;
102  if (cvt->filters[++cvt->filter_index]) {
103  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
104  }
105 }
106 
107 static void SDLCALL
109 {
110  const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
111  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
112  int i;
113 
114  LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
115 
116  for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
117  *dst = ((float) *src) * DIVBY32768;
118  }
119 
120  cvt->len_cvt *= 2;
121  if (cvt->filters[++cvt->filter_index]) {
122  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
123  }
124 }
125 
126 static void SDLCALL
128 {
129  const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
130  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
131  int i;
132 
133  LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
134 
135  for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
136  *dst = (((float) *src) * DIVBY32768) - 1.0f;
137  }
138 
139  cvt->len_cvt *= 2;
140  if (cvt->filters[++cvt->filter_index]) {
141  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
142  }
143 }
144 
145 static void SDLCALL
147 {
148  const Sint32 *src = (const Sint32 *) cvt->buf;
149  float *dst = (float *) cvt->buf;
150  int i;
151 
152  LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
153 
154  for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
155  *dst = (float) (((double) *src) * DIVBY2147483648);
156  }
157 
158  if (cvt->filters[++cvt->filter_index]) {
159  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
160  }
161 }
162 
163 static void SDLCALL
165 {
166  const float *src = (const float *) cvt->buf;
167  Sint8 *dst = (Sint8 *) cvt->buf;
168  int i;
169 
170  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
171 
172  for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
173  *dst = (Sint8) (*src * 127.0f);
174  }
175 
176  cvt->len_cvt /= 4;
177  if (cvt->filters[++cvt->filter_index]) {
178  cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
179  }
180 }
181 
182 static void SDLCALL
184 {
185  const float *src = (const float *) cvt->buf;
186  Uint8 *dst = (Uint8 *) cvt->buf;
187  int i;
188 
189  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
190 
191  for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
192  *dst = (Uint8) ((*src + 1.0f) * 127.0f);
193  }
194 
195  cvt->len_cvt /= 4;
196  if (cvt->filters[++cvt->filter_index]) {
197  cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
198  }
199 }
200 
201 static void SDLCALL
203 {
204  const float *src = (const float *) cvt->buf;
205  Sint16 *dst = (Sint16 *) cvt->buf;
206  int i;
207 
208  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
209 
210  for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
211  *dst = (Sint16) (*src * 32767.0f);
212  }
213 
214  cvt->len_cvt /= 2;
215  if (cvt->filters[++cvt->filter_index]) {
216  cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
217  }
218 }
219 
220 static void SDLCALL
222 {
223  const float *src = (const float *) cvt->buf;
224  Uint16 *dst = (Uint16 *) cvt->buf;
225  int i;
226 
227  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
228 
229  for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
230  *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
231  }
232 
233  cvt->len_cvt /= 2;
234  if (cvt->filters[++cvt->filter_index]) {
235  cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
236  }
237 }
238 
239 static void SDLCALL
241 {
242  const float *src = (const float *) cvt->buf;
243  Sint32 *dst = (Sint32 *) cvt->buf;
244  int i;
245 
246  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
247 
248  for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
249  *dst = (Sint32) (((double) *src) * 2147483647.0);
250  }
251 
252  if (cvt->filters[++cvt->filter_index]) {
253  cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
254  }
255 }
256 #endif
257 
258 
259 #if HAVE_SSE2_INTRINSICS
260 static void SDLCALL
261 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
262 {
263  const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
264  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
265  int i;
266 
267  LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
268 
269  /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
270  for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
271  *dst = ((float) *src) * DIVBY128;
272  }
273 
274  src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
275  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
276 
277  /* Make sure src is aligned too. */
278  if ((((size_t) src) & 15) == 0) {
279  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
280  const __m128i *mmsrc = (const __m128i *) src;
281  const __m128i zero = _mm_setzero_si128();
282  const __m128 divby128 = _mm_set1_ps(DIVBY128);
283  while (i >= 16) { /* 16 * 8-bit */
284  const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 sint8 into an XMM register. */
285  /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
286  const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
287  /* right-shift-sign-extend gets us sint16 with the other set of values. */
288  const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
289  /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
290  const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
291  const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
292  const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
293  const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
294  /* Interleave back into correct order, store. */
295  _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
296  _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
297  _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
298  _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
299  i -= 16; mmsrc--; dst -= 16;
300  }
301 
302  src = (const Sint8 *) mmsrc;
303  }
304 
305  src += 15; dst += 15; /* adjust for any scalar finishing. */
306 
307  /* Finish off any leftovers with scalar operations. */
308  while (i) {
309  *dst = ((float) *src) * DIVBY128;
310  i--; src--; dst--;
311  }
312 
313  cvt->len_cvt *= 4;
314  if (cvt->filters[++cvt->filter_index]) {
315  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
316  }
317 }
318 
319 static void SDLCALL
320 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
321 {
322  const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
323  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
324  int i;
325 
326  LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
327 
328  /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
329  for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
330  *dst = (((float) *src) * DIVBY128) - 1.0f;
331  }
332 
333  src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
334  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
335 
336  /* Make sure src is aligned too. */
337  if ((((size_t) src) & 15) == 0) {
338  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
339  const __m128i *mmsrc = (const __m128i *) src;
340  const __m128i zero = _mm_setzero_si128();
341  const __m128 divby128 = _mm_set1_ps(DIVBY128);
342  const __m128 minus1 = _mm_set1_ps(-1.0f);
343  while (i >= 16) { /* 16 * 8-bit */
344  const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */
345  /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
346  const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
347  /* right-shift-zero-extend gets us uint16 with the other set of values. */
348  const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
349  /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
350  /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
351  const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
352  const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
353  const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
354  const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
355  /* Interleave back into correct order, store. */
356  _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
357  _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
358  _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
359  _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
360  i -= 16; mmsrc--; dst -= 16;
361  }
362 
363  src = (const Uint8 *) mmsrc;
364  }
365 
366  src += 15; dst += 15; /* adjust for any scalar finishing. */
367 
368  /* Finish off any leftovers with scalar operations. */
369  while (i) {
370  *dst = (((float) *src) * DIVBY128) - 1.0f;
371  i--; src--; dst--;
372  }
373 
374  cvt->len_cvt *= 4;
375  if (cvt->filters[++cvt->filter_index]) {
376  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
377  }
378 }
379 
380 static void SDLCALL
381 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
382 {
383  const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
384  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
385  int i;
386 
387  LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
388 
389  /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
390  for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
391  *dst = ((float) *src) * DIVBY32768;
392  }
393 
394  src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
395  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
396 
397  /* Make sure src is aligned too. */
398  if ((((size_t) src) & 15) == 0) {
399  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
400  const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
401  while (i >= 8) { /* 8 * 16-bit */
402  const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
403  /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
404  const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
405  /* right-shift-sign-extend gets us sint32 with the other set of values. */
406  const __m128i b = _mm_srai_epi32(ints, 16);
407  /* Interleave these back into the right order, convert to float, multiply, store. */
408  _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
409  _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
410  i -= 8; src -= 8; dst -= 8;
411  }
412  }
413 
414  src += 7; dst += 7; /* adjust for any scalar finishing. */
415 
416  /* Finish off any leftovers with scalar operations. */
417  while (i) {
418  *dst = ((float) *src) * DIVBY32768;
419  i--; src--; dst--;
420  }
421 
422  cvt->len_cvt *= 2;
423  if (cvt->filters[++cvt->filter_index]) {
424  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
425  }
426 }
427 
428 static void SDLCALL
429 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
430 {
431  const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
432  float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
433  int i;
434 
435  LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
436 
437  /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
438  for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
439  *dst = (((float) *src) * DIVBY32768) - 1.0f;
440  }
441 
442  src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
443  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
444 
445  /* Make sure src is aligned too. */
446  if ((((size_t) src) & 15) == 0) {
447  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
448  const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
449  const __m128 minus1 = _mm_set1_ps(1.0f);
450  while (i >= 8) { /* 8 * 16-bit */
451  const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
452  /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
453  const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
454  /* right-shift-sign-extend gets us sint32 with the other set of values. */
455  const __m128i b = _mm_srli_epi32(ints, 16);
456  /* Interleave these back into the right order, convert to float, multiply, store. */
457  _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
458  _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
459  i -= 8; src -= 8; dst -= 8;
460  }
461  }
462 
463  src += 7; dst += 7; /* adjust for any scalar finishing. */
464 
465  /* Finish off any leftovers with scalar operations. */
466  while (i) {
467  *dst = (((float) *src) * DIVBY32768) - 1.0f;
468  i--; src--; dst--;
469  }
470 
471  cvt->len_cvt *= 2;
472  if (cvt->filters[++cvt->filter_index]) {
473  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
474  }
475 }
476 
477 static void SDLCALL
478 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
479 {
480  const Sint32 *src = (const Sint32 *) cvt->buf;
481  float *dst = (float *) cvt->buf;
482  int i;
483 
484  LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
485 
486  /* Get dst aligned to 16 bytes */
487  for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
488  *dst = (float) (((double) *src) * DIVBY2147483648);
489  }
490 
491  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
492  SDL_assert(!i || ((((size_t) src) & 15) == 0));
493 
494  {
495  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
496  const __m128d divby2147483648 = _mm_set1_pd(DIVBY2147483648);
497  const __m128i *mmsrc = (const __m128i *) src;
498  while (i >= 4) { /* 4 * sint32 */
499  const __m128i ints = _mm_load_si128(mmsrc);
500  /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
501  const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(ints, 8)), divby2147483648);
502  const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483648);
503  /* convert to float32, bitshift/or to get these into a vector to store. */
504  _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_slli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
505  i -= 4; mmsrc++; dst += 4;
506  }
507  src = (const Sint32 *) mmsrc;
508  }
509 
510  /* Finish off any leftovers with scalar operations. */
511  while (i) {
512  *dst = (float) (((double) *src) * DIVBY2147483648);
513  i--; src++; dst++;
514  }
515 
516  if (cvt->filters[++cvt->filter_index]) {
517  cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
518  }
519 }
520 
521 static void SDLCALL
522 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
523 {
524  const float *src = (const float *) cvt->buf;
525  Sint8 *dst = (Sint8 *) cvt->buf;
526  int i;
527 
528  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
529 
530  /* Get dst aligned to 16 bytes */
531  for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
532  *dst = (Sint8) (*src * 127.0f);
533  }
534 
535  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
536 
537  /* Make sure src is aligned too. */
538  if ((((size_t) src) & 15) == 0) {
539  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
540  const __m128 mulby127 = _mm_set1_ps(127.0f);
541  __m128i *mmdst = (__m128i *) dst;
542  while (i >= 16) { /* 16 * float32 */
543  const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby127)); /* load 4 floats, convert to sint32 */
544  const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby127)); /* load 4 floats, convert to sint32 */
545  const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+8), mulby127)); /* load 4 floats, convert to sint32 */
546  const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+12), mulby127)); /* load 4 floats, convert to sint32 */
547  _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */
548  i -= 16; src += 16; mmdst++;
549  }
550  dst = (Sint8 *) mmdst;
551  }
552 
553  /* Finish off any leftovers with scalar operations. */
554  while (i) {
555  *dst = (Sint8) (*src * 127.0f);
556  i--; src++; dst++;
557  }
558 
559  cvt->len_cvt /= 4;
560  if (cvt->filters[++cvt->filter_index]) {
561  cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
562  }
563 }
564 
565 static void SDLCALL
566 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
567 {
568  const float *src = (const float *) cvt->buf;
569  Uint8 *dst = (Uint8 *) cvt->buf;
570  int i;
571 
572  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
573 
574  /* Get dst aligned to 16 bytes */
575  for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
576  *dst = (Uint8) ((*src + 1.0f) * 127.0f);
577  }
578 
579  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
580 
581  /* Make sure src is aligned too. */
582  if ((((size_t) src) & 15) == 0) {
583  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
584  const __m128 add1 = _mm_set1_ps(1.0f);
585  const __m128 mulby127 = _mm_set1_ps(127.0f);
586  __m128i *mmdst = (__m128i *) dst;
587  while (i >= 16) { /* 16 * float32 */
588  const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src), add1), mulby127)); /* load 4 floats, convert to sint32 */
589  const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+4), add1), mulby127)); /* load 4 floats, convert to sint32 */
590  const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+8), add1), mulby127)); /* load 4 floats, convert to sint32 */
591  const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+12), add1), mulby127)); /* load 4 floats, convert to sint32 */
592  _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */
593  i -= 16; src += 16; mmdst++;
594  }
595  dst = (Uint8 *) mmdst;
596  }
597 
598  /* Finish off any leftovers with scalar operations. */
599  while (i) {
600  *dst = (Uint8) ((*src + 1.0f) * 127.0f);
601  i--; src++; dst++;
602  }
603 
604  cvt->len_cvt /= 4;
605  if (cvt->filters[++cvt->filter_index]) {
606  cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
607  }
608 }
609 
610 static void SDLCALL
611 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
612 {
613  const float *src = (const float *) cvt->buf;
614  Sint16 *dst = (Sint16 *) cvt->buf;
615  int i;
616 
617  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
618 
619  /* Get dst aligned to 16 bytes */
620  for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
621  *dst = (Sint16) (*src * 32767.0f);
622  }
623 
624  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
625 
626  /* Make sure src is aligned too. */
627  if ((((size_t) src) & 15) == 0) {
628  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
629  const __m128 mulby32767 = _mm_set1_ps(32767.0f);
630  __m128i *mmdst = (__m128i *) dst;
631  while (i >= 8) { /* 8 * float32 */
632  const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767)); /* load 4 floats, convert to sint32 */
633  const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767)); /* load 4 floats, convert to sint32 */
634  _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2)); /* pack to sint16, store out. */
635  i -= 8; src += 8; mmdst++;
636  }
637  dst = (Sint16 *) mmdst;
638  }
639 
640  /* Finish off any leftovers with scalar operations. */
641  while (i) {
642  *dst = (Sint16) (*src * 32767.0f);
643  i--; src++; dst++;
644  }
645 
646  cvt->len_cvt /= 2;
647  if (cvt->filters[++cvt->filter_index]) {
648  cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
649  }
650 }
651 
652 static void SDLCALL
653 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
654 {
655  const float *src = (const float *) cvt->buf;
656  Uint16 *dst = (Uint16 *) cvt->buf;
657  int i;
658 
659  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
660 
661  /* Get dst aligned to 16 bytes */
662  for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
663  *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
664  }
665 
666  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
667 
668  /* Make sure src is aligned too. */
669  if ((((size_t) src) & 15) == 0) {
670  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
671  /* This calculates differently than the scalar path because SSE2 can't
672  pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
673  saturation, so that would corrupt our data. _mm_packus_epi32 exists,
674  but not before SSE 4.1. So we convert from float to sint16, packing
675  that down with legit signed saturation, and then xor the top bit
676  against 1. This results in the correct unsigned 16-bit value, even
677  though it looks like dark magic. */
678  const __m128 mulby32767 = _mm_set1_ps(32767.0f);
679  const __m128i topbit = _mm_set1_epi16(-32768);
680  __m128i *mmdst = (__m128i *) dst;
681  while (i >= 8) { /* 8 * float32 */
682  const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767)); /* load 4 floats, convert to sint32 */
683  const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767)); /* load 4 floats, convert to sint32 */
684  _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit)); /* pack to sint16, xor top bit, store out. */
685  i -= 8; src += 8; mmdst++;
686  }
687  dst = (Uint16 *) mmdst;
688  }
689 
690  /* Finish off any leftovers with scalar operations. */
691  while (i) {
692  *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
693  i--; src++; dst++;
694  }
695 
696  cvt->len_cvt /= 2;
697  if (cvt->filters[++cvt->filter_index]) {
698  cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
699  }
700 }
701 
702 static void SDLCALL
703 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
704 {
705  const float *src = (const float *) cvt->buf;
706  Sint32 *dst = (Sint32 *) cvt->buf;
707  int i;
708 
709  LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
710 
711  /* Get dst aligned to 16 bytes */
712  for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
713  *dst = (Sint32) (((double) *src) * 2147483647.0);
714  }
715 
716  SDL_assert(!i || ((((size_t) dst) & 15) == 0));
717  SDL_assert(!i || ((((size_t) src) & 15) == 0));
718 
719  {
720  /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
721  const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
722  __m128i *mmdst = (__m128i *) dst;
723  while (i >= 4) { /* 4 * float32 */
724  const __m128 floats = _mm_load_ps(src);
725  /* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
726  const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
727  const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
728  _mm_store_si128(mmdst, _mm_or_si128(_mm_slli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
729  i -= 4; src += 4; mmdst++;
730  }
731  dst = (Sint32 *) mmdst;
732  }
733 
734  /* Finish off any leftovers with scalar operations. */
735  while (i) {
736  *dst = (Sint32) (((double) *src) * 2147483647.0);
737  i--; src++; dst++;
738  }
739 
740  if (cvt->filters[++cvt->filter_index]) {
741  cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
742  }
743 }
744 #endif
745 
746 
748 {
749  static SDL_bool converters_chosen = SDL_FALSE;
750 
751  if (converters_chosen) {
752  return;
753  }
754 
755 #define SET_CONVERTER_FUNCS(fntype) \
756  SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
757  SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
758  SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
759  SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
760  SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
761  SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
762  SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
763  SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
764  SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
765  SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
766  converters_chosen = SDL_TRUE
767 
768 #if HAVE_SSE2_INTRINSICS
769  if (SDL_HasSSE2()) {
770  SET_CONVERTER_FUNCS(SSE2);
771  return;
772  }
773 #endif
774 
775 #if NEED_SCALAR_CONVERTER_FALLBACKS
776  SET_CONVERTER_FUNCS(Scalar);
777 #endif
778 
779 #undef SET_CONVERTER_FUNCS
780 
781  SDL_assert(converters_chosen == SDL_TRUE);
782 }
783 
784 /* vi: set ts=4 sw=4 expandtab: */
#define LOG_DEBUG_CONVERT(from, to)
Definition: SDL_audio_c.h:34
GLenum GLenum dst
Uint8 * buf
Definition: SDL_audio.h:222
static void SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
SDL_AudioFilter SDL_Convert_F32_to_U16
SDL_AudioFilter SDL_Convert_F32_to_S16
void SDL_ChooseAudioConverters(void)
int filter_index
Definition: SDL_audio.h:228
SDL_AudioFilter SDL_Convert_U8_to_F32
static void SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
GLfloat f
Uint16 SDL_AudioFormat
Audio format flags.
Definition: SDL_audio.h:64
GLenum src
#define AUDIO_S16SYS
Definition: SDL_audio.h:123
SDL_AudioFilter SDL_Convert_F32_to_U8
A structure to hold a set of audio conversion filters and buffers.
Definition: SDL_audio.h:216
static void SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
SDL_AudioFilter SDL_Convert_S16_to_F32
GLint GLint GLsizei GLsizei GLsizei GLint GLenum format
Definition: SDL_opengl.h:1572
unsigned int size_t
#define AUDIO_U8
Definition: SDL_audio.h:89
SDL_AudioFilter filters[SDL_AUDIOCVT_MAX_FILTERS+1]
Definition: SDL_audio.h:227
#define AUDIO_F32SYS
Definition: SDL_audio.h:125
static void SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
int8_t Sint8
A signed 8-bit integer type.
Definition: SDL_stdinc.h:149
static void SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
uint8_t Uint8
An unsigned 8-bit integer type.
Definition: SDL_stdinc.h:153
static void SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
static void SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
int32_t Sint32
A signed 32-bit integer type.
Definition: SDL_stdinc.h:165
static void SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
void(* SDL_AudioFilter)(struct SDL_AudioCVT *cvt, SDL_AudioFormat format)
Definition: SDL_audio.h:183
return Display return Display Bool Bool int int int return Display XEvent Bool(*) XPointer return Display return Display Drawable _Xconst char unsigned int unsigned int return Display Pixmap Pixmap XColor XColor unsigned int unsigned int return Display _Xconst char char int char return Display Visual unsigned int int int char unsigned int unsigned int in i)
Definition: SDL_x11sym.h:50
#define SDL_assert(condition)
Definition: SDL_assert.h:169
#define NULL
Definition: begin_code.h:164
SDL_bool
Definition: SDL_stdinc.h:139
#define DIVBY2147483648
static void SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
#define SDL_HasSSE2
SDL_AudioFilter SDL_Convert_S8_to_F32
#define AUDIO_S32SYS
Definition: SDL_audio.h:124
uint16_t Uint16
An unsigned 16-bit integer type.
Definition: SDL_stdinc.h:161
SDL_AudioFilter SDL_Convert_F32_to_S32
SDL_AudioFilter SDL_Convert_F32_to_S8
static void SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
#define DIVBY32768
#define DIVBY128
SDL_AudioFilter SDL_Convert_U16_to_F32
zero
Definition: e_pow.c:78
GLboolean GLboolean GLboolean GLboolean a
#define AUDIO_S8
Definition: SDL_audio.h:90
SDL_AudioFilter SDL_Convert_S32_to_F32
#define SDLCALL
Definition: SDL_internal.h:45
GLboolean GLboolean GLboolean b
#define SET_CONVERTER_FUNCS(fntype)
#define AUDIO_U16SYS
Definition: SDL_audio.h:122
int16_t Sint16
A signed 16-bit integer type.
Definition: SDL_stdinc.h:157