71 #ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
72 #define INCLUDED_volk_32f_x2_subtract_32f_a_H
77 #ifdef LV_HAVE_AVX512F
78 #include <immintrin.h>
80 static inline void volk_32f_x2_subtract_32f_a_avx512f(
float* cVector,
83 unsigned int num_points)
85 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 const float* bPtr = bVector;
92 __m512 aVal, bVal, cVal;
93 for (; number < sixteenthPoints; number++) {
95 aVal = _mm512_load_ps(aPtr);
96 bVal = _mm512_load_ps(bPtr);
98 cVal = _mm512_sub_ps(aVal, bVal);
100 _mm512_store_ps(cPtr, cVal);
107 number = sixteenthPoints * 16;
108 for (; number < num_points; number++) {
109 *cPtr++ = (*aPtr++) - (*bPtr++);
115 #include <immintrin.h>
118 const float* aVector,
119 const float* bVector,
120 unsigned int num_points)
122 unsigned int number = 0;
123 const unsigned int eighthPoints = num_points / 8;
125 float* cPtr = cVector;
126 const float* aPtr = aVector;
127 const float* bPtr = bVector;
129 __m256 aVal, bVal, cVal;
130 for (; number < eighthPoints; number++) {
132 aVal = _mm256_load_ps(aPtr);
133 bVal = _mm256_load_ps(bPtr);
135 cVal = _mm256_sub_ps(aVal, bVal);
137 _mm256_store_ps(cPtr, cVal);
144 number = eighthPoints * 8;
145 for (; number < num_points; number++) {
146 *cPtr++ = (*aPtr++) - (*bPtr++);
152 #include <xmmintrin.h>
155 const float* aVector,
156 const float* bVector,
157 unsigned int num_points)
159 unsigned int number = 0;
160 const unsigned int quarterPoints = num_points / 4;
162 float* cPtr = cVector;
163 const float* aPtr = aVector;
164 const float* bPtr = bVector;
166 __m128 aVal, bVal, cVal;
167 for (; number < quarterPoints; number++) {
169 aVal = _mm_load_ps(aPtr);
170 bVal = _mm_load_ps(bPtr);
172 cVal = _mm_sub_ps(aVal, bVal);
174 _mm_store_ps(cPtr, cVal);
181 number = quarterPoints * 4;
182 for (; number < num_points; number++) {
183 *cPtr++ = (*aPtr++) - (*bPtr++);
189 #ifdef LV_HAVE_GENERIC
192 const float* aVector,
193 const float* bVector,
194 unsigned int num_points)
196 float* cPtr = cVector;
197 const float* aPtr = aVector;
198 const float* bPtr = bVector;
199 unsigned int number = 0;
201 for (number = 0; number < num_points; number++) {
202 *cPtr++ = (*aPtr++) - (*bPtr++);
209 #include <arm_neon.h>
212 const float* aVector,
213 const float* bVector,
214 unsigned int num_points)
216 float* cPtr = cVector;
217 const float* aPtr = aVector;
218 const float* bPtr = bVector;
219 unsigned int number = 0;
220 unsigned int quarter_points = num_points / 4;
222 float32x4_t a_vec, b_vec, c_vec;
224 for (number = 0; number < quarter_points; number++) {
225 a_vec = vld1q_f32(aPtr);
226 b_vec = vld1q_f32(bPtr);
227 c_vec = vsubq_f32(a_vec, b_vec);
228 vst1q_f32(cPtr, c_vec);
234 for (number = quarter_points * 4; number < num_points; number++) {
235 *cPtr++ = (*aPtr++) - (*bPtr++);
242 extern void volk_32f_x2_subtract_32f_a_orc_impl(
float* cVector,
243 const float* aVector,
244 const float* bVector,
245 unsigned int num_points);
247 static inline void volk_32f_x2_subtract_32f_u_orc(
float* cVector,
248 const float* aVector,
249 const float* bVector,
250 unsigned int num_points)
252 volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
260 #ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
261 #define INCLUDED_volk_32f_x2_subtract_32f_u_H
263 #include <inttypes.h>
266 #ifdef LV_HAVE_AVX512F
267 #include <immintrin.h>
269 static inline void volk_32f_x2_subtract_32f_u_avx512f(
float* cVector,
270 const float* aVector,
271 const float* bVector,
272 unsigned int num_points)
274 unsigned int number = 0;
275 const unsigned int sixteenthPoints = num_points / 16;
277 float* cPtr = cVector;
278 const float* aPtr = aVector;
279 const float* bPtr = bVector;
281 __m512 aVal, bVal, cVal;
282 for (; number < sixteenthPoints; number++) {
284 aVal = _mm512_loadu_ps(aPtr);
285 bVal = _mm512_loadu_ps(bPtr);
287 cVal = _mm512_sub_ps(aVal, bVal);
289 _mm512_storeu_ps(cPtr, cVal);
296 number = sixteenthPoints * 16;
297 for (; number < num_points; number++) {
298 *cPtr++ = (*aPtr++) - (*bPtr++);
305 #include <immintrin.h>
308 const float* aVector,
309 const float* bVector,
310 unsigned int num_points)
312 unsigned int number = 0;
313 const unsigned int eighthPoints = num_points / 8;
315 float* cPtr = cVector;
316 const float* aPtr = aVector;
317 const float* bPtr = bVector;
319 __m256 aVal, bVal, cVal;
320 for (; number < eighthPoints; number++) {
322 aVal = _mm256_loadu_ps(aPtr);
323 bVal = _mm256_loadu_ps(bPtr);
325 cVal = _mm256_sub_ps(aVal, bVal);
327 _mm256_storeu_ps(cPtr, cVal);
334 number = eighthPoints * 8;
335 for (; number < num_points; number++) {
336 *cPtr++ = (*aPtr++) - (*bPtr++);