74 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
75 #define INCLUDED_volk_32fc_32f_add_32fc_u_H
77 #ifdef LV_HAVE_GENERIC
82 unsigned int num_points)
86 const float* bPtr = bVector;
87 unsigned int number = 0;
89 for (number = 0; number < num_points; number++) {
90 *cPtr++ = (*aPtr++) + (*bPtr++);
97 #include <immintrin.h>
101 const float* bVector,
102 unsigned int num_points)
104 unsigned int number = 0;
105 const unsigned int eighthPoints = num_points / 8;
109 const float* bPtr = bVector;
111 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
112 __m256 cpx_b1, cpx_b2;
114 zero = _mm256_setzero_ps();
116 for (; number < eighthPoints; number++) {
118 aVal1 = _mm256_loadu_ps((
float*)aPtr);
119 aVal2 = _mm256_loadu_ps((
float*)(aPtr + 4));
120 bVal = _mm256_loadu_ps(bPtr);
121 cpx_b1 = _mm256_unpacklo_ps(bVal, zero);
122 cpx_b2 = _mm256_unpackhi_ps(bVal, zero);
124 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
125 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
127 cVal1 = _mm256_add_ps(aVal1, tmp1);
128 cVal2 = _mm256_add_ps(aVal2, tmp2);
130 _mm256_storeu_ps((
float*)cPtr,
132 _mm256_storeu_ps((
float*)(cPtr + 4),
140 number = eighthPoints * 8;
141 for (; number < num_points; number++) {
142 *cPtr++ = (*aPtr++) + (*bPtr++);
148 #include <immintrin.h>
152 const float* bVector,
153 unsigned int num_points)
155 unsigned int number = 0;
156 const unsigned int eighthPoints = num_points / 8;
160 const float* bPtr = bVector;
162 __m256 aVal1, aVal2, bVal, cVal1, cVal2;
163 __m256 cpx_b1, cpx_b2;
165 zero = _mm256_setzero_ps();
167 for (; number < eighthPoints; number++) {
169 aVal1 = _mm256_load_ps((
float*)aPtr);
170 aVal2 = _mm256_load_ps((
float*)(aPtr + 4));
171 bVal = _mm256_load_ps(bPtr);
172 cpx_b1 = _mm256_unpacklo_ps(bVal, zero);
173 cpx_b2 = _mm256_unpackhi_ps(bVal, zero);
175 tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
176 tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
178 cVal1 = _mm256_add_ps(aVal1, tmp1);
179 cVal2 = _mm256_add_ps(aVal2, tmp2);
181 _mm256_store_ps((
float*)cPtr,
183 _mm256_store_ps((
float*)(cPtr + 4),
191 number = eighthPoints * 8;
192 for (; number < num_points; number++) {
193 *cPtr++ = (*aPtr++) + (*bPtr++);
199 #include <arm_neon.h>
203 const float* bVector,
204 unsigned int num_points)
208 const float* bPtr = bVector;
210 float32x4x4_t aVal0, aVal1;
211 float32x4x2_t bVal0, bVal1;
213 const unsigned int sixteenthPoints = num_points / 16;
214 unsigned int number = 0;
215 for (; number < sixteenthPoints; number++) {
216 aVal0 = vld4q_f32((
const float*)aPtr);
218 aVal1 = vld4q_f32((
const float*)aPtr);
222 bVal0 = vld2q_f32((
const float*)bPtr);
224 bVal1 = vld2q_f32((
const float*)bPtr);
228 aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
229 aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
231 aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
232 aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
234 vst4q_f32((
float*)(cPtr), aVal0);
236 vst4q_f32((
float*)(cPtr), aVal1);
240 for (number = sixteenthPoints * 16; number < num_points; number++) {
241 *cPtr++ = (*aPtr++) + (*bPtr++);