64 #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
65 #define INCLUDED_volk_32fc_accumulator_s32fc_a_H
70 #ifdef LV_HAVE_GENERIC
73 unsigned int num_points)
76 unsigned int number = 0;
79 for (; number < num_points; number++) {
80 returnValue += (*aPtr++);
82 *result = returnValue;
87 #include <immintrin.h>
91 unsigned int num_points)
94 unsigned int number = 0;
95 const unsigned int quarterPoints = num_points / 4;
100 __m256 accumulator = _mm256_setzero_ps();
101 __m256 aVal = _mm256_setzero_ps();
103 for (; number < quarterPoints; number++) {
104 aVal = _mm256_loadu_ps((
float*)aPtr);
105 accumulator = _mm256_add_ps(accumulator, aVal);
109 _mm256_store_ps(tempBuffer, accumulator);
111 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
112 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
113 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
114 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
116 number = quarterPoints * 4;
117 for (; number < num_points; number++) {
118 returnValue += (*aPtr++);
120 *result = returnValue;
125 #include <xmmintrin.h>
129 unsigned int num_points)
132 unsigned int number = 0;
133 const unsigned int halfPoints = num_points / 2;
138 __m128 accumulator = _mm_setzero_ps();
139 __m128 aVal = _mm_setzero_ps();
141 for (; number < halfPoints; number++) {
142 aVal = _mm_loadu_ps((
float*)aPtr);
143 accumulator = _mm_add_ps(accumulator, aVal);
147 _mm_store_ps(tempBuffer, accumulator);
149 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
150 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
152 number = halfPoints * 2;
153 for (; number < num_points; number++) {
154 returnValue += (*aPtr++);
156 *result = returnValue;
161 #include <immintrin.h>
165 unsigned int num_points)
168 unsigned int number = 0;
169 const unsigned int quarterPoints = num_points / 4;
174 __m256 accumulator = _mm256_setzero_ps();
175 __m256 aVal = _mm256_setzero_ps();
177 for (; number < quarterPoints; number++) {
178 aVal = _mm256_load_ps((
float*)aPtr);
179 accumulator = _mm256_add_ps(accumulator, aVal);
183 _mm256_store_ps(tempBuffer, accumulator);
185 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
186 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
187 returnValue +=
lv_cmake(tempBuffer[4], tempBuffer[5]);
188 returnValue +=
lv_cmake(tempBuffer[6], tempBuffer[7]);
190 number = quarterPoints * 4;
191 for (; number < num_points; number++) {
192 returnValue += (*aPtr++);
194 *result = returnValue;
199 #include <xmmintrin.h>
203 unsigned int num_points)
206 unsigned int number = 0;
207 const unsigned int halfPoints = num_points / 2;
212 __m128 accumulator = _mm_setzero_ps();
213 __m128 aVal = _mm_setzero_ps();
215 for (; number < halfPoints; number++) {
216 aVal = _mm_load_ps((
float*)aPtr);
217 accumulator = _mm_add_ps(accumulator, aVal);
221 _mm_store_ps(tempBuffer, accumulator);
223 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
224 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
226 number = halfPoints * 2;
227 for (; number < num_points; number++) {
228 returnValue += (*aPtr++);
230 *result = returnValue;
235 #include <arm_neon.h>
238 unsigned int num_points)
241 unsigned int number = 0;
243 unsigned int eighthPoints = num_points / 8;
245 float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
246 float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
247 float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
248 float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
251 for (; number < eighthPoints; number++) {
252 in_vec = vld1q_f32((
float*)aPtr);
253 out_vec0 = vaddq_f32(in_vec, out_vec0);
256 in_vec = vld1q_f32((
float*)aPtr);
257 out_vec1 = vaddq_f32(in_vec, out_vec1);
260 in_vec = vld1q_f32((
float*)aPtr);
261 out_vec2 = vaddq_f32(in_vec, out_vec2);
264 in_vec = vld1q_f32((
float*)aPtr);
265 out_vec3 = vaddq_f32(in_vec, out_vec3);
268 vst1q_f32(tempBuffer, out_vec0);
269 returnValue =
lv_cmake(tempBuffer[0], tempBuffer[1]);
270 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
272 vst1q_f32(tempBuffer, out_vec1);
273 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
274 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
276 vst1q_f32(tempBuffer, out_vec2);
277 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
278 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
280 vst1q_f32(tempBuffer, out_vec3);
281 returnValue +=
lv_cmake(tempBuffer[0], tempBuffer[1]);
282 returnValue +=
lv_cmake(tempBuffer[2], tempBuffer[3]);
284 number = eighthPoints * 8;
285 for (; number < num_points; number++) {
286 returnValue += (*aPtr++);
288 *result = returnValue;
static void volk_32fc_accumulator_s32fc_generic(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:71
static void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:201
static void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:127
static void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:163
static void volk_32fc_accumulator_s32fc_neon(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:236
static void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:89
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cmake(r, i)
Definition: volk_complex.h:68
float complex lv_32fc_t
Definition: volk_complex.h:65