Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32fc_accumulator_s32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2019 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
64 #ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
65 #define INCLUDED_volk_32fc_accumulator_s32fc_a_H
66 
67 #include <inttypes.h>
68 #include <volk/volk_common.h>
69 
70 #ifdef LV_HAVE_GENERIC
72  const lv_32fc_t* inputBuffer,
73  unsigned int num_points)
74 {
75  const lv_32fc_t* aPtr = inputBuffer;
76  unsigned int number = 0;
77  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
78 
79  for (; number < num_points; number++) {
80  returnValue += (*aPtr++);
81  }
82  *result = returnValue;
83 }
84 #endif /* LV_HAVE_GENERIC */
85 
86 #ifdef LV_HAVE_AVX
87 #include <immintrin.h>
88 
89 static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result,
90  const lv_32fc_t* inputBuffer,
91  unsigned int num_points)
92 {
93  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
94  unsigned int number = 0;
95  const unsigned int quarterPoints = num_points / 4;
96 
97  const lv_32fc_t* aPtr = inputBuffer;
98  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
99 
100  __m256 accumulator = _mm256_setzero_ps();
101  __m256 aVal = _mm256_setzero_ps();
102 
103  for (; number < quarterPoints; number++) {
104  aVal = _mm256_loadu_ps((float*)aPtr);
105  accumulator = _mm256_add_ps(accumulator, aVal);
106  aPtr += 4;
107  }
108 
109  _mm256_store_ps(tempBuffer, accumulator);
110 
111  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
112  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
113  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
114  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
115 
116  number = quarterPoints * 4;
117  for (; number < num_points; number++) {
118  returnValue += (*aPtr++);
119  }
120  *result = returnValue;
121 }
122 #endif /* LV_HAVE_AVX */
123 
124 #ifdef LV_HAVE_SSE
125 #include <xmmintrin.h>
126 
127 static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result,
128  const lv_32fc_t* inputBuffer,
129  unsigned int num_points)
130 {
131  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
132  unsigned int number = 0;
133  const unsigned int halfPoints = num_points / 2;
134 
135  const lv_32fc_t* aPtr = inputBuffer;
136  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
137 
138  __m128 accumulator = _mm_setzero_ps();
139  __m128 aVal = _mm_setzero_ps();
140 
141  for (; number < halfPoints; number++) {
142  aVal = _mm_loadu_ps((float*)aPtr);
143  accumulator = _mm_add_ps(accumulator, aVal);
144  aPtr += 2;
145  }
146 
147  _mm_store_ps(tempBuffer, accumulator);
148 
149  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
150  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
151 
152  number = halfPoints * 2;
153  for (; number < num_points; number++) {
154  returnValue += (*aPtr++);
155  }
156  *result = returnValue;
157 }
158 #endif /* LV_HAVE_SSE */
159 
160 #ifdef LV_HAVE_AVX
161 #include <immintrin.h>
162 
163 static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result,
164  const lv_32fc_t* inputBuffer,
165  unsigned int num_points)
166 {
167  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
168  unsigned int number = 0;
169  const unsigned int quarterPoints = num_points / 4;
170 
171  const lv_32fc_t* aPtr = inputBuffer;
172  __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
173 
174  __m256 accumulator = _mm256_setzero_ps();
175  __m256 aVal = _mm256_setzero_ps();
176 
177  for (; number < quarterPoints; number++) {
178  aVal = _mm256_load_ps((float*)aPtr);
179  accumulator = _mm256_add_ps(accumulator, aVal);
180  aPtr += 4;
181  }
182 
183  _mm256_store_ps(tempBuffer, accumulator);
184 
185  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
186  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
187  returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
188  returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
189 
190  number = quarterPoints * 4;
191  for (; number < num_points; number++) {
192  returnValue += (*aPtr++);
193  }
194  *result = returnValue;
195 }
196 #endif /* LV_HAVE_AVX */
197 
198 #ifdef LV_HAVE_SSE
199 #include <xmmintrin.h>
200 
201 static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result,
202  const lv_32fc_t* inputBuffer,
203  unsigned int num_points)
204 {
205  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
206  unsigned int number = 0;
207  const unsigned int halfPoints = num_points / 2;
208 
209  const lv_32fc_t* aPtr = inputBuffer;
210  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
211 
212  __m128 accumulator = _mm_setzero_ps();
213  __m128 aVal = _mm_setzero_ps();
214 
215  for (; number < halfPoints; number++) {
216  aVal = _mm_load_ps((float*)aPtr);
217  accumulator = _mm_add_ps(accumulator, aVal);
218  aPtr += 2;
219  }
220 
221  _mm_store_ps(tempBuffer, accumulator);
222 
223  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
224  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
225 
226  number = halfPoints * 2;
227  for (; number < num_points; number++) {
228  returnValue += (*aPtr++);
229  }
230  *result = returnValue;
231 }
232 #endif /* LV_HAVE_SSE */
233 
234 #ifdef LV_HAVE_NEON
235 #include <arm_neon.h>
236 static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
237  const lv_32fc_t* inputBuffer,
238  unsigned int num_points)
239 {
240  const lv_32fc_t* aPtr = inputBuffer;
241  unsigned int number = 0;
242  lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
243  unsigned int eighthPoints = num_points / 8;
244  float32x4_t in_vec;
245  float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
246  float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
247  float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
248  float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
249  __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
250 
251  for (; number < eighthPoints; number++) {
252  in_vec = vld1q_f32((float*)aPtr);
253  out_vec0 = vaddq_f32(in_vec, out_vec0);
254  aPtr += 2;
255 
256  in_vec = vld1q_f32((float*)aPtr);
257  out_vec1 = vaddq_f32(in_vec, out_vec1);
258  aPtr += 2;
259 
260  in_vec = vld1q_f32((float*)aPtr);
261  out_vec2 = vaddq_f32(in_vec, out_vec2);
262  aPtr += 2;
263 
264  in_vec = vld1q_f32((float*)aPtr);
265  out_vec3 = vaddq_f32(in_vec, out_vec3);
266  aPtr += 2;
267  }
268  vst1q_f32(tempBuffer, out_vec0);
269  returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
270  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
271 
272  vst1q_f32(tempBuffer, out_vec1);
273  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
274  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
275 
276  vst1q_f32(tempBuffer, out_vec2);
277  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
278  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
279 
280  vst1q_f32(tempBuffer, out_vec3);
281  returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
282  returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
283 
284  number = eighthPoints * 8;
285  for (; number < num_points; number++) {
286  returnValue += (*aPtr++);
287  }
288  *result = returnValue;
289 }
290 #endif /* LV_HAVE_NEON */
291 
292 #endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
static void volk_32fc_accumulator_s32fc_generic(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:71
static void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:201
static void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:127
static void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:163
static void volk_32fc_accumulator_s32fc_neon(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:236
static void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:89
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cmake(r, i)
Definition: volk_complex.h:68
float complex lv_32fc_t
Definition: volk_complex.h:65