71 #ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
72 #define INCLUDED_volk_32f_x2_divide_32f_a_H
77 #ifdef LV_HAVE_AVX512F
78 #include <immintrin.h>
80 static inline void volk_32f_x2_divide_32f_a_avx512f(
float* cVector,
83 unsigned int num_points)
85 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 const float* bPtr = bVector;
92 __m512 aVal, bVal, cVal;
93 for (; number < sixteenthPoints; number++) {
94 aVal = _mm512_load_ps(aPtr);
95 bVal = _mm512_load_ps(bPtr);
97 cVal = _mm512_div_ps(aVal, bVal);
99 _mm512_store_ps(cPtr, cVal);
106 number = sixteenthPoints * 16;
107 for (; number < num_points; number++) {
108 *cPtr++ = (*aPtr++) / (*bPtr++);
115 #include <immintrin.h>
118 const float* aVector,
119 const float* bVector,
120 unsigned int num_points)
122 unsigned int number = 0;
123 const unsigned int eighthPoints = num_points / 8;
125 float* cPtr = cVector;
126 const float* aPtr = aVector;
127 const float* bPtr = bVector;
129 __m256 aVal, bVal, cVal;
130 for (; number < eighthPoints; number++) {
131 aVal = _mm256_load_ps(aPtr);
132 bVal = _mm256_load_ps(bPtr);
134 cVal = _mm256_div_ps(aVal, bVal);
136 _mm256_store_ps(cPtr, cVal);
143 number = eighthPoints * 8;
144 for (; number < num_points; number++) {
145 *cPtr++ = (*aPtr++) / (*bPtr++);
152 #include <xmmintrin.h>
155 const float* aVector,
156 const float* bVector,
157 unsigned int num_points)
159 unsigned int number = 0;
160 const unsigned int quarterPoints = num_points / 4;
162 float* cPtr = cVector;
163 const float* aPtr = aVector;
164 const float* bPtr = bVector;
166 __m128 aVal, bVal, cVal;
167 for (; number < quarterPoints; number++) {
168 aVal = _mm_load_ps(aPtr);
169 bVal = _mm_load_ps(bPtr);
171 cVal = _mm_div_ps(aVal, bVal);
173 _mm_store_ps(cPtr, cVal);
180 number = quarterPoints * 4;
181 for (; number < num_points; number++) {
182 *cPtr++ = (*aPtr++) / (*bPtr++);
189 #include <arm_neon.h>
192 const float* aVector,
193 const float* bVector,
194 unsigned int num_points)
196 float* cPtr = cVector;
197 const float* aPtr = aVector;
198 const float* bPtr = bVector;
200 float32x4x4_t aVal, bVal, bInv, cVal;
202 const unsigned int eighthPoints = num_points / 16;
203 unsigned int number = 0;
204 for (; number < eighthPoints; number++) {
205 aVal = vld4q_f32(aPtr);
207 bVal = vld4q_f32(bPtr);
213 bInv.val[0] = vrecpeq_f32(bVal.val[0]);
214 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
215 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
216 cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
218 bInv.val[1] = vrecpeq_f32(bVal.val[1]);
219 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
220 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
221 cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
223 bInv.val[2] = vrecpeq_f32(bVal.val[2]);
224 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
225 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
226 cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
228 bInv.val[3] = vrecpeq_f32(bVal.val[3]);
229 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
230 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
231 cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
233 vst4q_f32(cPtr, cVal);
237 for (number = eighthPoints * 16; number < num_points; number++) {
238 *cPtr++ = (*aPtr++) / (*bPtr++);
245 #ifdef LV_HAVE_GENERIC
248 const float* aVector,
249 const float* bVector,
250 unsigned int num_points)
252 float* cPtr = cVector;
253 const float* aPtr = aVector;
254 const float* bPtr = bVector;
255 unsigned int number = 0;
257 for (number = 0; number < num_points; number++) {
258 *cPtr++ = (*aPtr++) / (*bPtr++);
266 extern void volk_32f_x2_divide_32f_a_orc_impl(
float* cVector,
267 const float* aVector,
268 const float* bVector,
269 unsigned int num_points);
271 static inline void volk_32f_x2_divide_32f_u_orc(
float* cVector,
272 const float* aVector,
273 const float* bVector,
274 unsigned int num_points)
276 volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
284 #ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
285 #define INCLUDED_volk_32f_x2_divide_32f_u_H
287 #include <inttypes.h>
290 #ifdef LV_HAVE_AVX512F
291 #include <immintrin.h>
293 static inline void volk_32f_x2_divide_32f_u_avx512f(
float* cVector,
294 const float* aVector,
295 const float* bVector,
296 unsigned int num_points)
298 unsigned int number = 0;
299 const unsigned int sixteenthPoints = num_points / 16;
301 float* cPtr = cVector;
302 const float* aPtr = aVector;
303 const float* bPtr = bVector;
305 __m512 aVal, bVal, cVal;
306 for (; number < sixteenthPoints; number++) {
307 aVal = _mm512_loadu_ps(aPtr);
308 bVal = _mm512_loadu_ps(bPtr);
310 cVal = _mm512_div_ps(aVal, bVal);
312 _mm512_storeu_ps(cPtr, cVal);
319 number = sixteenthPoints * 16;
320 for (; number < num_points; number++) {
321 *cPtr++ = (*aPtr++) / (*bPtr++);
328 #include <immintrin.h>
331 const float* aVector,
332 const float* bVector,
333 unsigned int num_points)
335 unsigned int number = 0;
336 const unsigned int eighthPoints = num_points / 8;
338 float* cPtr = cVector;
339 const float* aPtr = aVector;
340 const float* bPtr = bVector;
342 __m256 aVal, bVal, cVal;
343 for (; number < eighthPoints; number++) {
344 aVal = _mm256_loadu_ps(aPtr);
345 bVal = _mm256_loadu_ps(bPtr);
347 cVal = _mm256_div_ps(aVal, bVal);
349 _mm256_storeu_ps(cPtr, cVal);
356 number = eighthPoints * 8;
357 for (; number < num_points; number++) {
358 *cPtr++ = (*aPtr++) / (*bPtr++);
static void volk_32f_x2_divide_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:191
static void volk_32f_x2_divide_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:330
static void volk_32f_x2_divide_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:154
static void volk_32f_x2_divide_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:247
static void volk_32f_x2_divide_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:117
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62