Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_add_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2018 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
74 #ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
75 #define INCLUDED_volk_32fc_32f_add_32fc_u_H
76 
77 #ifdef LV_HAVE_GENERIC
78 
79 static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
80  const lv_32fc_t* aVector,
81  const float* bVector,
82  unsigned int num_points)
83 {
84  lv_32fc_t* cPtr = cVector;
85  const lv_32fc_t* aPtr = aVector;
86  const float* bPtr = bVector;
87  unsigned int number = 0;
88 
89  for (number = 0; number < num_points; number++) {
90  *cPtr++ = (*aPtr++) + (*bPtr++);
91  }
92 }
93 #endif /* LV_HAVE_GENERIC */
94 
95 
96 #ifdef LV_HAVE_AVX
97 #include <immintrin.h>
98 
99 static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
100  const lv_32fc_t* aVector,
101  const float* bVector,
102  unsigned int num_points)
103 {
104  unsigned int number = 0;
105  const unsigned int eighthPoints = num_points / 8;
106 
107  lv_32fc_t* cPtr = cVector;
108  const lv_32fc_t* aPtr = aVector;
109  const float* bPtr = bVector;
110 
111  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
112  __m256 cpx_b1, cpx_b2;
113  __m256 zero;
114  zero = _mm256_setzero_ps();
115  __m256 tmp1, tmp2;
116  for (; number < eighthPoints; number++) {
117 
118  aVal1 = _mm256_loadu_ps((float*)aPtr);
119  aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
120  bVal = _mm256_loadu_ps(bPtr);
121  cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
122  cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
123 
124  tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
125  tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
126 
127  cVal1 = _mm256_add_ps(aVal1, tmp1);
128  cVal2 = _mm256_add_ps(aVal2, tmp2);
129 
130  _mm256_storeu_ps((float*)cPtr,
131  cVal1); // Store the results back into the C container
132  _mm256_storeu_ps((float*)(cPtr + 4),
133  cVal2); // Store the results back into the C container
134 
135  aPtr += 8;
136  bPtr += 8;
137  cPtr += 8;
138  }
139 
140  number = eighthPoints * 8;
141  for (; number < num_points; number++) {
142  *cPtr++ = (*aPtr++) + (*bPtr++);
143  }
144 }
145 #endif /* LV_HAVE_AVX */
146 
147 #ifdef LV_HAVE_AVX
148 #include <immintrin.h>
149 
150 static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
151  const lv_32fc_t* aVector,
152  const float* bVector,
153  unsigned int num_points)
154 {
155  unsigned int number = 0;
156  const unsigned int eighthPoints = num_points / 8;
157 
158  lv_32fc_t* cPtr = cVector;
159  const lv_32fc_t* aPtr = aVector;
160  const float* bPtr = bVector;
161 
162  __m256 aVal1, aVal2, bVal, cVal1, cVal2;
163  __m256 cpx_b1, cpx_b2;
164  __m256 zero;
165  zero = _mm256_setzero_ps();
166  __m256 tmp1, tmp2;
167  for (; number < eighthPoints; number++) {
168 
169  aVal1 = _mm256_load_ps((float*)aPtr);
170  aVal2 = _mm256_load_ps((float*)(aPtr + 4));
171  bVal = _mm256_load_ps(bPtr);
172  cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
173  cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
174 
175  tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
176  tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
177 
178  cVal1 = _mm256_add_ps(aVal1, tmp1);
179  cVal2 = _mm256_add_ps(aVal2, tmp2);
180 
181  _mm256_store_ps((float*)cPtr,
182  cVal1); // Store the results back into the C container
183  _mm256_store_ps((float*)(cPtr + 4),
184  cVal2); // Store the results back into the C container
185 
186  aPtr += 8;
187  bPtr += 8;
188  cPtr += 8;
189  }
190 
191  number = eighthPoints * 8;
192  for (; number < num_points; number++) {
193  *cPtr++ = (*aPtr++) + (*bPtr++);
194  }
195 }
196 #endif /* LV_HAVE_AVX */
197 
198 #ifdef LV_HAVE_NEON
199 #include <arm_neon.h>
200 
201 static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
202  const lv_32fc_t* aVector,
203  const float* bVector,
204  unsigned int num_points)
205 {
206  lv_32fc_t* cPtr = cVector;
207  const lv_32fc_t* aPtr = aVector;
208  const float* bPtr = bVector;
209 
210  float32x4x4_t aVal0, aVal1;
211  float32x4x2_t bVal0, bVal1;
212 
213  const unsigned int sixteenthPoints = num_points / 16;
214  unsigned int number = 0;
215  for (; number < sixteenthPoints; number++) {
216  aVal0 = vld4q_f32((const float*)aPtr);
217  aPtr += 8;
218  aVal1 = vld4q_f32((const float*)aPtr);
219  aPtr += 8;
220  __VOLK_PREFETCH(aPtr + 16);
221 
222  bVal0 = vld2q_f32((const float*)bPtr);
223  bPtr += 8;
224  bVal1 = vld2q_f32((const float*)bPtr);
225  bPtr += 8;
226  __VOLK_PREFETCH(bPtr + 16);
227 
228  aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
229  aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
230 
231  aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
232  aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
233 
234  vst4q_f32((float*)(cPtr), aVal0);
235  cPtr += 8;
236  vst4q_f32((float*)(cPtr), aVal1);
237  cPtr += 8;
238  }
239 
240  for (number = sixteenthPoints * 16; number < num_points; number++) {
241  *cPtr++ = (*aPtr++) + (*bPtr++);
242  }
243 }
244 #endif /* LV_HAVE_NEON */
245 
246 
247 #endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
static void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:150
static void volk_32fc_32f_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:79
static void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:99
static void volk_32fc_32f_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_add_32fc.h:201
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65