Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * Copyright (c) 2016-2019 ARM Limited.
25  *
26  * SPDX-License-Identifier: MIT
27  *
28  * Permission is hereby granted, free of charge, to any person obtaining a copy
29  * of this software and associated documentation files (the "Software"), to
30  * deal in the Software without restriction, including without limitation the
31  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
32  * sell copies of the Software, and to permit persons to whom the Software is
33  * furnished to do so, subject to the following conditions:
34  *
35  * The above copyright notice and this permission notice shall be included in all
36  * copies or substantial portions of the Software.
37  *
38  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44  * SOFTWARE.
45  *
46  * _vtaylor_polyq_f32
47  * _vlogq_f32
48  *
49  */
50 
51 /* Copyright (C) 2011 Julien Pommier
52 
53  This software is provided 'as-is', without any express or implied
54  warranty. In no event will the authors be held liable for any damages
55  arising from the use of this software.
56 
57  Permission is granted to anyone to use this software for any purpose,
58  including commercial applications, and to alter it and redistribute it
59  freely, subject to the following restrictions:
60 
61  1. The origin of this software must not be misrepresented; you must not
62  claim that you wrote the original software. If you use this software
63  in a product, an acknowledgment in the product documentation would be
64  appreciated but is not required.
65  2. Altered source versions must be plainly marked as such, and must not be
66  misrepresented as being the original software.
67  3. This notice may not be removed or altered from any source distribution.
68 
69  (this is the zlib license)
70 
71  _vsincosq_f32
72 
73 */
74 
75 /*
76  * This file is intended to hold NEON intrinsics of intrinsics.
77  * They should be used in VOLK kernels to avoid copy-pasta.
78  */
79 
80 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
81 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
82 #include <arm_neon.h>
83 
84 
85 /* Magnitude squared for float32x4x2_t */
86 static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
87 {
88  float32x4_t iValue, qValue, result;
89  iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
90  qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
91  result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
92  return result;
93 }
94 
95 /* Inverse square root for float32x4_t */
96 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
97 {
98  float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
99  sqrt_reciprocal = vmulq_f32(
100  vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
101  sqrt_reciprocal = vmulq_f32(
102  vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
103 
104  return sqrt_reciprocal;
105 }
106 
107 /* Inverse */
108 static inline float32x4_t _vinvq_f32(float32x4_t x)
109 {
110  // Newton's method
111  float32x4_t recip = vrecpeq_f32(x);
112  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
113  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
114  return recip;
115 }
116 
117 /* Complex multiplication for float32x4x2_t */
118 static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
119  float32x4x2_t b_val)
120 {
121  float32x4x2_t tmp_real;
122  float32x4x2_t tmp_imag;
123  float32x4x2_t c_val;
124 
125  // multiply the real*real and imag*imag to get real result
126  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
127  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
128  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
129  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
130  // Multiply cross terms to get the imaginary result
131  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
132  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
133  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
134  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
135  // combine the products
136  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
137  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
138  return c_val;
139 }
140 
141 /* From ARM Compute Library, MIT license */
142 static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
143 {
144  float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
145  float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
146  float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
147  float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
148  float32x4_t x2 = vmulq_f32(x, x);
149  float32x4_t x4 = vmulq_f32(x2, x2);
150  float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
151  return res;
152 }
153 
154 /* Natural logarithm.
155  * From ARM Compute Library, MIT license */
156 static inline float32x4_t _vlogq_f32(float32x4_t x)
157 {
158  const float32x4_t log_tab[8] = {
159  vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
160  vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
161  vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
162  vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
163  };
164 
165  const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
166  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
167 
168  // Extract exponent
169  int32x4_t m = vsubq_s32(
170  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
171  float32x4_t val =
172  vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
173 
174  // Polynomial Approximation
175  float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
176 
177  // Reconstruct
178  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
179 
180  return poly;
181 }
182 
183 /* Evaluation of 4 sines & cosines at once.
184  * Optimized from here (zlib license)
185  * http://gruntthepeon.free.fr/ssemath/ */
186 static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
187 {
188  const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
189  const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
190  const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
191  const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
192  const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
193  const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
194  const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
195  const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
196  const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
197  const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
198 
199  const float32x4_t CONST_1 = vdupq_n_f32(1.f);
200  const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
201  const float32x4_t CONST_0 = vdupq_n_f32(0.f);
202  const uint32x4_t CONST_2 = vdupq_n_u32(2);
203  const uint32x4_t CONST_4 = vdupq_n_u32(4);
204 
205  uint32x4_t emm2;
206 
207  uint32x4_t sign_mask_sin, sign_mask_cos;
208  sign_mask_sin = vcltq_f32(x, CONST_0);
209  x = vabsq_f32(x);
210  // scale by 4/pi
211  float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
212 
213  // store the integer part of y in mm0
214  emm2 = vcvtq_u32_f32(y);
215  /* j=(j+1) & (~1) (see the cephes sources) */
216  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
217  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
218  y = vcvtq_f32_u32(emm2);
219 
220  /* get the polynom selection mask
221  there is one polynom for 0 <= x <= Pi/4
222  and another one for Pi/4<x<=Pi/2
223  Both branches will be computed. */
224  const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
225 
226  // The magic pass: "Extended precision modular arithmetic"
227  x = vmlaq_f32(x, y, c_minus_cephes_DP1);
228  x = vmlaq_f32(x, y, c_minus_cephes_DP2);
229  x = vmlaq_f32(x, y, c_minus_cephes_DP3);
230 
231  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
232  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
233 
234  /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
235  and the second polynom (Pi/4 <= x <= 0) in y2 */
236  float32x4_t y1, y2;
237  float32x4_t z = vmulq_f32(x, x);
238 
239  y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
240  y1 = vmlaq_f32(c_coscof_p2, z, y1);
241  y1 = vmulq_f32(y1, z);
242  y1 = vmulq_f32(y1, z);
243  y1 = vmlsq_f32(y1, z, CONST_1_2);
244  y1 = vaddq_f32(y1, CONST_1);
245 
246  y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
247  y2 = vmlaq_f32(c_sincof_p2, z, y2);
248  y2 = vmulq_f32(y2, z);
249  y2 = vmlaq_f32(x, x, y2);
250 
251  /* select the correct result from the two polynoms */
252  const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
253  const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
254 
255  float32x4x2_t sincos;
256  sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
257  sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
258 
259  return sincos;
260 }
261 
262 static inline float32x4_t _vsinq_f32(float32x4_t x)
263 {
264  const float32x4x2_t sincos = _vsincosq_f32(x);
265  return sincos.val[0];
266 }
267 
268 static inline float32x4_t _vcosq_f32(float32x4_t x)
269 {
270  const float32x4x2_t sincos = _vsincosq_f32(x);
271  return sincos.val[1];
272 }
273 
274 static inline float32x4_t _vtanq_f32(float32x4_t x)
275 {
276  const float32x4x2_t sincos = _vsincosq_f32(x);
277  return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
278 }
279 
280 static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
281  float32x4_t acc,
282  float32x4_t val,
283  float32x4_t rec,
284  float32x4_t aux)
285 {
286  aux = vmulq_f32(aux, val);
287  aux = vsubq_f32(aux, acc);
288  aux = vmulq_f32(aux, aux);
289 #ifdef LV_HAVE_NEONV8
290  return vfmaq_f32(sq_acc, aux, rec);
291 #else
292  aux = vmulq_f32(aux, rec);
293  return vaddq_f32(sq_acc, aux);
294 #endif
295 }
296 
297 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
val
Definition: volk_arch_defs.py:66
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:96
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:108
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:118
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:262
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:156
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:268
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:86
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:280
static float32x4x2_t _vsincosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:186
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:274
static float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
Definition: volk_neon_intrinsics.h:142