Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_8u_polarbutterfly_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
74 #ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
75 #define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
76 #include <math.h>
78 
79 static inline float llr_odd(const float la, const float lb)
80 {
81  const float ala = fabsf(la);
82  const float alb = fabsf(lb);
83  return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
84 }
85 
86 static inline void llr_odd_stages(
87  float* llrs, int min_stage, const int depth, const int frame_size, const int row)
88 {
89  int loop_stage = depth - 1;
90  float* dst_llr_ptr;
91  float* src_llr_ptr;
92  int stage_size = 0x01 << loop_stage;
93 
94  int el;
95  while (min_stage <= loop_stage) {
96  dst_llr_ptr = llrs + loop_stage * frame_size + row;
97  src_llr_ptr = dst_llr_ptr + frame_size;
98  for (el = 0; el < stage_size; el++) {
99  *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
100  src_llr_ptr += 2;
101  }
102 
103  --loop_stage;
104  stage_size >>= 1;
105  }
106 }
107 
108 static inline float llr_even(const float la, const float lb, const unsigned char f)
109 {
110  switch (f) {
111  case 0:
112  return lb + la;
113  default:
114  return lb - la;
115  }
116 }
117 
118 static inline void
119 even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
120 {
121  u++;
122  int i;
123  for (i = 1; i < u_num; i += 2) {
124  *u_even++ = *u;
125  u += 2;
126  }
127 }
128 
129 static inline void
130 odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
131 {
132  int i;
133  for (i = 1; i < u_num; i += 2) {
134  *u_xor++ = *u ^ *(u + 1);
135  u += 2;
136  }
137 }
138 
139 static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
140 {
141  int max_stage_depth = 0;
142  int half_stage_size = 0x01;
143  int stage_size = half_stage_size << 1;
144  while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
145  if (!(row % stage_size < half_stage_size)) {
146  break;
147  }
148  half_stage_size <<= 1;
149  stage_size <<= 1;
150  max_stage_depth++;
151  }
152  return max_stage_depth;
153 }
154 
155 #ifdef LV_HAVE_GENERIC
156 
157 static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
158  unsigned char* u,
159  const int frame_exp,
160  const int stage,
161  const int u_num,
162  const int row)
163 {
164  const int frame_size = 0x01 << frame_exp;
165  const int next_stage = stage + 1;
166 
167  const int half_stage_size = 0x01 << stage;
168  const int stage_size = half_stage_size << 1;
169 
170  const bool is_upper_stage_half = row % stage_size < half_stage_size;
171 
172  // // this is a natural bit order impl
173  float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
174  float* call_row_llr = llrs + row;
175 
176  const int section = row - (row % stage_size);
177  const int jump_size = ((row % half_stage_size) << 1) % stage_size;
178 
179  const int next_upper_row = section + jump_size;
180  const int next_lower_row = next_upper_row + 1;
181 
182  const float* upper_right_llr_ptr = next_llrs + next_upper_row;
183  const float* lower_right_llr_ptr = next_llrs + next_lower_row;
184 
185  if (!is_upper_stage_half) {
186  const int u_pos = u_num >> stage;
187  const unsigned char f = u[u_pos - 1];
188  *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
189  return;
190  }
191 
192  if (frame_exp > next_stage) {
193  unsigned char* u_half = u + frame_size;
194  odd_xor_even_values(u_half, u, u_num);
196  next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
197 
198  even_u_values(u_half, u, u_num);
200  next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
201  }
202 
203  *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
204 }
205 
206 #endif /* LV_HAVE_GENERIC */
207 
208 
209 #ifdef LV_HAVE_AVX
210 #include <immintrin.h>
212 
213 static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
214  unsigned char* u,
215  const int frame_exp,
216  const int stage,
217  const int u_num,
218  const int row)
219 {
220  const int frame_size = 0x01 << frame_exp;
221  if (row % 2) { // for odd rows just do the only necessary calculation and return.
222  const float* next_llrs = llrs + frame_size + row;
223  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
224  return;
225  }
226 
227  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
228  if (max_stage_depth < 3) { // vectorized version needs larger vectors.
229  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
230  return;
231  }
232 
233  int loop_stage = max_stage_depth;
234  int stage_size = 0x01 << loop_stage;
235 
236  float* src_llr_ptr;
237  float* dst_llr_ptr;
238 
239  __m256 src0, src1, dst;
240 
241  if (row) { // not necessary for ZERO row. == first bit to be decoded.
242  // first do bit combination for all stages
243  // effectively encode some decoded bits again.
244  unsigned char* u_target = u + frame_size;
245  unsigned char* u_temp = u + 2 * frame_size;
246  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
247 
248  if (stage_size > 15) {
249  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
250  } else {
251  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
252  }
253 
254  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
255  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
256 
257  __m128i fbits;
258 
259  int p;
260  for (p = 0; p < stage_size; p += 8) {
261  fbits = _mm_loadu_si128((__m128i*)u_target);
262  u_target += 8;
263 
264  src0 = _mm256_loadu_ps(src_llr_ptr);
265  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
266  src_llr_ptr += 16;
267 
268  dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
269 
270  _mm256_storeu_ps(dst_llr_ptr, dst);
271  dst_llr_ptr += 8;
272  }
273 
274  --loop_stage;
275  stage_size >>= 1;
276  }
277 
278  const int min_stage = stage > 2 ? stage : 2;
279 
280  _mm256_zeroall(); // Important to clear cache!
281 
282  int el;
283  while (min_stage < loop_stage) {
284  dst_llr_ptr = llrs + loop_stage * frame_size + row;
285  src_llr_ptr = dst_llr_ptr + frame_size;
286  for (el = 0; el < stage_size; el += 8) {
287  src0 = _mm256_loadu_ps(src_llr_ptr);
288  src_llr_ptr += 8;
289  src1 = _mm256_loadu_ps(src_llr_ptr);
290  src_llr_ptr += 8;
291 
292  dst = _mm256_polar_minsum_llrs(src0, src1);
293 
294  _mm256_storeu_ps(dst_llr_ptr, dst);
295  dst_llr_ptr += 8;
296  }
297 
298  --loop_stage;
299  stage_size >>= 1;
300  }
301 
302  // for stages < 3 vectors are too small!.
303  llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
304 }
305 
306 #endif /* LV_HAVE_AVX */
307 
308 #ifdef LV_HAVE_AVX2
309 #include <immintrin.h>
311 
312 static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
313  unsigned char* u,
314  const int frame_exp,
315  const int stage,
316  const int u_num,
317  const int row)
318 {
319  const int frame_size = 0x01 << frame_exp;
320  if (row % 2) { // for odd rows just do the only necessary calculation and return.
321  const float* next_llrs = llrs + frame_size + row;
322  *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
323  return;
324  }
325 
326  const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
327  if (max_stage_depth < 3) { // vectorized version needs larger vectors.
328  volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
329  return;
330  }
331 
332  int loop_stage = max_stage_depth;
333  int stage_size = 0x01 << loop_stage;
334 
335  float* src_llr_ptr;
336  float* dst_llr_ptr;
337 
338  __m256 src0, src1, dst;
339 
340  if (row) { // not necessary for ZERO row. == first bit to be decoded.
341  // first do bit combination for all stages
342  // effectively encode some decoded bits again.
343  unsigned char* u_target = u + frame_size;
344  unsigned char* u_temp = u + 2 * frame_size;
345  memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
346 
347  if (stage_size > 15) {
348  volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
349  } else {
350  volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
351  }
352 
353  src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
354  dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
355 
356  __m128i fbits;
357 
358  int p;
359  for (p = 0; p < stage_size; p += 8) {
360  fbits = _mm_loadu_si128((__m128i*)u_target);
361  u_target += 8;
362 
363  src0 = _mm256_loadu_ps(src_llr_ptr);
364  src1 = _mm256_loadu_ps(src_llr_ptr + 8);
365  src_llr_ptr += 16;
366 
367  dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
368 
369  _mm256_storeu_ps(dst_llr_ptr, dst);
370  dst_llr_ptr += 8;
371  }
372 
373  --loop_stage;
374  stage_size >>= 1;
375  }
376 
377  const int min_stage = stage > 2 ? stage : 2;
378 
379  _mm256_zeroall(); // Important to clear cache!
380 
381  int el;
382  while (min_stage < loop_stage) {
383  dst_llr_ptr = llrs + loop_stage * frame_size + row;
384  src_llr_ptr = dst_llr_ptr + frame_size;
385  for (el = 0; el < stage_size; el += 8) {
386  src0 = _mm256_loadu_ps(src_llr_ptr);
387  src_llr_ptr += 8;
388  src1 = _mm256_loadu_ps(src_llr_ptr);
389  src_llr_ptr += 8;
390 
391  dst = _mm256_polar_minsum_llrs(src0, src1);
392 
393  _mm256_storeu_ps(dst_llr_ptr, dst);
394  dst_llr_ptr += 8;
395  }
396 
397  --loop_stage;
398  stage_size >>= 1;
399  }
400 
401  // for stages < 3 vectors are too small!.
402  llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
403 }
404 
405 #endif /* LV_HAVE_AVX2 */
406 
407 #endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
static float llr_even(const float la, const float lb, const unsigned char f)
Definition: volk_32f_8u_polarbutterfly_32f.h:108
static void llr_odd_stages(float *llrs, int min_stage, const int depth, const int frame_size, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:86
static void odd_xor_even_values(unsigned char *u_xor, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:130
static void volk_32f_8u_polarbutterfly_32f_generic(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:157
static void volk_32f_8u_polarbutterfly_32f_u_avx(float *llrs, unsigned char *u, const int frame_exp, const int stage, const int u_num, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:213
static void even_u_values(unsigned char *u_even, const unsigned char *u, const int u_num)
Definition: volk_32f_8u_polarbutterfly_32f.h:119
static int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
Definition: volk_32f_8u_polarbutterfly_32f.h:139
static float llr_odd(const float la, const float lb)
Definition: volk_32f_8u_polarbutterfly_32f.h:79
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:65
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:89
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:81
static __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:167
static __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx_intrinsics.h:184
for i
Definition: volk_config_fixed.tmpl.h:25