Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32u_byteswap.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
66 #ifndef INCLUDED_volk_32u_byteswap_u_H
67 #define INCLUDED_volk_32u_byteswap_u_H
68 
69 #include <inttypes.h>
70 #include <stdio.h>
71 
72 #if LV_HAVE_AVX2
73 #include <immintrin.h>
74 static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
75 {
76 
77  unsigned int number;
78 
79  const unsigned int nPerSet = 8;
80  const uint64_t nSets = num_points / nPerSet;
81 
82  uint32_t* inputPtr = intsToSwap;
83 
84  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
85  8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
86  21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
87 
88  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
89 
90  for (number = 0; number < nSets; number++) {
91 
92  // Load the 32t values, increment inputPtr later since we're doing it in-place.
93  const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
94  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
95 
96  // Store the results
97  _mm256_storeu_si256((__m256i*)inputPtr, output);
98  inputPtr += nPerSet;
99  }
100 
101  // Byteswap any remaining points:
102  for (number = nSets * nPerSet; number < num_points; number++) {
103  uint32_t outputVal = *inputPtr;
104  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
105  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
106  *inputPtr = outputVal;
107  inputPtr++;
108  }
109 }
110 #endif /* LV_HAVE_AVX2 */
111 
112 
113 #ifdef LV_HAVE_SSE2
114 #include <emmintrin.h>
115 
116 static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
117 {
118  unsigned int number = 0;
119 
120  uint32_t* inputPtr = intsToSwap;
121  __m128i input, byte1, byte2, byte3, byte4, output;
122  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
123  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
124 
125  const uint64_t quarterPoints = num_points / 4;
126  for (; number < quarterPoints; number++) {
127  // Load the 32t values, increment inputPtr later since we're doing it in-place.
128  input = _mm_loadu_si128((__m128i*)inputPtr);
129  // Do the four shifts
130  byte1 = _mm_slli_epi32(input, 24);
131  byte2 = _mm_slli_epi32(input, 8);
132  byte3 = _mm_srli_epi32(input, 8);
133  byte4 = _mm_srli_epi32(input, 24);
134  // Or bytes together
135  output = _mm_or_si128(byte1, byte4);
136  byte2 = _mm_and_si128(byte2, byte2mask);
137  output = _mm_or_si128(output, byte2);
138  byte3 = _mm_and_si128(byte3, byte3mask);
139  output = _mm_or_si128(output, byte3);
140  // Store the results
141  _mm_storeu_si128((__m128i*)inputPtr, output);
142  inputPtr += 4;
143  }
144 
145  // Byteswap any remaining points:
146  number = quarterPoints * 4;
147  for (; number < num_points; number++) {
148  uint32_t outputVal = *inputPtr;
149  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
150  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
151  *inputPtr = outputVal;
152  inputPtr++;
153  }
154 }
155 #endif /* LV_HAVE_SSE2 */
156 
157 
158 #ifdef LV_HAVE_NEON
159 #include <arm_neon.h>
160 
161 static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
162 {
163  uint32_t* inputPtr = intsToSwap;
164  unsigned int number = 0;
165  unsigned int n8points = num_points / 8;
166 
167  uint8x8x4_t input_table;
168  uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
169  uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
170 
171  /* these magic numbers are used as byte-indices in the LUT.
172  they are pre-computed to save time. A simple C program
173  can calculate them; for example for lookup01:
174  uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
175  for(ii=0; ii < 8; ++ii) {
176  index += ((uint64_t)(*(chars+ii))) << (ii*8);
177  }
178  */
179  int_lookup01 = vcreate_u8(74609667900706840);
180  int_lookup23 = vcreate_u8(219290013576860186);
181  int_lookup45 = vcreate_u8(363970359253013532);
182  int_lookup67 = vcreate_u8(508650704929166878);
183 
184  for (number = 0; number < n8points; ++number) {
185  input_table = vld4_u8((uint8_t*)inputPtr);
186  swapped_int01 = vtbl4_u8(input_table, int_lookup01);
187  swapped_int23 = vtbl4_u8(input_table, int_lookup23);
188  swapped_int45 = vtbl4_u8(input_table, int_lookup45);
189  swapped_int67 = vtbl4_u8(input_table, int_lookup67);
190  vst1_u8((uint8_t*)inputPtr, swapped_int01);
191  vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
192  vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
193  vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
194 
195  inputPtr += 8;
196  }
197 
198  for (number = n8points * 8; number < num_points; ++number) {
199  uint32_t output = *inputPtr;
200  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
201  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
202 
203  *inputPtr = output;
204  inputPtr++;
205  }
206 }
207 #endif /* LV_HAVE_NEON */
208 
209 #ifdef LV_HAVE_NEONV8
210 #include <arm_neon.h>
211 
212 static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
213 {
214  uint32_t* inputPtr = (uint32_t*)intsToSwap;
215  const unsigned int n8points = num_points / 8;
216  uint8x16_t input;
217  uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
218 
219  unsigned int number = 0;
220  for (number = 0; number < n8points; ++number) {
221  __VOLK_PREFETCH(inputPtr + 8);
222  input = vld1q_u8((uint8_t*)inputPtr);
223  input = vqtbl1q_u8(input, idx);
224  vst1q_u8((uint8_t*)inputPtr, input);
225  inputPtr += 4;
226 
227  input = vld1q_u8((uint8_t*)inputPtr);
228  input = vqtbl1q_u8(input, idx);
229  vst1q_u8((uint8_t*)inputPtr, input);
230  inputPtr += 4;
231  }
232 
233  for (number = n8points * 8; number < num_points; ++number) {
234  uint32_t output = *inputPtr;
235 
236  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
237  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
238 
239  *inputPtr++ = output;
240  }
241 }
242 #endif /* LV_HAVE_NEONV8 */
243 
244 
245 #ifdef LV_HAVE_GENERIC
246 
247 static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
248  unsigned int num_points)
249 {
250  uint32_t* inputPtr = intsToSwap;
251 
252  unsigned int point;
253  for (point = 0; point < num_points; point++) {
254  uint32_t output = *inputPtr;
255  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
256  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
257 
258  *inputPtr = output;
259  inputPtr++;
260  }
261 }
262 #endif /* LV_HAVE_GENERIC */
263 
264 
265 #endif /* INCLUDED_volk_32u_byteswap_u_H */
266 #ifndef INCLUDED_volk_32u_byteswap_a_H
267 #define INCLUDED_volk_32u_byteswap_a_H
268 
269 #include <inttypes.h>
270 #include <stdio.h>
271 
272 
273 #if LV_HAVE_AVX2
274 #include <immintrin.h>
275 static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
276 {
277 
278  unsigned int number;
279 
280  const unsigned int nPerSet = 8;
281  const uint64_t nSets = num_points / nPerSet;
282 
283  uint32_t* inputPtr = intsToSwap;
284 
285  const uint8_t shuffleVector[32] = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
286  8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
287  21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
288 
289  const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
290 
291  for (number = 0; number < nSets; number++) {
292 
293  // Load the 32t values, increment inputPtr later since we're doing it in-place.
294  const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
295  const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
296 
297  // Store the results
298  _mm256_store_si256((__m256i*)inputPtr, output);
299  inputPtr += nPerSet;
300  }
301 
302  // Byteswap any remaining points:
303  for (number = nSets * nPerSet; number < num_points; number++) {
304  uint32_t outputVal = *inputPtr;
305  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
306  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
307  *inputPtr = outputVal;
308  inputPtr++;
309  }
310 }
311 #endif /* LV_HAVE_AVX2 */
312 
313 
314 #ifdef LV_HAVE_SSE2
315 #include <emmintrin.h>
316 
317 
318 static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
319 {
320  unsigned int number = 0;
321 
322  uint32_t* inputPtr = intsToSwap;
323  __m128i input, byte1, byte2, byte3, byte4, output;
324  __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
325  __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
326 
327  const uint64_t quarterPoints = num_points / 4;
328  for (; number < quarterPoints; number++) {
329  // Load the 32t values, increment inputPtr later since we're doing it in-place.
330  input = _mm_load_si128((__m128i*)inputPtr);
331  // Do the four shifts
332  byte1 = _mm_slli_epi32(input, 24);
333  byte2 = _mm_slli_epi32(input, 8);
334  byte3 = _mm_srli_epi32(input, 8);
335  byte4 = _mm_srli_epi32(input, 24);
336  // Or bytes together
337  output = _mm_or_si128(byte1, byte4);
338  byte2 = _mm_and_si128(byte2, byte2mask);
339  output = _mm_or_si128(output, byte2);
340  byte3 = _mm_and_si128(byte3, byte3mask);
341  output = _mm_or_si128(output, byte3);
342  // Store the results
343  _mm_store_si128((__m128i*)inputPtr, output);
344  inputPtr += 4;
345  }
346 
347  // Byteswap any remaining points:
348  number = quarterPoints * 4;
349  for (; number < num_points; number++) {
350  uint32_t outputVal = *inputPtr;
351  outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
352  ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
353  *inputPtr = outputVal;
354  inputPtr++;
355  }
356 }
357 #endif /* LV_HAVE_SSE2 */
358 
359 
360 #ifdef LV_HAVE_GENERIC
361 
362 static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
363  unsigned int num_points)
364 {
365  uint32_t* inputPtr = intsToSwap;
366 
367  unsigned int point;
368  for (point = 0; point < num_points; point++) {
369  uint32_t output = *inputPtr;
370  output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
371  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
372 
373  *inputPtr = output;
374  inputPtr++;
375  }
376 }
377 #endif /* LV_HAVE_GENERIC */
378 
379 
380 #endif /* INCLUDED_volk_32u_byteswap_a_H */
static void volk_32u_byteswap_neon(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:161
static void volk_32u_byteswap_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:247
static void volk_32u_byteswap_a_generic(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:362
static void volk_32u_byteswap_u_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:116
static void volk_32u_byteswap_a_sse2(uint32_t *intsToSwap, unsigned int num_points)
Definition: volk_32u_byteswap.h:318
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62