LIBINT  2.6.0
vector_x86.h
1 /*
2  * Copyright (C) 2004-2019 Edward F. Valeev
3  *
4  * This file is part of Libint.
5  *
6  * Libint is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Libint is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with Libint. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 #ifndef _libint2_src_lib_libint_vectorx86_h_
22 #define _libint2_src_lib_libint_vectorx86_h_
23 
24 #include <cstring>
25 #include <cmath>
26 #include <iostream>
27 
28 #include <libint2/util/cxxstd.h>
29 #include <libint2/util/type_traits.h>
30 
31 #if defined(__SSE2__) || defined(__SSE__) || defined(__AVX__)
32 # include <x86intrin.h>
33 #endif
34 
35 #ifdef __SSE2__
36 
37 namespace libint2 { namespace simd {
38 
43  struct VectorSSEDouble {
44 
45  typedef double T;
46  __m128d d;
47 
52 
57  d = _mm_set_pd(a, a);
58  }
59 
63  VectorSSEDouble(T (&a)[2]) {
64  d = _mm_loadu_pd(&a[0]);
65  }
66 
70  VectorSSEDouble(T a0, T a1) {
71  d = _mm_set_pd(a1, a0);
72  }
73 
77  VectorSSEDouble(__m128d a) {
78  d = a;
79  }
80 
81  VectorSSEDouble& operator=(T a) {
82  d = _mm_set_pd(a, a);
83  return *this;
84  }
85 
86  VectorSSEDouble& operator+=(VectorSSEDouble a) {
87  d = _mm_add_pd(d, a.d);
88  return *this;
89  }
90 
91  VectorSSEDouble& operator-=(VectorSSEDouble a) {
92  d = _mm_sub_pd(d, a.d);
93  return *this;
94  }
95 
96  VectorSSEDouble operator-() const {
97  // TODO improve using bitflips
98  const static __m128d minus_one = _mm_set_pd(-1.0, -1.0);;
99  VectorSSEDouble result;
100  result.d = _mm_mul_pd(this->d, minus_one);
101  return result;
102  }
103 
104 #if LIBINT2_CPLUSPLUS_STD >= 2011
105  explicit
106 #endif
107  operator double() const {
108  double d0[2];
109  ::memcpy(&(d0[0]), &d, sizeof(__m128d));
110  // this segfaults on OS X
111  //_mm_storeu_pd(&(d0[0]), d);
112 // // aligned alternative requires C++11's alignas, but efficiency should not matter here
113 // alignas(__m128d) double d0[2];
114 // _mm_store_pd(&(d0[0]), d);
115  return d0[0];
116  }
117 
119  operator __m128d() const {
120  return d;
121  }
122 
124  void load(T const* a) {
125  d = _mm_loadu_pd(a);
126  }
129  void load_aligned(T const* a) {
130  d = _mm_load_pd(a);
131  }
133  void convert(T* a) const {
134  _mm_storeu_pd(&a[0], d);
135  }
138  void convert_aligned(T* a) const {
139  _mm_store_pd(&a[0], d);
140  }
141 
142  };
143 
145  inline VectorSSEDouble operator*(double a, VectorSSEDouble b) {
146  VectorSSEDouble c;
147  VectorSSEDouble _a(a);
148  c.d = _mm_mul_pd(_a.d, b.d);
149  return c;
150  }
151 
152  inline VectorSSEDouble operator*(VectorSSEDouble a, double b) {
153  VectorSSEDouble c;
154  VectorSSEDouble _b(b);
155  c.d = _mm_mul_pd(a.d, _b.d);
156  return c;
157  }
158 
159  inline VectorSSEDouble operator*(int a, VectorSSEDouble b) {
160  if (a == 1)
161  return b;
162  else {
163  VectorSSEDouble c;
164  VectorSSEDouble _a((double)a);
165  c.d = _mm_mul_pd(_a.d, b.d);
166  return c;
167  }
168  }
169 
170  inline VectorSSEDouble operator*(VectorSSEDouble a, int b) {
171  if (b == 1)
172  return a;
173  else {
174  VectorSSEDouble c;
175  VectorSSEDouble _b((double)b);
176  c.d = _mm_mul_pd(a.d, _b.d);
177  return c;
178  }
179  }
180 
181  inline VectorSSEDouble operator*(VectorSSEDouble a, VectorSSEDouble b) {
182  VectorSSEDouble c;
183  c.d = _mm_mul_pd(a.d, b.d);
184  return c;
185  }
186 
187  inline VectorSSEDouble operator+(VectorSSEDouble a, VectorSSEDouble b) {
188  VectorSSEDouble c;
189  c.d = _mm_add_pd(a.d, b.d);
190  return c;
191  }
192 
193  inline VectorSSEDouble operator-(VectorSSEDouble a, VectorSSEDouble b) {
194  VectorSSEDouble c;
195  c.d = _mm_sub_pd(a.d, b.d);
196  return c;
197  }
198 
199  inline VectorSSEDouble operator/(VectorSSEDouble a, VectorSSEDouble b) {
200  VectorSSEDouble c;
201  c.d = _mm_div_pd(a.d, b.d);
202  return c;
203  }
204 
205 #if defined(__FMA__)
206  inline VectorSSEDouble fma_plus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
207  VectorSSEDouble d;
208  d.d = _mm_fmadd_pd(a.d, b.d, c.d);
209  return d;
210  }
211  inline VectorSSEDouble fma_minus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
212  VectorSSEDouble d;
213  d.d = _mm_fmsub_pd(a.d, b.d, c.d);
214  return d;
215  }
216 #elif defined(__FMA4__)
217  inline VectorSSEDouble fma_plus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
218  VectorSSEDouble d;
219  d.d = _mm_macc_pd(a.d, b.d, c.d);
220  return d;
221  }
222  inline VectorSSEDouble fma_minus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
223  VectorSSEDouble d;
224  d.d = _mm_msub_pd(a.d, b.d, c.d);
225  return d;
226  }
227 #endif
228 
232  inline double horizontal_add (VectorSSEDouble const & a) {
233 // Agner Fog's version
234 #if defined(__SSE3__)
235  __m128d t1 = _mm_hadd_pd(a,a);
236  return _mm_cvtsd_f64(t1);
237 #else // SSE2 only
238  __m128 t0 = _mm_castpd_ps(a);
239  __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
240  __m128d t2 = _mm_add_sd(a,t1);
241  return _mm_cvtsd_f64(t2);
242 #endif
243  }
244 
250 #if defined(__SSE3__)
251  return _mm_hadd_pd(a, b);
252 #else // will be very inefficient without SSE3
254 #endif
255  }
256 
258 
260  inline VectorSSEDouble exp(VectorSSEDouble a) {
261 #if HAVE_INTEL_SVML
262  VectorSSEDouble result;
263  result.d = _mm_exp_pd(a.d);
264 #else
265  double a_d[2]; a.convert(a_d);
266  for(int i=0; i<2; ++i) a_d[i] = std::exp(a_d[i]);
267  VectorSSEDouble result(a_d);
268 #endif
269  return result;
270  }
271  inline VectorSSEDouble sqrt(VectorSSEDouble a) {
272 #if HAVE_INTEL_SVML
273  VectorSSEDouble result;
274  result.d = _mm_sqrt_pd(a.d);
275 #else
276  double a_d[2]; a.convert(a_d);
277  for(int i=0; i<2; ++i) a_d[i] = std::sqrt(a_d[i]);
278  VectorSSEDouble result(a_d);
279 #endif
280  return result;
281  }
282  inline VectorSSEDouble erf(VectorSSEDouble a) {
283 #if HAVE_INTEL_SVML
284  VectorSSEDouble result;
285  result.d = _mm_erf_pd(a.d);
286 #else
287  double a_d[2]; a.convert(a_d);
288  for(int i=0; i<2; ++i) a_d[i] = ::erf(a_d[i]);
289  VectorSSEDouble result(a_d);
290 #endif
291  return result;
292  }
293  inline VectorSSEDouble erfc(VectorSSEDouble a) {
294 #if HAVE_INTEL_SVML
295  VectorSSEDouble result;
296  result.d = _mm_erfc_pd(a.d);
297 #else
298  double a_d[2]; a.convert(a_d);
299  for(int i=0; i<2; ++i) a_d[i] = ::erfc(a_d[i]);
300  VectorSSEDouble result(a_d);
301 #endif
302  return result;
303  }
305 
306 };}; // namespace libint2::simd
307 
309 inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorSSEDouble a) {
310  double ad[2];
311  a.convert(ad);
312  os << "{" << ad[0] << "," << ad[1] << "}";
313  return os;
314 }
316 
317 namespace libint2 {
318 
320 
321  template <>
322  struct is_vector<simd::VectorSSEDouble> {
323  static const bool value = true;
324  };
325 
326  template <>
327  struct vector_traits<simd::VectorSSEDouble> {
328  typedef double scalar_type;
329  static const size_t extent = 2;
330  };
331 
333 
334 } // namespace libint2
335 
336 #endif // SSE2-only
337 
338 #ifdef __SSE__
339 
340 namespace libint2 { namespace simd {
341 
346  struct VectorSSEFloat {
347 
348  typedef float T;
349  __m128 d;
350 
355 
360  d = _mm_set_ps(a, a, a, a);
361  }
362 
366  VectorSSEFloat(T (&a)[4]) {
367  d = _mm_loadu_ps(&a[0]);
368  }
369 
373  VectorSSEFloat(T a0, T a1, T a2, T a3) {
374  d = _mm_set_ps(a3, a2, a1, a0);
375  }
376 
377  VectorSSEFloat& operator=(T a) {
378  d = _mm_set_ps(a, a, a, a);
379  return *this;
380  }
381 
382  VectorSSEFloat& operator+=(VectorSSEFloat a) {
383  d = _mm_add_ps(d, a.d);
384  return *this;
385  }
386 
387  VectorSSEFloat& operator-=(VectorSSEFloat a) {
388  d = _mm_sub_ps(d, a.d);
389  return *this;
390  }
391 
392  VectorSSEFloat operator-() const {
393  // TODO improve using bitflips
394  const static __m128 minus_one = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);;
395  VectorSSEFloat result;
396  result.d = _mm_mul_ps(this->d, minus_one);
397  return result;
398  }
399 
400 #if LIBINT2_CPLUSPLUS_STD >= 2011
401  explicit
402 #endif
403  operator float() const {
404  float d0[4];
405  ::memcpy(&(d0[0]), &d, sizeof(__m128));
406  // this segfaults on OS X
407  //_mm_storeu_ps(&(d0[0]), d);
408 // // aligned alternative requires C++11's alignas, but efficiency should not matter here
409 // alignas(__m128) float d0[4];
410 // _mm_store_ps(&(d0[0]), d);
411  return d0[0];
412  }
413 
414 #if LIBINT2_CPLUSPLUS_STD >= 2011
415  explicit
416 #endif
417  operator double() const {
418  const float result_flt = this->operator float();
419  return result_flt;
420  }
421 
423  operator __m128() const {
424  return d;
425  }
426 
428  void load(T const* a) {
429  d = _mm_loadu_ps(a);
430  }
433  void load_aligned(T const* a) {
434  d = _mm_load_ps(a);
435  }
437  void convert(T* a) const {
438  _mm_storeu_ps(&a[0], d);
439  }
442  void convert_aligned(T* a) const {
443  _mm_store_ps(&a[0], d);
444  }
445  };
446 
448  inline VectorSSEFloat operator*(float a, VectorSSEFloat b) {
449  VectorSSEFloat c;
450  VectorSSEFloat _a(a);
451  c.d = _mm_mul_ps(_a.d, b.d);
452  return c;
453  }
454 
455  inline VectorSSEFloat operator*(VectorSSEFloat a, float b) {
456  VectorSSEFloat c;
457  VectorSSEFloat _b(b);
458  c.d = _mm_mul_ps(a.d, _b.d);
459  return c;
460  }
461 
462  // narrows a!
463  inline VectorSSEFloat operator*(double a, VectorSSEFloat b) {
464  VectorSSEFloat c;
465  VectorSSEFloat _a((float)a);
466  c.d = _mm_mul_ps(_a.d, b.d);
467  return c;
468  }
469 
470  // narrows b!
471  inline VectorSSEFloat operator*(VectorSSEFloat a, double b) {
472  VectorSSEFloat c;
473  VectorSSEFloat _b((float)b);
474  c.d = _mm_mul_ps(a.d, _b.d);
475  return c;
476  }
477 
478 
479  inline VectorSSEFloat operator*(int a, VectorSSEFloat b) {
480  if (a == 1)
481  return b;
482  else {
483  VectorSSEFloat c;
484  VectorSSEFloat _a((float)a);
485  c.d = _mm_mul_ps(_a.d, b.d);
486  return c;
487  }
488  }
489 
490  inline VectorSSEFloat operator*(VectorSSEFloat a, int b) {
491  if (b == 1)
492  return a;
493  else {
494  VectorSSEFloat c;
495  VectorSSEFloat _b((float)b);
496  c.d = _mm_mul_ps(a.d, _b.d);
497  return c;
498  }
499  }
500 
501  inline VectorSSEFloat operator*(VectorSSEFloat a, VectorSSEFloat b) {
502  VectorSSEFloat c;
503  c.d = _mm_mul_ps(a.d, b.d);
504  return c;
505  }
506 
507  inline VectorSSEFloat operator+(VectorSSEFloat a, VectorSSEFloat b) {
508  VectorSSEFloat c;
509  c.d = _mm_add_ps(a.d, b.d);
510  return c;
511  }
512 
513  inline VectorSSEFloat operator-(VectorSSEFloat a, VectorSSEFloat b) {
514  VectorSSEFloat c;
515  c.d = _mm_sub_ps(a.d, b.d);
516  return c;
517  }
518 
519  inline VectorSSEFloat operator/(VectorSSEFloat a, VectorSSEFloat b) {
520  VectorSSEFloat c;
521  c.d = _mm_div_ps(a.d, b.d);
522  return c;
523  }
524 
525 #if defined(__FMA__)
526  inline VectorSSEFloat fma_plus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
527  VectorSSEFloat d;
528  d.d = _mm_fmadd_ps(a.d, b.d, c.d);
529  return d;
530  }
531  inline VectorSSEFloat fma_minus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
532  VectorSSEFloat d;
533  d.d = _mm_fmsub_ps(a.d, b.d, c.d);
534  return d;
535  }
536 #elif defined(__FMA4__)
537  inline VectorSSEFloat fma_plus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
538  VectorSSEFloat d;
539  d.d = _mm_macc_ps(a.d, b.d, c.d);
540  return d;
541  }
542  inline VectorSSEFloat fma_minus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
543  VectorSSEFloat d;
544  d.d = _mm_msub_ps(a.d, b.d, c.d);
545  return d;
546  }
547 #endif
548 
550 
552  inline VectorSSEFloat exp(VectorSSEFloat a) {
553 #if HAVE_INTEL_SVML
554  VectorSSEFloat result;
555  result.d = _mm_exp_ps(a.d);
556 #else
557  float a_d[4]; a.convert(a_d);
558  for(int i=0; i<4; ++i) a_d[i] = std::exp(a_d[i]);
559  VectorSSEFloat result(a_d);
560 #endif
561  return result;
562  }
563  inline VectorSSEFloat sqrt(VectorSSEFloat a) {
564 #if HAVE_INTEL_SVML
565  VectorSSEFloat result;
566  result.d = _mm_sqrt_ps(a.d);
567 #else
568  float a_d[4]; a.convert(a_d);
569  for(int i=0; i<4; ++i) a_d[i] = std::sqrt(a_d[i]);
570  VectorSSEFloat result(a_d);
571 #endif
572  return result;
573  }
574  inline VectorSSEFloat erf(VectorSSEFloat a) {
575 #if HAVE_INTEL_SVML
576  VectorSSEFloat result;
577  result.d = _mm_erf_ps(a.d);
578 #else
579  float a_d[4]; a.convert(a_d);
580  for(int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
581  VectorSSEFloat result(a_d);
582 #endif
583  return result;
584  }
585  inline VectorSSEFloat erfc(VectorSSEFloat a) {
586 #if HAVE_INTEL_SVML
587  VectorSSEFloat result;
588  result.d = _mm_erfc_ps(a.d);
589 #else
590  float a_d[4]; a.convert(a_d);
591  for(int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
592  VectorSSEFloat result(a_d);
593 #endif
594  return result;
595  }
597 
598 };}; // namespace libint2::simd
599 
601 inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorSSEFloat a) {
602  float ad[4];
603  a.convert(ad);
604  os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
605  return os;
606 }
608 
609 namespace libint2 {
610 
612 
613  template <>
614  struct is_vector<simd::VectorSSEFloat> {
615  static const bool value = true;
616  };
617 
618  template <>
619  struct vector_traits<simd::VectorSSEFloat> {
620  typedef float scalar_type;
621  static const size_t extent = 4;
622  };
623 
625 
626 } // namespace libint2
627 
628 #endif // SSE-only
629 
630 #ifdef __AVX__
631 
632 namespace libint2 { namespace simd {
633 
640 
641  typedef double T;
642  __m256d d;
643 
648 
653  d = _mm256_set_pd(a, a, a, a);
654  }
655 
659  VectorAVXDouble(T (&a)[4]) {
660  d = _mm256_loadu_pd(&a[0]);
661  }
662 
666  VectorAVXDouble(T a0, T a1, T a2, T a3) {
667  d = _mm256_set_pd(a3, a2, a1, a0);
668  }
669 
673  VectorAVXDouble(__m256d a) {
674  d = a;
675  }
676 
677  VectorAVXDouble& operator=(T a) {
678  d = _mm256_set_pd(a, a, a, a);
679  return *this;
680  }
681 
682  VectorAVXDouble& operator+=(VectorAVXDouble a) {
683  d = _mm256_add_pd(d, a.d);
684  return *this;
685  }
686 
687  VectorAVXDouble& operator-=(VectorAVXDouble a) {
688  d = _mm256_sub_pd(d, a.d);
689  return *this;
690  }
691 
692  VectorAVXDouble operator-() const {
693  // TODO improve using bitflips
694  const static __m256d minus_one = _mm256_set_pd(-1.0, -1.0, -1.0, -1.0);;
695  VectorAVXDouble result;
696  result.d = _mm256_mul_pd(this->d, minus_one);
697  return result;
698  }
699 
700 #if LIBINT2_CPLUSPLUS_STD >= 2011
701  explicit
702 #endif
703  operator double() const {
704  double d0[4];
705  ::memcpy(&(d0[0]), &d, sizeof(__m256d));
706  // this segfaults on OS X
707 // _mm256_storeu_pd(&d0[0], d);
708 // // aligned alternative requires C++11's alignas, but efficiency should not matter here
709 // // alignas(__m256d) double d0[4];
710 // // _mm256_store_pd(&(d0[0]), d);
711  return d0[0];
712  }
713 
715  operator __m256d() const {
716  return d;
717  }
718 
720  void load(T const* a) {
721  d = _mm256_loadu_pd(a);
722  }
725  void load_aligned(T const* a) {
726  d = _mm256_load_pd(a);
727  }
729  void convert(T* a) const {
730  _mm256_storeu_pd(&a[0], d);
731  }
734  void convert_aligned(T* a) const {
735  _mm256_store_pd(&a[0], d);
736  }
737  };
738 
740  inline VectorAVXDouble operator*(double a, VectorAVXDouble b) {
741  VectorAVXDouble c;
742  VectorAVXDouble _a(a);
743  c.d = _mm256_mul_pd(_a.d, b.d);
744  return c;
745  }
746 
747  inline VectorAVXDouble operator*(VectorAVXDouble a, double b) {
748  VectorAVXDouble c;
749  VectorAVXDouble _b(b);
750  c.d = _mm256_mul_pd(a.d, _b.d);
751  return c;
752  }
753 
754  inline VectorAVXDouble operator*(int a, VectorAVXDouble b) {
755  if (a == 1)
756  return b;
757  else {
758  VectorAVXDouble c;
759  VectorAVXDouble _a((double)a);
760  c.d = _mm256_mul_pd(_a.d, b.d);
761  return c;
762  }
763  }
764 
765  inline VectorAVXDouble operator*(VectorAVXDouble a, int b) {
766  if (b == 1)
767  return a;
768  else {
769  VectorAVXDouble c;
770  VectorAVXDouble _b((double)b);
771  c.d = _mm256_mul_pd(a.d, _b.d);
772  return c;
773  }
774  }
775 
776  inline VectorAVXDouble operator*(VectorAVXDouble a, VectorAVXDouble b) {
777  VectorAVXDouble c;
778  c.d = _mm256_mul_pd(a.d, b.d);
779  return c;
780  }
781 
782  inline VectorAVXDouble operator+(VectorAVXDouble a, VectorAVXDouble b) {
783  VectorAVXDouble c;
784  c.d = _mm256_add_pd(a.d, b.d);
785  return c;
786  }
787 
788  inline VectorAVXDouble operator+(int a, VectorAVXDouble b) {
789  if(a == 0){
790  return b;
791  }
792  else {
793  VectorAVXDouble c;
794  VectorAVXDouble _a = (static_cast<double>(a));
795  c.d = _mm256_add_pd(_a.d,b.d);
796  return c;
797  }
798  }
799 
800 
801  inline VectorAVXDouble operator+(VectorAVXDouble a, int b) {
802  if(b == 0){
803  return a;
804  }
805  else {
806  VectorAVXDouble c;
807  VectorAVXDouble _b = (static_cast<double>(b));
808  c.d = _mm256_add_pd(a.d,_b.d);
809  return c;
810  }
811  }
812 
813 
814  inline VectorAVXDouble operator-(VectorAVXDouble a, VectorAVXDouble b) {
815  VectorAVXDouble c;
816  c.d = _mm256_sub_pd(a.d, b.d);
817  return c;
818  }
819 
820  inline VectorAVXDouble operator/(VectorAVXDouble a, VectorAVXDouble b) {
821  VectorAVXDouble c;
822  c.d = _mm256_div_pd(a.d, b.d);
823  return c;
824  }
825 
826 #if defined(__FMA__)
827  inline VectorAVXDouble fma_plus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
828  VectorAVXDouble d;
829  d.d = _mm256_fmadd_pd(a.d, b.d, c.d);
830  return d;
831  }
832  inline VectorAVXDouble fma_minus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
833  VectorAVXDouble d;
834  d.d = _mm256_fmsub_pd(a.d, b.d, c.d);
835  return d;
836  }
837 #elif defined(__FMA4__)
838  inline VectorAVXDouble fma_plus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
839  VectorAVXDouble d;
840  d.d = _mm256_macc_pd(a.d, b.d, c.d);
841  return d;
842  }
843  inline VectorAVXDouble fma_minus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
844  VectorAVXDouble d;
845  d.d = _mm256_msub_pd(a.d, b.d, c.d);
846  return d;
847  }
848 #endif
849 
853  inline double horizontal_add (VectorAVXDouble const & a) {
854  __m256d s = _mm256_hadd_pd(a,a);
855  return ((double*)&s)[0] + ((double*)&s)[2];
856 // Agner Fog's version
857 // __m256d t1 = _mm256_hadd_pd(a,a);
858 // __m128d t2 = _mm256_extractf128_pd(t1,1);
859 // __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
860 // return _mm_cvtsd_f64(t3);
861  }
862 
868  // solution from http://stackoverflow.com/questions/9775538/fastest-way-to-do-horizontal-vector-sum-with-avx-instructions
869  __m256d sum = _mm256_hadd_pd(a, b);
870  // extract upper 128 bits of result
871  __m128d sum_high = _mm256_extractf128_pd(sum, 1);
872  // add upper 128 bits of sum to its lower 128 bits
873  return _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
874  }
875 
883  VectorAVXDouble const & b,
884  VectorAVXDouble const & c,
885  VectorAVXDouble const & d) {
886  // solution from http://stackoverflow.com/questions/10833234/4-horizontal-double-precision-sums-in-one-go-with-avx?lq=1
887  // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
888  __m256d sumab = _mm256_hadd_pd(a, b);
889  // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
890  __m256d sumcd = _mm256_hadd_pd(c, d);
891 
892  // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
893  __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
894  // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
895  __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
896 
897  return _mm256_add_pd(perm, blend);
898  }
899 
901 
903  inline VectorAVXDouble exp(VectorAVXDouble a) {
904 #if HAVE_INTEL_SVML
905  VectorAVXDouble result;
906  result.d = _mm256_exp_pd(a.d);
907 #else
908  double a_d[4]; a.convert(a_d);
909  for(int i=0; i<4; ++i) a_d[i] = ::exp(a_d[i]);
910  VectorAVXDouble result(a_d);
911 #endif
912  return result;
913  }
914  inline VectorAVXDouble sqrt(VectorAVXDouble a) {
915 #if HAVE_INTEL_SVML
916  VectorAVXDouble result;
917  result.d = _mm256_sqrt_pd(a.d);
918 #else
919  double a_d[4]; a.convert(a_d);
920  for(int i=0; i<4; ++i) a_d[i] = ::sqrt(a_d[i]);
921  VectorAVXDouble result(a_d);
922 #endif
923  return result;
924  }
925  inline VectorAVXDouble erf(VectorAVXDouble a) {
926 #if HAVE_INTEL_SVML
927  VectorAVXDouble result;
928  result.d = _mm256_erf_pd(a.d);
929 #else
930  double a_d[4]; a.convert(a_d);
931  for(int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
932  VectorAVXDouble result(a_d);
933 #endif
934  return result;
935  }
936  inline VectorAVXDouble erfc(VectorAVXDouble a) {
937 #if HAVE_INTEL_SVML
938  VectorAVXDouble result;
939  result.d = _mm256_erfc_pd(a.d);
940 #else
941  double a_d[4]; a.convert(a_d);
942  for(int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
943  VectorAVXDouble result(a_d);
944 #endif
945  return result;
946  }
947 
948 
949 
950 
956  struct VectorAVXFloat {
957 
958  typedef float T;
959  __m256 d;
960 
965 
970  d = _mm256_set_ps(a, a, a, a, a, a, a, a);
971  }
972 
976  VectorAVXFloat(T (&a)[8]) {
977  d = _mm256_loadu_ps(&a[0]);
978  }
979 
983  VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7) {
984  d = _mm256_set_ps(a0, a1, a2, a3, a4, a5, a6, a7);
985  }
986 
987  VectorAVXFloat& operator=(T a) {
988  d = _mm256_set_ps(a, a, a, a, a, a, a, a);
989  return *this;
990  }
991 
992  VectorAVXFloat& operator+=(VectorAVXFloat a) {
993  d = _mm256_add_ps(d, a.d);
994  return *this;
995  }
996 
997  VectorAVXFloat& operator-=(VectorAVXFloat a) {
998  d = _mm256_sub_ps(d, a.d);
999  return *this;
1000  }
1001 
1002  VectorAVXFloat operator-() const {
1003  // TODO improve using bitflips
1004  const static __m256 minus_one = _mm256_set_ps(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0);;
1005  VectorAVXFloat result;
1006  result.d = _mm256_mul_ps(this->d, minus_one);
1007  return result;
1008  }
1009 
1010  explicit operator float() const {
1011  float d0[8];
1012  ::memcpy(&(d0[0]), &d, sizeof(__m256));
1013  // this segfaults on OS X
1014 // _mm256_storeu_ps(&d0[0], d);
1015 // // aligned alternative requires C++11's alignas, but efficiency should not matter here
1016 // // alignas(__m256) float d0[8];
1017 // // _mm256_store_ps(&(d0[0]), d);
1018  return d0[0];
1019  }
1020 
1021  explicit operator double() const {
1022  const float result_flt = this->operator float();
1023  return result_flt;
1024  }
1025 
1026  void convert(T(&a)[8]) const {
1027  _mm256_storeu_ps(&a[0], d);
1028  }
1029  };
1030 
1032  inline VectorAVXFloat operator*(double a, VectorAVXFloat b) {
1033  VectorAVXFloat c;
1034  VectorAVXFloat _a(a);
1035  c.d = _mm256_mul_ps(_a.d, b.d);
1036  return c;
1037  }
1038 
1039  inline VectorAVXFloat operator*(VectorAVXFloat a, double b) {
1040  VectorAVXFloat c;
1041  VectorAVXFloat _b(b);
1042  c.d = _mm256_mul_ps(a.d, _b.d);
1043  return c;
1044  }
1045 
1046  inline VectorAVXFloat operator*(int a, VectorAVXFloat b) {
1047  if (a == 1)
1048  return b;
1049  else {
1050  VectorAVXFloat c;
1051  VectorAVXFloat _a((float)a);
1052  c.d = _mm256_mul_ps(_a.d, b.d);
1053  return c;
1054  }
1055  }
1056 
1057  inline VectorAVXFloat operator*(VectorAVXFloat a, int b) {
1058  if (b == 1)
1059  return a;
1060  else {
1061  VectorAVXFloat c;
1062  VectorAVXFloat _b((float)b);
1063  c.d = _mm256_mul_ps(a.d, _b.d);
1064  return c;
1065  }
1066  }
1067 
1068  inline VectorAVXFloat operator*(VectorAVXFloat a, VectorAVXFloat b) {
1069  VectorAVXFloat c;
1070  c.d = _mm256_mul_ps(a.d, b.d);
1071  return c;
1072  }
1073 
1074  inline VectorAVXFloat operator+(VectorAVXFloat a, VectorAVXFloat b) {
1075  VectorAVXFloat c;
1076  c.d = _mm256_add_ps(a.d, b.d);
1077  return c;
1078  }
1079 
1080  inline VectorAVXFloat operator-(VectorAVXFloat a, VectorAVXFloat b) {
1081  VectorAVXFloat c;
1082  c.d = _mm256_sub_ps(a.d, b.d);
1083  return c;
1084  }
1085 
1086  inline VectorAVXFloat operator/(VectorAVXFloat a, VectorAVXFloat b) {
1087  VectorAVXFloat c;
1088  c.d = _mm256_div_ps(a.d, b.d);
1089  return c;
1090  }
1091 
1092 #if defined(__FMA__)
1093  inline VectorAVXFloat fma_plus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1094  VectorAVXFloat d;
1095  d.d = _mm256_fmadd_ps(a.d, b.d, c.d);
1096  return d;
1097  }
1098  inline VectorAVXFloat fma_minus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1099  VectorAVXFloat d;
1100  d.d = _mm256_fmsub_ps(a.d, b.d, c.d);
1101  return d;
1102  }
1103 #elif defined(__FMA4__)
1104  inline VectorAVXFloat fma_plus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1105  VectorAVXFloat d;
1106  d.d = _mm256_facc_ps(a.d, b.d, c.d);
1107  return d;
1108  }
1109  inline VectorAVXFloat fma_minus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1110  VectorAVXFloat d;
1111  d.d = _mm256_fsub_ps(a.d, b.d, c.d);
1112  return d;
1113  }
1114 #endif
1115 
1117 
1119  inline VectorAVXFloat exp(VectorAVXFloat a) {
1120 #if HAVE_INTEL_SVML
1121  VectorAVXFloat result;
1122  result.d = _mm256_exp_ps(a.d);
1123 #else
1124  float a_d[8]; a.convert(a_d);
1125  for(int i=0; i<8; ++i) a_d[i] = ::exp(a_d[i]);
1126  VectorAVXFloat result(a_d);
1127 #endif
1128  return result;
1129  }
1130  inline VectorAVXFloat sqrt(VectorAVXFloat a) {
1131 #if HAVE_INTEL_SVML
1132  VectorAVXFloat result;
1133  result.d = _mm256_sqrt_ps(a.d);
1134 #else
1135  float a_d[8]; a.convert(a_d);
1136  for(int i=0; i<8; ++i) a_d[i] = ::sqrt(a_d[i]);
1137  VectorAVXFloat result(a_d);
1138 #endif
1139  return result;
1140  }
1141  inline VectorAVXFloat erf(VectorAVXFloat a) {
1142 #if HAVE_INTEL_SVML
1143  VectorAVXFloat result;
1144  result.d = _mm256_erf_ps(a.d);
1145 #else
1146  float a_d[8]; a.convert(a_d);
1147  for(int i=0; i<8; ++i) a_d[i] = ::erf(a_d[i]);
1148  VectorAVXFloat result(a_d);
1149 #endif
1150  return result;
1151  }
1152  inline VectorAVXFloat erfc(VectorAVXFloat a) {
1153 #if HAVE_INTEL_SVML
1154  VectorAVXFloat result;
1155  result.d = _mm256_erfc_ps(a.d);
1156 #else
1157  float a_d[8]; a.convert(a_d);
1158  for(int i=0; i<8; ++i) a_d[i] = ::erfc(a_d[i]);
1159  VectorAVXFloat result(a_d);
1160 #endif
1161  return result;
1162  }
1164 
1165 };}; // namespace libint2::simd
1166 
1168 inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorAVXDouble a) {
1169  double ad[4];
1170  a.convert(ad);
1171  os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
1172  return os;
1173 }
1174 
1176 inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorAVXFloat a) {
1177  float ad[8];
1178  a.convert(ad);
1179  os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "," << ad[4]<< "," << ad[5]<< "," << ad[6]<< "," << ad[7] << "}";
1180  return os;
1181 }
1183 
1184 namespace libint2 {
1185 
1187 
1188  template <>
1189  struct is_vector<simd::VectorAVXDouble> {
1190  static const bool value = true;
1191  };
1192 
1193  template <>
1194  struct vector_traits<simd::VectorAVXDouble> {
1195  typedef double scalar_type;
1196  static const size_t extent = 4;
1197  };
1198 
1200 
1201 } // namespace libint2
1202 
1203 #endif // AVX-only
1204 
1205 
1206 #ifdef LIBINT2_HAVE_AGNER_VECTORCLASS
1207 #include <vectorclass.h>
1208 #endif
1209 
1210 #endif // header guard
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition: vector_x86.h:232
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition: vector_x86.h:43
VectorAVXFloat()
creates a vector of default-initialized values.
Definition: vector_x86.h:964
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
VectorAVXDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:647
VectorSSEFloat(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:366
Definition: type_traits.h:34
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:24
VectorSSEDouble(T a0, T a1)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:70
VectorAVXDouble(__m256d a)
converts a 256-bit AVX double vector type to VectorAVXDouble
Definition: vector_x86.h:673
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:442
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
VectorSSEFloat(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:359
VectorSSEDouble(T(&a)[2])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:63
SafePtr< CTimeEntity< typename ProductType< T, U >::result > > operator*(const SafePtr< CTimeEntity< T > > &A, const SafePtr< CTimeEntity< U > > &B)
Creates product A*B.
Definition: entity.h:280
void convert(T *a) const
writes this to a
Definition: vector_x86.h:133
VectorSSEDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:56
VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:983
void convert(T *a) const
writes this to a
Definition: vector_x86.h:437
VectorSSEDouble(__m128d a)
converts a 128-bit SSE double vector type to VectorSSEDouble
Definition: vector_x86.h:77
VectorAVXDouble(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:659
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
VectorAVXFloat(T(&a)[8])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:976
SIMD vector of 8 single-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:956
VectorAVXFloat(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:969
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:734
auto fma_plus(X x, Y y, Z z) -> decltype(x *y+z)
Definition: intrinsic_operations.h:36
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
VectorSSEDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:51
void load(T const *a)
loads a to this
Definition: vector_x86.h:428
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:433
VectorSSEFloat()
creates a vector of default-initialized values.
Definition: vector_x86.h:354
VectorAVXDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:652
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:138
SIMD vector of 4 single-precision floating-point real numbers, operations on which use SSE instructio...
Definition: vector_x86.h:346
void load(T const *a)
loads a to this
Definition: vector_x86.h:124
Definition: type_traits.h:29
VectorSSEFloat(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:373
auto fma_minus(X x, Y y, Z z) -> decltype(x *y-z)
Definition: intrinsic_operations.h:42
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:129
VectorAVXDouble(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:666