21 #ifndef _libint2_src_lib_libint_vectorx86_h_ 22 #define _libint2_src_lib_libint_vectorx86_h_ 28 #include <libint2/util/cxxstd.h> 29 #include <libint2/util/type_traits.h> 31 #if defined(__SSE2__) || defined(__SSE__) || defined(__AVX__) 32 # include <x86intrin.h> 37 namespace libint2 {
namespace simd {
64 d = _mm_loadu_pd(&a[0]);
71 d = _mm_set_pd(a1, a0);
87 d = _mm_add_pd(d, a.d);
92 d = _mm_sub_pd(d, a.d);
98 const static __m128d minus_one = _mm_set_pd(-1.0, -1.0);;
100 result.d = _mm_mul_pd(this->d, minus_one);
104 #if LIBINT2_CPLUSPLUS_STD >= 2011 107 operator double()
const {
109 ::memcpy(&(d0[0]), &d,
sizeof(__m128d));
119 operator __m128d()
const {
134 _mm_storeu_pd(&a[0], d);
139 _mm_store_pd(&a[0], d);
145 inline VectorSSEDouble
operator*(
double a, VectorSSEDouble b) {
147 VectorSSEDouble _a(a);
148 c.d = _mm_mul_pd(_a.d, b.d);
152 inline VectorSSEDouble operator*(VectorSSEDouble a,
double b) {
154 VectorSSEDouble _b(b);
155 c.d = _mm_mul_pd(a.d, _b.d);
159 inline VectorSSEDouble
operator*(
int a, VectorSSEDouble b) {
164 VectorSSEDouble _a((
double)a);
165 c.d = _mm_mul_pd(_a.d, b.d);
170 inline VectorSSEDouble
operator*(VectorSSEDouble a,
int b) {
175 VectorSSEDouble _b((
double)b);
176 c.d = _mm_mul_pd(a.d, _b.d);
181 inline VectorSSEDouble
operator*(VectorSSEDouble a, VectorSSEDouble b) {
183 c.d = _mm_mul_pd(a.d, b.d);
187 inline VectorSSEDouble operator+(VectorSSEDouble a, VectorSSEDouble b) {
189 c.d = _mm_add_pd(a.d, b.d);
193 inline VectorSSEDouble operator-(VectorSSEDouble a, VectorSSEDouble b) {
195 c.d = _mm_sub_pd(a.d, b.d);
199 inline VectorSSEDouble operator/(VectorSSEDouble a, VectorSSEDouble b) {
201 c.d = _mm_div_pd(a.d, b.d);
206 inline VectorSSEDouble
fma_plus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
208 d.d = _mm_fmadd_pd(a.d, b.d, c.d);
211 inline VectorSSEDouble
fma_minus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
213 d.d = _mm_fmsub_pd(a.d, b.d, c.d);
216 #elif defined(__FMA4__) 217 inline VectorSSEDouble
fma_plus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
219 d.d = _mm_macc_pd(a.d, b.d, c.d);
222 inline VectorSSEDouble
fma_minus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
224 d.d = _mm_msub_pd(a.d, b.d, c.d);
234 #if defined(__SSE3__) 235 __m128d t1 = _mm_hadd_pd(a,a);
236 return _mm_cvtsd_f64(t1);
238 __m128 t0 = _mm_castpd_ps(a);
239 __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
240 __m128d t2 = _mm_add_sd(a,t1);
241 return _mm_cvtsd_f64(t2);
250 #if defined(__SSE3__) 251 return _mm_hadd_pd(a, b);
252 #else // will be very inefficient without SSE3 260 inline VectorSSEDouble exp(VectorSSEDouble a) {
262 VectorSSEDouble result;
263 result.d = _mm_exp_pd(a.d);
265 double a_d[2]; a.convert(a_d);
266 for(
int i=0; i<2; ++i) a_d[i] = std::exp(a_d[i]);
267 VectorSSEDouble result(a_d);
271 inline VectorSSEDouble sqrt(VectorSSEDouble a) {
273 VectorSSEDouble result;
274 result.d = _mm_sqrt_pd(a.d);
276 double a_d[2]; a.convert(a_d);
277 for(
int i=0; i<2; ++i) a_d[i] = std::sqrt(a_d[i]);
278 VectorSSEDouble result(a_d);
282 inline VectorSSEDouble erf(VectorSSEDouble a) {
284 VectorSSEDouble result;
285 result.d = _mm_erf_pd(a.d);
287 double a_d[2]; a.convert(a_d);
288 for(
int i=0; i<2; ++i) a_d[i] = ::erf(a_d[i]);
289 VectorSSEDouble result(a_d);
293 inline VectorSSEDouble erfc(VectorSSEDouble a) {
295 VectorSSEDouble result;
296 result.d = _mm_erfc_pd(a.d);
298 double a_d[2]; a.convert(a_d);
299 for(
int i=0; i<2; ++i) a_d[i] = ::erfc(a_d[i]);
300 VectorSSEDouble result(a_d);
312 os <<
"{" << ad[0] <<
"," << ad[1] <<
"}";
323 static const bool value =
true;
328 typedef double scalar_type;
329 static const size_t extent = 2;
340 namespace libint2 {
namespace simd {
360 d = _mm_set_ps(a, a, a, a);
367 d = _mm_loadu_ps(&a[0]);
374 d = _mm_set_ps(a3, a2, a1, a0);
378 d = _mm_set_ps(a, a, a, a);
383 d = _mm_add_ps(d, a.d);
388 d = _mm_sub_ps(d, a.d);
394 const static __m128 minus_one = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);;
396 result.d = _mm_mul_ps(this->d, minus_one);
400 #if LIBINT2_CPLUSPLUS_STD >= 2011 403 operator float()
const {
405 ::memcpy(&(d0[0]), &d,
sizeof(__m128));
414 #if LIBINT2_CPLUSPLUS_STD >= 2011 417 operator double()
const {
418 const float result_flt = this->
operator float();
423 operator __m128()
const {
438 _mm_storeu_ps(&a[0], d);
443 _mm_store_ps(&a[0], d);
448 inline VectorSSEFloat
operator*(
float a, VectorSSEFloat b) {
450 VectorSSEFloat _a(a);
451 c.d = _mm_mul_ps(_a.d, b.d);
455 inline VectorSSEFloat operator*(VectorSSEFloat a,
float b) {
457 VectorSSEFloat _b(b);
458 c.d = _mm_mul_ps(a.d, _b.d);
463 inline VectorSSEFloat
operator*(
double a, VectorSSEFloat b) {
465 VectorSSEFloat _a((
float)a);
466 c.d = _mm_mul_ps(_a.d, b.d);
471 inline VectorSSEFloat
operator*(VectorSSEFloat a,
double b) {
473 VectorSSEFloat _b((
float)b);
474 c.d = _mm_mul_ps(a.d, _b.d);
479 inline VectorSSEFloat
operator*(
int a, VectorSSEFloat b) {
484 VectorSSEFloat _a((
float)a);
485 c.d = _mm_mul_ps(_a.d, b.d);
490 inline VectorSSEFloat
operator*(VectorSSEFloat a,
int b) {
495 VectorSSEFloat _b((
float)b);
496 c.d = _mm_mul_ps(a.d, _b.d);
501 inline VectorSSEFloat
operator*(VectorSSEFloat a, VectorSSEFloat b) {
503 c.d = _mm_mul_ps(a.d, b.d);
507 inline VectorSSEFloat operator+(VectorSSEFloat a, VectorSSEFloat b) {
509 c.d = _mm_add_ps(a.d, b.d);
513 inline VectorSSEFloat operator-(VectorSSEFloat a, VectorSSEFloat b) {
515 c.d = _mm_sub_ps(a.d, b.d);
519 inline VectorSSEFloat operator/(VectorSSEFloat a, VectorSSEFloat b) {
521 c.d = _mm_div_ps(a.d, b.d);
526 inline VectorSSEFloat
fma_plus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
528 d.d = _mm_fmadd_ps(a.d, b.d, c.d);
531 inline VectorSSEFloat
fma_minus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
533 d.d = _mm_fmsub_ps(a.d, b.d, c.d);
536 #elif defined(__FMA4__) 537 inline VectorSSEFloat
fma_plus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
539 d.d = _mm_macc_ps(a.d, b.d, c.d);
542 inline VectorSSEFloat
fma_minus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
544 d.d = _mm_msub_ps(a.d, b.d, c.d);
552 inline VectorSSEFloat exp(VectorSSEFloat a) {
554 VectorSSEFloat result;
555 result.d = _mm_exp_ps(a.d);
557 float a_d[4]; a.convert(a_d);
558 for(
int i=0; i<4; ++i) a_d[i] = std::exp(a_d[i]);
559 VectorSSEFloat result(a_d);
563 inline VectorSSEFloat sqrt(VectorSSEFloat a) {
565 VectorSSEFloat result;
566 result.d = _mm_sqrt_ps(a.d);
568 float a_d[4]; a.convert(a_d);
569 for(
int i=0; i<4; ++i) a_d[i] = std::sqrt(a_d[i]);
570 VectorSSEFloat result(a_d);
574 inline VectorSSEFloat erf(VectorSSEFloat a) {
576 VectorSSEFloat result;
577 result.d = _mm_erf_ps(a.d);
579 float a_d[4]; a.convert(a_d);
580 for(
int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
581 VectorSSEFloat result(a_d);
585 inline VectorSSEFloat erfc(VectorSSEFloat a) {
587 VectorSSEFloat result;
588 result.d = _mm_erfc_ps(a.d);
590 float a_d[4]; a.convert(a_d);
591 for(
int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
592 VectorSSEFloat result(a_d);
604 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
"}";
615 static const bool value =
true;
620 typedef float scalar_type;
621 static const size_t extent = 4;
632 namespace libint2 {
namespace simd {
653 d = _mm256_set_pd(a, a, a, a);
660 d = _mm256_loadu_pd(&a[0]);
667 d = _mm256_set_pd(a3, a2, a1, a0);
678 d = _mm256_set_pd(a, a, a, a);
683 d = _mm256_add_pd(d, a.d);
688 d = _mm256_sub_pd(d, a.d);
694 const static __m256d minus_one = _mm256_set_pd(-1.0, -1.0, -1.0, -1.0);;
696 result.d = _mm256_mul_pd(this->d, minus_one);
700 #if LIBINT2_CPLUSPLUS_STD >= 2011 703 operator double()
const {
705 ::memcpy(&(d0[0]), &d,
sizeof(__m256d));
715 operator __m256d()
const {
721 d = _mm256_loadu_pd(a);
726 d = _mm256_load_pd(a);
730 _mm256_storeu_pd(&a[0], d);
735 _mm256_store_pd(&a[0], d);
740 inline VectorAVXDouble
operator*(
double a, VectorAVXDouble b) {
742 VectorAVXDouble _a(a);
743 c.d = _mm256_mul_pd(_a.d, b.d);
747 inline VectorAVXDouble operator*(VectorAVXDouble a,
double b) {
749 VectorAVXDouble _b(b);
750 c.d = _mm256_mul_pd(a.d, _b.d);
754 inline VectorAVXDouble
operator*(
int a, VectorAVXDouble b) {
759 VectorAVXDouble _a((
double)a);
760 c.d = _mm256_mul_pd(_a.d, b.d);
765 inline VectorAVXDouble
operator*(VectorAVXDouble a,
int b) {
770 VectorAVXDouble _b((
double)b);
771 c.d = _mm256_mul_pd(a.d, _b.d);
776 inline VectorAVXDouble
operator*(VectorAVXDouble a, VectorAVXDouble b) {
778 c.d = _mm256_mul_pd(a.d, b.d);
782 inline VectorAVXDouble operator+(VectorAVXDouble a, VectorAVXDouble b) {
784 c.d = _mm256_add_pd(a.d, b.d);
788 inline VectorAVXDouble operator+(
int a, VectorAVXDouble b) {
794 VectorAVXDouble _a = (static_cast<double>(a));
795 c.d = _mm256_add_pd(_a.d,b.d);
801 inline VectorAVXDouble operator+(VectorAVXDouble a,
int b) {
807 VectorAVXDouble _b = (static_cast<double>(b));
808 c.d = _mm256_add_pd(a.d,_b.d);
814 inline VectorAVXDouble operator-(VectorAVXDouble a, VectorAVXDouble b) {
816 c.d = _mm256_sub_pd(a.d, b.d);
820 inline VectorAVXDouble operator/(VectorAVXDouble a, VectorAVXDouble b) {
822 c.d = _mm256_div_pd(a.d, b.d);
827 inline VectorAVXDouble
fma_plus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
829 d.d = _mm256_fmadd_pd(a.d, b.d, c.d);
832 inline VectorAVXDouble
fma_minus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
834 d.d = _mm256_fmsub_pd(a.d, b.d, c.d);
837 #elif defined(__FMA4__) 838 inline VectorAVXDouble
fma_plus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
840 d.d = _mm256_macc_pd(a.d, b.d, c.d);
843 inline VectorAVXDouble
fma_minus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
845 d.d = _mm256_msub_pd(a.d, b.d, c.d);
854 __m256d s = _mm256_hadd_pd(a,a);
855 return ((
double*)&s)[0] + ((
double*)&s)[2];
869 __m256d sum = _mm256_hadd_pd(a, b);
871 __m128d sum_high = _mm256_extractf128_pd(sum, 1);
873 return _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
888 __m256d sumab = _mm256_hadd_pd(a, b);
890 __m256d sumcd = _mm256_hadd_pd(c, d);
893 __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
895 __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
897 return _mm256_add_pd(perm, blend);
903 inline VectorAVXDouble exp(VectorAVXDouble a) {
905 VectorAVXDouble result;
906 result.d = _mm256_exp_pd(a.d);
908 double a_d[4]; a.convert(a_d);
909 for(
int i=0; i<4; ++i) a_d[i] = ::exp(a_d[i]);
910 VectorAVXDouble result(a_d);
914 inline VectorAVXDouble sqrt(VectorAVXDouble a) {
916 VectorAVXDouble result;
917 result.d = _mm256_sqrt_pd(a.d);
919 double a_d[4]; a.convert(a_d);
920 for(
int i=0; i<4; ++i) a_d[i] = ::sqrt(a_d[i]);
921 VectorAVXDouble result(a_d);
925 inline VectorAVXDouble erf(VectorAVXDouble a) {
927 VectorAVXDouble result;
928 result.d = _mm256_erf_pd(a.d);
930 double a_d[4]; a.convert(a_d);
931 for(
int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
932 VectorAVXDouble result(a_d);
936 inline VectorAVXDouble erfc(VectorAVXDouble a) {
938 VectorAVXDouble result;
939 result.d = _mm256_erfc_pd(a.d);
941 double a_d[4]; a.convert(a_d);
942 for(
int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
943 VectorAVXDouble result(a_d);
970 d = _mm256_set_ps(a, a, a, a, a, a, a, a);
977 d = _mm256_loadu_ps(&a[0]);
984 d = _mm256_set_ps(a0, a1, a2, a3, a4, a5, a6, a7);
988 d = _mm256_set_ps(a, a, a, a, a, a, a, a);
993 d = _mm256_add_ps(d, a.d);
998 d = _mm256_sub_ps(d, a.d);
1004 const static __m256 minus_one = _mm256_set_ps(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0);;
1006 result.d = _mm256_mul_ps(this->d, minus_one);
1010 explicit operator float()
const {
1012 ::memcpy(&(d0[0]), &d,
sizeof(__m256));
1021 explicit operator double()
const {
1022 const float result_flt = this->
operator float();
1026 void convert(T(&a)[8])
const {
1027 _mm256_storeu_ps(&a[0], d);
1032 inline VectorAVXFloat
operator*(
double a, VectorAVXFloat b) {
1034 VectorAVXFloat _a(a);
1035 c.d = _mm256_mul_ps(_a.d, b.d);
1039 inline VectorAVXFloat
operator*(VectorAVXFloat a,
double b) {
1041 VectorAVXFloat _b(b);
1042 c.d = _mm256_mul_ps(a.d, _b.d);
1046 inline VectorAVXFloat
operator*(
int a, VectorAVXFloat b) {
1051 VectorAVXFloat _a((
float)a);
1052 c.d = _mm256_mul_ps(_a.d, b.d);
1057 inline VectorAVXFloat
operator*(VectorAVXFloat a,
int b) {
1062 VectorAVXFloat _b((
float)b);
1063 c.d = _mm256_mul_ps(a.d, _b.d);
1068 inline VectorAVXFloat
operator*(VectorAVXFloat a, VectorAVXFloat b) {
1070 c.d = _mm256_mul_ps(a.d, b.d);
1074 inline VectorAVXFloat operator+(VectorAVXFloat a, VectorAVXFloat b) {
1076 c.d = _mm256_add_ps(a.d, b.d);
1080 inline VectorAVXFloat operator-(VectorAVXFloat a, VectorAVXFloat b) {
1082 c.d = _mm256_sub_ps(a.d, b.d);
1086 inline VectorAVXFloat operator/(VectorAVXFloat a, VectorAVXFloat b) {
1088 c.d = _mm256_div_ps(a.d, b.d);
1092 #if defined(__FMA__) 1093 inline VectorAVXFloat
fma_plus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1095 d.d = _mm256_fmadd_ps(a.d, b.d, c.d);
1098 inline VectorAVXFloat
fma_minus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1100 d.d = _mm256_fmsub_ps(a.d, b.d, c.d);
1103 #elif defined(__FMA4__) 1104 inline VectorAVXFloat
fma_plus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1106 d.d = _mm256_facc_ps(a.d, b.d, c.d);
1109 inline VectorAVXFloat
fma_minus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1111 d.d = _mm256_fsub_ps(a.d, b.d, c.d);
1119 inline VectorAVXFloat exp(VectorAVXFloat a) {
1121 VectorAVXFloat result;
1122 result.d = _mm256_exp_ps(a.d);
1124 float a_d[8]; a.convert(a_d);
1125 for(
int i=0; i<8; ++i) a_d[i] = ::exp(a_d[i]);
1126 VectorAVXFloat result(a_d);
1130 inline VectorAVXFloat sqrt(VectorAVXFloat a) {
1132 VectorAVXFloat result;
1133 result.d = _mm256_sqrt_ps(a.d);
1135 float a_d[8]; a.convert(a_d);
1136 for(
int i=0; i<8; ++i) a_d[i] = ::sqrt(a_d[i]);
1137 VectorAVXFloat result(a_d);
1141 inline VectorAVXFloat erf(VectorAVXFloat a) {
1143 VectorAVXFloat result;
1144 result.d = _mm256_erf_ps(a.d);
1146 float a_d[8]; a.convert(a_d);
1147 for(
int i=0; i<8; ++i) a_d[i] = ::erf(a_d[i]);
1148 VectorAVXFloat result(a_d);
1152 inline VectorAVXFloat erfc(VectorAVXFloat a) {
1154 VectorAVXFloat result;
1155 result.d = _mm256_erfc_ps(a.d);
1157 float a_d[8]; a.convert(a_d);
1158 for(
int i=0; i<8; ++i) a_d[i] = ::erfc(a_d[i]);
1159 VectorAVXFloat result(a_d);
1171 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
"}";
1179 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
"," << ad[4]<<
"," << ad[5]<<
"," << ad[6]<<
"," << ad[7] <<
"}";
1190 static const bool value =
true;
1195 typedef double scalar_type;
1196 static const size_t extent = 4;
1206 #ifdef LIBINT2_HAVE_AGNER_VECTORCLASS 1207 #include <vectorclass.h> 1210 #endif // header guard double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition: vector_x86.h:232
VectorAVXFloat()
creates a vector of default-initialized values.
Definition: vector_x86.h:964
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
VectorAVXDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:647
Definition: type_traits.h:34
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:24
VectorAVXDouble(__m256d a)
converts a 256-bit AVX double vector type to VectorAVXDouble
Definition: vector_x86.h:673
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
SafePtr< CTimeEntity< typename ProductType< T, U >::result > > operator*(const SafePtr< CTimeEntity< T > > &A, const SafePtr< CTimeEntity< U > > &B)
Creates product A*B.
Definition: entity.h:280
VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:983
VectorAVXDouble(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:659
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
VectorAVXFloat(T(&a)[8])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:976
SIMD vector of 8 single-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:956
VectorAVXFloat(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:969
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:734
auto fma_plus(X x, Y y, Z z) -> decltype(x *y+z)
Definition: intrinsic_operations.h:36
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
VectorAVXDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:652
Definition: type_traits.h:29
auto fma_minus(X x, Y y, Z z) -> decltype(x *y-z)
Definition: intrinsic_operations.h:42
VectorAVXDouble(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:666