LIBINT  2.6.0
boys.h
1 /*
2  * Copyright (C) 2004-2019 Edward F. Valeev
3  *
4  * This file is part of Libint.
5  *
6  * Libint is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Libint is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with Libint. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 #ifndef _libint2_src_lib_libint_boys_h_
22 #define _libint2_src_lib_libint_boys_h_
23 
24 #if defined(__cplusplus)
25 
26 #include <iostream>
27 #include <cstdlib>
28 #include <cmath>
29 #include <stdexcept>
30 #include <libint2/util/vector.h>
31 #include <cassert>
32 #include <vector>
33 #include <algorithm>
34 #include <limits>
35 #include <mutex>
36 #include <type_traits>
37 #include <memory>
38 
39 // from now on at least C++11 is required by default
40 #include <libint2/util/cxxstd.h>
41 #if LIBINT2_CPLUSPLUS_STD < 2011
42 # error "Libint2 C++ API requires C++11 support"
43 #endif
44 
45 #include <libint2/boys_fwd.h>
46 #include <libint2/numeric.h>
47 #include <libint2/initialize.h>
48 
49 #if HAVE_LAPACK // use F77-type interface for now, switch to LAPACKE later
50 extern "C" void dgesv_(const int* n,
51  const int* nrhs, double* A, const int* lda,
52  int* ipiv, double* b, const int* ldb,
53  int* info);
54 #endif
55 
56 namespace libint2 {
57 
59  template<typename Real>
61  public:
62  ExpensiveNumbers(int ifac = -1, int idf = -1, int ibc = -1) {
63  if (ifac >= 0) {
64  fac.resize(ifac + 1);
65  fac[0] = 1.0;
66  for (int i = 1; i <= ifac; i++) {
67  fac[i] = i * fac[i - 1];
68  }
69  }
70 
71  if (idf >= 0) {
72  df.resize(idf + 1);
73  /* df[i] gives (i-1)!!, so that (-1)!! is defined... */
74  df[0] = 1.0;
75  if (idf >= 1)
76  df[1] = 1.0;
77  if (idf >= 2)
78  df[2] = 1.0;
79  for (int i = 3; i <= idf; i++) {
80  df[i] = (i - 1) * df[i - 2];
81  }
82  }
83 
84  if (ibc >= 0) {
85  bc_.resize((ibc+1)*(ibc+1));
86  std::fill(bc_.begin(), bc_.end(), Real(0));
87  bc.resize(ibc+1);
88  bc[0] = &bc_[0];
89  for(int i=1; i<=ibc; ++i)
90  bc[i] = bc[i-1] + (ibc+1);
91 
92  for(int i=0; i<=ibc; i++)
93  bc[i][0] = 1.0;
94  for(int i=0; i<=ibc; i++)
95  for(int j=1; j<=i; ++j)
96  bc[i][j] = bc[i][j-1] * Real(i-j+1) / Real(j);
97  }
98 
99  for (int i = 0; i < 128; i++) {
100  twoi1[i] = 1.0 / (Real(2.0) * i + Real(1.0));
101  ihalf[i] = Real(i) - Real(0.5);
102  }
103 
104  }
105 
106  ~ExpensiveNumbers() {
107  }
108 
109  std::vector<Real> fac;
110  std::vector<Real> df;
111  std::vector<Real*> bc;
112 
113  // these quantitites are needed with indices <= mmax
114  // 64 is sufficient to handle up to 4 center integrals with up to L=15 basis functions
115  // but need higher values for Yukawa integrals ...
116  Real twoi1[128]; /* 1/(2 i + 1); needed for downward recursion */
117  Real ihalf[128]; /* i - 0.5, needed for upward recursion */
118 
119  private:
120  std::vector<Real> bc_;
121  };
122 
123 #define _local_min_macro(a,b) ((a) > (b) ? (a) : (b))
124 
143  template<typename Real>
145 
146  static std::shared_ptr<const FmEval_Reference> instance(int /* mmax */, Real /* precision */) {
147 
148  // thread-safe per C++11 standard [6.7.4]
149  static auto instance_ = std::make_shared<const FmEval_Reference>();
150 
151  return instance_;
152  }
153 
155  static Real eval(Real T, size_t m) {
156  assert(m < 100);
157  static const Real T_crit = std::numeric_limits<Real>::is_bounded == true ? -log( std::numeric_limits<Real>::min() * 100.5 / 2. ) : Real(0) ;
158  if (std::numeric_limits<Real>::is_bounded && T > T_crit)
159  throw std::overflow_error("FmEval_Reference<Real>::eval: Real lacks precision for the given value of argument T");
160  static const Real half = Real(1)/2;
161  Real denom = (m + half);
162  using std::exp;
163  Real term = exp(-T) / (2 * denom);
164  Real old_term = 0;
165  Real sum = term;
166  const Real epsilon = get_epsilon(T);
167  const Real epsilon_divided_10 = epsilon / 10;
168  do {
169  denom += 1;
170  old_term = term;
171  term = old_term * T / denom;
172  sum += term;
173  //rel_error = term / sum , hence iterate until rel_error = epsilon
174  // however, must ensure that contributions are decreasing to ensure that omitted contributions are smaller than epsilon
175  } while (term > sum * epsilon_divided_10 || old_term < term);
176 
177  return sum;
178  }
179 
184  static void eval(Real* Fm, Real T, size_t mmax) {
185 
186  // evaluate for mmax using MacLaurin series
187  // it converges fastest for the largest m -> use it to compute Fmmax(T)
188  // see JPC 94, 5564 (1990).
189  for(size_t m=0; m<=mmax; ++m)
190  Fm[m] = eval(T, m);
191  return;
192  }
193 
194  };
195 
206  template<typename Real>
208 
209  static std::shared_ptr<const FmEval_Reference2> instance(int /* mmax */, Real /* precision */) {
210 
211  // thread-safe per C++11 standard [6.7.4]
212  static auto instance_ = std::make_shared<const FmEval_Reference2>();
213 
214  return instance_;
215  }
216 
221  static void eval(Real* Fm, Real t, size_t mmax) {
222 
223  if (t < Real(117)) {
224  FmEval_Reference<Real>::eval(Fm,t,mmax);
225  }
226  else {
227  const Real two_over_sqrt_pi{1.128379167095512573896158903121545171688101258657997713688171443421284936882986828973487320404214727};
228  const Real K = 1/two_over_sqrt_pi;
229 
230  auto t2 = 2*t;
231  auto et = exp(-t);
232  auto sqrt_t = sqrt(t);
233  Fm[0] = K*erf(sqrt_t)/sqrt_t;
234  if (mmax > 0)
235  for(size_t m=0; m<=mmax-1; m++) {
236  Fm[m+1] = ((2*m + 1)*Fm[m] - et)/(t2);
237  }
238  }
239  }
240 
241  };
242 
246  template <typename Real = double>
248 
249 #include <libint2/boys_cheb7.h>
250 
251  static_assert(std::is_same<Real,double>::value, "FmEval_Chebyshev7 only supports double as the real type");
252 
253  static constexpr const int ORDER = interpolation_order;
254  static constexpr const int ORDERp1 = ORDER+1;
255 
256  static constexpr const Real T_crit = cheb_table_tmax;
257  static constexpr Real delta = cheb_table_delta;
258  static constexpr Real one_over_delta = 1/delta;
259 
260  int mmax;
261  ExpensiveNumbers<double> numbers_;
262  Real *c; /* the Chebyshev coefficients table, T_crit*one_over_delta by mmax*ORDERp1 */
263 
264  public:
269  FmEval_Chebyshev7(int m_max, double precision = std::numeric_limits<double>::epsilon()) :
270  mmax(m_max), numbers_(14) {
271 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
272 # if !defined(__AVX__) && defined(NDEBUG)
273  if (libint2::verbose()) {
274  static bool printed_performance_warning = false;
275  if (!printed_performance_warning) {
277  << "libint2::FmEval_Chebyshev7 on x86(-64) platforms needs AVX support for best performance"
278  << std::endl;
279  printed_performance_warning = true;
280  }
281  }
282 # endif
283 #endif
284  if (precision < std::numeric_limits<double>::epsilon())
285  throw std::invalid_argument(std::string("FmEval_Chebyshev7 does not support precision smaller than ") + std::to_string(std::numeric_limits<double>::epsilon()));
286  if (mmax > cheb_table_mmax)
287  throw std::invalid_argument(
288  "FmEval_Chebyshev7::init() : requested mmax exceeds the "
289  "hard-coded mmax");
290  if (m_max >= 0)
291  init_table();
292  }
293  ~FmEval_Chebyshev7() {
294  if (mmax >= 0) {
295  free(c);
296  }
297  }
298 
300  static std::shared_ptr<const FmEval_Chebyshev7> instance(int m_max, double = 0.0) {
301 
302  assert(m_max >= 0);
303  // thread-safe per C++11 standard [6.7.4]
304  static auto instance_ = std::make_shared<const FmEval_Chebyshev7>(m_max);
305 
306  while (instance_->max_m() < m_max) {
307  static std::mutex mtx;
308  std::lock_guard<std::mutex> lck(mtx);
309  if (instance_->max_m() < m_max) {
310  auto new_instance = std::make_shared<const FmEval_Chebyshev7>(m_max);
311  instance_ = new_instance;
312  }
313  }
314 
315  return instance_;
316  }
317 
319  int max_m() const { return mmax; }
320 
325  inline void eval(Real* Fm, Real x, int m_max) const {
326 
327  // large T => use upward recursion
328  // cost = 1 div + 1 sqrt + (1 + 2*(m-1)) muls
329  if (x > T_crit) {
330  const double one_over_x = 1/x;
331  Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
332  if (m_max == 0)
333  return;
334  // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is small enough to guarantee full double precision
335  for (int i = 1; i <= m_max; i++)
336  Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
337  return;
338  }
339 
340  // ---------------------------------------------
341  // small and intermediate arguments => interpolate Fm and (optional) downward recursion
342  // ---------------------------------------------
343  // which interval does this x fall into?
344  const Real x_over_delta = x * one_over_delta;
345  const int iv = int(x_over_delta); // the interval index
346  const Real xd = x_over_delta - (Real)iv - 0.5; // this ranges from -0.5 to 0.5
347  const int m_min = 0;
348 
349 #if defined(__AVX__)
350  const auto x2 = xd*xd;
351  const auto x3 = x2*xd;
352  const auto x4 = x2*x2;
353  const auto x5 = x2*x3;
354  const auto x6 = x3*x3;
355  const auto x7 = x3*x4;
356  libint2::simd::VectorAVXDouble x0vec(1., xd, x2, x3);
357  libint2::simd::VectorAVXDouble x1vec(x4, x5, x6, x7);
358 #endif // AVX
359 
360  const Real *d = c + (ORDERp1) * (iv * (mmax+1) + m_min); // ptr to the interpolation data for m=mmin
361  int m = m_min;
362 #if defined(__AVX__)
363  if (m_max-m >=3) {
364  const int unroll_size = 4;
365  const int m_fence = (m_max + 2 - unroll_size);
366  for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
367  libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v,
368  d20v, d21v, d30v, d31v;
369  d00v.load_aligned(d); d01v.load_aligned((d+4));
370  d10v.load_aligned(d+ORDERp1); d11v.load_aligned((d+4)+ORDERp1);
371  d20v.load_aligned(d+2*ORDERp1); d21v.load_aligned((d+4)+2*ORDERp1);
372  d30v.load_aligned(d+3*ORDERp1); d31v.load_aligned((d+4)+3*ORDERp1);
373  libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
374  libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
375  libint2::simd::VectorAVXDouble fm2 = d20v * x0vec + d21v * x1vec;
376  libint2::simd::VectorAVXDouble fm3 = d30v * x0vec + d31v * x1vec;
377  libint2::simd::VectorAVXDouble sum0123 = horizontal_add(fm0, fm1, fm2, fm3);
378  sum0123.convert(&Fm[m]);
379  }
380  } // unroll_size=4
381  if (m_max-m >=1) {
382  const int unroll_size = 2;
383  const int m_fence = (m_max + 2 - unroll_size);
384  for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
385  libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v;
386  d00v.load_aligned(d);
387  d01v.load_aligned((d+4));
388  d10v.load_aligned(d+ORDERp1);
389  d11v.load_aligned((d+4)+ORDERp1);
390  libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
391  libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
392  libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm0, fm1);
393  sum01.convert(&Fm[m]);
394  }
395  } // unroll_size=2
396  { // no unrolling
397  for(; m<=m_max; ++m, d+=ORDERp1) {
399  d0v.load_aligned(d);
400  d1v.load_aligned(d+4);
401  Fm[m] = horizontal_add(d0v * x0vec + d1v * x1vec);
402  }
403  }
404 #else // AVX not available
405  for(m=m_min; m<=m_max; ++m, d+=ORDERp1) {
406  Fm[m] = d[0]
407  + xd * (d[1]
408  + xd * (d[2]
409  + xd * (d[3]
410  + xd * (d[4]
411  + xd * (d[5]
412  + xd * (d[6]
413  + xd * (d[7])))))));
414 
415  // // check against the reference value
416  // if (false) {
417  // double refvalue = FmEval_Reference2<double>::eval(x, m); // compute F(T)
418  // if (abs(refvalue - Fm[m]) > 1e-10) {
419  // std::cout << "T = " << x << " m = " << m << " cheb = "
420  // << Fm[m] << " ref = " << refvalue << std::endl;
421  // }
422  // }
423  }
424 #endif
425 
426 
427  } // eval()
428 
429  private:
430 
431  void init_table() {
432 
433  // get memory
434  void* result;
435  int status = posix_memalign(&result, ORDERp1*sizeof(Real), (mmax + 1) * cheb_table_nintervals * ORDERp1 * sizeof(Real));
436  if (status != 0) {
437  if (status == EINVAL)
438  throw std::logic_error(
439  "FmEval_Chebyshev7::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)");
440  if (status == ENOMEM)
441  throw std::bad_alloc();
442  abort(); // should be unreachable
443  }
444  c = static_cast<Real*>(result);
445 
446  // copy contents of static table into c
447  // need all intervals
448  for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
449  // but only values of m up to mmax
450  std::copy(cheb_table[iv], cheb_table[iv]+(mmax+1)*ORDERp1, c+(iv*(mmax+1))*ORDERp1);
451  }
452  }
453 
454  }; // FmEval_Chebyshev7
455 
456 #if LIBINT2_CONSTEXPR_STATICS
457  template <typename Real>
458  constexpr
459  double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)];
460 #else
461  // clang needs an explicit specifalization declaration to avoid warning
462 # ifdef __clang__
463  template <typename Real>
464  double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)];
465 # endif
466 #endif
467 
468 #ifndef STATIC_OON
469 #define STATIC_OON
470  namespace {
471  const double oon[] = {0.0, 1.0, 1.0/2.0, 1.0/3.0, 1.0/4.0, 1.0/5.0, 1.0/6.0, 1.0/7.0, 1.0/8.0, 1.0/9.0, 1.0/10.0, 1.0/11.0};
472  }
473 #endif
474 
480  template<typename Real = double, int INTERPOLATION_ORDER = 7>
482  public:
483  static_assert(std::is_same<Real,double>::value, "FmEval_Taylor only supports double as the real type");
484 
485  static const int max_interp_order = 8;
486  static const bool INTERPOLATION_AND_RECURSION = false; // compute F_lmax(T) and then iterate down to F_0(T)? Else use interpolation only
487  const double soft_zero_;
488 
490  FmEval_Taylor(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) :
491  soft_zero_(1e-6), cutoff_(precision), numbers_(
492  INTERPOLATION_ORDER + 1, 2 * (mmax + INTERPOLATION_ORDER - 1)) {
493 
494 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
495 # if !defined(__AVX__) && defined(NDEBUG)
496  if (libint2::verbose()) {
497  static bool printed_performance_warning = false;
498  if (!printed_performance_warning) {
500  << "libint2::FmEval_Taylor on x86(-64) platforms needs AVX support for best performance"
501  << std::endl;
502  printed_performance_warning = true;
503  }
504  }
505 # endif
506 #endif
507 
508  assert(mmax <= 63);
509 
510  const double sqrt_pi = std::sqrt(M_PI);
511 
512  /*---------------------------------------
513  We are doing Taylor interpolation with
514  n=TAYLOR_ORDER terms here:
515  error <= delT^n/(n+1)!
516  ---------------------------------------*/
517  delT_ = 2.0
518  * std::pow(cutoff_ * numbers_.fac[INTERPOLATION_ORDER + 1],
519  1.0 / INTERPOLATION_ORDER);
520  oodelT_ = 1.0 / delT_;
521  max_m_ = mmax + INTERPOLATION_ORDER - 1;
522 
523  T_crit_ = new Real[max_m_ + 1]; /*--- m=0 is included! ---*/
524  max_T_ = 0;
525  /*--- Figure out T_crit for each m and put into the T_crit ---*/
526  for (int m = max_m_; m >= 0; --m) {
527  /*------------------------------------------
528  Damped Newton-Raphson method to solve
529  T^{m-0.5}*exp(-T) = epsilon*Gamma(m+0.5)
530  The solution is the max T for which to do
531  the interpolation
532  ------------------------------------------*/
533  double T = -log(cutoff_);
534  const double egamma = cutoff_ * sqrt_pi * numbers_.df[2 * m]
535  / std::pow(2.0, m);
536  double T_new = T;
537  double func;
538  do {
539  const double damping_factor = 0.2;
540  T = T_new;
541  /* f(T) = the difference between LHS and RHS of the equation above */
542  func = std::pow(T, m - 0.5) * std::exp(-T) - egamma;
543  const double dfuncdT = ((m - 0.5) * std::pow(T, m - 1.5)
544  - std::pow(T, m - 0.5)) * std::exp(-T);
545  /* f(T) has 2 roots and has a maximum in between. If f'(T) > 0 we are to the left of the hump. Make a big step to the right. */
546  if (dfuncdT > 0.0) {
547  T_new *= 2.0;
548  } else {
549  /* damp the step */
550  double deltaT = -func / dfuncdT;
551  const double sign_deltaT = (deltaT > 0.0) ? 1.0 : -1.0;
552  const double max_deltaT = damping_factor * T;
553  if (std::fabs(deltaT) > max_deltaT)
554  deltaT = sign_deltaT * max_deltaT;
555  T_new = T + deltaT;
556  }
557  if (T_new <= 0.0) {
558  T_new = T / 2.0;
559  }
560  } while (std::fabs(func / egamma) >= soft_zero_);
561  T_crit_[m] = T_new;
562  const int T_idx = (int) std::floor(T_new / delT_);
563  max_T_ = std::max(max_T_, T_idx);
564  }
565 
566  // allocate the grid (see the comments below)
567  {
568  const int nrow = max_T_ + 1;
569  const int ncol = max_m_ + 1;
570  grid_ = new Real*[nrow];
571  grid_[0] = new Real[nrow * ncol];
572  //std::cout << "Allocated interpolation table of " << nrow * ncol << " reals" << std::endl;
573  for (int r = 1; r < nrow; ++r)
574  grid_[r] = grid_[r - 1] + ncol;
575  }
576 
577  /*-------------------------------------------------------
578  Tabulate the gamma function from t=delT to T_crit[m]:
579  1) include T=0 though the table is empty for T=0 since
580  Fm(0) is simple to compute
581  -------------------------------------------------------*/
582  /*--- do the mmax first ---*/
583  for (int T_idx = max_T_; T_idx >= 0; --T_idx) {
584  const double T = T_idx * delT_;
585  libint2::FmEval_Reference2<double>::eval(grid_[T_idx], T, max_m_);
586  }
587  }
588 
589  ~FmEval_Taylor() {
590  delete[] T_crit_;
591  delete[] grid_[0];
592  delete[] grid_;
593  }
594 
597  static std::shared_ptr<const FmEval_Taylor> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
598  assert(mmax >= 0);
599  assert(precision >= 0);
600  // thread-safe per C++11 standard [6.7.4]
601  static auto instance_ = std::make_shared<const FmEval_Taylor>(mmax, precision);
602 
603  while (instance_->max_m() < mmax || instance_->precision() > precision) {
604  static std::mutex mtx;
605  std::lock_guard<std::mutex> lck(mtx);
606  if (instance_->max_m() < mmax || instance_->precision() > precision) {
607  auto new_instance = std::make_shared<const FmEval_Taylor>(mmax, precision);
608  instance_ = new_instance;
609  }
610  }
611 
612  return instance_;
613  }
614 
616  int max_m() const { return max_m_ - INTERPOLATION_ORDER + 1; }
618  Real precision() const { return cutoff_; }
619 
625  void eval(Real* Fm, Real T, int mmax) const {
626  const double sqrt_pio2 = 1.2533141373155002512;
627  const double two_T = 2.0 * T;
628 
629  // stop recursion at mmin
630  const int mmin = INTERPOLATION_AND_RECURSION ? mmax : 0;
631  /*-------------------------------------
632  Compute Fm(T) from mmax down to mmin
633  -------------------------------------*/
634  const bool use_upward_recursion = true;
635  if (use_upward_recursion) {
636 // if (T > 30.0) {
637  if (T > T_crit_[0]) {
638  const double one_over_x = 1.0/T;
639  Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
640  if (mmax == 0)
641  return;
642  // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is <1e-15
643  for (int i = 1; i <= mmax; i++)
644  Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
645  return;
646  }
647  }
648 
649  // since Tcrit grows with mmax, this condition only needs to be determined once
650  if (T > T_crit_[mmax]) {
651  double pow_two_T_to_minusjp05 = std::pow(two_T, -mmax - 0.5);
652  for (int m = mmax; m >= mmin; --m) {
653  /*--- Asymptotic formula ---*/
654  Fm[m] = numbers_.df[2 * m] * sqrt_pio2 * pow_two_T_to_minusjp05;
655  pow_two_T_to_minusjp05 *= two_T;
656  }
657  }
658  else
659  {
660  const int T_ind = (int) (0.5 + T * oodelT_);
661  const Real h = T_ind * delT_ - T;
662  const Real* F_row = grid_[T_ind] + mmin;
663 
664 #if defined (__AVX__)
665  libint2::simd::VectorAVXDouble h0123, h4567;
666  if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
667  const double h2 = h*h*oon[2];
668  const double h3 = h2*h*oon[3];
669  h0123 = libint2::simd::VectorAVXDouble (1.0, h, h2, h3);
670  if (INTERPOLATION_ORDER == 7) {
671  const double h4 = h3*h*oon[4];
672  const double h5 = h4*h*oon[5];
673  const double h6 = h5*h*oon[6];
674  const double h7 = h6*h*oon[7];
675  h4567 = libint2::simd::VectorAVXDouble (h4, h5, h6, h7);
676  }
677  }
678  // libint2::simd::VectorAVXDouble h0123(1.0);
679  // libint2::simd::VectorAVXDouble h4567(1.0);
680 #endif
681 
682  int m = mmin;
683  if (mmax-m >=1) {
684  const int unroll_size = 2;
685  const int m_fence = (mmax + 2 - unroll_size);
686  for(; m<m_fence; m+=unroll_size, F_row+=unroll_size) {
687 
688 #if defined(__AVX__)
689  if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
690  libint2::simd::VectorAVXDouble fr0_0123; fr0_0123.load(F_row);
691  libint2::simd::VectorAVXDouble fr1_0123; fr1_0123.load(F_row+1);
692  libint2::simd::VectorSSEDouble fm01 = horizontal_add(fr0_0123*h0123, fr1_0123*h0123);
693  if (INTERPOLATION_ORDER == 7) {
694  libint2::simd::VectorAVXDouble fr0_4567; fr0_4567.load(F_row+4);
695  libint2::simd::VectorAVXDouble fr1_4567; fr1_4567.load(F_row+5);
696  fm01 += horizontal_add(fr0_4567*h4567, fr1_4567*h4567);
697  }
698  fm01.convert(&Fm[m]);
699  }
700  else {
701 #endif
702  Real total0 = 0.0, total1 = 0.0;
703  for(int i=INTERPOLATION_ORDER; i>=1; --i) {
704  total0 = oon[i]*h*(F_row[i] + total0);
705  total1 = oon[i]*h*(F_row[i+1] + total1);
706  }
707  Fm[m] = F_row[0] + total0;
708  Fm[m+1] = F_row[1] + total1;
709 #if defined(__AVX__)
710  }
711 #endif
712  }
713  } // unroll_size = 2
714  if (m<=mmax) { // unroll_size = 1
715 #if defined(__AVX__)
716  if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
717  libint2::simd::VectorAVXDouble fr0123; fr0123.load(F_row);
718  if (INTERPOLATION_ORDER == 7) {
719  libint2::simd::VectorAVXDouble fr4567; fr4567.load(F_row+4);
720 // libint2::simd::VectorSSEDouble fm = horizontal_add(fr0123*h0123, fr4567*h4567);
721 // Fm[m] = horizontal_add(fm);
722  Fm[m] = horizontal_add(fr0123*h0123 + fr4567*h4567);
723  }
724  else { // INTERPOLATION_ORDER == 3
725  Fm[m] = horizontal_add(fr0123*h0123);
726  }
727  }
728  else {
729 #endif
730  Real total = 0.0;
731  for(int i=INTERPOLATION_ORDER; i>=1; --i) {
732  total = oon[i]*h*(F_row[i] + total);
733  }
734  Fm[m] = F_row[0] + total;
735 #if defined(__AVX__)
736  }
737 #endif
738  } // unroll_size = 1
739 
740  // check against the reference value
741 // if (false) {
742 // double refvalue = FmEval_Reference2<double>::eval(T, mmax); // compute F(T) with m=mmax
743 // if (abs(refvalue - Fm[mmax]) > 1e-14) {
744 // std::cout << "T = " << T << " m = " << mmax << " cheb = "
745 // << Fm[mmax] << " ref = " << refvalue << std::endl;
746 // }
747 // }
748 
749  } // if T < T_crit
750 
751  /*------------------------------------
752  And then do downward recursion in j
753  ------------------------------------*/
754  if (INTERPOLATION_AND_RECURSION && mmin > 0) {
755  const Real exp_mT = std::exp(-T);
756  for (int m = mmin - 1; m >= 0; --m) {
757  Fm[m] = (exp_mT + two_T * Fm[m+1]) * numbers_.twoi1[m];
758  }
759  }
760  }
761 
762  private:
763  Real **grid_; /* Table of "exact" Fm(T) values. Row index corresponds to
764  values of T (max_T+1 rows), column index to values
765  of m (max_m+1 columns) */
766  Real delT_; /* The step size for T, depends on cutoff */
767  Real oodelT_; /* 1.0 / delT_, see above */
768  Real cutoff_; /* Tolerance cutoff used in all computations of Fm(T) */
769  int max_m_; /* Maximum value of m in the table, depends on cutoff
770  and the number of terms in Taylor interpolation */
771  int max_T_; /* Maximum index of T in the table, depends on cutoff
772  and m */
773  Real *T_crit_; /* Maximum T for each row, depends on cutoff;
774  for a given m and T_idx <= max_T_idx[m] use Taylor interpolation,
775  for a given m and T_idx > max_T_idx[m] use the asymptotic formula */
776 
777  ExpensiveNumbers<double> numbers_;
778 
787  static double truncation_error(unsigned int m, double T) {
788  const double m2= m * m;
789  const double m3= m2 * m;
790  const double m4= m2 * m2;
791  const double T2= T * T;
792  const double T3= T2 * T;
793  const double T4= T2 * T2;
794  const double T5= T2 * T3;
795 
796  const double result = exp(-T) * (105 + 16*m4 + 16*m3*(T - 8) - 30*T + 12*T2
797  - 8*T3 + 16*T4 + 8*m2*(43 - 9*T + 2*T2) +
798  4*m*(-88 + 23*T - 8*T2 + 4*T3))/(32*T5);
799  return result;
800  }
809  static double truncation_error(double T) {
810  const double result = exp(-T) /(2*T);
811  return result;
812  }
813  };
814 
815 
816  namespace detail {
817  constexpr double pow10(long exp) {
818  return (exp == 0) ? 1. : ((exp > 0) ? 10. * pow10(exp-1) : 0.1 * pow10(exp+1));
819  }
820  }
821 
825 
833  template<typename Real = double>
834  struct TennoGmEval {
835 
836  private:
837 
838  #include <libint2/tenno_cheb.h>
839 
840  static_assert(std::is_same<Real,double>::value, "TennoGmEval only supports double as the real type");
841 
842  static const int mmin_ = -1;
843  static constexpr const Real Tmax = (1 << cheb_table_tmaxlog2);
844  static constexpr const Real Umax = detail::pow10(cheb_table_umaxlog10);
845  static constexpr const Real Umin = detail::pow10(cheb_table_uminlog10);
846  static constexpr const std::size_t ORDERp1 = interpolation_order + 1;
847  static constexpr const Real maxabsprecision = 1.4e-14;
848 
849  public:
854  TennoGmEval(unsigned int mmax, Real precision = -1) :
855  mmax_(mmax), precision_(precision),
856  numbers_() {
857 #if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
858 # if !defined(__AVX__) && defined(NDEBUG)
859  if (libint2::verbose()) {
860  static bool printed_performance_warning = false;
861  if (!printed_performance_warning) {
863  << "libint2::TennoGmEval on x86(-64) platforms needs AVX support for best performance"
864  << std::endl;
865  printed_performance_warning = true;
866  }
867  }
868 # endif
869 #endif
870 
871 // if (precision_ < maxabsprecision)
872 // throw std::invalid_argument(
873 // std::string(
874 // "TennoGmEval does not support precision smaller than ") +
875 // std::to_string(maxabsprecision));
876 
877  if (mmax > cheb_table_mmax)
878  throw std::invalid_argument(
879  "TennoGmEval::init() : requested mmax exceeds the "
880  "hard-coded mmax");
881  init_table();
882  }
883 
884  ~TennoGmEval() {
885  if (c_ != nullptr)
886  free(c_);
887  }
888 
890  static std::shared_ptr<const TennoGmEval> instance(int m_max, double = 0) {
891 
892  assert(m_max >= 0);
893  // thread-safe per C++11 standard [6.7.4]
894  static auto instance_ = std::make_shared<const TennoGmEval>(m_max);
895 
896  while (instance_->max_m() < m_max) {
897  static std::mutex mtx;
898  std::lock_guard<std::mutex> lck(mtx);
899  if (instance_->max_m() < m_max) {
900  auto new_instance = std::make_shared<const TennoGmEval>(m_max);
901  instance_ = new_instance;
902  }
903  }
904 
905  return instance_;
906  }
907 
908  unsigned int max_m() const { return mmax_; }
910  Real precision() const { return precision_; }
911 
922  void eval_yukawa(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const {
923  assert(mmax <= mmax_);
924  assert(T >= 0);
925  const auto U = 0.25 * zeta * zeta * one_over_rho;
926  assert(U >= 0);
927  if (T > Tmax || U < Umin) {
928  eval_Gm_urr(Gm, T, U, mmax, 0); // no need for G_-1
929  } else {
930  interpolate_Gm<false>(Gm, T, U, 0, mmax);
931  }
932  }
937  // \f$ G_{m}(T,U) = \int_0^1 t^{2m} \exp(U(1-t^{-2}) - Tt^2) dt \f$, where
944  void eval_slater(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const {
945  assert(mmax <= mmax_);
946  assert(T >= 0);
947  const auto U = 0.25 * zeta * zeta * one_over_rho;
948  assert(U > 0); // integral factorizes into 2 overlaps for U = 0
949  const auto zeta_over_two_rho = 0.5 * zeta * one_over_rho;
950  if (T > Tmax) {
951  eval_Gm_urr(Gm, T, U, mmax, -1);
952  } else {
953  interpolate_Gm<true>(Gm, T, U, zeta_over_two_rho, mmax);
954  }
955  }
956 
957  private:
958 
964  template <bool compute_Gm1>
965  static inline std::tuple<Real,Real> eval_G0_and_maybe_Gm1(Real T, Real U) {
966  assert(T >= 0);
967  assert(U >= 0);
968  const Real sqrtPi(1.7724538509055160272981674833411451827975494561224);
969 
970  Real G_m1 = 0;
971  Real G_0 = 0;
972  if (U == 0) { // \sqrt{U} G_{-1} is finite, need to handle that case separately
973  assert(compute_Gm1 == false);
974  if (T < std::numeric_limits<Real>::epsilon()) {
975  G_0 = 1;
976  }
977  else {
978  const Real sqrt_T = sqrt(T);
979  const Real sqrtPi_over_2 = sqrtPi * 0.5;
980  G_0 = sqrtPi_over_2 * erf(sqrt_T) / sqrt_T;
981  }
982  }
983  else if (T > std::numeric_limits<Real>::epsilon()) { // U > 0
984  const Real sqrt_U = sqrt(U);
985  const Real sqrt_T = sqrt(T);
986  const Real oo_sqrt_T = 1 / sqrt_T;
987  const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
988  const Real kappa = sqrt_U - sqrt_T;
989  const Real lambda = sqrt_U + sqrt_T;
990  const Real sqrtPi_over_4 = sqrtPi * 0.25;
991  const Real pfac = sqrtPi_over_4;
992  const Real erfc_k = exp(kappa * kappa - T) * erfc(kappa);
993  const Real erfc_l = exp(lambda * lambda - T) * erfc(lambda);
994 
995  G_m1 = compute_Gm1 ? pfac * (erfc_k + erfc_l) * oo_sqrt_U : 0.0;
996  G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
997  }
998  else { // T = 0, U > 0
999  const Real exp_U = exp(U);
1000  const Real sqrt_U = sqrt(U);
1001  if (compute_Gm1) {
1002  const Real sqrtPi_over_2 = sqrtPi * 0.5;
1003  const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
1004 
1005  G_m1 = exp_U * sqrtPi_over_2 * erfc(sqrt_U) * oo_sqrt_U;
1006  }
1007  G_0 = 1 - exp_U * sqrtPi * erfc(sqrt_U) * sqrt_U;
1008  }
1009 
1010  return std::make_tuple(G_0, G_m1);
1011  }
1012 
1020  static void eval_Gm_urr(Real* Gm, Real T, Real U, size_t mmax, long mmin) {
1021  assert(mmin == 0 || mmin == -1);
1022  assert(T > 0);
1023  assert(U > 0);
1024 
1025  const Real sqrt_U = sqrt(U);
1026  const Real sqrt_T = sqrt(T);
1027  const Real oo_sqrt_T = 1 / sqrt_T;
1028  const Real oo_sqrt_U = 1 / sqrt_U;
1029  const Real kappa = sqrt_U - sqrt_T;
1030  const Real lambda = sqrt_U + sqrt_T;
1031  const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060);
1032  const Real pfac = sqrtPi_over_4;
1033  const Real erfc_k = exp(kappa*kappa - T) * erfc(kappa);
1034  const Real erfc_l = exp(lambda*lambda - T) * erfc(lambda);
1035 
1036  Real G_m1_value;
1037  Real& G_m1 = (mmin == -1) ? Gm[0] : G_m1_value;
1038  G_m1 = pfac * (erfc_k + erfc_l) * oo_sqrt_U;
1039  Real& G_0 = (mmin == -1) ? Gm[1] : Gm[0];
1040  G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
1041 
1042  if (mmax > 0) {
1043 
1044  // first application of URR
1045  const Real oo_two_T = 0.5 / T;
1046  const Real two_U = 2.0 * U;
1047  const Real exp_mT = exp(-T);
1048 
1049  Real* Gmm1 = &G_m1;
1050  Real* Gm0 = &G_0;
1051  Real* Gmp1 = Gm0 + 1;
1052  for(unsigned int m=1, two_m_minus_1=1; m<=mmax; ++m, two_m_minus_1+=2, ++Gmp1) {
1053  *Gmp1 = oo_two_T * ( two_m_minus_1 * *Gm0 + two_U * *Gmm1 - exp_mT);
1054  Gmm1 = Gm0;
1055  Gm0 = Gmp1;
1056  }
1057  }
1058 
1059  return;
1060  }
1061 
1069  template <bool exp>
1070  void interpolate_Gm(Real* Gm_vec, Real T, Real U, Real zeta_over_two_rho, long mmax) const {
1071  assert(T >= 0);
1072  assert(U >= Umin && U <= Umax);
1073 
1074  // maps x in [0,2] to [-1/2,1/2] in a linear fashion
1075  auto linear_map_02 = [](Real x) {
1076  assert(x >= 0 && x <= 2);
1077  return (x - 1) * 0.5;
1078  };
1079  // maps x in [a, 2*a] to [-1/2,1/2] in a logarithmic fashion
1080  auto log2_map = [](Real x, Real one_over_a) {
1081  return std::log2(x * one_over_a) - 0.5;
1082  };
1083  // maps x in [a, 10*a] to [-1/2,1/2] in a logarithmic fashion
1084  auto log10_map = [](Real x, Real one_over_a) {
1085  return std::log10(x * one_over_a) - 0.5;
1086  };
1087 
1088  // which interval does this T fall into?
1089  const int Tint = T < 2 ? 0 : int(std::floor(std::log2(T))); assert(Tint >= 0 && Tint < 10);
1090  // precomputed 1 / 2^K
1091  const constexpr Real one_over_2K[] = {1, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, .001953125};
1092  // which interval does this U fall into?
1093  const int Uint = int(std::floor(std::log10(U / Umin))); assert(Uint >= 0 && Uint < 10);
1094  // precomputed 1 / 10^(cheb_table_uminlog10 + K)
1095  const constexpr Real one_over_10K[] = {1. / detail::pow10(cheb_table_uminlog10),
1096  1. / detail::pow10(cheb_table_uminlog10+1),
1097  1. / detail::pow10(cheb_table_uminlog10+2),
1098  1. / detail::pow10(cheb_table_uminlog10+3),
1099  1. / detail::pow10(cheb_table_uminlog10+4),
1100  1. / detail::pow10(cheb_table_uminlog10+5),
1101  1. / detail::pow10(cheb_table_uminlog10+6),
1102  1. / detail::pow10(cheb_table_uminlog10+7),
1103  1. / detail::pow10(cheb_table_uminlog10+8),
1104  1. / detail::pow10(cheb_table_uminlog10+9)};
1105  const Real t = Tint == 0 ? linear_map_02(T) : log2_map(T, one_over_2K[Tint]); // this ranges from -0.5 to 0.5
1106  const Real u = log10_map(U, one_over_10K[Uint]); // this ranges from -0.5 to 0.5
1107 
1108  const int interval = Tint * 10 + Uint;
1109 
1110 #if defined(__AVX__)
1111  assert(ORDERp1 == 16);
1112  const auto t2 = t*t;
1113  const auto t3 = t2*t;
1114  const auto t4 = t2*t2;
1115  const auto t5 = t2*t3;
1116  const auto t6 = t3*t3;
1117  const auto t7 = t3*t4;
1118  const auto t8 = t4*t4;
1119  const auto t9 = t4*t5;
1120  const auto t10 = t5*t5;
1121  const auto t11 = t5*t6;
1122  const auto t12 = t6*t6;
1123  const auto t13 = t6*t7;
1124  const auto t14 = t7*t7;
1125  const auto t15 = t7*t8;
1126  const auto u2 = u*u;
1127  const auto u3 = u2*u;
1128  const auto u4 = u2*u2;
1129  const auto u5 = u2*u3;
1130  const auto u6 = u3*u3;
1131  const auto u7 = u3*u4;
1132  const auto u8 = u4*u4;
1133  const auto u9 = u4*u5;
1134  const auto u10 = u5*u5;
1135  const auto u11 = u5*u6;
1136  const auto u12 = u6*u6;
1137  const auto u13 = u6*u7;
1138  const auto u14 = u7*u7;
1139  const auto u15 = u7*u8;
1140  libint2::simd::VectorAVXDouble u0vec(1., u, u2, u3);
1141  libint2::simd::VectorAVXDouble u1vec(u4, u5, u6, u7);
1142  libint2::simd::VectorAVXDouble u2vec(u8, u9, u10, u11);
1143  libint2::simd::VectorAVXDouble u3vec(u12, u13, u14, u15);
1144 #endif // AVX
1145 
1146  Real Gmm1 = 0.0; // will track the previous value, only used for exp==false
1147 
1148  constexpr const bool compute_Gmm10 = true;
1149  long mmin_interp;
1150  if (compute_Gmm10) {
1151  // precision of interpolation for m=-1,0 can be insufficient, just evaluate explicitly
1152  Real G0;
1153  std::tie(exp ? G0 : Gm_vec[0], Gmm1) = eval_G0_and_maybe_Gm1<exp>(T, U);
1154  if (exp) {
1155  Gm_vec[0] = (Gmm1 - G0) * zeta_over_two_rho;
1156  Gmm1 = G0;
1157  }
1158  mmin_interp = 1;
1159  }
1160  else
1161  mmin_interp = (exp == true) ? -1 : 0;
1162 
1163  // now compute the rest
1164  for(long m=mmin_interp; m<=mmax; ++m){
1165  const Real *c_tuint = c_ + (ORDERp1) * (ORDERp1) * (interval * (mmax_ - mmin_ + 1) + (m - mmin_)); // ptr to the interpolation data for m=mmin
1166 #if defined(__AVX__)
1167  libint2::simd::VectorAVXDouble c00v, c01v, c02v, c03v, c10v, c11v, c12v, c13v,
1168  c20v, c21v, c22v, c23v, c30v, c31v, c32v, c33v,
1169  c40v, c41v, c42v, c43v, c50v, c51v, c52v, c53v,
1170  c60v, c61v, c62v, c63v, c70v, c71v, c72v, c73v;
1171  libint2::simd::VectorAVXDouble c80v, c81v, c82v, c83v, c90v, c91v, c92v, c93v,
1172  ca0v, ca1v, ca2v, ca3v, cb0v, cb1v, cb2v, cb3v,
1173  cc0v, cc1v, cc2v, cc3v, cd0v, cd1v, cd2v, cd3v,
1174  ce0v, ce1v, ce2v, ce3v, cf0v, cf1v, cf2v, cf3v;
1175  c00v.load_aligned(c_tuint); c01v.load_aligned((c_tuint+4));
1176  c02v.load_aligned(c_tuint+8); c03v.load_aligned((c_tuint+12));
1177  libint2::simd::VectorAVXDouble t0vec(1, 1, 1, 1);
1178  libint2::simd::VectorAVXDouble t0 = t0vec * (c00v * u0vec + c01v * u1vec + c02v * u2vec + c03v * u3vec);
1179  c10v.load_aligned(c_tuint +ORDERp1); c11v.load_aligned((c_tuint+4) +ORDERp1);
1180  c12v.load_aligned((c_tuint+8) +ORDERp1); c13v.load_aligned((c_tuint+12) +ORDERp1);
1181  libint2::simd::VectorAVXDouble t1vec(t, t, t, t);
1182  libint2::simd::VectorAVXDouble t1 = t1vec * (c10v * u0vec + c11v * u1vec + c12v * u2vec + c13v * u3vec);
1183  c20v.load_aligned(c_tuint +2*ORDERp1); c21v.load_aligned((c_tuint+4) +2*ORDERp1);
1184  c22v.load_aligned((c_tuint+8)+2*ORDERp1); c23v.load_aligned((c_tuint+12)+2*ORDERp1);
1185  libint2::simd::VectorAVXDouble t2vec(t2, t2, t2, t2);
1186  libint2::simd::VectorAVXDouble t2 = t2vec * (c20v * u0vec + c21v * u1vec + c22v * u2vec + c23v * u3vec);
1187  c30v.load_aligned(c_tuint +3*ORDERp1); c31v.load_aligned((c_tuint+4) +3*ORDERp1);
1188  c32v.load_aligned((c_tuint+8)+3*ORDERp1); c33v.load_aligned((c_tuint+12)+3*ORDERp1);
1189  libint2::simd::VectorAVXDouble t3vec(t3, t3, t3, t3);
1190  libint2::simd::VectorAVXDouble t3 = t3vec * (c30v * u0vec + c31v * u1vec + c32v * u2vec + c33v * u3vec);
1191  libint2::simd::VectorAVXDouble t0123 = horizontal_add(t0, t1, t2, t3);
1192 
1193  c40v.load_aligned(c_tuint +4*ORDERp1); c41v.load_aligned((c_tuint+4) +4*ORDERp1);
1194  c42v.load_aligned((c_tuint+8)+4*ORDERp1); c43v.load_aligned((c_tuint+12)+4*ORDERp1);
1195  libint2::simd::VectorAVXDouble t4vec(t4, t4, t4, t4);
1196  libint2::simd::VectorAVXDouble t4 = t4vec * (c40v * u0vec + c41v * u1vec + c42v * u2vec + c43v * u3vec);
1197  c50v.load_aligned(c_tuint +5*ORDERp1); c51v.load_aligned((c_tuint+4) +5*ORDERp1);
1198  c52v.load_aligned((c_tuint+8)+5*ORDERp1); c53v.load_aligned((c_tuint+12)+5*ORDERp1);
1199  libint2::simd::VectorAVXDouble t5vec(t5, t5, t5, t5);
1200  libint2::simd::VectorAVXDouble t5 = t5vec * (c50v * u0vec + c51v * u1vec + c52v * u2vec + c53v * u3vec);
1201  c60v.load_aligned(c_tuint +6*ORDERp1); c61v.load_aligned((c_tuint+4) +6*ORDERp1);
1202  c62v.load_aligned((c_tuint+8)+6*ORDERp1); c63v.load_aligned((c_tuint+12)+6*ORDERp1);
1203  libint2::simd::VectorAVXDouble t6vec(t6, t6, t6, t6);
1204  libint2::simd::VectorAVXDouble t6 = t6vec * (c60v * u0vec + c61v * u1vec + c62v * u2vec + c63v * u3vec);
1205  c70v.load_aligned(c_tuint +7*ORDERp1); c71v.load_aligned((c_tuint+4) +7*ORDERp1);
1206  c72v.load_aligned((c_tuint+8)+7*ORDERp1); c73v.load_aligned((c_tuint+12)+7*ORDERp1);
1207  libint2::simd::VectorAVXDouble t7vec(t7, t7, t7, t7);
1208  libint2::simd::VectorAVXDouble t7 = t7vec * (c70v * u0vec + c71v * u1vec + c72v * u2vec + c73v * u3vec);
1209  libint2::simd::VectorAVXDouble t4567 = horizontal_add(t4, t5, t6, t7);
1210 
1211  c80v.load_aligned(c_tuint +8*ORDERp1); c81v.load_aligned((c_tuint+4) +8*ORDERp1);
1212  c82v.load_aligned((c_tuint+8)+8*ORDERp1); c83v.load_aligned((c_tuint+12)+8*ORDERp1);
1213  libint2::simd::VectorAVXDouble t8vec(t8, t8, t8, t8);
1214  libint2::simd::VectorAVXDouble t8 = t8vec * (c80v * u0vec + c81v * u1vec + c82v * u2vec + c83v * u3vec);
1215  c90v.load_aligned(c_tuint +9*ORDERp1); c91v.load_aligned((c_tuint+4) +9*ORDERp1);
1216  c92v.load_aligned((c_tuint+8)+9*ORDERp1); c93v.load_aligned((c_tuint+12)+9*ORDERp1);
1217  libint2::simd::VectorAVXDouble t9vec(t9, t9, t9, t9);
1218  libint2::simd::VectorAVXDouble t9 = t9vec * (c90v * u0vec + c91v * u1vec + c92v * u2vec + c93v * u3vec);
1219  ca0v.load_aligned(c_tuint +10*ORDERp1); ca1v.load_aligned((c_tuint+4) +10*ORDERp1);
1220  ca2v.load_aligned((c_tuint+8)+10*ORDERp1); ca3v.load_aligned((c_tuint+12)+10*ORDERp1);
1221  libint2::simd::VectorAVXDouble tavec(t10, t10, t10, t10);
1222  libint2::simd::VectorAVXDouble ta = tavec * (ca0v * u0vec + ca1v * u1vec + ca2v * u2vec + ca3v * u3vec);
1223  cb0v.load_aligned(c_tuint +11*ORDERp1); cb1v.load_aligned((c_tuint+4) +11*ORDERp1);
1224  cb2v.load_aligned((c_tuint+8)+11*ORDERp1); cb3v.load_aligned((c_tuint+12)+11*ORDERp1);
1225  libint2::simd::VectorAVXDouble tbvec(t11, t11, t11, t11);
1226  libint2::simd::VectorAVXDouble tb = tbvec * (cb0v * u0vec + cb1v * u1vec + cb2v * u2vec + cb3v * u3vec);
1227  libint2::simd::VectorAVXDouble t89ab = horizontal_add(t8, t9, ta, tb);
1228 
1229  cc0v.load_aligned(c_tuint +12*ORDERp1); cc1v.load_aligned((c_tuint+4) +12*ORDERp1);
1230  cc2v.load_aligned((c_tuint+8)+12*ORDERp1); cc3v.load_aligned((c_tuint+12)+12*ORDERp1);
1231  libint2::simd::VectorAVXDouble tcvec(t12, t12, t12, t12);
1232  libint2::simd::VectorAVXDouble tc = tcvec * (cc0v * u0vec + cc1v * u1vec + cc2v * u2vec + cc3v * u3vec);
1233  cd0v.load_aligned(c_tuint +13*ORDERp1); cd1v.load_aligned((c_tuint+4) +13*ORDERp1);
1234  cd2v.load_aligned((c_tuint+8)+13*ORDERp1); cd3v.load_aligned((c_tuint+12)+13*ORDERp1);
1235  libint2::simd::VectorAVXDouble tdvec(t13, t13, t13, t13);
1236  libint2::simd::VectorAVXDouble td = tdvec * (cd0v * u0vec + cd1v * u1vec + cd2v * u2vec + cd3v * u3vec);
1237  ce0v.load_aligned(c_tuint +14*ORDERp1); ce1v.load_aligned((c_tuint+4) +14*ORDERp1);
1238  ce2v.load_aligned((c_tuint+8)+14*ORDERp1); ce3v.load_aligned((c_tuint+12)+14*ORDERp1);
1239  libint2::simd::VectorAVXDouble tevec(t14, t14, t14, t14);
1240  libint2::simd::VectorAVXDouble te = tevec * (ce0v * u0vec + ce1v * u1vec + ce2v * u2vec + ce3v * u3vec);
1241  cf0v.load_aligned(c_tuint +15*ORDERp1); cf1v.load_aligned((c_tuint+4)+15*ORDERp1);
1242  cf2v.load_aligned((c_tuint+8)+15*ORDERp1); cf3v.load_aligned((c_tuint+4)+15*ORDERp1);
1243  libint2::simd::VectorAVXDouble tfvec(t15, t15, t15, t15);
1244  libint2::simd::VectorAVXDouble tf = tfvec * (cf0v * u0vec + cf1v * u1vec + cf2v * u2vec + cf3v * u3vec);
1245  libint2::simd::VectorAVXDouble tcdef = horizontal_add(tc, td, te, tf);
1246 
1247  auto tall = horizontal_add(t0123, t4567, t89ab, tcdef);
1248  const auto Gm = horizontal_add(tall);
1249 #else // AVX
1250  // no AVX, use explicit loops (probably slow)
1251  double uvec[16];
1252  double tvec[16];
1253  uvec[0] = 1;
1254  tvec[0] = 1;
1255  for(int i=1; i!=16; ++i) {
1256  uvec[i] = uvec[i-1] * u;
1257  tvec[i] = tvec[i-1] * t;
1258  }
1259  double Gm = 0.0;
1260  for(int i=0, ij=0; i!=16; ++i) {
1261  for (int j = 0; j != 16; ++j, ++ij) {
1262  Gm += c_tuint[ij] * tvec[i] * uvec[j];
1263  }
1264  }
1265 #endif // AVX
1266 
1267  if (exp == false) {
1268  Gm_vec[m] = Gm;
1269  }
1270  else {
1271  if (m != -1) {
1272  Gm_vec[m] = (Gmm1 - Gm) * zeta_over_two_rho;
1273  }
1274  Gmm1 = Gm;
1275  }
1276  }
1277 
1278  return;
1279  }
1280 
1281  private:
1282  unsigned int mmax_;
1283  Real precision_;
1284  ExpensiveNumbers<Real> numbers_;
1285  Real* c_ = nullptr;
1286 
1287  void init_table() {
1288 
1289  // get memory
1290  void* result;
1291  int status = posix_memalign(&result, std::max(sizeof(Real), 32ul), (mmax_ - mmin_ + 1) * cheb_table_nintervals * ORDERp1 * ORDERp1 * sizeof(Real));
1292  if (status != 0) {
1293  if (status == EINVAL)
1294  throw std::logic_error(
1295  "TennoGmEval::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)");
1296  if (status == ENOMEM)
1297  throw std::bad_alloc();
1298  abort(); // should be unreachable
1299  }
1300  c_ = static_cast<Real*>(result);
1301 
1302  // copy contents of static table into c
1303  // need all intervals
1304  for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
1305  // but only values of m up to mmax
1306  std::copy(cheb_table[iv], cheb_table[iv]+((mmax_-mmin_)+1)*ORDERp1*ORDERp1, c_+(iv*((mmax_-mmin_)+1))*ORDERp1*ORDERp1);
1307  }
1308  }
1309 
1310 
1311  }; // TennoGmEval
1312 
1313 #if LIBINT2_CONSTEXPR_STATICS
1314  template <typename Real>
1315  constexpr
1316  double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)];
1317 #else
1318  // clang needs an explicit specifalization declaration to avoid warning
1319 # ifdef __clang__
1320  template <typename Real>
1321  double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)];
1322 # endif
1323 #endif
1324 
1325  template<typename Real, int k>
1327 
1328  namespace detail {
1329 
1331  template <typename CoreEval> struct CoreEvalScratch {
1332  CoreEvalScratch(const CoreEvalScratch&) = default;
1333  CoreEvalScratch(CoreEvalScratch&&) = default;
1334  explicit CoreEvalScratch(int) { }
1335  };
1337  template <typename Real>
1338  struct CoreEvalScratch<GaussianGmEval<Real, -1>> {
1339  std::vector<Real> Fm_;
1340  std::vector<Real> g_i;
1341  std::vector<Real> r_i;
1342  std::vector<Real> oorhog_i;
1343  CoreEvalScratch(const CoreEvalScratch&) = default;
1344  CoreEvalScratch(CoreEvalScratch&&) = default;
1345  explicit CoreEvalScratch(int mmax) {
1346  init(mmax);
1347  }
1348  private:
1349  void init(int mmax) {
1350  Fm_.resize(mmax+1);
1351  g_i.resize(mmax+1);
1352  r_i.resize(mmax+1);
1353  oorhog_i.resize(mmax+1);
1354  g_i[0] = 1.0;
1355  r_i[0] = 1.0;
1356  }
1357  };
1358  } // namespace libint2::detail
1359 
1363 
1385  template<typename Real, int k>
1386  struct GaussianGmEval : private detail::CoreEvalScratch<GaussianGmEval<Real,k>> // N.B. empty-base optimization
1387  {
1388 #ifndef LIBINT_USER_DEFINED_REAL
1389  using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1390 #else
1391  using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1392 #endif
1393 
1397  GaussianGmEval(int mmax, Real precision) :
1398  detail::CoreEvalScratch<GaussianGmEval<Real, k>>(mmax), mmax_(mmax),
1399  precision_(precision), fm_eval_(),
1400  numbers_(-1,-1,mmax) {
1401  assert(k == -1 || k == 0 || k == 2);
1402  // for k=-1 need to evaluate the Boys function
1403  if (k == -1) {
1404  fm_eval_ = FmEvalType::instance(mmax_, precision_);
1405  }
1406  }
1407 
1408  ~GaussianGmEval() {
1409  }
1410 
1413  static std::shared_ptr<GaussianGmEval> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
1414  assert(mmax >= 0);
1415  assert(precision >= 0);
1416  // thread-safe per C++11 standard [6.7.4]
1417  static auto instance_ = std::make_shared<GaussianGmEval>(mmax, precision);
1418 
1419  while (instance_->max_m() < mmax || instance_->precision() > precision) {
1420  static std::mutex mtx;
1421  std::lock_guard<std::mutex> lck(mtx);
1422  if (instance_->max_m() < mmax || instance_->precision() > precision) {
1423  auto new_instance = std::make_shared<GaussianGmEval>(mmax, precision);
1424  instance_ = new_instance;
1425  }
1426  }
1427 
1428  return instance_;
1429  }
1430 
1432  int max_m() const { return mmax_; }
1434  Real precision() const { return precision_; }
1435 
1450  template <typename AnyReal>
1451  void eval(Real* Gm, Real rho, Real T, size_t mmax,
1452  const std::vector<std::pair<AnyReal, AnyReal> >& geminal,
1453  void* scr = 0) {
1454 
1455  std::fill(Gm, Gm+mmax+1, Real(0));
1456 
1457  const auto sqrt_rho = sqrt(rho);
1458  const auto oo_sqrt_rho = 1/sqrt_rho;
1459  if (k == -1) {
1460  void* _scr = (scr == 0) ? this : scr;
1461  auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1462  for(int i=1; i<=mmax; i++) {
1463  scratch.r_i[i] = scratch.r_i[i-1] * rho;
1464  }
1465  }
1466 
1467  typedef typename std::vector<std::pair<AnyReal, AnyReal> >::const_iterator citer;
1468  const citer gend = geminal.end();
1469  for(citer i=geminal.begin(); i!= gend; ++i) {
1470 
1471  const auto gamma = i->first;
1472  const auto gcoef = i->second;
1473  const auto rhog = rho + gamma;
1474  const auto oorhog = 1/rhog;
1475 
1476  const auto gorg = gamma * oorhog;
1477  const auto rorg = rho * oorhog;
1478  const auto sqrt_rho_org = sqrt_rho * oorhog;
1479  const auto sqrt_rhog = sqrt(rhog);
1480  const auto sqrt_rorg = sqrt_rho_org * sqrt_rhog;
1481 
1483  constexpr Real const_SQRTPI_2(0.88622692545275801364908374167057259139877472806119); /* sqrt(pi)/2 */
1484  const auto SS_K0G12_SS = gcoef * oo_sqrt_rho * const_SQRTPI_2 * rorg * sqrt_rorg * exp(-gorg*T);
1485 
1486  if (k == -1) {
1487  void* _scr = (scr == 0) ? this : scr;
1488  auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1489 
1490  const auto rorgT = rorg * T;
1491  fm_eval_->eval(&scratch.Fm_[0], rorgT, mmax);
1492 
1493 #if 1
1494  constexpr Real const_2_SQRTPI(1.12837916709551257389615890312154517); /* 2/sqrt(pi) */
1495  const auto pfac = const_2_SQRTPI * sqrt_rhog * SS_K0G12_SS;
1496  scratch.oorhog_i[0] = pfac;
1497  for(int i=1; i<=mmax; i++) {
1498  scratch.g_i[i] = scratch.g_i[i-1] * gamma;
1499  scratch.oorhog_i[i] = scratch.oorhog_i[i-1] * oorhog;
1500  }
1501  for(int m=0; m<=mmax; m++) {
1502  Real ssss = 0.0;
1503  Real* bcm = numbers_.bc[m];
1504  for(int n=0; n<=m; n++) {
1505  ssss += bcm[n] * scratch.r_i[n] * scratch.g_i[m-n] * scratch.Fm_[n];
1506  }
1507  Gm[m] += ssss * scratch.oorhog_i[m];
1508  }
1509 #endif
1510  }
1511 
1512  if (k == 0) {
1513  auto ss_oper_ss_m = SS_K0G12_SS;
1514  Gm[0] += ss_oper_ss_m;
1515  for(int m=1; m<=mmax; ++m) {
1516  ss_oper_ss_m *= gorg;
1517  Gm[m] += ss_oper_ss_m;
1518  }
1519  }
1520 
1521  if (k == 2) {
1522 
1524  const auto rorgT = rorg * T;
1525  const auto SS_K2G12_SS_0 = (1.5 + rorgT) * (SS_K0G12_SS * oorhog);
1526  const auto SS_K2G12_SS_m1 = rorg * (SS_K0G12_SS * oorhog);
1527 
1528  auto SS_K2G12_SS_gorg_m = SS_K2G12_SS_0 ;
1529  auto SS_K2G12_SS_gorg_m1 = SS_K2G12_SS_m1;
1530  Gm[0] += SS_K2G12_SS_gorg_m;
1531  for(int m=1; m<=mmax; ++m) {
1532  SS_K2G12_SS_gorg_m *= gorg;
1533  Gm[m] += SS_K2G12_SS_gorg_m - m * SS_K2G12_SS_gorg_m1;
1534  SS_K2G12_SS_gorg_m1 *= gorg;
1535  }
1536  }
1537 
1538  }
1539 
1540  }
1541 
1542  private:
1543  int mmax_;
1544  Real precision_; //< absolute precision
1545  std::shared_ptr<const FmEvalType> fm_eval_;
1546 
1547  ExpensiveNumbers<Real> numbers_;
1548  };
1549 
1550  template <typename GmEvalFunction>
1551  struct GenericGmEval : private GmEvalFunction {
1552 
1553  typedef typename GmEvalFunction::value_type Real;
1554 
1555  GenericGmEval(int mmax, Real precision) : GmEvalFunction(mmax, precision),
1556  mmax_(mmax), precision_(precision) {}
1557 
1558  static std::shared_ptr<const GenericGmEval> instance(int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
1559  return std::make_shared<const GenericGmEval>(mmax, precision);
1560  }
1561 
1562  template <typename Real, typename... ExtraArgs>
1563  void eval(Real* Gm, Real rho, Real T, int mmax, ExtraArgs... args) const {
1564  assert(mmax <= mmax_);
1565  (GmEvalFunction(*this))(Gm, rho, T, mmax, std::forward<ExtraArgs>(args)...);
1566  }
1567 
1569  int max_m() const { return mmax_; }
1571  Real precision() const { return precision_; }
1572 
1573  private:
1574  int mmax_;
1575  Real precision_;
1576  };
1577 
1578  // these Gm engines need extra scratch data
1579  namespace os_core_ints {
1580  template <typename Real, int K> struct r12_xx_K_gm_eval;
1581  template <typename Real> struct erfc_coulomb_gm_eval;
1582  }
1583 
1584  namespace detail {
1586  template <typename Real>
1587  struct CoreEvalScratch<os_core_ints::r12_xx_K_gm_eval<Real, 1>> {
1588  std::vector<Real> Fm_;
1589  CoreEvalScratch(const CoreEvalScratch&) = default;
1590  CoreEvalScratch(CoreEvalScratch&&) = default;
1591  // need to store Fm(T) for m = 0 .. mmax+1
1592  explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+2); }
1593  };
1595  template <typename Real>
1596  struct CoreEvalScratch<os_core_ints::erfc_coulomb_gm_eval<Real>> {
1597  std::vector<Real> Fm_;
1598  CoreEvalScratch(const CoreEvalScratch&) = default;
1599  CoreEvalScratch(CoreEvalScratch&&) = default;
1600  // need to store Fm(T) for m = 0 .. mmax
1601  explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+1); }
1602  };
1603  }
1604 
1606  namespace os_core_ints {
1607 
1609  template <typename Real>
1610  struct delta_gm_eval {
1611  typedef Real value_type;
1612 
1613  delta_gm_eval(unsigned int, Real) {}
1614  void operator()(Real* Gm, Real rho, Real T, int mmax) const {
1615  constexpr static auto one_over_two_pi = 1.0 / (2.0 * M_PI);
1616  const auto G0 = exp(-T) * rho * one_over_two_pi;
1617  std::fill(Gm, Gm + mmax + 1, G0);
1618  }
1619  };
1620 
1625 
1626  template <typename Real, int K>
1627  struct r12_xx_K_gm_eval;
1628 
1629  template <typename Real>
1630  struct r12_xx_K_gm_eval<Real, 1>
1631  : private detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> {
1632  typedef detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> base_type;
1633  typedef Real value_type;
1634 
1635 #ifndef LIBINT_USER_DEFINED_REAL
1636  using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1637 #else
1638  using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1639 #endif
1640 
1641  r12_xx_K_gm_eval(unsigned int mmax, Real precision)
1642  : base_type(mmax) {
1643  fm_eval_ = FmEvalType::instance(mmax + 1, precision);
1644  }
1645  void operator()(Real* Gm, Real rho, Real T, int mmax) {
1646  fm_eval_->eval(&base_type::Fm_[0], T, mmax + 1);
1647  auto T_plus_m_plus_one = T + 1.0;
1648  Gm[0] = T_plus_m_plus_one * base_type::Fm_[0] - T * base_type::Fm_[1];
1649  auto minus_m = -1.0;
1650  T_plus_m_plus_one += 1.0;
1651  for (auto m = 1; m <= mmax;
1652  ++m, minus_m -= 1.0, T_plus_m_plus_one += 1.0) {
1653  Gm[m] =
1654  minus_m * base_type::Fm_[m - 1] + T_plus_m_plus_one * base_type::Fm_[m] - T * base_type::Fm_[m + 1];
1655  }
1656  }
1657 
1658  private:
1659  std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1660  };
1661 
1663  template <typename Real>
1664  struct erf_coulomb_gm_eval {
1665  typedef Real value_type;
1666 
1667 #ifndef LIBINT_USER_DEFINED_REAL
1668  using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1669 #else
1670  using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1671 #endif
1672 
1673  erf_coulomb_gm_eval(unsigned int mmax, Real precision) {
1674  fm_eval_ = FmEvalType::instance(mmax, precision);
1675  }
1676  void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) const {
1677  if (omega > 0) {
1678  auto omega2 = omega * omega;
1679  auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1680  fm_eval_->eval(Gm, T * omega2_over_omega2_plus_rho,
1681  mmax);
1682 
1683  using std::sqrt;
1684  auto ooversqrto2prho_exp_2mplus1 =
1685  sqrt(omega2_over_omega2_plus_rho);
1686  for (auto m = 0; m <= mmax;
1687  ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1688  Gm[m] *= ooversqrto2prho_exp_2mplus1;
1689  }
1690  }
1691  else {
1692  std::fill(Gm, Gm+mmax+1, Real{0});
1693  }
1694  }
1695 
1696  private:
1697  std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1698  };
1699 
1703  template <typename Real>
1704  struct erfc_coulomb_gm_eval : private
1705  detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> {
1706  typedef detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> base_type;
1707  typedef Real value_type;
1708 
1709  #ifndef LIBINT_USER_DEFINED_REAL
1710  using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1711 #else
1712  using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1713 #endif
1714 
1715  erfc_coulomb_gm_eval(unsigned int mmax, Real precision)
1716  : base_type(mmax) {
1717  fm_eval_ = FmEvalType::instance(mmax, precision);
1718  }
1719  void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) {
1720  fm_eval_->eval(&base_type::Fm_[0], T, mmax);
1721  std::copy(base_type::Fm_.cbegin(), base_type::Fm_.cbegin() + mmax + 1, Gm);
1722  if (omega > 0) {
1723  auto omega2 = omega * omega;
1724  auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1725  fm_eval_->eval(&base_type::Fm_[0], T * omega2_over_omega2_plus_rho,
1726  mmax);
1727 
1728  using std::sqrt;
1729  auto ooversqrto2prho_exp_2mplus1 =
1730  sqrt(omega2_over_omega2_plus_rho);
1731  for (auto m = 0; m <= mmax;
1732  ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1733  Gm[m] -= ooversqrto2prho_exp_2mplus1 * base_type::Fm_[m];
1734  }
1735  }
1736  }
1737 
1738  private:
1739  std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1740  };
1741 
1742  } // namespace os_core_ints
1743 
1744  /*
1745  * Slater geminal fitting is available only if have LAPACK
1746  */
1747 #if HAVE_LAPACK
1748  /*
1749  f[x_] := - Exp[-\[Zeta] x] / \[Zeta];
1750 
1751  ff[cc_, aa_, x_] := Sum[cc[[i]]*Exp[-aa[[i]] x^2], {i, 1, n}];
1752  */
1753  template <typename Real>
1754  Real
1755  fstg(Real zeta,
1756  Real x) {
1757  return -std::exp(-zeta*x)/zeta;
1758  }
1759 
1760  template <typename Real>
1761  Real
1762  fngtg(const std::vector<Real>& cc,
1763  const std::vector<Real>& aa,
1764  Real x) {
1765  Real value = 0.0;
1766  const Real x2 = x * x;
1767  const unsigned int n = cc.size();
1768  for(unsigned int i=0; i<n; ++i)
1769  value += cc[i] * std::exp(- aa[i] * x2);
1770  return value;
1771  }
1772 
1773  // --- weighting functions ---
1774  // L2 error is weighted by ww(x)
1775  // hence error is weighted by sqrt(ww(x))
1776  template <typename Real>
1777  Real
1778  wwtewklopper(Real x) {
1779  const Real x2 = x * x;
1780  return x2 * std::exp(-2 * x2);
1781  }
1782  template <typename Real>
1783  Real
1784  wwcusp(Real x) {
1785  const Real x2 = x * x;
1786  const Real x6 = x2 * x2 * x2;
1787  return std::exp(-0.005 * x6);
1788  }
1789  // default is Tew-Klopper
1790  template <typename Real>
1791  Real
1792  ww(Real x) {
1793  //return wwtewklopper(x);
1794  return wwcusp(x);
1795  }
1796 
1797  template <typename Real>
1798  Real
1799  norm(const std::vector<Real>& vec) {
1800  Real value = 0.0;
1801  const unsigned int n = vec.size();
1802  for(unsigned int i=0; i<n; ++i)
1803  value += vec[i] * vec[i];
1804  return value;
1805  }
1806 
1807  template <typename Real>
1808  void LinearSolveDamped(const std::vector<Real>& A,
1809  const std::vector<Real>& b,
1810  Real lambda,
1811  std::vector<Real>& x) {
1812  const size_t n = b.size();
1813  std::vector<Real> Acopy(A);
1814  for(size_t m=0; m<n; ++m) Acopy[m*n + m] *= (1 + lambda);
1815  std::vector<Real> e(b);
1816 
1817  //int info = LAPACKE_dgesv( LAPACK_ROW_MAJOR, n, 1, &Acopy[0], n, &ipiv[0], &e[0], n );
1818  {
1819  std::vector<int> ipiv(n);
1820  int n = b.size();
1821  int one = 1;
1822  int info;
1823  dgesv_(&n, &one, &Acopy[0], &n, &ipiv[0], &e[0], &n, &info);
1824  assert (info == 0);
1825  }
1826 
1827  x = e;
1828  }
1829 
1840  template <typename Real>
1841  void stg_ng_fit(unsigned int n,
1842  Real zeta,
1843  std::vector< std::pair<Real, Real> >& geminal,
1844  Real xmin = 0.0,
1845  Real xmax = 10.0,
1846  unsigned int npts = 1001) {
1847 
1848  // initial guess
1849  std::vector<Real> cc(n, 1.0); // coefficients
1850  std::vector<Real> aa(n); // exponents
1851  for(unsigned int i=0; i<n; ++i)
1852  aa[i] = std::pow(3.0, (i + 2 - (n + 1)/2.0));
1853 
1854  // first rescale cc for ff[x] to match the norm of f[x]
1855  Real ffnormfac = 0.0;
1856  using std::sqrt;
1857  for(unsigned int i=0; i<n; ++i)
1858  for(unsigned int j=0; j<n; ++j)
1859  ffnormfac += cc[i] * cc[j]/sqrt(aa[i] + aa[j]);
1860  const Real Nf = sqrt(2.0 * zeta) * zeta;
1861  const Real Nff = sqrt(2.0) / (sqrt(ffnormfac) *
1862  sqrt(sqrt(M_PI)));
1863  for(unsigned int i=0; i<n; ++i) cc[i] *= -Nff/Nf;
1864 
1865  Real lambda0 = 1000; // damping factor is initially set to 1000, eventually should end up at 0
1866  const Real nu = 3.0; // increase/decrease the damping factor scale it by this
1867  const Real epsilon = 1e-15; // convergence
1868  const unsigned int maxniter = 200;
1869 
1870  // grid points on which we will fit
1871  std::vector<Real> xi(npts);
1872  for(unsigned int i=0; i<npts; ++i) xi[i] = xmin + (xmax - xmin)*i/(npts - 1);
1873 
1874  std::vector<Real> err(npts);
1875 
1876  const size_t nparams = 2*n; // params = expansion coefficients + gaussian exponents
1877  std::vector<Real> J( npts * nparams );
1878  std::vector<Real> delta(nparams);
1879 
1880 // std::cout << "iteration 0" << std::endl;
1881 // for(unsigned int i=0; i<n; ++i)
1882 // std::cout << cc[i] << " " << aa[i] << std::endl;
1883 
1884  Real errnormI;
1885  Real errnormIm1 = 1e3;
1886  bool converged = false;
1887  unsigned int iter = 0;
1888  while (!converged && iter < maxniter) {
1889 // std::cout << "Iteration " << ++iter << ": lambda = " << lambda0/nu << std::endl;
1890 
1891  for(unsigned int i=0; i<npts; ++i) {
1892  const Real x = xi[i];
1893  err[i] = (fstg(zeta, x) - fngtg(cc, aa, x)) * sqrt(ww(x));
1894  }
1895  errnormI = norm(err)/sqrt((Real)npts);
1896 
1897 // std::cout << "|err|=" << errnormI << std::endl;
1898  converged = std::abs((errnormI - errnormIm1)/errnormIm1) <= epsilon;
1899  if (converged) break;
1900  errnormIm1 = errnormI;
1901 
1902  for(unsigned int i=0; i<npts; ++i) {
1903  const Real x2 = xi[i] * xi[i];
1904  const Real sqrt_ww_x = sqrt(ww(xi[i]));
1905  const unsigned int ioffset = i * nparams;
1906  for(unsigned int j=0; j<n; ++j)
1907  J[ioffset+j] = (std::exp(-aa[j] * x2)) * sqrt_ww_x;
1908  const unsigned int ioffsetn = ioffset+n;
1909  for(unsigned int j=0; j<n; ++j)
1910  J[ioffsetn+j] = - sqrt_ww_x * x2 * cc[j] * std::exp(-aa[j] * x2);
1911  }
1912 
1913  std::vector<Real> A( nparams * nparams);
1914  for(size_t r=0, rc=0; r<nparams; ++r) {
1915  for(size_t c=0; c<nparams; ++c, ++rc) {
1916  double Arc = 0.0;
1917  for(size_t i=0, ir=r, ic=c; i<npts; ++i, ir+=nparams, ic+=nparams)
1918  Arc += J[ir] * J[ic];
1919  A[rc] = Arc;
1920  }
1921  }
1922 
1923  std::vector<Real> b( nparams );
1924  for(size_t r=0; r<nparams; ++r) {
1925  Real br = 0.0;
1926  for(size_t i=0, ir=r; i<npts; ++i, ir+=nparams)
1927  br += J[ir] * err[i];
1928  b[r] = br;
1929  }
1930 
1931  // try decreasing damping first
1932  // if not successful try increasing damping until it results in a decrease in the error
1933  lambda0 /= nu;
1934  for(int l=-1; l<1000; ++l) {
1935 
1936  LinearSolveDamped(A, b, lambda0, delta );
1937 
1938  std::vector<double> cc_0(cc); for(unsigned int i=0; i<n; ++i) cc_0[i] += delta[i];
1939  std::vector<double> aa_0(aa); for(unsigned int i=0; i<n; ++i) aa_0[i] += delta[i+n];
1940 
1941  // if any of the exponents are negative the step is too large and need to increase damping
1942  bool step_too_large = false;
1943  for(unsigned int i=0; i<n; ++i)
1944  if (aa_0[i] < 0.0) {
1945  step_too_large = true;
1946  break;
1947  }
1948  if (!step_too_large) {
1949  std::vector<double> err_0(npts);
1950  for(unsigned int i=0; i<npts; ++i) {
1951  const double x = xi[i];
1952  err_0[i] = (fstg(zeta, x) - fngtg(cc_0, aa_0, x)) * sqrt(ww(x));
1953  }
1954  const double errnorm_0 = norm(err_0)/sqrt((double)npts);
1955  if (errnorm_0 < errnormI) {
1956  cc = cc_0;
1957  aa = aa_0;
1958  break;
1959  }
1960  else // step lead to increase of the error -- try dampening a bit more
1961  lambda0 *= nu;
1962  }
1963  else // too large of a step
1964  lambda0 *= nu;
1965  } // done adjusting the damping factor
1966 
1967  } // end of iterative minimization
1968 
1969  // if reached max # of iterations throw if the error is too terrible
1970  assert(not (iter == maxniter && errnormI > 1e-10));
1971 
1972  for(unsigned int i=0; i<n; ++i)
1973  geminal[i] = std::make_pair(aa[i], cc[i]);
1974  }
1975 #endif
1976 
1977 } // end of namespace libint2
1978 
1979 #endif // C++ only
1980 #endif // header guard
holds tables of expensive quantities
Definition: boys.h:60
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition: vector_x86.h:232
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition: vector_x86.h:43
static std::shared_ptr< const FmEval_Chebyshev7 > instance(int m_max, double=0.0)
Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe...
Definition: boys.h:300
int max_m() const
Definition: boys.h:616
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:24
std::ostream & verbose_stream()
Accessor for the disgnostics stream.
Definition: initialize.h:98
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
void convert(T *a) const
writes this to a
Definition: vector_x86.h:133
bool verbose()
Accessor for the verbose flag.
Definition: initialize.h:104
some evaluators need thread-local scratch, but most don't
Definition: boys.h:1331
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition: boys.h:207
void eval_slater(Real *Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const
and
Definition: boys.h:944
void eval(Real *Fm, Real T, int mmax) const
computes Boys function values with m index in range [0,mmax]
Definition: boys.h:625
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
static std::shared_ptr< const FmEval_Taylor > instance(unsigned int mmax, Real precision=std::numeric_limits< Real >::epsilon())
Singleton interface allows to manage the lone instance; adjusts max m and precision values as needed ...
Definition: boys.h:597
static std::shared_ptr< const TennoGmEval > instance(int m_max, double=0)
Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe...
Definition: boys.h:890
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition: boys.h:247
Definition: boys.h:1326
static void eval(Real *Fm, Real t, size_t mmax)
fills up an array of Fm(T) for m in [0,mmax]
Definition: boys.h:221
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
static void eval(Real *Fm, Real T, size_t mmax)
fills up an array of Fm(T) for m in [0,mmax]
Definition: boys.h:184
Real precision() const
Definition: boys.h:910
void eval(Real *Fm, Real x, int m_max) const
fills in Fm with computed Boys function values for m in [0,mmax]
Definition: boys.h:325
Computes the Boys function, , using single algorithm (asymptotic expansion).
Definition: boys.h:144
core integral for Yukawa and exponential interactions
Definition: boys.h:834
FmEval_Chebyshev7(int m_max, double precision=std::numeric_limits< double >::epsilon())
Definition: boys.h:269
TennoGmEval(unsigned int mmax, Real precision=-1)
Definition: boys.h:854
FmEval_Taylor(unsigned int mmax, Real precision=std::numeric_limits< Real >::epsilon())
Constructs the object to be able to compute Boys funcion for m in [0,mmax], with relative precision.
Definition: boys.h:490
void eval_yukawa(Real *Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const
Definition: boys.h:922
static Real eval(Real T, size_t m)
computes a single value of using MacLaurin series to full precision of Real
Definition: boys.h:155
Real precision() const
Definition: boys.h:618
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition: boys.h:481
int max_m() const
Definition: boys.h:319