1/*
2 * Single-precision erf(x) function.
3 *
4 * Copyright (c) 2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8#include <stdint.h>
9#include <math.h>
10#include "math_config.h"
11
12#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
13#define A __erff_data.erff_poly_A
14#define B __erff_data.erff_poly_B
15
16/* Top 12 bits of a float.  */
17static inline uint32_t
18top12 (float x)
19{
20  return asuint (x) >> 20;
21}
22
23/* Efficient implementation of erff
24   using either a pure polynomial approximation or
25   the exponential of a polynomial.
26   Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
27float
28erff (float x)
29{
30  float r, x2, u;
31
32  /* Get top word.  */
33  uint32_t ix = asuint (x);
34  uint32_t sign = ix >> 31;
35  uint32_t ia12 = top12 (x) & 0x7ff;
36
37  /* Limit of both intervals is 0.875 for performance reasons but coefficients
38     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
39     from 0.94 to 1.1ulps.  */
40  if (ia12 < 0x3f6)
41    { /* a = |x| < 0.875.  */
42
43      /* Tiny and subnormal cases.  */
44      if (unlikely (ia12 < 0x318))
45	{ /* |x| < 2^(-28).  */
46	  if (unlikely (ia12 < 0x040))
47	    { /* |x| < 2^(-119).  */
48	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
49	      return check_uflowf (y);
50	    }
51	  return x + TwoOverSqrtPiMinusOne * x;
52	}
53
54      x2 = x * x;
55
56      /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
57      r = A[5];
58      r = fmaf (r, x2, A[4]);
59      r = fmaf (r, x2, A[3]);
60      r = fmaf (r, x2, A[2]);
61      r = fmaf (r, x2, A[1]);
62      r = fmaf (r, x2, A[0]);
63      r = fmaf (r, x, x);
64    }
65  else if (ia12 < 0x408)
66    { /* |x| < 4.0 - Use a custom Estrin scheme.  */
67
68      float a = fabsf (x);
69      /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
70      r = fmaf (B[6], a, B[5]);
71      u = fmaf (B[4], a, B[3]);
72      x2 = x * x;
73      r = fmaf (r, x2, u);
74      /* Then switch to pure Horner scheme.  */
75      r = fmaf (r, a, B[2]);
76      r = fmaf (r, a, B[1]);
77      r = fmaf (r, a, B[0]);
78      r = fmaf (r, a, a);
79      /* Single precision exponential with ~0.5ulps,
80	 ensures erff has max. rel. error
81	 < 1ulp on [0.921875, 4.0],
82	 < 1.1ulps on [0.875, 4.0].  */
83      r = expf (-r);
84      /* Explicit copysign (calling copysignf increases latency).  */
85      if (sign)
86	r = -1.0f + r;
87      else
88	r = 1.0f - r;
89    }
90  else
91    { /* |x| >= 4.0.  */
92
93      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
94      if (unlikely (ia12 >= 0x7f8))
95	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
96
97      /* Explicit copysign (calling copysignf increases latency).  */
98      if (sign)
99	r = -1.0f;
100      else
101	r = 1.0f;
102    }
103  return r;
104}
105