1bbbf1280Sopenharmony_ci/*
2bbbf1280Sopenharmony_ci * Single-precision vector e^x function.
3bbbf1280Sopenharmony_ci *
4bbbf1280Sopenharmony_ci * Copyright (c) 2019, Arm Limited.
5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT
6bbbf1280Sopenharmony_ci */
7bbbf1280Sopenharmony_ci
8bbbf1280Sopenharmony_ci#include "mathlib.h"
9bbbf1280Sopenharmony_ci#include "v_math.h"
10bbbf1280Sopenharmony_ci#if V_SUPPORTED
11bbbf1280Sopenharmony_ci
12bbbf1280Sopenharmony_cistatic const float Poly[] = {
13bbbf1280Sopenharmony_ci  /* maxerr: 1.45358 +0.5 ulp.  */
14bbbf1280Sopenharmony_ci  0x1.0e4020p-7f,
15bbbf1280Sopenharmony_ci  0x1.573e2ep-5f,
16bbbf1280Sopenharmony_ci  0x1.555e66p-3f,
17bbbf1280Sopenharmony_ci  0x1.fffdb6p-2f,
18bbbf1280Sopenharmony_ci  0x1.ffffecp-1f,
19bbbf1280Sopenharmony_ci};
20bbbf1280Sopenharmony_ci#define C0 v_f32 (Poly[0])
21bbbf1280Sopenharmony_ci#define C1 v_f32 (Poly[1])
22bbbf1280Sopenharmony_ci#define C2 v_f32 (Poly[2])
23bbbf1280Sopenharmony_ci#define C3 v_f32 (Poly[3])
24bbbf1280Sopenharmony_ci#define C4 v_f32 (Poly[4])
25bbbf1280Sopenharmony_ci
26bbbf1280Sopenharmony_ci#define Shift v_f32 (0x1.8p23f)
27bbbf1280Sopenharmony_ci#define InvLn2 v_f32 (0x1.715476p+0f)
28bbbf1280Sopenharmony_ci#define Ln2hi v_f32 (0x1.62e4p-1f)
29bbbf1280Sopenharmony_ci#define Ln2lo v_f32 (0x1.7f7d1cp-20f)
30bbbf1280Sopenharmony_ci
31bbbf1280Sopenharmony_ciVPCS_ATTR
32bbbf1280Sopenharmony_cistatic v_f32_t
33bbbf1280Sopenharmony_cispecialcase (v_f32_t poly, v_f32_t n, v_u32_t e, v_f32_t absn, v_u32_t cmp1, v_f32_t scale)
34bbbf1280Sopenharmony_ci{
35bbbf1280Sopenharmony_ci  /* 2^n may overflow, break it up into s1*s2.  */
36bbbf1280Sopenharmony_ci  v_u32_t b = v_cond_u32 (n <= v_f32 (0.0f)) & v_u32 (0x82000000);
37bbbf1280Sopenharmony_ci  v_f32_t s1 = v_as_f32_u32 (v_u32 (0x7f000000) + b);
38bbbf1280Sopenharmony_ci  v_f32_t s2 = v_as_f32_u32 (e - b);
39bbbf1280Sopenharmony_ci  v_u32_t cmp2 = v_cond_u32 (absn > v_f32 (192.0f));
40bbbf1280Sopenharmony_ci  v_u32_t r2 = v_as_u32_f32 (s1 * s1);
41bbbf1280Sopenharmony_ci  v_u32_t r1 = v_as_u32_f32 (v_fma_f32 (poly, s2, s2) * s1);
42bbbf1280Sopenharmony_ci  /* Similar to r1 but avoids double rounding in the subnormal range.  */
43bbbf1280Sopenharmony_ci  v_u32_t r0 = v_as_u32_f32 (v_fma_f32 (poly, scale, scale));
44bbbf1280Sopenharmony_ci  return v_as_f32_u32 ((cmp2 & r2) | (~cmp2 & cmp1 & r1) | (~cmp1 & r0));
45bbbf1280Sopenharmony_ci}
46bbbf1280Sopenharmony_ci
47bbbf1280Sopenharmony_ciVPCS_ATTR
48bbbf1280Sopenharmony_civ_f32_t
49bbbf1280Sopenharmony_ciV_NAME(expf) (v_f32_t x)
50bbbf1280Sopenharmony_ci{
51bbbf1280Sopenharmony_ci  v_f32_t n, r, r2, scale, p, q, poly, absn, z;
52bbbf1280Sopenharmony_ci  v_u32_t cmp, e;
53bbbf1280Sopenharmony_ci
54bbbf1280Sopenharmony_ci  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
55bbbf1280Sopenharmony_ci     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
56bbbf1280Sopenharmony_ci#if 1
57bbbf1280Sopenharmony_ci  z = v_fma_f32 (x, InvLn2, Shift);
58bbbf1280Sopenharmony_ci  n = z - Shift;
59bbbf1280Sopenharmony_ci  r = v_fma_f32 (n, -Ln2hi, x);
60bbbf1280Sopenharmony_ci  r = v_fma_f32 (n, -Ln2lo, r);
61bbbf1280Sopenharmony_ci  e = v_as_u32_f32 (z) << 23;
62bbbf1280Sopenharmony_ci#else
63bbbf1280Sopenharmony_ci  z = x * InvLn2;
64bbbf1280Sopenharmony_ci  n = v_round_f32 (z);
65bbbf1280Sopenharmony_ci  r = v_fma_f32 (n, -Ln2hi, x);
66bbbf1280Sopenharmony_ci  r = v_fma_f32 (n, -Ln2lo, r);
67bbbf1280Sopenharmony_ci  e = v_as_u32_s32 (v_round_s32 (z)) << 23;
68bbbf1280Sopenharmony_ci#endif
69bbbf1280Sopenharmony_ci  scale = v_as_f32_u32 (e + v_u32 (0x3f800000));
70bbbf1280Sopenharmony_ci  absn = v_abs_f32 (n);
71bbbf1280Sopenharmony_ci  cmp = v_cond_u32 (absn > v_f32 (126.0f));
72bbbf1280Sopenharmony_ci  r2 = r * r;
73bbbf1280Sopenharmony_ci  p = v_fma_f32 (C0, r, C1);
74bbbf1280Sopenharmony_ci  q = v_fma_f32 (C2, r, C3);
75bbbf1280Sopenharmony_ci  q = v_fma_f32 (p, r2, q);
76bbbf1280Sopenharmony_ci  p = C4 * r;
77bbbf1280Sopenharmony_ci  poly = v_fma_f32 (q, r2, p);
78bbbf1280Sopenharmony_ci  if (unlikely (v_any_u32 (cmp)))
79bbbf1280Sopenharmony_ci    return specialcase (poly, n, e, absn, cmp, scale);
80bbbf1280Sopenharmony_ci  return v_fma_f32 (poly, scale, scale);
81bbbf1280Sopenharmony_ci}
82bbbf1280Sopenharmony_ciVPCS_ALIAS
83bbbf1280Sopenharmony_ci#endif
84