cosmopolitan/libc/tinymath/erff.c

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
╚──────────────────────────────────────────────────────────────────────────────╝
│                                                                              │
│  Optimized Routines                                                          │
│  Copyright (c) 2018-2024, Arm Limited.                                       │
│                                                                              │
│  Permission is hereby granted, free of charge, to any person obtaining       │
│  a copy of this software and associated documentation files (the             │
│  "Software"), to deal in the Software without restriction, including         │
│  without limitation the rights to use, copy, modify, merge, publish,         │
│  distribute, sublicense, and/or sell copies of the Software, and to          │
│  permit persons to whom the Software is furnished to do so, subject to       │
│  the following conditions:                                                   │
│                                                                              │
│  The above copyright notice and this permission notice shall be              │
│  included in all copies or substantial portions of the Software.             │
│                                                                              │
│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
│                                                                              │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/tinymath/arm.internal.h"
__static_yoink("arm_optimized_routines_notice");

#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
#define A __erff_data.erff_poly_A
#define B __erff_data.erff_poly_B

/* Top 12 bits of a float.  */
static inline uint32_t
top12 (float x)
{
  return asuint (x) >> 20;
}

/* Efficient implementation of erff
   using either a pure polynomial approximation or
   the exponential of a polynomial.
   Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
float
erff (float x)
{
  float r, x2, u;

  /* Get top word.  */
  uint32_t ix = asuint (x);
  uint32_t sign = ix >> 31;
  uint32_t ia12 = top12 (x) & 0x7ff;

  /* Limit of both intervals is 0.875 for performance reasons but coefficients
     computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
     from 0.94 to 1.1ulps.  */
  if (ia12 < 0x3f6)
    { /* a = |x| < 0.875.  */

      /* Tiny and subnormal cases.  */
      if (unlikely (ia12 < 0x318))
	{ /* |x| < 2^(-28).  */
	  if (unlikely (ia12 < 0x040))
	    { /* |x| < 2^(-119).  */
	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
	      return check_uflowf (y);
	    }
	  return x + TwoOverSqrtPiMinusOne * x;
	}

      x2 = x * x;

      /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
      r = A[5];
      r = fmaf (r, x2, A[4]);
      r = fmaf (r, x2, A[3]);
      r = fmaf (r, x2, A[2]);
      r = fmaf (r, x2, A[1]);
      r = fmaf (r, x2, A[0]);
      r = fmaf (r, x, x);
    }
  else if (ia12 < 0x408)
    { /* |x| < 4.0 - Use a custom Estrin scheme.  */

      float a = fabsf (x);
      /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
      r = fmaf (B[6], a, B[5]);
      u = fmaf (B[4], a, B[3]);
      x2 = x * x;
      r = fmaf (r, x2, u);
      /* Then switch to pure Horner scheme.  */
      r = fmaf (r, a, B[2]);
      r = fmaf (r, a, B[1]);
      r = fmaf (r, a, B[0]);
      r = fmaf (r, a, a);
      /* Single precision exponential with ~0.5ulps,
	 ensures erff has max. rel. error
	 < 1ulp on [0.921875, 4.0],
	 < 1.1ulps on [0.875, 4.0].  */
      r = expf (-r);
      /* Explicit copysign (calling copysignf increases latency).  */
      if (sign)
	r = -1.0f + r;
      else
	r = 1.0f - r;
    }
  else
    { /* |x| >= 4.0.  */

      /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
      if (unlikely (ia12 >= 0x7f8))
	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;

      /* Explicit copysign (calling copysignf increases latency).  */
      if (sign)
	r = -1.0f;
      else
	r = 1.0f;
    }
  return r;
}
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/-- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│`
			`│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00			`╚──────────────────────────────────────────────────────────────────────────────╝`
			`│ │`
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`│ Optimized Routines │`
			`│ Copyright (c) 2018-2024, Arm Limited. │`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00			`│ │`
			`│ Permission is hereby granted, free of charge, to any person obtaining │`
			`│ a copy of this software and associated documentation files (the │`
			`│ "Software"), to deal in the Software without restriction, including │`
			`│ without limitation the rights to use, copy, modify, merge, publish, │`
			`│ distribute, sublicense, and/or sell copies of the Software, and to │`
			`│ permit persons to whom the Software is furnished to do so, subject to │`
			`│ the following conditions: │`
			`│ │`
			`│ The above copyright notice and this permission notice shall be │`
			`│ included in all copies or substantial portions of the Software. │`
			`│ │`
			`│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │`
			`│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │`
			`│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │`
			`│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │`
			`│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │`
			`│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │`
			`│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │`
			`│ │`
			`╚─────────────────────────────────────────────────────────────────────────────*/`
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`#include "libc/tinymath/arm.internal.h"`
			`__static_yoink("arm_optimized_routines_notice");`
Get more Python tests passing (#141) 2021-08-16 22:26:31 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`#define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f`
			`#define A __erff_data.erff_poly_A`
			`#define B __erff_data.erff_poly_B`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/* Top 12 bits of a float. */`
			`static inline uint32_t`
			`top12 (float x)`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00			`{`
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`return asuint (x) >> 20;`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00			`}`

Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/* Efficient implementation of erff`
			`using either a pure polynomial approximation or`
			`the exponential of a polynomial.`
			`Worst-case error is 1.09ulps at 0x1.c111acp-1. */`
			`float`
			`erff (float x)`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00			`{`
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`float r, x2, u;`

			`/* Get top word. */`
			`uint32_t ix = asuint (x);`
			`uint32_t sign = ix >> 31;`
			`uint32_t ia12 = top12 (x) & 0x7ff;`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/* Limit of both intervals is 0.875 for performance reasons but coefficients`
			`computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy`
			`from 0.94 to 1.1ulps. */`
			`if (ia12 < 0x3f6)`
			`{ /* a = \|x\| < 0.875. */`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/* Tiny and subnormal cases. */`
			`if (unlikely (ia12 < 0x318))`
			`{ /* \|x\| < 2^(-28). */`
			`if (unlikely (ia12 < 0x040))`
			`{ /* \|x\| < 2^(-119). */`
			`float y = fmaf (TwoOverSqrtPiMinusOne, x, x);`
			`return check_uflowf (y);`
			`}`
			`return x + TwoOverSqrtPiMinusOne * x;`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00			`}`

Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`x2 = x * x;`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/* Normalized cases (\|x\| < 0.921875). Use Horner scheme for x+xP(x^2). /`
			`r = A[5];`
			`r = fmaf (r, x2, A[4]);`
			`r = fmaf (r, x2, A[3]);`
			`r = fmaf (r, x2, A[2]);`
			`r = fmaf (r, x2, A[1]);`
			`r = fmaf (r, x2, A[0]);`
			`r = fmaf (r, x, x);`
			`}`
			`else if (ia12 < 0x408)`
			`{ /* \|x\| < 4.0 - Use a custom Estrin scheme. */`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`float a = fabsf (x);`
			`/* Start with Estrin scheme on high order (small magnitude) coefficients. */`
			`r = fmaf (B[6], a, B[5]);`
			`u = fmaf (B[4], a, B[3]);`
			`x2 = x * x;`
			`r = fmaf (r, x2, u);`
			`/* Then switch to pure Horner scheme. */`
			`r = fmaf (r, a, B[2]);`
			`r = fmaf (r, a, B[1]);`
			`r = fmaf (r, a, B[0]);`
			`r = fmaf (r, a, a);`
			`/* Single precision exponential with ~0.5ulps,`
			`ensures erff has max. rel. error`
			`< 1ulp on [0.921875, 4.0],`
			`< 1.1ulps on [0.875, 4.0]. */`
			`r = expf (-r);`
			`/* Explicit copysign (calling copysignf increases latency). */`
			`if (sign)`
			`r = -1.0f + r;`
			`else`
			`r = 1.0f - r;`
			`}`
			`else`
			`{ /* \|x\| >= 4.0. */`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1. */`
			`if (unlikely (ia12 >= 0x7f8))`
			`return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00
Make quality improvements - Write some more unit tests - memcpy() on ARM is now faster - Address the Musl complex math FIXME comments - Some libm funcs like pow() now support setting errno - Import the latest and greatest math functions from ARM - Use more accurate atan2f() and log1pf() implementations - atoi() and atol() will no longer saturate or clobber errno 2024-02-25 22:57:28 +00:00			`/* Explicit copysign (calling copysignf increases latency). */`
			`if (sign)`
			`r = -1.0f;`
			`else`
			`r = 1.0f;`
			`}`
			`return r;`
Add more math functions (#128) 2021-03-18 03:05:12 +00:00			`}`