Make more ML improvements

- Fix UX issues with llama.com
- Do housekeeping on libm code
- Add more vectorization to GGML
- Get GGJT quantizer programs working well
- Have the quantizer keep the output layer as f16c
- Prefetching improves performance 15% if you use fewer threads
This commit is contained in:
Justine Tunney 2023-05-16 08:07:23 -07:00
parent 80db9de173
commit e7eb0b3070
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
46 changed files with 340 additions and 289 deletions

View file

@ -28,6 +28,7 @@
#include "libc/math.h"
#include "libc/tinymath/invtrigl.internal.h"
#include "libc/tinymath/ldshape.internal.h"
#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
asm(".ident\t\"\\n\\n\
fdlibm (fdlibm license)\\n\
@ -54,22 +55,20 @@ asm(".include \"libc/disclaimer.inc\"");
* Converted to long double by David Schultz <das@FreeBSD.ORG>.
*/
/**
* Returns arc cosine of 𝑥.
*
* @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥)
* @domain -1 𝑥 1
*/
long double acosl(long double x) {
#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
return acos(x);
#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
#if LDBL_MANT_DIG == 64
#define CLEARBOTTOM(u) (u.i.m &= -1ULL << 32)
#elif LDBL_MANT_DIG == 113
#define CLEARBOTTOM(u) (u.i.lo = 0)
#endif
/**
* Returns arc cosine of 𝑥.
*
* @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥)
* @domain -1 𝑥 1
*/
long double acosl(long double x)
{
union ldshape u = {x};
long double z, s, c, f;
uint16_t e = u.i.se & 0x7fff;
@ -102,8 +101,6 @@ long double acosl(long double x) {
f = u.f;
c = (z - f*f)/(s + f);
return 2*(__invtrigl_R(z)*s + c + f);
#else
#error "architecture unsupported"
#endif
}
#endif /* long double is long */