Make more ML improvements

- Fix UX issues with llama.com
- Do housekeeping on libm code
- Add more vectorization to GGML
- Get GGJT quantizer programs working well
- Have the quantizer keep the output layer as f16c
- Prefetching improves performance 15% if you use fewer threads
This commit is contained in:
Justine Tunney 2023-05-16 08:07:23 -07:00
parent 80db9de173
commit e7eb0b3070
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
46 changed files with 340 additions and 289 deletions

View file

@ -18,10 +18,18 @@ __funline float _cvtsh_ss(unsigned short __S) {
return __builtin_ia32_vec_ext_v4sf(__A, 0);
}
/**
* Converts four half-precision (16-bit) floating point values to
* single-precision floating point values.
*/
__funline __m128 _mm_cvtph_ps(__m128i __A) {
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A);
}
/**
* Converts eight half-precision (16-bit) floating point values to
* single-precision floating point values.
*/
__funline __m256 _mm256_cvtph_ps(__m128i __A) {
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A);
}
@ -37,6 +45,10 @@ __funline __m128i _mm_cvtps_ph(__m128 __A, const int __I) {
return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I);
}
/**
* Converts eight single-precision floating point values to
* half-precision (16-bit) floating point values.
*/
__funline __m128i _mm256_cvtps_ph(__m256 __A, const int __I) {
return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I);
}