diff --git a/libc/log/log.h b/libc/log/log.h index 9f418da40..ff4e7ebe2 100644 --- a/libc/log/log.h +++ b/libc/log/log.h @@ -43,6 +43,7 @@ const char *commandvenv(const char *, const char *); const char *GetAddr2linePath(void); const char *GetGdbPath(void); void ShowCrashReports(void); +int MakeProcessNice(void); bool32 IsDebuggerPresent(bool); bool IsRunningUnderMake(void); const char *GetSiCodeName(int, int); diff --git a/libc/log/makeprocessnice.c b/libc/log/makeprocessnice.c new file mode 100644 index 000000000..0a3299cce --- /dev/null +++ b/libc/log/makeprocessnice.c @@ -0,0 +1,41 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/calls/calls.h" +#include "libc/calls/struct/sched_param.h" +#include "libc/errno.h" +#include "libc/log/log.h" +#include "libc/sysv/consts/ioprio.h" +#include "libc/sysv/consts/prio.h" +#include "libc/sysv/consts/sched.h" + +/** + * Makes current process as low-priority as possible. + * + * @return 0 on success, or -1 w/ errno + * @note error reporting currently not implemented + */ +int MakeProcessNice(void) { + int e = errno; + setpriority(PRIO_PROCESS, 0, 10); + ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + struct sched_param param = {sched_get_priority_min(SCHED_IDLE)}; + sched_setscheduler(0, SCHED_IDLE, ¶m); + errno = e; + return 0; +} diff --git a/libc/log/oncrash_arm64.c b/libc/log/oncrash_arm64.c index 6c3473ab4..96d885143 100644 --- a/libc/log/oncrash_arm64.c +++ b/libc/log/oncrash_arm64.c @@ -230,7 +230,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) { // simply examining the program counter. pc = ctx->uc_mcontext.pc; Append(b, " %016lx sp %lx pc", ctx->uc_mcontext.sp, pc); - if (pc && (symbol = __get_symbol(st, pc))) { + if (pc && st && (symbol = __get_symbol(st, pc))) { addend = pc - st->addr_base; addend -= st->symbols[symbol].x; Append(b, " "); @@ -251,7 +251,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) { fp = (struct StackFrame *)ctx->uc_mcontext.regs[29]; if (IsCode((pc = ctx->uc_mcontext.regs[30]))) { Append(b, " %016lx sp %lx lr", ctx->uc_mcontext.sp, pc); - if (pc && (symbol = __get_symbol(st, pc))) { + if (pc && st && (symbol = __get_symbol(st, pc))) { addend = pc - st->addr_base; addend -= st->symbols[symbol].x; Append(b, " "); @@ -282,7 +282,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) { Append(b, " \n"); break; } - if ((pc = fp->addr)) { + if (st && (pc = fp->addr)) { if ((symbol = __get_symbol(st, pc))) { addend = pc - st->addr_base; addend -= st->symbols[symbol].x; @@ -294,7 +294,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) { addend = 0; } Append(b, " %016lx fp %lx lr ", fp, pc); - if (!AppendFileLine(b, addr2line, debugbin, pc)) { + if (!AppendFileLine(b, addr2line, debugbin, pc) && st) { Append(b, "%s", __get_symbol_name(st, symbol)); if (addend) Append(b, "%+d", addend); } diff --git a/libc/runtime/finddebugbinary.c b/libc/runtime/finddebugbinary.c index 7e534e9f7..0db821288 100644 --- a/libc/runtime/finddebugbinary.c +++ b/libc/runtime/finddebugbinary.c @@ -16,12 +16,69 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "ape/sections.internal.h" #include "libc/calls/calls.h" +#include "libc/elf/def.h" +#include "libc/elf/elf.h" +#include "libc/elf/struct/ehdr.h" +#include "libc/elf/struct/sym.h" +#include "libc/errno.h" #include "libc/intrin/bits.h" #include "libc/macros.internal.h" #include "libc/runtime/runtime.h" #include "libc/runtime/symbols.internal.h" #include "libc/str/str.h" +#include "libc/sysv/consts/map.h" +#include "libc/sysv/consts/o.h" +#include "libc/sysv/consts/prot.h" + +static bool GetElfSymbolValue(const Elf64_Ehdr *ehdr, size_t esize, + const char *name, uint64_t *res) { + Elf64_Xword i, n; + const char *stab; + const Elf64_Sym *st; + if ((stab = GetElfStringTable(ehdr, esize)) && + (st = GetElfSymbolTable(ehdr, esize, &n))) { + for (i = 0; i < n; ++i) { + if (!strcmp(stab + st[i].st_name, name)) { + *res = st[i].st_value; + return true; + } + } + } + return false; +} + +static bool IsMyDebugBinaryImpl(const char *path) { + int fd; + void *map; + int64_t size; + uintptr_t value; + bool res = false; + if ((fd = open(path, O_RDONLY | O_CLOEXEC, 0)) != -1) { + // sanity test that this .com.dbg file (1) is an elf image, and (2) + // contains the same number of bytes of code as our .com executable + // which is currently running in memory. + if ((size = lseek(fd, 0, SEEK_END)) != -1 && + (map = mmap(0, size, PROT_READ, MAP_SHARED, fd, 0)) != MAP_FAILED) { + if (GetElfSymbolValue(map, size, "_etext", &value)) { + res = !_etext || value == (uintptr_t)_etext; + } + munmap(map, size); + } + close(fd); + } + return res; +} + +static bool IsMyDebugBinary(const char *path) { + int e; + bool res; + e = errno; + res = IsMyDebugBinaryImpl(path); + errno = e; + return res; +} /** * Returns path of binary with the debug information, or null. @@ -42,12 +99,12 @@ const char *FindDebugBinary(void) { } else if (n > 4 && READ32LE(p + n - 4) == READ32LE(".com") && n + 4 < ARRAYLEN(buf)) { mempcpy(mempcpy(buf, p, n), ".dbg", 5); - if (fileexists(buf)) { + if (IsMyDebugBinary(buf)) { res = buf; } } else if (n + 8 < ARRAYLEN(buf)) { mempcpy(mempcpy(buf, p, n), ".com.dbg", 9); - if (fileexists(buf)) { + if (IsMyDebugBinary(buf)) { res = buf; } } diff --git a/third_party/ggml/README.cosmo b/third_party/ggml/README.cosmo index f71191337..13b4ac6a7 100644 --- a/third_party/ggml/README.cosmo +++ b/third_party/ggml/README.cosmo @@ -9,13 +9,11 @@ LICENSE ORIGIN https://github.com/ggerganov/llama.cpp - commit 0b2da20538d01926b77ea237dd1c930c4d20b686 - Author: Stephan Walter - Date: Wed Apr 26 20:26:42 2023 +0000 - ggml : slightly faster AVX2 implementation for Q5 (#1197) + d8bd0013e8768aaa3dc9cfc1ff01499419d5348e LOCAL CHANGES + - Maintaining support for deprecated file formats - Make it possible for loaded prompts to be cached to disk - Introduce -v and --verbose flags - Reduce batch size from 512 to 32 @@ -25,7 +23,6 @@ LOCAL CHANGES - Change --reverse-prompt to no longer imply --interactive - Permit --reverse-prompt specifying custom EOS if non-interactive - Refactor headers per cosmo convention - - Replace code like 'ggjt' with READ32BE("ggjt") - Remove C++ exceptions; use Die() function instead - Removed division from matrix multiplication. - Let quantizer convert between ggmt formats diff --git a/third_party/ggml/common.cc b/third_party/ggml/common.cc index d780d4f9f..3931996df 100644 --- a/third_party/ggml/common.cc +++ b/third_party/ggml/common.cc @@ -34,6 +34,7 @@ #include "libc/stdio/stdio.h" #include "libc/str/str.h" #include "libc/sysv/consts/fileno.h" +#include "third_party/ggml/llama.h" #include "third_party/ggml/llama_util.h" #include "third_party/libcxx/algorithm" #include "third_party/libcxx/cassert" @@ -258,6 +259,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.use_color = true; } else if (arg == "--mlock") { params.use_mlock = true; + } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); +#endif } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--mtest") { @@ -425,6 +437,10 @@ void gpt_print_usage(FILE *f, int /*argc*/, char ** argv, const gpt_params & par if (llama_mmap_supported()) { fprintf(f, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); + fprintf(stderr, " number of layers to store in VRAM\n"); +#endif fprintf(f, " --mtest compute maximum memory usage\n"); fprintf(f, " --verbose-prompt print prompt before generation\n"); fprintf(f, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); diff --git a/third_party/ggml/common.h b/third_party/ggml/common.h index 8cd4562ba..8a150d2ad 100644 --- a/third_party/ggml/common.h +++ b/third_party/ggml/common.h @@ -27,6 +27,7 @@ struct gpt_params { int32_t n_ctx = 512; // context size int32_t n_batch = 32; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_gpu_layers = 0; // number of layers to store in VRAM // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens diff --git a/third_party/ggml/ggjt.v2.c b/third_party/ggml/ggjt.v2.c new file mode 100644 index 000000000..3d2ad1da5 --- /dev/null +++ b/third_party/ggml/ggjt.v2.c @@ -0,0 +1,152 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/assert.h" +#include "third_party/ggml/ggjt.v2.q4_0.h" +#include "third_party/ggml/ggjt.v2.q4_1.h" +#include "third_party/ggml/ggjt.v2.q5_0.h" +#include "third_party/ggml/ggjt.v2.q5_1.h" +#include "third_party/ggml/ggjt.v2.q8_0.h" +#include "third_party/ggml/ggjt.v2.q8_1.h" +#include "third_party/ggml/ggml.h" +// clang-format off + +static const int ggjt_v2_blck_size[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = 1, + [GGML_TYPE_F16] = 1, + [GGML_TYPE_Q4_0] = V2_QK4_0, + [GGML_TYPE_Q4_1] = V2_QK4_1, + [GGML_TYPE_Q5_0] = V2_QK5_0, + [GGML_TYPE_Q5_1] = V2_QK5_1, + [GGML_TYPE_Q8_0] = V2_QK8_0, + [GGML_TYPE_Q8_1] = V2_QK8_1, + [GGML_TYPE_I8] = 1, + [GGML_TYPE_I16] = 1, + [GGML_TYPE_I32] = 1, +}; + +static const size_t ggjt_v2_type_size[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = sizeof(float), + [GGML_TYPE_F16] = sizeof(ggml_fp16_t), + [GGML_TYPE_Q4_0] = sizeof(block_v2_q4_0), + [GGML_TYPE_Q4_1] = sizeof(block_v2_q4_1), + [GGML_TYPE_Q5_0] = sizeof(block_v2_q5_0), + [GGML_TYPE_Q5_1] = sizeof(block_v2_q5_1), + [GGML_TYPE_Q8_0] = sizeof(block_v2_q8_0), + [GGML_TYPE_Q8_1] = sizeof(block_v2_q8_1), + [GGML_TYPE_I8] = sizeof(int8_t), + [GGML_TYPE_I16] = sizeof(int16_t), + [GGML_TYPE_I32] = sizeof(int32_t), +}; + +static const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = "f32", + [GGML_TYPE_F16] = "f16", + [GGML_TYPE_Q4_0] = "q4_0", + [GGML_TYPE_Q4_1] = "q4_1", + [GGML_TYPE_Q4_2] = "q4_2", + [GGML_TYPE_Q5_0] = "q5_0", + [GGML_TYPE_Q5_1] = "q5_1", + [GGML_TYPE_Q8_0] = "q8_0", + [GGML_TYPE_Q8_1] = "q8_1", + [GGML_TYPE_I8] = "i8", + [GGML_TYPE_I16] = "i16", + [GGML_TYPE_I32] = "i32", +}; + +static const bool ggjt_v2_is_quantized[GGML_TYPE_COUNT] = { + [GGML_TYPE_F32] = false, + [GGML_TYPE_F16] = false, + [GGML_TYPE_Q4_0] = true, + [GGML_TYPE_Q4_1] = true, + [GGML_TYPE_Q5_0] = true, + [GGML_TYPE_Q5_1] = true, + [GGML_TYPE_Q8_0] = true, + [GGML_TYPE_Q8_1] = true, + [GGML_TYPE_I8] = false, + [GGML_TYPE_I16] = false, + [GGML_TYPE_I32] = false, +}; + +static const quantize_chunk_f *const ggjt_v2_quantize_chunk[GGML_TYPE_COUNT] = { + [GGML_TYPE_Q4_0] = (void *)ggml_quantize_v2_q4_0, + [GGML_TYPE_Q4_1] = (void *)ggml_quantize_v2_q4_1, + [GGML_TYPE_Q5_0] = (void *)ggml_quantize_v2_q5_0, + [GGML_TYPE_Q5_1] = (void *)ggml_quantize_v2_q5_1, + [GGML_TYPE_Q8_0] = (void *)ggml_quantize_v2_q8_0, +}; + +static const quantize_fns_t ggjt_v2_quantize_fns[GGML_TYPE_COUNT] = { + [GGML_TYPE_Q4_0] = { + .dequantize_row_q = dequantize_row_v2_q4_0, + .quantize_row_q = quantize_row_v2_q4_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q4_0_reference, + .quantize_row_q_dot = quantize_row_v2_q8_0, + .vec_dot_q = ggml_vec_dot_v2_q4_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q4_1] = { + .dequantize_row_q = dequantize_row_v2_q4_1, + .quantize_row_q = quantize_row_v2_q4_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q4_1_reference, + .quantize_row_q_dot = quantize_row_v2_q8_1, + .vec_dot_q = ggml_vec_dot_v2_q4_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, + }, + [GGML_TYPE_Q5_0] = { + .dequantize_row_q = dequantize_row_v2_q5_0, + .quantize_row_q = quantize_row_v2_q5_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q5_0_reference, + .quantize_row_q_dot = quantize_row_v2_q8_0, + .vec_dot_q = ggml_vec_dot_v2_q5_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q5_1] = { + .dequantize_row_q = dequantize_row_v2_q5_1, + .quantize_row_q = quantize_row_v2_q5_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q5_1_reference, + .quantize_row_q_dot = quantize_row_v2_q8_1, + .vec_dot_q = ggml_vec_dot_v2_q5_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, + }, + [GGML_TYPE_Q8_0] = { + .dequantize_row_q = dequantize_row_v2_q8_0, + .quantize_row_q = quantize_row_v2_q8_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q8_0_reference, + .quantize_row_q_dot = quantize_row_v2_q8_0, + .vec_dot_q = ggml_vec_dot_v2_q8_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + }, + [GGML_TYPE_Q8_1] = { + .dequantize_row_q = NULL, // TODO + .quantize_row_q = quantize_row_v2_q8_1, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q8_1_reference, + .quantize_row_q_dot = quantize_row_v2_q8_1, + .vec_dot_q = NULL, // TODO + .vec_dot_type = GGML_TYPE_Q8_1, + }, +}; + +void ggjt_v2(void) { + GGML_BLCK_SIZE = ggjt_v2_blck_size; + GGML_TYPE_SIZE = ggjt_v2_type_size; + GGML_TYPE_NAME = ggjt_v2_type_name; + GGML_IS_QUANTIZED = ggjt_v2_is_quantized; + quantize_fns = ggjt_v2_quantize_fns; + GGML_QUANTIZE_CHUNK = ggjt_v2_quantize_chunk; +} diff --git a/third_party/ggml/ggjt.v2.internal.h b/third_party/ggml/ggjt.v2.internal.h new file mode 100644 index 000000000..6ed4f892b --- /dev/null +++ b/third_party/ggml/ggjt.v2.internal.h @@ -0,0 +1,152 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_ +#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_ +#include "libc/str/str.h" +#include "third_party/aarch64/arm_neon.h" +#include "third_party/intel/immintrin.internal.h" +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +#if __AVX__ || __AVX2__ || __AVX512F__ +// horizontally add 8 floats +static inline float hsum_float_8(const __m256 x) { + __m128 res = _mm256_extractf128_ps(x, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(x)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + return _mm_cvtss_f32(res); +} +#endif + +#if __AVX2__ || __AVX512F__ + +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t *x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m256i shuf_mask = + _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202, + 0x0101010101010101, 0x0000000000000000); + __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); + const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytes = _mm256_or_si256(bytes, bit_mask); + return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m256i x) { + const __m256i ones = _mm256_set1_epi16(1); + const __m256i summed_pairs = _mm256_madd_epi16(ones, x); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, + const __m256i sy) { +#if __AVXVNNI__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + return sum_i16_pairs_float(dot); +#endif +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { +#if __AVXVNNIINT8__ + const __m256i zero = _mm256_setzero_si256(); + const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); + return _mm256_cvtepi32_ps(summed_pairs); +#else + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(x, x); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(y, x); + return mul_sum_us8_pairs_float(ax, sy); +#endif +} + +static inline __m256i bytes_from_nibbles_32(const uint8_t *rsi) { + const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); + const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp); + const __m256i lowMask = _mm256_set1_epi8(0xF); + return _mm256_and_si256(lowMask, bytes); +} + +#elif defined(__AVX__) + +// spread 32 bits to 32 bytes { 0x00, 0xFF } +static inline __m256i bytes_from_bits_32(const uint8_t *x) { + uint32_t x32; + memcpy(&x32, x, sizeof(uint32_t)); + const __m128i shuf_maskl = + _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); + const __m128i shuf_maskh = + _mm_set_epi64x(0x0303030303030303, 0x0202020202020202); + __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl); + __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh); + const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe); + bytesl = _mm_or_si128(bytesl, bit_mask); + bytesh = _mm_or_si128(bytesh, bit_mask); + bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1)); + bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1)); + return _mm256_set_m128i(bytesh, bytesl); +} + +// add int16_t pairwise and return as float vector +static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) { + const __m128i ones = _mm_set1_epi16(1); + const __m128i summed_pairsl = _mm_madd_epi16(ones, xl); + const __m128i summed_pairsh = _mm_madd_epi16(ones, xh); + const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl); + return _mm256_cvtepi32_ps(summed_pairs); +} + +static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, + const __m256i sy) { + const __m128i axl = _mm256_castsi256_si128(ax); + const __m128i axh = _mm256_extractf128_si256(ax, 1); + const __m128i syl = _mm256_castsi256_si128(sy); + const __m128i syh = _mm256_extractf128_si256(sy, 1); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +// multiply int8_t, add results pairwise twice and return as float vector +static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { + const __m128i xl = _mm256_castsi256_si128(x); + const __m128i xh = _mm256_extractf128_si256(x, 1); + const __m128i yl = _mm256_castsi256_si128(y); + const __m128i yh = _mm256_extractf128_si256(y, 1); + // Get absolute values of x vectors + const __m128i axl = _mm_sign_epi8(xl, xl); + const __m128i axh = _mm_sign_epi8(xh, xh); + // Sign the values of the y vectors + const __m128i syl = _mm_sign_epi8(yl, xl); + const __m128i syh = _mm_sign_epi8(yh, xh); + // Perform multiplication and create 16-bit values + const __m128i dotl = _mm_maddubs_epi16(axl, syl); + const __m128i doth = _mm_maddubs_epi16(axh, syh); + return sum_i16_pairs_float(doth, dotl); +} + +// Unpack 32 4-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval +static inline __m256i bytes_from_nibbles_32(const uint8_t *rsi) { + // Load 16 bytes from memory + __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi); + __m128i tmph = _mm_srli_epi16(tmpl, 4); + const __m128i lowMask = _mm_set1_epi8(0xF); + tmpl = _mm_and_si128(lowMask, tmpl); + tmph = _mm_and_si128(lowMask, tmph); + return _mm256_set_m128i(tmph, tmpl); +} + +#endif /* AVX */ + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_ */ diff --git a/third_party/ggml/ggjt.v2.q4_0.c b/third_party/ggml/ggjt.v2.q4_0.c new file mode 100644 index 000000000..19f31fc41 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q4_0.c @@ -0,0 +1,396 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ GGML │ +│ Copyright (c) 2023 Georgi Gerganov │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/ggml/ggjt.v2.q4_0.h" +#include "libc/assert.h" +#include "libc/macros.internal.h" +#include "libc/math.h" +#include "third_party/ggml/ggjt.v2.internal.h" +#include "third_party/ggml/ggjt.v2.q8_0.h" +// clang-format off + +static_assert(sizeof(block_v2_q4_0) == sizeof(float) + V2_QK4_0 / 2, + "wrong q4_0 block size/padding"); + +void dequantize_row_v2_q4_0(const void * restrict x_, float * restrict y, int k) { + const block_v2_q4_0 * restrict x = x_; + static const int qk = V2_QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = x[i].d; + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F) - 8; + const int x1 = (x[i].qs[j] >> 4) - 8; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +size_t ggml_quantize_v2_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % V2_QK4_0 == 0); + const int nb = k / V2_QK4_0; + + for (int b = 0; b < n; b += k) { + block_v2_q4_0 * restrict y = (block_v2_q4_0 *) dst + b/V2_QK4_0; + + quantize_row_v2_q4_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < V2_QK4_0; j += 2) { + const uint8_t vi0 = y[i].qs[j/2] & 0x0F; + const uint8_t vi1 = y[i].qs[j/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/V2_QK4_0*sizeof(block_v2_q4_0)); +} + +void quantize_row_v2_q4_0(const float * restrict x, void * restrict y, int k) { + quantize_row_v2_q4_0_reference(x, y, k); +} + +// reference implementation for deterministic creation of model files +void quantize_row_v2_q4_0_reference(const float * restrict x, block_v2_q4_0 * restrict y, int k) { + static const int qk = V2_QK4_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} + +void ggml_vec_dot_v2_q4_0_q8_0(const int n, + float * restrict s, + const void * restrict vx, + const void * restrict vy) { + const int qk = V2_QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_v2_q4_0 * restrict x = vx; + const block_v2_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i += 2) { + const block_v2_q4_0 * restrict x0 = &x[i + 0]; + const block_v2_q4_0 * restrict x1 = &x[i + 1]; + const block_v2_q8_0 * restrict y0 = &y[i + 0]; + const block_v2_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // sub 8 + const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); + const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); + const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); + const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx = _mm_and_si128(lowMask, tmp); + __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx, by); + + bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); + by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx = _mm_sub_epi8(bx, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx, by); + + // Convert int32_t to float + __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1)); + + // Apply the scale, and accumulate + acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + } + + *s = hsum_float_8(acc); +#elif defined(__SSSE3__) + // set constants + const __m128i lowMask = _mm_set1_epi8(0xF); + const __m128i off = _mm_set1_epi8(8); + + // Initialize accumulator with zeros + __m128 acc_0 = _mm_setzero_ps(); + __m128 acc_1 = _mm_setzero_ps(); + __m128 acc_2 = _mm_setzero_ps(); + __m128 acc_3 = _mm_setzero_ps(); + + // First round without accumulation + { + _mm_prefetch(&x[0] + sizeof(block_v2_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[0] + sizeof(block_v2_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[1] + sizeof(block_v2_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[1] + sizeof(block_v2_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + acc_0 = _mm_mul_ps( d_0_1, p0 ); + acc_1 = _mm_mul_ps( d_0_1, p1 ); + acc_2 = _mm_mul_ps( d_2_3, p2 ); + acc_3 = _mm_mul_ps( d_2_3, p3 ); + } + + // Main loop + for (int i = 2; i < nb; i+=2) { + _mm_prefetch(&x[i] + sizeof(block_v2_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + sizeof(block_v2_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 0 and 1 + const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) ); + + const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); + + __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); + __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); + bx_0 = _mm_sub_epi8(bx_0, off); + const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); + + __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); + __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); + bx_1 = _mm_sub_epi8(bx_1, off); + const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); + + _mm_prefetch(&x[i] + 2 * sizeof(block_v2_q4_0), _MM_HINT_T0); + _mm_prefetch(&y[i] + 2 * sizeof(block_v2_q8_0), _MM_HINT_T0); + + // Compute combined scale for the block 2 and 3 + const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) ); + + const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); + + __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); + __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); + bx_2 = _mm_sub_epi8(bx_2, off); + const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); + + __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); + __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); + bx_3 = _mm_sub_epi8(bx_3, off); + const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); + + // Convert int32_t to float + __m128 p0 = _mm_cvtepi32_ps(i32_0); + __m128 p1 = _mm_cvtepi32_ps(i32_1); + __m128 p2 = _mm_cvtepi32_ps(i32_2); + __m128 p3 = _mm_cvtepi32_ps(i32_3); + + // Apply the scale + __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); + __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); + __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); + __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); + + // Acummulate + acc_0 = _mm_add_ps(p0_d, acc_0); + acc_1 = _mm_add_ps(p1_d, acc_1); + acc_2 = _mm_add_ps(p2_d, acc_2); + acc_3 = _mm_add_ps(p3_d, acc_3); + } + + *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F) - 8; + const int v1 = (x[i].qs[j] >> 4) - 8; + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); + } + + sumf += (x[i].d*y[i].d)*sumi; + } + + *s = sumf; +#endif +} diff --git a/third_party/ggml/ggjt.v2.q4_0.h b/third_party/ggml/ggjt.v2.q4_0.h new file mode 100644 index 000000000..48207bb07 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q4_0.h @@ -0,0 +1,22 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_ +#define COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_ +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +#define V2_QK4_0 32 +typedef struct { + float d; // delta + uint8_t qs[V2_QK4_0 / 2]; // nibbles / quants +} block_v2_q4_0; + +void dequantize_row_v2_q4_0(const void* restrict, float* restrict, int); +size_t ggml_quantize_v2_q4_0(const float*, void*, int, int, int64_t*); +void quantize_row_v2_q4_0(const float* restrict, void* restrict, int); +void quantize_row_v2_q4_0_reference(const float* restrict, + block_v2_q4_0* restrict, int); +void ggml_vec_dot_v2_q4_0_q8_0(const int, float* restrict, const void* restrict, + const void* restrict); + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_ */ diff --git a/third_party/ggml/ggjt.v2.q4_1.c b/third_party/ggml/ggjt.v2.q4_1.c new file mode 100644 index 000000000..ddc53fab1 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q4_1.c @@ -0,0 +1,253 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ GGML │ +│ Copyright (c) 2023 Georgi Gerganov │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/ggml/ggjt.v2.q4_1.h" +#include "libc/assert.h" +#include "libc/macros.internal.h" +#include "libc/math.h" +#include "third_party/ggml/ggjt.v2.internal.h" +#include "third_party/ggml/ggjt.v2.q8_1.h" +// clang-format off + +static_assert(sizeof(block_v2_q4_1) == 2 * sizeof(float) + V2_QK4_1 / 2, + "wrong q4_1 block size/padding"); + +void dequantize_row_v2_q4_1(const void * restrict x_, float * restrict y, int k) { + const block_v2_q4_1 * restrict x = x_; + static const int qk = V2_QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = x[i].d; + const float m = x[i].m; + + for (int j = 0; j < qk/2; ++j) { + const int x0 = (x[i].qs[j] & 0x0F); + const int x1 = (x[i].qs[j] >> 4); + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +size_t ggml_quantize_v2_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % V2_QK4_1 == 0); + const int nb = k / V2_QK4_1; + + for (int b = 0; b < n; b += k) { + block_v2_q4_1 * restrict y = (block_v2_q4_1 *) dst + b/V2_QK4_1; + + quantize_row_v2_q4_1_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < V2_QK4_1; j += 2) { + const uint8_t vi0 = y[i].qs[j/2] & 0x0F; + const uint8_t vi1 = y[i].qs[j/2] >> 4; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/V2_QK4_1*sizeof(block_v2_q4_1)); +} + +void quantize_row_v2_q4_1(const float * restrict x, void * restrict y, int k) { + quantize_row_v2_q4_1_reference(x, y, k); +} + +void ggml_vec_dot_v2_q4_1_q8_1(const int n, + float * restrict s, + const void * restrict vx, + const void * restrict vy) { + const int qk = V2_QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_v2_q4_1 * restrict x = vx; + const block_v2_q8_1 * restrict y = vy; + + // TODO: add WASM SIMD +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs = 0; + + for (int i = 0; i < nb; i += 2) { + const block_v2_q4_1 * restrict x0 = &x[i + 0]; + const block_v2_q4_1 * restrict x1 = &x[i + 1]; + const block_v2_q8_1 * restrict y0 = &y[i + 0]; + const block_v2_q8_1 * restrict y1 = &y[i + 1]; + + summs += x0->m * y0->s + x1->m * y1->s; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + // dot product into int32x4_t + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0; + + // Main loop + for (int i = 0; i < nb; ++i) { + const float * d0 = &x[i].d; + const float * d1 = &y[i].d; + + summs += x[i].m * y[i].s; + + const __m256 d0v = _mm256_broadcast_ss( d0 ); + const __m256 d1v = _mm256_broadcast_ss( d1 ); + + // Compute combined scales + const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); + + // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes + const __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs ); + + const __m256 xy = mul_sum_us8_pairs_float(bx, by); + + // Accumulate d0*d1*x*y +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d0d1, xy, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); +#endif + } + + *s = hsum_float_8(acc) + summs; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const int v0 = (x[i].qs[j] & 0x0F); + const int v1 = (x[i].qs[j] >> 4); + + sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); + } + + sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s; + } + + *s = sumf; +#endif +} + +void quantize_row_v2_q4_1_reference(const float * restrict x, void * restrict y_, int k) { + block_v2_q4_1 * restrict y = y_; + const int qk = V2_QK4_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + y[i].m = min; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } + } +} diff --git a/third_party/ggml/ggjt.v2.q4_1.h b/third_party/ggml/ggjt.v2.q4_1.h new file mode 100644 index 000000000..56a984143 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q4_1.h @@ -0,0 +1,22 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_ +#define COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_ +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +#define V2_QK4_1 32 +typedef struct { + float d; // delta + float m; // min + uint8_t qs[V2_QK4_1 / 2]; // nibbles / quants +} block_v2_q4_1; + +void dequantize_row_v2_q4_1(const void* restrict, float* restrict, int); +size_t ggml_quantize_v2_q4_1(const float*, void*, int, int, int64_t*); +void quantize_row_v2_q4_1(const float* restrict, void* restrict, int); +void ggml_vec_dot_v2_q4_1_q8_1(const int, float* restrict, const void* restrict, + const void* restrict); +void quantize_row_v2_q4_1_reference(const float* restrict, void* restrict, int); + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_ */ diff --git a/third_party/ggml/ggjt.v2.q5_0.c b/third_party/ggml/ggjt.v2.q5_0.c new file mode 100644 index 000000000..4d80366f2 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q5_0.c @@ -0,0 +1,393 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ GGML │ +│ Copyright (c) 2023 Georgi Gerganov │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/ggml/ggjt.v2.q5_0.h" +#include "libc/assert.h" +#include "libc/macros.internal.h" +#include "libc/math.h" +#include "libc/str/str.h" +#include "third_party/ggml/fp16.h" +#include "third_party/ggml/fp16.internal.h" +#include "third_party/ggml/ggjt.v2.internal.h" +#include "third_party/ggml/ggjt.v2.q8_0.h" +// clang-format off + +static_assert(sizeof(block_v2_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + V2_QK5_0 / 2, + "wrong q5_0 block size/padding"); + +void dequantize_row_v2_q5_0(const void * restrict x_, float * restrict y, int k) { + const block_v2_q5_0 * restrict x = x_; + static const int qk = V2_QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + y[i*qk + j + 0 ] = x0*d; + y[i*qk + j + qk/2] = x1*d; + } + } +} + +void quantize_row_v2_q5_0(const float * restrict x, void * restrict y, int k) { + quantize_row_v2_q5_0_reference(x, y, k); +} + +size_t ggml_quantize_v2_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % V2_QK5_0 == 0); + const int nb = k / V2_QK5_0; + + for (int b = 0; b < n; b += k) { + block_v2_q5_0 * restrict y = (block_v2_q5_0 *)dst + b/V2_QK5_0; + + quantize_row_v2_q5_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, &y[i].qh, sizeof(qh)); + + for (int j = 0; j < V2_QK5_0; j += 2) { + const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + // cast to 16 bins + const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; + const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/V2_QK5_0*sizeof(block_v2_q5_0)); +} + +void ggml_vec_dot_v2_q5_0_q8_0(const int n, + float * restrict s, + const void * restrict vx, + const void * restrict vy) { + const int qk = V2_QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nb % 2 == 0); + assert(qk == V2_QK5_0); + + const block_v2_q5_0 * restrict x = vx; + const block_v2_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + for (int i = 0; i < nb; i += 2) { + const block_v2_q5_0 * restrict x0 = &x[i]; + const block_v2_q5_0 * restrict x1 = &x[i + 1]; + const block_v2_q8_0 * restrict y0 = &y[i]; + const block_v2_q8_0 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + // extract the 5th bit via lookup table ((!b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x1d = GGML_FP16_TO_FP32(x1->d); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + uint32_t qh; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_v2_q5_0 * restrict x0 = &x[i]; + const block_v2_q8_0 * restrict y0 = &y[i]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + const v128_t s16b = wasm_i8x16_splat(0x10); + + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero) + const v128_t v0lf = wasm_i8x16_sub(v0l, qhl); + const v128_t v0hf = wasm_i8x16_sub(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0)); + bx = _mm256_or_si256(bx, bxhi); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps(d, q, acc); + } + + *s = hsum_float_8(acc); +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8((char)0xF0); + + // Main loop + for (int i = 0; i < nb; i++) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_andnot_si128(bxhil, mask); + bxhih = _mm_andnot_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = _mm256_set_m128i(bxh, bxl); + + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc); + } + + *s = hsum_float_8(acc); +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16; + const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); + } + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi; + } + + *s = sumf; +#endif +} + +void quantize_row_v2_q5_0_reference(const float * restrict x, block_v2_q5_0 * restrict y, int k) { + static const int qk = V2_QK5_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = x[i*qk + 0 + j]*id; + const float x1 = x[i*qk + qk/2 + j]*id; + + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10) >> 4) << (j + 0); + qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(qh)); + } +} diff --git a/third_party/ggml/ggjt.v2.q5_0.h b/third_party/ggml/ggjt.v2.q5_0.h new file mode 100644 index 000000000..54be059a1 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q5_0.h @@ -0,0 +1,24 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_ +#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_ +#include "third_party/ggml/fp16.h" +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +#define V2_QK5_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[V2_QK5_0 / 2]; // nibbles / quants +} block_v2_q5_0; + +void dequantize_row_v2_q5_0(const void* restrict, float* restrict, int); +void quantize_row_v2_q5_0(const float* restrict, void* restrict, int); +size_t ggml_quantize_v2_q5_0(const float*, void*, int, int, int64_t*); +void ggml_vec_dot_v2_q5_0_q8_0(const int, float* restrict, const void* restrict, + const void* restrict); +void quantize_row_v2_q5_0_reference(const float* restrict, + block_v2_q5_0* restrict, int); + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_ */ diff --git a/third_party/ggml/ggjt.v2.q5_1.c b/third_party/ggml/ggjt.v2.q5_1.c new file mode 100644 index 000000000..584640eff --- /dev/null +++ b/third_party/ggml/ggjt.v2.q5_1.c @@ -0,0 +1,409 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ GGML │ +│ Copyright (c) 2023 Georgi Gerganov │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/ggml/ggjt.v2.q5_1.h" +#include "libc/assert.h" +#include "libc/math.h" +#include "libc/str/str.h" +#include "third_party/ggml/fp16.internal.h" +#include "third_party/ggml/ggjt.v2.internal.h" +#include "third_party/ggml/ggjt.v2.q8_1.h" +// clang-format off + +static_assert(sizeof(block_v2_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + V2_QK5_1 / 2, + "wrong q5_1 block size/padding"); + +void dequantize_row_v2_q5_1(const void * restrict x_, float * restrict y, int k) { + const block_v2_q5_1 * restrict x = x_; + static const int qk = V2_QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); + + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int x0 = (x[i].qs[j] & 0x0F) | xh_0; + const int x1 = (x[i].qs[j] >> 4) | xh_1; + + y[i*qk + j + 0 ] = x0*d + m; + y[i*qk + j + qk/2] = x1*d + m; + } + } +} + +void quantize_row_v2_q5_1(const float * restrict x, void * restrict y, int k) { + quantize_row_v2_q5_1_reference(x, y, k); +} + +size_t ggml_quantize_v2_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % V2_QK5_1 == 0); + const int nb = k / V2_QK5_1; + + for (int b = 0; b < n; b += k) { + block_v2_q5_1 * restrict y = (block_v2_q5_1 *)dst + b/V2_QK5_1; + + quantize_row_v2_q5_1_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, &y[i].qh, sizeof(qh)); + + for (int j = 0; j < V2_QK5_1; j += 2) { + const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12)); + + // cast to 16 bins + const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; + const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; + + hist[vi0]++; + hist[vi1]++; + } + } + } + + return (n/V2_QK5_1*sizeof(block_v2_q5_1)); +} + +void ggml_vec_dot_v2_q5_1_q8_1(const int n, + float * restrict s, + const void * restrict vx, + const void * restrict vy) { + const int qk = V2_QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nb % 2 == 0); + assert(qk == V2_QK5_1); + + const block_v2_q5_1 * restrict x = vx; + const block_v2_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + float summs0 = 0.0f; + float summs1 = 0.0f; + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + for (int i = 0; i < nb; i += 2) { + const block_v2_q5_1 * restrict x0 = &x[i]; + const block_v2_q5_1 * restrict x1 = &x[i + 1]; + const block_v2_q8_1 * restrict y0 = &y[i]; + const block_v2_q8_1 * restrict y1 = &y[i + 1]; + + const uint8x16_t m4b = vdupq_n_u8(0x0F); + + summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; + summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; + + // extract the 5th bit via lookup table ((b) << 4) + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; + + const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0)); + const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2)); + const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0)); + const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2)); + + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + + // 4-bit -> 8-bit + const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); + const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); + const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); + const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); + + // add high bit + const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0); + const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0); + const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1); + const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1); + + // load y + const int8x16_t v1_0l = vld1q_s8(y0->qs); + const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); + const int8x16_t v1_1l = vld1q_s8(y1->qs); + const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x1d = GGML_FP16_TO_FP32(x1->d); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d); +#else + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h)); + + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h)); + + const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); + const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); + const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); + const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + float summs = 0.0f; + + uint32_t qh; + uint64_t tmp[4]; + + // TODO: check if unrolling this is better + for (int i = 0; i < nb; ++i) { + const block_v2_q5_1 * restrict x0 = &x[i]; + const block_v2_q8_1 * restrict y0 = &y[i]; + + summs += GGML_FP16_TO_FP32(x0->m) * y0->s; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + static bool x = true; + + // add high bit + const v128_t v0lf = wasm_v128_or(v0l, qhl); + const v128_t v0hf = wasm_v128_or(v0h, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; +#elif defined(__AVX2__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + __m256i bxhi = bytes_from_bits_32(x[i].qh); + bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); + bx = _mm256_or_si256(bx, bxhi); + + const __m256 dy = _mm256_broadcast_ss(&y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_us8_pairs_float(bx, by); + + acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc); + } + + *s = hsum_float_8(acc) + summs; +#elif defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + __m128i mask = _mm_set1_epi8(0x10); + + float summs = 0.0f; + + // Main loop + for (int i = 0; i < nb; i++) { + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + const __m256i bxhi = bytes_from_bits_32(x[i].qh); + __m128i bxhil = _mm256_castsi256_si128(bxhi); + __m128i bxhih = _mm256_extractf128_si256(bxhi, 1); + bxhil = _mm_and_si128(bxhil, mask); + bxhih = _mm_and_si128(bxhih, mask); + __m128i bxl = _mm256_castsi256_si128(bx); + __m128i bxh = _mm256_extractf128_si256(bx, 1); + bxl = _mm_or_si128(bxl, bxhil); + bxh = _mm_or_si128(bxh, bxhih); + bx = _mm256_set_m128i(bxh, bxl); + + const __m256 dy = _mm256_broadcast_ss(&y[i].d); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_us8_pairs_float(bx, by); + + acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc); + } + + *s = hsum_float_8(acc) + summs; +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); + + int sumi = 0; + + for (int j = 0; j < qk/2; ++j) { + const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; + const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10; + + const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0; + const int32_t x1 = (x[i].qs[j] >> 4) | xh_1; + + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); + } + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; +#endif +} + +void quantize_row_v2_q5_1_reference(const float * restrict x, block_v2_q5_1 * restrict y, int k) { + const int qk = V2_QK5_1; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 5) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); + + uint32_t qh = 0; + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); + + // get the 5-th bit and store it in qh at the right position + qh |= ((xi0 & 0x10) >> 4) << (j + 0); + qh |= ((xi1 & 0x10) >> 4) << (j + qk/2); + } + + memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); + } +} diff --git a/third_party/ggml/ggjt.v2.q5_1.h b/third_party/ggml/ggjt.v2.q5_1.h new file mode 100644 index 000000000..92b4ebe42 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q5_1.h @@ -0,0 +1,25 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_ +#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_ +#include "third_party/ggml/fp16.h" +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +#define V2_QK5_1 32 +typedef struct { + ggml_fp16_t d; // delta + ggml_fp16_t m; // min + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[V2_QK5_1 / 2]; // nibbles / quants +} block_v2_q5_1; + +void dequantize_row_v2_q5_1(const void* restrict, float* restrict, int); +void quantize_row_v2_q5_1(const float* restrict, void* restrict, int); +size_t ggml_quantize_v2_q5_1(const float*, void*, int, int, int64_t*); +void ggml_vec_dot_v2_q5_1_q8_1(const int, float* restrict, const void* restrict, + const void* restrict); +void quantize_row_v2_q5_1_reference(const float* restrict, + block_v2_q5_1* restrict, int); + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_ */ diff --git a/third_party/ggml/ggjt.v2.q8_0.c b/third_party/ggml/ggjt.v2.q8_0.c new file mode 100644 index 000000000..3b43c2151 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q8_0.c @@ -0,0 +1,332 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ GGML │ +│ Copyright (c) 2023 Georgi Gerganov │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/ggml/ggjt.v2.q8_0.h" +#include "libc/assert.h" +#include "libc/macros.internal.h" +#include "libc/math.h" +#include "third_party/ggml/ggjt.v2.internal.h" +// clang-format off + +static_assert(sizeof(block_v2_q8_0) == sizeof(float) + V2_QK8_0, + "wrong q8_0 block size/padding"); + +void dequantize_row_v2_q8_0(const void * restrict vx, float * restrict y, int k) { + static const int qk = V2_QK8_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + const block_v2_q8_0 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = x[i].d; + + for (int j = 0; j < qk; ++j) { + y[i*qk + j] = x[i].qs[j]*d; + } + } +} + +void quantize_row_v2_q8_0(const float * restrict x, void * restrict vy, int k) { + assert(V2_QK8_0 == 32); + assert(k % V2_QK8_0 == 0); + const int nb = k / V2_QK8_0; + + block_v2_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + } + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + // scalar + quantize_row_q8_0_reference(x, y, k); +#endif +} + +size_t ggml_quantize_v2_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % V2_QK8_0 == 0); + const int nb = k / V2_QK8_0; + + for (int b = 0; b < n; b += k) { + block_v2_q8_0 * restrict y = (block_v2_q8_0 *)dst + b/V2_QK8_0; + + quantize_row_v2_q8_0_reference(src + b, y, k); + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < V2_QK8_0; ++j) { + const int8_t vi = y[i].qs[j]; + + hist[vi/16 + 8]++; + } + } + } + + return (n/V2_QK8_0*sizeof(block_v2_q8_0)); +} + +void ggml_vec_dot_v2_q8_0_q8_0(const int n, + float * restrict s, + const void * restrict vx, + const void * restrict vy) { + const int qk = V2_QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nb % 2 == 0); + + const block_v2_q8_0 * restrict x = vx; + const block_v2_q8_0 * restrict y = vy; + +#if defined(__ARM_NEON) + float32x4_t sumv0 = vdupq_n_f32(0.0f); + float32x4_t sumv1 = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i += 2) { + const block_v2_q8_0 * restrict x0 = &x[i + 0]; + const block_v2_q8_0 * restrict x1 = &x[i + 1]; + const block_v2_q8_0 * restrict y0 = &y[i + 0]; + const block_v2_q8_0 * restrict y1 = &y[i + 1]; + + const int8x16_t x0_0 = vld1q_s8(x0->qs); + const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); + const int8x16_t x1_0 = vld1q_s8(x1->qs); + const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); + + // load y + const int8x16_t y0_0 = vld1q_s8(y0->qs); + const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); + const int8x16_t y1_0 = vld1q_s8(y1->qs); + const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); + +#if defined(__ARM_FEATURE_DOTPROD) + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d); + + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( + vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d); + +#else + const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); + const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0)); + const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1)); + const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1)); + + const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0)); + const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0)); + const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1)); + const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1)); + + const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1)); + const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3)); + const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); + const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); + + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d); +#endif + } + + *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); +#elif defined(__AVX2__) || defined(__AVX__) + // Initialize accumulator with zeros + __m256 acc = _mm256_setzero_ps(); + + // Main loop + for (int i = 0; i < nb; ++i) { + // Compute combined scale for the block + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + // Multiply q with scale and accumulate +#if defined(__AVX2__) + acc = _mm256_fmadd_ps( d, q, acc ); +#else + acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc ); +#endif + } + + *s = hsum_float_8(acc); +#else + // scalar + float sumf = 0.0; + + for (int i = 0; i < nb; i++) { + int sumi = 0; + + for (int j = 0; j < qk; j++) { + sumi += x[i].qs[j]*y[i].qs[j]; + } + + sumf += (x[i].d*y[i].d)*sumi; + } + + *s = sumf; +#endif +} + +void quantize_row_v2_q8_0_reference(const float * restrict x, block_v2_q8_0 * restrict y, int k) { + assert(k % V2_QK8_0 == 0); + const int nb = k / V2_QK8_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < V2_QK8_0; j++) { + const float v = x[i*V2_QK8_0 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int j = 0; j < V2_QK8_0; ++j) { + const float x0 = x[i*V2_QK8_0 + j]*id; + + y[i].qs[j] = roundf(x0); + } + } +} diff --git a/third_party/ggml/ggjt.v2.q8_0.h b/third_party/ggml/ggjt.v2.q8_0.h new file mode 100644 index 000000000..2e5b2107d --- /dev/null +++ b/third_party/ggml/ggjt.v2.q8_0.h @@ -0,0 +1,22 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_ +#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_ +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +#define V2_QK8_0 32 +typedef struct { + float d; // delta + int8_t qs[V2_QK8_0]; // quants +} block_v2_q8_0; + +void dequantize_row_v2_q8_0(const void* restrict, float* restrict, int); +void quantize_row_v2_q8_0(const float* restrict, void* restrict, int); +size_t ggml_quantize_v2_q8_0(const float*, void*, int, int, int64_t*); +void ggml_vec_dot_v2_q8_0_q8_0(const int, float* restrict, const void* restrict, + const void* restrict); +void quantize_row_v2_q8_0_reference(const float* restrict, + block_v2_q8_0* restrict, int); + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_ */ diff --git a/third_party/ggml/ggjt.v2.q8_1.c b/third_party/ggml/ggjt.v2.q8_1.c new file mode 100644 index 000000000..f52010ec9 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q8_1.c @@ -0,0 +1,221 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│ +╚──────────────────────────────────────────────────────────────────────────────╝ +│ │ +│ GGML │ +│ Copyright (c) 2023 Georgi Gerganov │ +│ │ +│ Permission is hereby granted, free of charge, to any person obtaining │ +│ a copy of this software and associated documentation files (the │ +│ "Software"), to deal in the Software without restriction, including │ +│ without limitation the rights to use, copy, modify, merge, publish, │ +│ distribute, sublicense, and/or sell copies of the Software, and to │ +│ permit persons to whom the Software is furnished to do so, subject to │ +│ the following conditions: │ +│ │ +│ The above copyright notice and this permission notice shall be │ +│ included in all copies or substantial portions of the Software. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │ +│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │ +│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │ +│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │ +│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │ +│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │ +│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │ +│ │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/ggml/ggjt.v2.q8_1.h" +#include "libc/assert.h" +#include "libc/macros.internal.h" +#include "libc/math.h" +#include "third_party/ggml/ggjt.v2.internal.h" +// clang-format off + +static_assert(sizeof(block_v2_q8_1) == 2*sizeof(float) + V2_QK8_1, + "wrong q8_1 block size/padding"); + +#if __AVX__ || __AVX2__ || __AVX512F__ +static inline int hsum_i32_8(const __m256i a) { + const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); + const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); + const __m128i sum64 = _mm_add_epi32(hi64, sum128); + const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); + return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); +} +#endif + +void quantize_row_v2_q8_1(const float * restrict x, void * restrict vy, int k) { + assert(k % V2_QK8_1 == 0); + const int nb = k / V2_QK8_1; + + block_v2_q8_1 * restrict y = vy; + +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int32x4_t accv = vdupq_n_s32(0); + + for (int j = 0; j < 8; j++) { + const float32x4_t v = vmulq_n_f32(srcv[j], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); + + accv = vaddq_s32(accv, vi); + } + + y[i].s = d * vaddvq_s32(accv); + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Compute the sum of the quants and set y[i].s + y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Compute the sum of the quants and set y[i].s + const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); + const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); + y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + // scalar + quantize_row_q8_1_reference(x, y, k); +#endif +} + +void quantize_row_v2_q8_1_reference(const float * restrict x, block_v2_q8_1 * restrict y, int k) { + assert(V2_QK8_1 == 32); + assert(k % V2_QK8_1 == 0); + const int nb = k / V2_QK8_1; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < V2_QK8_1; j++) { + const float v = x[i*V2_QK8_1 + j]; + amax = MAX(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + int sum = 0; + + for (int j = 0; j < V2_QK8_1/2; ++j) { + const float v0 = x[i*V2_QK8_1 + j]*id; + const float v1 = x[i*V2_QK8_1 + V2_QK8_1/2 + j]*id; + + y[i].qs[ j] = roundf(v0); + y[i].qs[V2_QK8_1/2 + j] = roundf(v1); + + sum += y[i].qs[ j]; + sum += y[i].qs[V2_QK8_1/2 + j]; + } + + y[i].s = d * sum; + } +} diff --git a/third_party/ggml/ggjt.v2.q8_1.h b/third_party/ggml/ggjt.v2.q8_1.h new file mode 100644 index 000000000..88b38ea17 --- /dev/null +++ b/third_party/ggml/ggjt.v2.q8_1.h @@ -0,0 +1,19 @@ +#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_ +#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_ +#if !(__ASSEMBLER__ + __LINKER__ + 0) +COSMOPOLITAN_C_START_ + +#define V2_QK8_1 32 +typedef struct { + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[V2_QK8_1]; // quants +} block_v2_q8_1; + +void quantize_row_v2_q8_1(const float* restrict, void* restrict, int); +void quantize_row_v2_q8_1_reference(const float* restrict, + block_v2_q8_1* restrict, int); + +COSMOPOLITAN_C_END_ +#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ +#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_ */ diff --git a/third_party/ggml/ggml.c b/third_party/ggml/ggml.c index a9e3bd6f8..7a3d442c5 100644 --- a/third_party/ggml/ggml.c +++ b/third_party/ggml/ggml.c @@ -217,7 +217,7 @@ static inline int hsum_i32_4(const __m128i a) { return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } -#if __AVX2__ || __AVX512F__ +#if defined(__AVX2__) || defined(__AVX512F__) // spread 32 bits to 32 bytes { 0x00, 0xFF } static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; @@ -393,7 +393,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -#if __ARM_NEON +#if defined(__ARM_NEON) #if !defined(__aarch64__) @@ -445,19 +445,19 @@ inline static float vaddvq_f32(float32x4_t v) { return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); } -float vminvq_f32(float32x4_t v) { +inline static float vminvq_f32(float32x4_t v) { return MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); } -float vmaxvq_f32(float32x4_t v) { +inline static float vmaxvq_f32(float32x4_t v) { return MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); } -int32x4_t vcvtnq_s32_f32(float32x4_t v) { +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { int32x4_t res; res[0] = roundf(vgetq_lane_f32(v, 0)); @@ -471,21 +471,20 @@ int32x4_t vcvtnq_s32_f32(float32x4_t v) { #endif #endif - #define QK4_0 32 typedef struct { - float d; // delta + ggml_fp16_t d; // delta uint8_t qs[QK4_0 / 2]; // nibbles / quants } block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); +static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); #define QK4_1 32 typedef struct { - float d; // delta - float m; // min + ggml_fp16_t d; // delta + ggml_fp16_t m; // min uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding"); +static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); #define QK5_0 32 typedef struct { @@ -506,16 +505,16 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + #define QK8_0 32 typedef struct { - float d; // delta - int8_t qs[QK8_0]; // quants + ggml_fp16_t d; // delta + int8_t qs[QK8_0]; // quants } block_q8_0; -static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); +static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); #define QK8_1 32 typedef struct { - float d; // delta - float s; // d * sum(qs[i]) - int8_t qs[QK8_1]; // quants + float d; // delta + float s; // d * sum(qs[i]) + int8_t qs[QK8_1]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); @@ -542,7 +541,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r const float d = max / -8; const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < qk/2; ++j) { const float x0 = x[i*qk + 0 + j]*id; @@ -582,8 +581,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; - y[i].m = min; + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); for (int j = 0; j < qk/2; ++j) { const float x0 = (x[i*qk + 0 + j] - min)*id; @@ -714,7 +713,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < QK8_0; ++j) { const float x0 = x[i*QK8_0 + j]*id; @@ -749,7 +748,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const float32x4_t v = vmulq_n_f32(srcv[j], id); @@ -761,6 +760,39 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); } } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + } + } #elif defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors @@ -784,7 +816,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int // Quantize these floats const float d = maxScalar / 127.f; - y[i].d = d; + y[i].d = GGML_FP32_TO_FP16(d); const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -883,7 +915,7 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r sum += y[i].qs[QK8_1/2 + j]; } - y[i].s = d * sum; + y[i].s = sum*d; } } @@ -929,6 +961,48 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int y[i].s = d * vaddvq_s32(accv); } +#elif defined(__wasm_simd128__) + for (int i = 0; i < nb; i++) { + v128_t srcv [8]; + v128_t asrcv[8]; + v128_t amaxv[8]; + + for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); + for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); + + for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); + for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); + for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); + + const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), + wasm_f32x4_extract_lane(amaxv[0], 1)), + MAX(wasm_f32x4_extract_lane(amaxv[0], 2), + wasm_f32x4_extract_lane(amaxv[0], 3))); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + v128_t accv = wasm_i32x4_splat(0); + + for (int j = 0; j < 8; j++) { + const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); + const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); + + y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); + y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); + y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); + y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); + + accv = wasm_i32x4_add(accv, vi); + } + + y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) + + wasm_i32x4_extract_lane(accv, 1) + + wasm_i32x4_extract_lane(accv, 2) + + wasm_i32x4_extract_lane(accv, 3)); + } #elif defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { // Load elements into 4 AVX vectors @@ -1035,7 +1109,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = x[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d); for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0x0F) - 8; @@ -1055,8 +1129,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = x[i].d; - const float m = x[i].m; + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0x0F); @@ -1131,7 +1205,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in const block_q8_0 * restrict x = vx; for (int i = 0; i < nb; i++) { - const float d = x[i].d; + const float d = GGML_FP16_TO_FP32(x[i].d); for (int j = 0; j < qk; ++j) { y[i*qk + j] = x[i].qs[j]*d; @@ -1145,7 +1219,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -const quantize_fns_t ggjt_v2_quantize_fns[GGML_TYPE_COUNT] = { +const quantize_fns_t ggjt_v3_quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, @@ -1818,8 +1892,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const block_q8_0 * restrict y0 = &y[i + 0]; const block_q8_0 * restrict y1 = &y[i + 1]; - const uint8x16_t m4b = vdupq_n_u8(0x0F); - const int8x16_t s8b = vdupq_n_s8(0x8); + const uint8x16_t m4b = vdupq_n_u8(0x0F); + const int8x16_t s8b = vdupq_n_s8(0x8); const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v0_1 = vld1q_u8(x1->qs); @@ -1847,8 +1921,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); @@ -1865,8 +1939,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -1876,37 +1950,23 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * __m256 acc = _mm256_setzero_ps(); // Main loop -#define WORK(I) \ - /* Compute combined scale for the block */ \ - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \ - __m256i bx = bytes_from_nibbles_32(x[I].qs); \ - /* Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */ \ - const __m256i off = _mm256_set1_epi8( 8 ); \ - bx = _mm256_sub_epi8( bx, off ); \ - __m256i by = _mm256_loadu_si256((const __m256i *)y[I].qs); \ - const __m256 q = mul_sum_i8_pairs_float(bx, by); \ - /* Multiply q with scale and accumulate */ \ - acc = _mm256_fmadd_ps( d, q, acc ) - int i = 0; - for (; i + 12 < nb; i += 12) { - _mm_prefetch(x+i+12, 3); - _mm_prefetch(x+i+15, 3); - _mm_prefetch(x+i+18, 3); - _mm_prefetch(x+i+21, 3); - _mm_prefetch(y+i+12, 3); - _mm_prefetch(y+i+14, 3); - _mm_prefetch(y+i+16, 3); - _mm_prefetch(y+i+18, 3); - _mm_prefetch(y+i+20, 3); - _mm_prefetch(y+i+22, 3); - for (int j = 0; j < 12; ++j) { - WORK(i+j); - } + for (int i = 0; i < nb; ++i) { + /* Compute combined scale for the block */ + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); + + __m256i bx = bytes_from_nibbles_32(x[i].qs); + + // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. + const __m256i off = _mm256_set1_epi8( 8 ); + bx = _mm256_sub_epi8( bx, off ); + + __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + const __m256 q = mul_sum_i8_pairs_float(bx, by); + + /* Multiply q with scale and accumulate */ + acc = _mm256_fmadd_ps( d, q, acc ); } - for (; i < nb; ++i) { - WORK(i); - } -#undef WORK *s = hsum_float_8(acc); #elif defined(__AVX__) @@ -1916,7 +1976,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; ++i) { // Compute combined scale for the block - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); const __m128i lowMask = _mm_set1_epi8(0xF); const __m128i off = _mm_set1_epi8(8); @@ -1958,7 +2018,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); @@ -1976,7 +2036,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); @@ -2009,7 +2069,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); @@ -2027,7 +2087,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); @@ -2075,7 +2135,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (x[i].d*y[i].d)*sumi; + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); } *s = sumf; @@ -2105,7 +2165,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const block_q8_1 * restrict y0 = &y[i + 0]; const block_q8_1 * restrict y1 = &y[i + 1]; - summs += x0->m * y0->s + x1->m * y1->s; + summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -2129,8 +2189,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); @@ -2147,8 +2207,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); #endif } @@ -2161,13 +2221,13 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; ++i) { - const float * d0 = &x[i].d; - const float * d1 = &y[i].d; + const float d0 = GGML_FP16_TO_FP32(x[i].d); + const float d1 = y[i].d; - summs += x[i].m * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; - const __m256 d0v = _mm256_broadcast_ss( d0 ); - const __m256 d1v = _mm256_broadcast_ss( d1 ); + const __m256 d0v = _mm256_set1_ps( d0 ); + const __m256 d1v = _mm256_set1_ps( d1 ); // Compute combined scales const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); @@ -2201,7 +2261,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; @@ -2277,16 +2337,13 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - const float x0d = GGML_FP16_TO_FP32(x0->d); - const float x1d = GGML_FP16_TO_FP32(x1->d); - #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d); + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d); + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); @@ -2303,8 +2360,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -2321,7 +2378,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const block_q8_0 * restrict y0 = &y[i]; const v128_t m4b = wasm_i8x16_splat(0x0F); - const v128_t s16b = wasm_i8x16_splat(0x10); // extract the 5th bit memcpy(&qh, x0->qh, sizeof(qh)); @@ -2359,15 +2415,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - const float x0d = GGML_FP16_TO_FP32(x0->d); - // dot product sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( wasm_i32x4_add( wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); } *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -2379,7 +2434,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2403,7 +2458,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2446,7 +2501,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; } *s = sumf; @@ -2528,16 +2583,13 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - const float x0d = GGML_FP16_TO_FP32(x0->d); - const float x1d = GGML_FP16_TO_FP32(x1->d); - #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d); + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d); + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); @@ -2554,8 +2606,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); #endif } @@ -2594,8 +2646,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const v128_t v0l = wasm_v128_and (v0, m4b); const v128_t v0h = wasm_u8x16_shr(v0, 4); - static bool x = true; - // add high bit const v128_t v0lf = wasm_v128_or(v0l, qhl); const v128_t v0hf = wasm_v128_or(v0h, qhh); @@ -2615,15 +2665,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - const float x0d = GGML_FP16_TO_FP32(x0->d); - // dot product - sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( - wasm_i32x4_add( + sumv = wasm_f32x4_add(sumv, + wasm_f32x4_mul(wasm_f32x4_convert_i32x4(wasm_i32x4_add( wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), - wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d))); } *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -2645,7 +2694,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); bx = _mm256_or_si256(bx, bxhi); - const __m256 dy = _mm256_broadcast_ss(&y[i].d); + const __m256 dy = _mm256_set1_ps(y[i].d); const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); const __m256 q = mul_sum_us8_pairs_float(bx, by); @@ -2679,7 +2728,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * bxh = _mm_or_si128(bxh, bxhih); bx = _mm256_set_m128i(bxh, bxl); - const __m256 dy = _mm256_broadcast_ss(&y[i].d); + const __m256 dy = _mm256_set1_ps(y[i].d); const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); const __m256 q = mul_sum_us8_pairs_float(bx, by); @@ -2749,11 +2798,11 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d); + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d); + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); @@ -2771,8 +2820,8 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -2784,7 +2833,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; ++i) { // Compute combined scale for the block - const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); @@ -2810,7 +2859,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * sumi += x[i].qs[j]*y[i].qs[j]; } - sumf += (x[i].d*y[i].d)*sumi; + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); } *s = sumf; @@ -2966,11 +3015,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { } #endif -inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { - const uint16_t * i16 = (const uint16_t *) x; - for (int i = 0; i < n; ++i) { - y[i] = table_silu_f16[i16[i]]; - } +// Sigmoid Linear Unit (SiLU) function +inline static float ggml_silu_f32(float x) { + return x/(1.0f + expf(-x)); } #ifdef GGML_SILU_FP16 @@ -3084,7 +3131,7 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // data types // -const int ggjt_v2_blck_size[GGML_TYPE_COUNT] = { +const int ggjt_v3_blck_size[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = 1, [GGML_TYPE_F16] = 1, [GGML_TYPE_Q4_0] = QK4_0, @@ -3100,7 +3147,7 @@ const int ggjt_v2_blck_size[GGML_TYPE_COUNT] = { const int *GGML_BLCK_SIZE; static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated"); -const size_t ggjt_v2_type_size[GGML_TYPE_COUNT] = { +const size_t ggjt_v3_type_size[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), [GGML_TYPE_F16] = sizeof(ggml_fp16_t), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), @@ -3116,7 +3163,7 @@ const size_t ggjt_v2_type_size[GGML_TYPE_COUNT] = { const size_t *GGML_TYPE_SIZE; static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated"); -const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = { +const char *const ggjt_v3_type_name[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = "f32", [GGML_TYPE_F16] = "f16", [GGML_TYPE_Q4_0] = "q4_0", @@ -3133,7 +3180,7 @@ const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = { const char *const *GGML_TYPE_NAME; static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated"); -const bool ggjt_v2_is_quantized[GGML_TYPE_COUNT] = { +const bool ggjt_v3_is_quantized[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = false, [GGML_TYPE_F16] = false, [GGML_TYPE_Q4_0] = true, @@ -3149,7 +3196,7 @@ const bool ggjt_v2_is_quantized[GGML_TYPE_COUNT] = { const bool *GGML_IS_QUANTIZED; static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated"); -static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { +static const char *const GGML_OP_NAME[GGML_OP_COUNT] = { "NONE", "DUP", @@ -3197,6 +3244,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "ROPE", "ROPE_BACK", "ALIBI", + "CLAMP", "CONV_1D_1S", "CONV_1D_2S", @@ -3207,9 +3255,10 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); +static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); -static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { + +static const char *const GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", "x", @@ -3257,6 +3306,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rope(x)", "rope_back(x)", "alibi(x)", + "clamp(x)", "conv_1d_1s(x)", "conv_1d_2s(x)", @@ -3267,7 +3317,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x,y)", }; -static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); +static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -3401,6 +3451,9 @@ const char * ggml_type_name(enum ggml_type type) { return GGML_TYPE_NAME[type]; } +const char * ggml_op_name(enum ggml_op op) { + return GGML_OP_NAME[op]; +} size_t ggml_element_size(const struct ggml_tensor * tensor) { return GGML_TYPE_SIZE[tensor->type]; @@ -3458,6 +3511,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { return wtype; } +size_t ggml_tensor_overhead(void) { + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -3502,6 +3559,12 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g (t1->ne[3]%t0->ne[3] == 0); } +static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1); +} + static inline int ggml_up32(int n) { return (n + 31) & ~31; } @@ -3647,6 +3710,18 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } +void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { + ctx->no_alloc = no_alloc; +} + +void * ggml_get_mem_buffer(struct ggml_context * ctx) { + return ctx->mem_buffer; +} + +size_t ggml_get_mem_size(struct ggml_context * ctx) { + return ctx->mem_size; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace @@ -3691,7 +3766,7 @@ struct ggml_tensor * ggml_new_tensor_impl( struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); if (ctx->scratch.data == NULL || data != NULL) { - size_needed += sizeof(struct ggml_tensor); + size_needed += GGML_TENSOR_SIZE; if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", @@ -3707,14 +3782,15 @@ struct ggml_tensor * ggml_new_tensor_impl( }; } else { if (ctx->scratch.offs + size_needed > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory\n", __func__); + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + size_needed, ctx->scratch.size); assert(false); return NULL; } - if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) { + if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size); + __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } @@ -3723,7 +3799,7 @@ struct ggml_tensor * ggml_new_tensor_impl( *obj_new = (struct ggml_object) { .offs = cur_end + GGML_OBJECT_SIZE, - .size = sizeof(struct ggml_tensor), + .size = GGML_TENSOR_SIZE, .next = NULL, }; @@ -4139,6 +4215,23 @@ struct ggml_tensor * ggml_view_tensor( return result; } +struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { + struct ggml_object * obj = ctx->objects_begin; + + char * const mem_buffer = ctx->mem_buffer; + + while (obj != NULL) { + struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); + if (strcmp(cur->name, name) == 0) { + return cur; + } + + obj = obj->next; + } + + return NULL; +} + //////////////////////////////////////////////////////////////////////////////// // ggml_dup @@ -4367,11 +4460,15 @@ struct ggml_tensor * ggml_mul_impl( struct ggml_tensor * a, struct ggml_tensor * b, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; if (!inplace && (a->grad || b->grad)) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); is_node = true; } @@ -5912,7 +6009,8 @@ struct ggml_tensor * ggml_alibi( struct ggml_context * ctx, struct ggml_tensor * a, int n_past, - int n_head) { + int n_head, + float bias_max) { GGML_ASSERT(n_past >= 0); bool is_node = false; @@ -5927,10 +6025,12 @@ struct ggml_tensor * ggml_alibi( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_head; + GGML_ASSERT(sizeof(float) == sizeof(int32_t)); + (((float *) b->data)[2]) = bias_max; ggml_scratch_load(ctx); @@ -5942,6 +6042,40 @@ struct ggml_tensor * ggml_alibi( return result; } +// ggml_clamp + +struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, + float min, + float max) { + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + // TODO: when implement backward, fix this: + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + + ((float *) b->data)[0] = min; + ((float *) b->data)[1] = max; + + ggml_scratch_load(ctx); + + result->op = GGML_OP_CLAMP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + // ggml_conv_1d_1s struct ggml_tensor * ggml_conv_1d_1s( @@ -7674,7 +7808,7 @@ static void ggml_compute_forward_mul_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -7682,10 +7816,25 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); - const int64_t ne0 = src0->ne[0]; - const int64_t ne1 = src0->ne[1]; - const int64_t ne2 = src0->ne[2]; +#ifdef GGML_USE_CUBLAS + if (src1->backend == GGML_BACKEND_CUDA) { + if (ith == 0) { + ggml_cuda_mul(src0, src1, dst); + } + return; + } +#endif + + const int64_t nr = ggml_nrows(src0); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; const size_t nb00 = src0->nb[0]; const size_t nb01 = src0->nb[1]; @@ -7704,44 +7853,51 @@ static void ggml_compute_forward_mul_f32( GGML_ASSERT( nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(ne00 == ne10); if (nb10 == sizeof(float)) { - for (int ir = ith; ir < nr; ir += nth) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); #ifdef GGML_USE_ACCELERATE UNUSED(ggml_vec_mul_f32); - vDSP_vmul( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, - ne0); + vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); #else - ggml_vec_mul_f32(ne0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr); #endif // } // } } } else { // src1 is not contiguous - for (int ir = ith; ir < nr; ir += nth) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + for (int64_t ir = ith; ir < nr; ir += nth) { + // src0 and dst are same shape => same indices + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - for (int i0 = 0; i0 < ne0; i0++) { - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + for (int64_t i0 = 0; i0 < ne00; i0++) { + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); } @@ -9003,7 +9159,7 @@ static void ggml_compute_forward_rms_norm_back( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -9044,7 +9200,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -9108,9 +9264,16 @@ static void ggml_compute_forward_mul_mat_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9130,21 +9293,11 @@ static void ggml_compute_forward_mul_mat_f32( const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CLBLAST) - // zT = y * xT - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); -#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); @@ -9288,9 +9441,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -9320,20 +9480,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( assert(id*sizeof(float) <= params->wsize); } -#if defined(GGML_USE_CLBLAST) - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); -#else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -9345,7 +9491,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } @@ -9513,9 +9658,16 @@ static void ggml_compute_forward_mul_mat_q_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9538,9 +9690,6 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CLBLAST) - const void* x = (char *) src0->data + i03*nb03 + i02*nb02; -#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -9552,23 +9701,12 @@ static void ggml_compute_forward_mul_mat_q_f32( } const float * x = wdata; -#endif -#if defined(GGML_USE_CLBLAST) - // zT = y * xT - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - type); -#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } @@ -10252,6 +10390,7 @@ static void ggml_compute_forward_diag_mask_f32( const int n_past = ((int32_t *) src1->data)[0]; const bool inplace = (bool)((int32_t *) src1->data)[1]; + assert(n_past >= 0); if (!inplace && (params->type == GGML_TASK_INIT)) { @@ -10422,14 +10561,15 @@ static void ggml_compute_forward_alibi_f32( struct ggml_tensor * dst) { assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + assert(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; + const int n_past = ((int32_t *) src1->data)[0]; + const int n_head = ((int32_t *) src1->data)[1]; + const float max_bias = ((float *) src1->data)[2]; assert(n_past >= 0); @@ -10452,8 +10592,8 @@ static void ggml_compute_forward_alibi_f32( // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << _bsr(n_head); // [jart] - const float m0 = exp2f(-8.0f / n_heads_log2_floor); - const float m1 = exp2f(-4.0f / n_heads_log2_floor); + const float m0 = exp2f(-(max_bias) / n_heads_log2_floor); + const float m1 = exp2f(-(max_bias / 2.0f) / n_heads_log2_floor); for (int i = 0; i < ne0; i++) { for (int j = 0; j < ne1; j++) { @@ -10471,7 +10611,8 @@ static void ggml_compute_forward_alibi_f32( m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); } - pdst[0] = i * m_k + src[0]; + pdst[0] = (i-ne0+1) * m_k + src[0]; + } } } @@ -10485,14 +10626,15 @@ static void ggml_compute_forward_alibi_f16( struct ggml_tensor * dst) { assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + assert(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; + const int n_past = ((int32_t *) src1->data)[0]; + const int n_head = ((int32_t *) src1->data)[1]; + const float max_bias = ((float *) src1->data)[2]; assert(n_past >= 0); @@ -10514,9 +10656,8 @@ static void ggml_compute_forward_alibi_f16( // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << _bsr(n_head); - - const float m0 = exp2f(-8.0f / n_heads_log2_floor); - const float m1 = exp2f(-4.0f / n_heads_log2_floor); + const float m0 = exp2f(-(max_bias) / n_heads_log2_floor); + const float m1 = exp2f(-(max_bias / 2.0f) / n_heads_log2_floor); for (int i = 0; i < ne0; i++) { for (int j = 0; j < ne1; j++) { @@ -10535,7 +10676,7 @@ static void ggml_compute_forward_alibi_f16( } // we return F32 - pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); + pdst[0] = (i-ne0+1) * m_k + GGML_FP16_TO_FP32(src[0]); } } } @@ -10572,6 +10713,78 @@ static void ggml_compute_forward_alibi( } } + +// ggml_compute_forward_clamp + +static void ggml_compute_forward_clamp_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(src1->type == GGML_TYPE_I32); + assert(ggml_nelements(src1) == 2); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int min = ((float *) src1->data)[0]; + const int max = ((float *) src1->data)[1]; + + const int ith = params->ith; + const int nth = params->nth; + + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + for (int j = ith; j < n; j += nth) { + float * dst_ptr = (float *) ((char *) dst->data + j*nb1); + float * src0_ptr = (float *) ((char *) src0->data + j*nb01); + + for (int i = 0; i < nc; i++) { + dst_ptr[i] = MAX(MIN(src0_ptr[i], max), min); + } + } +} + +static void ggml_compute_forward_clamp( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_clamp_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_2: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_rope static void ggml_compute_forward_rope_f32( @@ -12553,6 +12766,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); } break; + case GGML_OP_CLAMP: + { + ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor); + } break; case GGML_OP_CONV_1D_1S: { ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); @@ -12860,6 +13077,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CLAMP: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_SILU: { // necessary for llama @@ -13318,11 +13539,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * // reached a leaf node, not part of the gradient graph (e.g. a constant) GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); + } + cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; } else { GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); + if (strlen(node->name) == 0) { + snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); + } + cgraph->nodes[cgraph->n_nodes] = node; cgraph->grads[cgraph->n_nodes] = node->grad; cgraph->n_nodes++; @@ -13675,9 +13904,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node); } else +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + node->n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + } + else #endif if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -13691,13 +13927,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } #endif } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); @@ -13741,6 +13977,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = 1; //TODO } break; + case GGML_OP_CLAMP: + { + node->n_tasks = 1; //TODO + } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { @@ -14027,6 +14267,481 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } +struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { + for (int i = 0; i < cgraph->n_leafs; i++) { + struct ggml_tensor * leaf = cgraph->leafs[i]; + + if (strcmp(leaf->name, name) == 0) { + return leaf; + } + } + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + if (strcmp(node->name, name) == 0) { + return node; + } + } + + return NULL; +} + +static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n", + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->data, + tensor->name); +} + +static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n", + arg, + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->n_tasks, + tensor->data, + tensor->name); +} + +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { + assert(cgraph->work == NULL); + assert(cgraph->work_size == 0); + + uint64_t size_eval = 0; + + // compute size of intermediate results + // TODO: does not take into account scratch buffers !!!! + for (int i = 0; i < cgraph->n_nodes; ++i) { + size_eval += ggml_nbytes(cgraph->nodes[i]); + } + + // print + { + FILE * fout = stdout; + + fprintf(fout, "\n"); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8llu\n", "eval", size_eval); + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", + "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_leafs; ++i) { + ggml_graph_export_leaf(cgraph->leafs[i], fout); + + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + } + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", + "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + + if (cgraph->nodes[i]->src0) { + ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + } + + if (cgraph->nodes[i]->src1) { + ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + } + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + if (cgraph->nodes[i]->opt[j]) { + ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + } + } + + fprintf(fout, "\n"); + } + + fprintf(fout, "\n"); + } + + // write binary data + { + FILE * fout = fopen(fname, "wb"); + + if (!fout) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return; + } + + // header + { + const uint32_t magic = GGML_FILE_MAGIC; + const uint32_t version = GGML_FILE_VERSION; + const uint32_t n_leafs = cgraph->n_leafs; + const uint32_t nodes = cgraph->n_nodes; + + fwrite(&magic, sizeof(uint32_t), 1, fout); + fwrite(&version, sizeof(uint32_t), 1, fout); + fwrite(&n_leafs, sizeof(uint32_t), 1, fout); + fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&size_eval, sizeof(uint64_t), 1, fout); + } + + // leafs + { + for (int i = 0; i < cgraph->n_leafs; ++i) { + const struct ggml_tensor * tensor = cgraph->leafs[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // dump the data + // TODO: pad this to 32 byte boundary + { + const size_t size = ggml_nbytes(tensor); + + fwrite(tensor->data, sizeof(char), size, fout); + } + } + } + + // nodes + { + for (int i = 0; i < cgraph->n_nodes; ++i) { + const struct ggml_tensor * tensor = cgraph->nodes[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // output the op arguments + { + struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + + args[0] = tensor->src0; + args[1] = tensor->src1; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + if (args[j]) { + int32_t idx = -1; + + // check if leaf + { + for (int k = 0; k < cgraph->n_leafs; ++k) { + if (args[j] == cgraph->leafs[k]) { + idx = k; + break; + } + } + } + + // check if node + if (idx == -1) { + for (int k = 0; k < cgraph->n_nodes; ++k) { + if (args[j] == cgraph->nodes[k]) { + idx = GGML_MAX_NODES + k; + break; + } + } + } + + if (idx == -1) { + fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); + return; + } + + fwrite(&idx, sizeof(int32_t), 1, fout); + } else { + const int32_t nul = -1; + + fwrite(&nul, sizeof(int32_t), 1, fout); + } + } + } + } + } + + fclose(fout); + } +} + +struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { + assert(*ctx_data == NULL); + assert(*ctx_eval == NULL); + + struct ggml_cgraph result = { 0 }; + + struct ggml_tensor * data = NULL; + + // read file into data + { + FILE * fin = fopen(fname, "rb"); + + if (!fin) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return result; + } + + size_t fsize = 0; + + fseek(fin, 0, SEEK_END); + fsize = ftell(fin); + fseek(fin, 0, SEEK_SET); + + // create the data context + { + const size_t overhead = 1*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = fsize + overhead, + .mem_buffer = NULL, + .no_alloc = false, + }; + + *ctx_data = ggml_init(params); + + if (!*ctx_data) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); + + fread(data->data, sizeof(char), fsize, fin); + + fclose(fin); + } + + // populate result + { + char * ptr = (char *) data->data; + + const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); + + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); + return result; + } + + const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); + + if (version != GGML_FILE_VERSION) { + fprintf(stderr, "%s: invalid version number\n", __func__); + return result; + } + + const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs); + const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes); + const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); + + result.n_leafs = n_leafs; + result.n_nodes = n_nodes; + + // create the data context + { + const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = size_eval + overhead, + .mem_buffer = NULL, + .no_alloc = true, + }; + + *ctx_eval = ggml_init(params); + + if (!*ctx_eval) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + // leafs + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_leafs; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + tensor->data = (void *) ptr; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + result.leafs[i] = tensor; + + ptr += ggml_nbytes(tensor); + + fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + + ggml_set_no_alloc(*ctx_eval, false); + + // nodes + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_nodes; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + // parse args + { + struct ggml_tensor ** args[2 + GGML_MAX_OPT] = { + &tensor->src0, + &tensor->src1, + }; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = &tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx); + + if (arg_idx == -1) { + continue; + } + + if (arg_idx < GGML_MAX_NODES) { + *args[j] = result.leafs[arg_idx]; + } else { + *args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; + } + } + } + + result.nodes[i] = tensor; + + fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + } + + return result; +} + void ggml_graph_print(const struct ggml_cgraph * cgraph) { int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; @@ -14044,7 +14759,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, (double) node->perf_cycles / (double) ggml_cycles_per_ms(), (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, @@ -14058,7 +14773,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], - GGML_OP_LABEL[node->op]); + GGML_OP_NAME[node->op]); } for (int i = 0; i < GGML_OP_COUNT; i++) { @@ -14066,7 +14781,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { continue; } - GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0); } GGML_PRINT("========================================\n"); @@ -14137,9 +14852,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "%s |", node->name); } - fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", - i, node->ne[0], node->ne[1], - GGML_OP_SYMBOL[node->op]); + if (node->n_dims == 2) { + fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], GGML_OP_SYMBOL[node->op]); + } else { + fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]); + } + if (node->grad) { fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); @@ -15036,7 +15754,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK8_0*sizeof(block_q8_0)); } -static const quantize_chunk_f *const ggjt_v2_quantize_chunk[GGML_TYPE_COUNT] = { +static const quantize_chunk_f *const ggjt_v3_quantize_chunk[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = ggml_quantize_q4_0, [GGML_TYPE_Q4_1] = ggml_quantize_q4_1, [GGML_TYPE_Q5_0] = ggml_quantize_q5_0, @@ -15063,13 +15781,13 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i //////////////////////////////////////////////////////////////////////////////// -void ggjt_v2(void) { - GGML_BLCK_SIZE = ggjt_v2_blck_size; - GGML_TYPE_SIZE = ggjt_v2_type_size; - GGML_TYPE_NAME = ggjt_v2_type_name; - GGML_IS_QUANTIZED = ggjt_v2_is_quantized; - quantize_fns = ggjt_v2_quantize_fns; - GGML_QUANTIZE_CHUNK = ggjt_v2_quantize_chunk; +void ggjt_v3(void) { + GGML_BLCK_SIZE = ggjt_v3_blck_size; + GGML_TYPE_SIZE = ggjt_v3_type_size; + GGML_TYPE_NAME = ggjt_v3_type_name; + GGML_IS_QUANTIZED = ggjt_v3_is_quantized; + quantize_fns = ggjt_v3_quantize_fns; + GGML_QUANTIZE_CHUNK = ggjt_v3_quantize_chunk; } //////////////////////////////////////////////////////////////////////////////// diff --git a/third_party/ggml/ggml.h b/third_party/ggml/ggml.h index 26e1cc399..a81a4d768 100644 --- a/third_party/ggml/ggml.h +++ b/third_party/ggml/ggml.h @@ -191,11 +191,15 @@ COSMOPOLITAN_C_START_ #define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_VERSION 1 +#define GGML_QNT_VERSION 2 // bump this on quantization format changes +#define GGML_QNT_VERSION_FACTOR 1000 // do not change this + #define GGML_MAX_DIMS 4 #define GGML_MAX_NODES 4096 -#define GGML_MAX_PARAMS 16 +#define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_OPT 4 +#define GGML_MAX_NAME 32 #define GGML_DEFAULT_N_THREADS 4 #define GGML_ASSERT(x) \ @@ -229,6 +233,7 @@ COSMOPOLITAN_C_START_ enum ggml_backend { GGML_BACKEND_CPU = 0, GGML_BACKEND_CUDA = 1, + GGML_BACKEND_CL = 2, }; // model file types @@ -294,6 +299,7 @@ COSMOPOLITAN_C_START_ GGML_OP_ROPE, GGML_OP_ROPE_BACK, GGML_OP_ALIBI, + GGML_OP_CLAMP, GGML_OP_CONV_1D_1S, GGML_OP_CONV_1D_2S, @@ -351,11 +357,13 @@ COSMOPOLITAN_C_START_ void * data; - char name[32]; + char name[GGML_MAX_NAME]; - char padding[16]; // TODO: remove and add padding to name? + char padding[16]; }; + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // computation graph struct ggml_cgraph { int n_nodes; @@ -393,6 +401,8 @@ COSMOPOLITAN_C_START_ // compatibilty // + // TODO(jart): These should be associated with each tensor. + GGML_API void ggjt_v3(void); GGML_API void ggjt_v2(void); GGML_API void ggjt_v1(void); @@ -415,6 +425,7 @@ COSMOPOLITAN_C_START_ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); @@ -423,6 +434,9 @@ COSMOPOLITAN_C_START_ // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); @@ -430,7 +444,11 @@ COSMOPOLITAN_C_START_ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); + + GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, @@ -470,6 +488,8 @@ COSMOPOLITAN_C_START_ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); @@ -730,8 +750,6 @@ COSMOPOLITAN_C_START_ struct ggml_tensor * a, int64_t ne0); - // return view(a) - // TODO: when we start computing gradient, make a copy instead of view GGML_API struct ggml_tensor * ggml_reshape_2d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -839,7 +857,7 @@ COSMOPOLITAN_C_START_ int n_past); // in-place, returns view(a) - GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace( + GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( struct ggml_context * ctx, struct ggml_tensor * a, int n_past); @@ -854,7 +872,6 @@ COSMOPOLITAN_C_START_ struct ggml_tensor * a); // rotary position embedding - // in-place, returns view(a) // if mode & 1 == 1, skip n_past elements // if mode & 2 == 1, GPT-NeoX style // TODO: avoid creating a new tensor every time @@ -888,7 +905,16 @@ COSMOPOLITAN_C_START_ struct ggml_context * ctx, struct ggml_tensor * a, int n_past, - int n_head); + int n_head, + float bias_max); + + // clamp + // in-place, returns view(a) + struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, + float min, + float max); // padding = 1 // TODO: we don't support extra parameters for now @@ -950,6 +976,11 @@ COSMOPOLITAN_C_START_ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); + + GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); + GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); + // print info and performance information for the graph GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); diff --git a/third_party/ggml/ggml.mk b/third_party/ggml/ggml.mk index 3aee03ac9..b334dc145 100644 --- a/third_party/ggml/ggml.mk +++ b/third_party/ggml/ggml.mk @@ -24,8 +24,15 @@ THIRD_PARTY_GGML_A_HDRS = \ third_party/ggml/ggjt.v1.q5_1.h \ third_party/ggml/ggjt.v1.q8_0.h \ third_party/ggml/ggjt.v1.q8_1.h \ + third_party/ggml/ggjt.v2.q4_0.h \ + third_party/ggml/ggjt.v2.q4_1.h \ + third_party/ggml/ggjt.v2.q5_0.h \ + third_party/ggml/ggjt.v2.q5_1.h \ + third_party/ggml/ggjt.v2.q8_0.h \ + third_party/ggml/ggjt.v2.q8_1.h \ third_party/ggml/fp16.internal.h \ - third_party/ggml/ggjt.v1.internal.h + third_party/ggml/ggjt.v1.internal.h \ + third_party/ggml/ggjt.v2.internal.h THIRD_PARTY_GGML_A_SRCS = \ third_party/ggml/fp16.c \ @@ -37,11 +44,19 @@ THIRD_PARTY_GGML_A_SRCS = \ third_party/ggml/ggjt.v1.q5_0.c \ third_party/ggml/ggjt.v1.q5_1.c \ third_party/ggml/ggjt.v1.q8_0.c \ - third_party/ggml/ggjt.v1.q8_1.c + third_party/ggml/ggjt.v1.q8_1.c \ + third_party/ggml/ggjt.v2.c \ + third_party/ggml/ggjt.v2.q4_0.c \ + third_party/ggml/ggjt.v2.q4_1.c \ + third_party/ggml/ggjt.v2.q5_0.c \ + third_party/ggml/ggjt.v2.q5_1.c \ + third_party/ggml/ggjt.v2.q8_0.c \ + third_party/ggml/ggjt.v2.q8_1.c THIRD_PARTY_GGML_A_DIRECTDEPS = \ LIBC_CALLS \ LIBC_INTRIN \ + LIBC_FMT \ LIBC_MEM \ LIBC_NEXGEN32E \ LIBC_RUNTIME \ diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc index bfd2e8c9c..32048ba7a 100644 --- a/third_party/ggml/llama.cc +++ b/third_party/ggml/llama.cc @@ -68,6 +68,7 @@ asm(".include \"libc/disclaimer.inc\""); // available llama models enum e_model { MODEL_UNKNOWN, + MODEL_3B, MODEL_7B, MODEL_13B, MODEL_30B, @@ -83,6 +84,7 @@ static const size_t MB = 1024*1024; static const std::map & MEM_REQ_SCRATCH0() { static std::map _MEM_REQ_SCRATCH0 = { + { MODEL_3B, 128ull * MB }, { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, @@ -94,6 +96,7 @@ static const std::map & MEM_REQ_SCRATCH0() static const std::map & MEM_REQ_SCRATCH1() { static std::map _MEM_REQ_SCRATCH1 = { + { MODEL_3B, 128ull * MB }, { MODEL_7B, 512ull * MB }, { MODEL_13B, 512ull * MB }, { MODEL_30B, 512ull * MB }, @@ -106,6 +109,7 @@ static const std::map & MEM_REQ_SCRATCH1() static const std::map & MEM_REQ_KV_SELF() { static std::map k_sizes = { + { MODEL_3B, 682ull * MB }, { MODEL_7B, 1026ull * MB }, { MODEL_13B, 1608ull * MB }, { MODEL_30B, 3124ull * MB }, @@ -119,6 +123,7 @@ static const std::map & MEM_REQ_KV_SELF() static const std::map & MEM_REQ_EVAL() { static std::map k_sizes = { + { MODEL_3B, 512ull * MB }, { MODEL_7B, 768ull * MB }, { MODEL_13B, 1024ull * MB }, { MODEL_30B, 1280ull * MB }, @@ -436,6 +441,7 @@ enum llama_file_version { LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab LLAMA_FILE_VERSION_GGJT_V1, // adopted unified aligned mappable layout LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format + LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format }; struct llama_file_loader { @@ -457,22 +463,25 @@ struct llama_file_loader { uint32_t magic = file.read_u32(); uint32_t version = 0; - if (magic != READ32BE("ggml")) { + if (magic != LLAMA_FILE_MAGIC_GGML) { version = file.read_u32(); } - if (magic == READ32BE("ggml") && version == 0) { + if (magic == LLAMA_FILE_MAGIC_GGML && version == 0) { file_version = LLAMA_FILE_VERSION_GGML; ggjt_v1(); - } else if (magic == READ32BE("ggmf") && version == 1) { + } else if (magic == LLAMA_FILE_MAGIC_GGMF && version == 1) { file_version = LLAMA_FILE_VERSION_GGMF_V1; ggjt_v1(); - } else if (magic == READ32BE("ggjt") && version == 1) { + } else if (magic == LLAMA_FILE_MAGIC_GGJT && version == 1) { file_version = LLAMA_FILE_VERSION_GGJT_V1; ggjt_v1(); - } else if (magic == READ32BE("ggjt") && version == 2) { + } else if (magic == LLAMA_FILE_MAGIC_GGJT && version == 2) { file_version = LLAMA_FILE_VERSION_GGJT_V2; ggjt_v2(); + } else if (magic == LLAMA_FILE_MAGIC_GGJT && version == 3) { + file_version = LLAMA_FILE_VERSION_GGJT_V3; + ggjt_v3(); } else { Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", magic, version); @@ -570,9 +579,9 @@ struct llama_file_saver { write_vocab(); } void write_magic() { - ggjt_v2(); - file.write_u32(READ32BE("ggjt")); // magic - file.write_u32(2); // version + ggjt_v3(); + file.write_u32(LLAMA_FILE_MAGIC); // magic + file.write_u32(LLAMA_FILE_VERSION); // version } void write_hparams(enum llama_ftype new_ftype) { const llama_hparams & hparams = any_file_loader->hparams; @@ -614,7 +623,7 @@ struct llama_file_saver { file.write_u32(new_type); file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.name.data(), tensor.name.size()); - file.seek(-file.tell() & 31, SEEK_CUR); + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); file.write_raw(new_data, new_size); } @@ -629,12 +638,12 @@ struct llama_model_loader { std::unique_ptr mapping; llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { - auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); file_loaders.emplace_back(first_file); uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); for (uint32_t i = 1; i < n_parts; i++) { std::string fname = fname_base + "." + std::to_string(i); - auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); file_loaders.emplace_back(ith_file); if (ith_file->hparams != first_file->hparams) { Die("llama.cpp: hparams inconsistent between files"); @@ -681,7 +690,7 @@ struct llama_model_loader { } } - struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne) { + struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { Die("llama.cpp: tensor '%s' is missing from model", name.c_str()); @@ -692,10 +701,10 @@ struct llama_model_loader { name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); } - return get_tensor_for(lt); + return get_tensor_for(lt, backend); } - struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) { struct ggml_tensor * tensor; if (lt.ne.size() == 2) { tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); @@ -705,12 +714,13 @@ struct llama_model_loader { } ggml_set_name(tensor, lt.name.c_str()); LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + tensor->backend = backend; lt.ggml_tensor = tensor; num_ggml_tensors_created++; return tensor; } - void done_getting_tensors() { + void done_getting_tensors() const { if (num_ggml_tensors_created != tensors_map.tensors.size()) { Die("llama.cpp: file contained more tensors than expected"); } @@ -718,12 +728,16 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; + size_t prefetch_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + prefetch_size += lt.size; + } } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size)); if (!lmlock) { // Don't call the callback since the actual loading will be lazy // and we can't measure it. @@ -736,6 +750,9 @@ struct llama_model_loader { size_t done_size = 0; for (llama_load_tensor & lt : tensors_map.tensors) { + if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { + continue; + } if (progress_callback) { progress_callback((float) done_size / data_size, progress_callback_user_data); } @@ -853,8 +870,9 @@ struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, /*.n_parts =*/ -1, + /*.gpu_layers =*/ 0, /*.seed =*/ -1, - /*.f16_kv =*/ false, + /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, @@ -875,6 +893,21 @@ bool llama_mlock_supported() { return llama_mlock::SUPPORTED; } +void llama_init_backend() { + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } +} + +int64_t llama_time_us() { + return ggml_time_us(); +} + // // model loading // @@ -884,7 +917,8 @@ static const char *llama_file_version_name(llama_file_version version) { case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (pre #613 with sharded files and no mmap support)"; case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; - case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; + case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; + case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; default: LLAMA_ASSERT(false); } } @@ -907,6 +941,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { static const char *llama_model_type_name(e_model type) { switch (type) { + case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_13B: return "13B"; case MODEL_30B: return "30B"; @@ -919,6 +954,7 @@ static void llama_model_load_internal( const std::string & fname, llama_context & lctx, int n_ctx, + int n_gpu_layers, ggml_type memory_type, bool use_mmap, bool use_mlock, @@ -940,6 +976,7 @@ static void llama_model_load_internal( { switch (hparams.n_layer) { + case 26: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; case 40: model.type = e_model::MODEL_13B; break; case 60: model.type = e_model::MODEL_30B; break; @@ -986,29 +1023,7 @@ static void llama_model_load_internal( size_t ctx_size, mmapped_size; ml->calc_sizes(&ctx_size, &mmapped_size); if (verbose > 0) { - fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); - } - - // print memory requirements - { - const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; - - // this is the total memory required to run the inference - const size_t mem_required = - ctx_size + - mmapped_size + - MEM_REQ_SCRATCH0().at(model.type) + - MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at(model.type); - - // this is the memory required by one llama_state - const size_t mem_required_state = - scale*MEM_REQ_KV_SELF().at(model.type); - - if (verbose > 0) { - fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - } + fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); } // create the ggml context @@ -1031,7 +1046,14 @@ static void llama_model_load_internal( } } +#ifdef GGML_USE_CUBLAS +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA +#else +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU +#endif + // prepare memory for the weights + size_t vram_total = 0; { const auto & hparams = model.hparams; @@ -1041,33 +1063,87 @@ static void llama_model_load_internal( ml->ggml_ctx = ctx; - model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); - model.norm = ml->get_tensor("norm.weight", {n_embd}); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU); + + // "output" tensor + { + ggml_backend backend_output; + if (n_gpu_layers > int(n_layer)) { // NOLINT + backend_output = LLAMA_BACKEND_OFFLOAD; + } else { + backend_output = GGML_BACKEND_CPU; + } + + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); + } + + const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + auto & layer = model.layers[i]; std::string layers_i = "layers." + std::to_string(i); - layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend); - layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend); + + if (backend == GGML_BACKEND_CUDA) { + vram_total += + ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } } } ml->done_getting_tensors(); + // print memory requirements + { + const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; + + // this is the total memory required to run the inference + const size_t mem_required = + ctx_size + + mmapped_size - vram_total + // weights in VRAM not in memory + MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH1().at(model.type) + + MEM_REQ_EVAL().at(model.type); + + // this is the memory required by one llama_state + const size_t mem_required_state = + scale*MEM_REQ_KV_SELF().at(model.type); + + fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + +#ifdef GGML_USE_CUBLAS + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); + } + fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); +#elif !defined(GGML_USE_CLBLAST) + (void) n_gpu_layers; +#endif + } + // populate `tensors_by_name` for (llama_load_tensor & lt : ml->tensors_map.tensors) { model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); @@ -1075,6 +1151,59 @@ static void llama_model_load_internal( ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); +#ifdef GGML_USE_CUBLAS + { + size_t done_size = 0; + size_t data_size = 0; + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + done_size += lt.size; + } + } + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) { + continue; + } + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off); + done_size += lt.size; + } + } +#elif defined(GGML_USE_CLBLAST) + { + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu); + + size_t vram_total = 0; + + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + + ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); + ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); + ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); + ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); + ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); + ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); + ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); + } + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n"); + ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + } + + fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024); + } +#endif + + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so @@ -1086,6 +1215,7 @@ static bool llama_model_load( const std::string & fname, llama_context & lctx, int n_ctx, + int n_gpu_layers, ggml_type memory_type, bool use_mmap, bool use_mlock, @@ -1094,7 +1224,7 @@ static bool llama_model_load( void *progress_callback_user_data, int verbose) { // try { - llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock, + llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data, verbose); return true; @@ -1175,10 +1305,8 @@ static bool llama_eval_internal( { cur = ggml_rms_norm(ctx0, inpL); - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].attention_norm, cur), - cur); + // cur = cur*attention_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); } // self-attention @@ -1285,10 +1413,8 @@ static bool llama_eval_internal( { cur = ggml_rms_norm(ctx0, inpFF); - // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), - cur); + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, @@ -1325,10 +1451,8 @@ static bool llama_eval_internal( inpL = ggml_rms_norm(ctx0, inpL); - // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model.norm, inpL), - inpL); + // inpL = inpL*norm(broadcasted) + inpL = ggml_mul(ctx0, inpL, model.norm); embeddings = inpL; } @@ -2149,10 +2273,11 @@ struct llama_context * llama_init_from_file( unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { ++*cur_percentage_p; - fprintf(stderr, "."); + // TODO(jart): Fix this. + // fprintf(stderr, "."); fflush(stderr); if (percentage >= 100) { - fprintf(stderr, "\n"); + // fprintf(stderr, "\n"); } } }; @@ -2163,7 +2288,7 @@ struct llama_context * llama_init_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type, + if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data, verbose)) { @@ -2242,7 +2367,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != READ32BE("ggla")) { + if (magic != LLAMA_FILE_MAGIC_GGLA) { fprintf(stderr, "%s: bad file magic\n", __func__); return 1; } @@ -2306,7 +2431,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // maybe this should in llama_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0)); } } @@ -2399,7 +2524,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } size_t idx = model_loader->tensors_map.name_to_idx[base_name]; llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; - base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); lt.data = (uint8_t *) lt.ggml_tensor->data; model_loader->load_data_for(lt); lt.ggml_tensor->data = lt.data; @@ -2453,7 +2578,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * n_tensors++; if (n_tensors % 4 == 0) { - fprintf(stderr, "."); + // fprintf(stderr, "."); } } } @@ -2586,9 +2711,10 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { if (kv_size) { const size_t elt_size = ggml_element_size(kv_self.k); - char buffer[4096]; + llama_buffer buffer; // [jart] fixed buffer alignment bug + buffer.resize(4096); - ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + ggml_context * cpy_ctx = ggml_init({ buffer.size, buffer.addr, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; diff --git a/third_party/ggml/llama.h b/third_party/ggml/llama.h index 9c28a60dc..b44d15106 100644 --- a/third_party/ggml/llama.h +++ b/third_party/ggml/llama.h @@ -20,12 +20,23 @@ # define LLAMA_API #endif -#define LLAMA_FILE_VERSION 1 -#define LLAMA_FILE_MAGIC READ32BE("ggjt") -#define LLAMA_FILE_MAGIC_UNVERSIONED READ32BE("ggml") -#define LLAMA_SESSION_MAGIC READ32BE("ggsn") +#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' + +#define LLAMA_FILE_VERSION 3 +#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT +#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML +#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +// Defined when llama.cpp is compiled with support for offloading model layers to GPU. +#define LLAMA_SUPPORTS_GPU_OFFLOAD +#endif + #ifdef __cplusplus extern "C" { #endif @@ -55,9 +66,10 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { - int n_ctx; // text context - int n_parts; // -1 for default - int seed; // RNG seed, -1 for random + int n_ctx; // text context + int n_parts; // -1 for default + int n_gpu_layers; // number of layers to store in VRAM + int seed; // RNG seed, -1 for random bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one @@ -74,16 +86,16 @@ extern "C" { // model file types enum llama_ftype { - LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); @@ -91,6 +103,13 @@ extern "C" { LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mlock_supported(); + // TODO: not great API - very likely to change + // Initialize the llama + ggml backend + // Call once at the start of the program + LLAMA_API void llama_init_backend(); + + LLAMA_API int64_t llama_time_us(); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure diff --git a/third_party/ggml/llama_util.h b/third_party/ggml/llama_util.h index ae43af616..74504f7e6 100755 --- a/third_party/ggml/llama_util.h +++ b/third_party/ggml/llama_util.h @@ -387,7 +387,7 @@ struct llama_buffer { void resize(size_t size) { free(addr); - addr = (uint8_t *)memalign(32, size); + addr = (uint8_t *)memalign(32, size); // [jart] always avx align this->size = size; } diff --git a/third_party/ggml/main.cc b/third_party/ggml/main.cc index 8a399f825..b59f9670f 100644 --- a/third_party/ggml/main.cc +++ b/third_party/ggml/main.cc @@ -210,17 +210,11 @@ static int on_missing_feature(const char *name) { return 1; } -void MakeProcessNice(void) { - setpriority(PRIO_PROCESS, 0, 10); - ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - struct sched_param param = {sched_get_priority_min(SCHED_IDLE)}; - sched_setscheduler(0, SCHED_IDLE, ¶m); -} - int main(int argc, char ** argv) { MakeProcessNice(); ShowCrashReports(); + setvbuf(stdin, NULL, _IONBF, 0); setvbuf(stdout, NULL, _IONBF, 0); setvbuf(stderr, NULL, _IONBF, 0); @@ -232,9 +226,7 @@ int main(int argc, char ** argv) { if (!X86_HAVE(AVX)) return on_missing_feature("avx"); if (!X86_HAVE(FMA)) return on_missing_feature("fma"); if (!X86_HAVE(SSE3)) return on_missing_feature("sse3"); - if (!X86_HAVE(F16C)) { - fprintf(stderr, "%s: warning: cpuid f16c not detected; inference might crash\n", __func__); - } + if (!X86_HAVE(F16C)) return on_missing_feature("f16c"); #endif /* __x86_64__ */ if (gpt_params_parse(argc, argv, params) == false) { diff --git a/third_party/ggml/quantize.cc b/third_party/ggml/quantize.cc index eaebc082f..2d179ddce 100644 --- a/third_party/ggml/quantize.cc +++ b/third_party/ggml/quantize.cc @@ -55,9 +55,10 @@ static const std::map LLAMA_FTYPE_MAP = { // ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type [nthreads] // int main(int argc, char ** argv) { + MakeProcessNice(); ShowCrashReports(); - ggjt_v2(); + ggjt_v3(); ggml_time_init(); if (argc < 3) { @@ -69,11 +70,7 @@ int main(int argc, char ** argv) { } // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } + llama_init_backend(); const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; @@ -95,7 +92,7 @@ int main(int argc, char ** argv) { ftype = (enum llama_ftype)atoi(argv[3]); } - int nthread = argc > 4 ? atoi(argv[4]) : 0; + int nthread = argc > 4 ? atoi(argv[4]) : std::min(20, std::max(1, _getcpucount() >> 1)); const int64_t t_main_start_us = ggml_time_us(); diff --git a/third_party/radpajama/copy-gptneox.cc b/third_party/radpajama/copy-gptneox.cc index 649e2ae7c..0df494882 100644 --- a/third_party/radpajama/copy-gptneox.cc +++ b/third_party/radpajama/copy-gptneox.cc @@ -48,6 +48,7 @@ static const std::map GPTNEOX_FTYPE_MAP = { // ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // int main(int argc, char ** argv) { + MakeProcessNice(); ShowCrashReports(); ggjt_v1(); diff --git a/third_party/radpajama/main-redpajama-chat.cc b/third_party/radpajama/main-redpajama-chat.cc index 00946f418..dff7b3d6a 100644 --- a/third_party/radpajama/main-redpajama-chat.cc +++ b/third_party/radpajama/main-redpajama-chat.cc @@ -99,6 +99,7 @@ int main(int argc, char ** argv) { params.instruct = true; params.interactive = true; + MakeProcessNice(); ShowCrashReports(); if (gpt_params_parse(argc, argv, params) == false) { return 1; } @@ -136,6 +137,7 @@ int main(int argc, char ** argv) { } } + MakeProcessNice(); ShowCrashReports(); // Always interactive for RedPajama chat model diff --git a/third_party/radpajama/main-redpajama.cc b/third_party/radpajama/main-redpajama.cc index 0aad11063..d6a5af27d 100644 --- a/third_party/radpajama/main-redpajama.cc +++ b/third_party/radpajama/main-redpajama.cc @@ -84,6 +84,7 @@ int main(int argc, char ** argv) { gpt_params params; params.model = "./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Instruct-3B-v1-f16.bin"; + MakeProcessNice(); ShowCrashReports(); if (gpt_params_parse(argc, argv, params) == false) { diff --git a/third_party/radpajama/quantize-gptneox.cc b/third_party/radpajama/quantize-gptneox.cc index 826117bae..182188cd2 100644 --- a/third_party/radpajama/quantize-gptneox.cc +++ b/third_party/radpajama/quantize-gptneox.cc @@ -50,6 +50,7 @@ static const std::map GPTNEOX_FTYPE_MAP = { // ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // int main(int argc, char ** argv) { + MakeProcessNice(); ShowCrashReports(); ggjt_v2();