From a0cfed1e3052d610998fd9cab6ad5ec859de04e7 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 20 May 2023 15:58:33 +0800 Subject: [PATCH] still merging in process --- ggml-opencl-legacy.h | 15 - ggml-opencl.h | 35 - ggml.c | 4 +- gpttype_adapter.cpp | 2 +- llamaextra.cpp | 86 - llamaextra.h | 18 - .../ggml_v2-opencl-legacy.c | 54 +- otherarch/ggml_v2-opencl-legacy.h | 15 + .../ggml_v2-opencl.cpp | 2 +- otherarch/ggml_v2-opencl.h | 35 + ggml_v2.c => otherarch/ggml_v2.c | 8264 ++++++++--------- otherarch/ggml_v2.h | 1143 +++ otherarch/gpt2_v2.cpp | 322 +- otherarch/gptj_v2.cpp | 270 +- llama.cpp => otherarch/llama_v2.cpp | 1319 +-- llama.h => otherarch/llama_v2.h | 183 +- otherarch/neox_v2.cpp | 298 +- otherarch/otherarch.h | 134 + otherarch/rwkv_v2.cpp | 348 +- 19 files changed, 6903 insertions(+), 5644 deletions(-) delete mode 100644 ggml-opencl-legacy.h delete mode 100644 ggml-opencl.h delete mode 100644 llamaextra.cpp delete mode 100644 llamaextra.h rename ggml-opencl-legacy.c => otherarch/ggml_v2-opencl-legacy.c (89%) create mode 100644 otherarch/ggml_v2-opencl-legacy.h rename ggml-opencl.cpp => otherarch/ggml_v2-opencl.cpp (99%) create mode 100644 otherarch/ggml_v2-opencl.h rename ggml_v2.c => otherarch/ggml_v2.c (65%) create mode 100644 otherarch/ggml_v2.h rename llama.cpp => otherarch/llama_v2.cpp (62%) rename llama.h => otherarch/llama_v2.h (51%) diff --git a/ggml-opencl-legacy.h b/ggml-opencl-legacy.h deleted file mode 100644 index 588a5bab6..000000000 --- a/ggml-opencl-legacy.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include "ggml-opencl.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void ggml_cl_init_legacy(void); - -void ggml_cl_sgemm_wrapper_legacy(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); - -#ifdef __cplusplus -} -#endif diff --git a/ggml-opencl.h b/ggml-opencl.h deleted file mode 100644 index e0c1d6957..000000000 --- a/ggml-opencl.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include "ggml.h" - -#ifdef __cplusplus -extern "C" { -#endif - -enum ggml_blas_order { - GGML_BLAS_ORDER_ROW_MAJOR = 101, - GGML_BLAS_ORDER_COLUMN_MAJOR = 102, -}; - -enum ggml_blas_op { - GGML_BLAS_OP_N = 111, - GGML_BLAS_OP_T = 112, - GGML_BLAS_OP_C = 113, -}; - -void ggml_cl_init(void); - -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); - -void * ggml_cl_host_malloc(size_t size); -void ggml_cl_host_free(void * ptr); - -void ggml_cl_transform_tensor(struct ggml_tensor * tensor); - -void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); - -#ifdef __cplusplus -} -#endif diff --git a/ggml.c b/ggml.c index 1cb89636a..c380eb65e 100644 --- a/ggml.c +++ b/ggml.c @@ -138,14 +138,14 @@ inline static void* ggml_aligned_malloc(size_t size) { #if defined(GGML_USE_ACCELERATE) #include #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions -#include "ggml-opencl.h" +#include "ggml_v2-opencl.h" #endif #elif defined(GGML_USE_OPENBLAS) #include #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) -#include "ggml-opencl.h" +#include "ggml_v2-opencl.h" #endif #undef MIN diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 9daa971bf..feabb1848 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -12,7 +12,7 @@ #include "otherarch.h" //for easier compilation -#include "llamaextra.cpp" +#include "llama_v2.cpp" //concat source files into one file for compilation purposes #include "utils.cpp" diff --git a/llamaextra.cpp b/llamaextra.cpp deleted file mode 100644 index 9b4be8c9a..000000000 --- a/llamaextra.cpp +++ /dev/null @@ -1,86 +0,0 @@ -#include "ggml.h" -#include "llamaextra.h" -#include "llama.cpp" - - -// TODO: Calculate this constant from the vocabulary -#define MAX_TOKEN_LEN 18 -// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece -std::vector legacy_llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { - std::vector res; - std::vector score; - std::vector prev; - int len = text.length(); - - score.resize(len + 1); - prev.resize(len + 1); - - // Forward pass - for (int i = 0; i < len; i++) { - int max_len = std::min(len - i, MAX_TOKEN_LEN); - for (int sub_len = 1; sub_len <= max_len; sub_len++) { - auto sub = text.substr(i, sub_len); - auto token = vocab.token_to_id.find(sub); - if (token != vocab.token_to_id.end()) { - int token_score = sub.length() * sub.length(); - int local_score = score[i] + token_score; - int next = i + sub_len; - if (score[next] < local_score) { - score[next] = local_score; - prev[next] = (*token).second; - } - } - } - } - - // Backward pass - int i = len; - while (i > 0) { - llama_token token_id = prev[i]; - if (token_id == 0) { - // TODO: Return error or something more meaningful - printf("failed to tokenize string!\n"); - break; - } - res.push_back(token_id); - auto token = vocab.id_to_token[token_id].tok; - i -= token.length(); - } - - if (bos) { - res.push_back(1); // TODO: replace with vocab.bos - } - - // Pieces are in reverse order so correct that - std::reverse(res.begin(), res.end()); - - return res; -} - -int legacy_llama_tokenize( - struct llama_context * ctx, - const char * text, - llama_token * tokens, - int n_max_tokens, - bool add_bos) { - auto res = legacy_llama_tokenize(ctx->vocab, text, add_bos); - - if (n_max_tokens < (int) res.size()) { - fprintf(stderr, "%s: too many tokens\n", __func__); - return -((int) res.size()); - } - - for (size_t i = 0; i < res.size(); i++) { - tokens[i] = res[i]; - } - - return res.size(); -} - -std::vector legacy_llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { - std::vector res(8096); - int n = legacy_llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); - res.resize(n); - - return res; -} \ No newline at end of file diff --git a/llamaextra.h b/llamaextra.h deleted file mode 100644 index ab36c39fd..000000000 --- a/llamaextra.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "llama.h" -#include "ggml.h" - -std::vector legacy_llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); \ No newline at end of file diff --git a/ggml-opencl-legacy.c b/otherarch/ggml_v2-opencl-legacy.c similarity index 89% rename from ggml-opencl-legacy.c rename to otherarch/ggml_v2-opencl-legacy.c index d6ab46f88..b5b8fa3e0 100644 --- a/ggml-opencl-legacy.c +++ b/otherarch/ggml_v2-opencl-legacy.c @@ -1,4 +1,4 @@ -#include "ggml-opencl-legacy.h" +#include "ggml_v2-opencl-legacy.h" #define CL_TARGET_OPENCL_VERSION 110 #include @@ -7,7 +7,7 @@ #include #include -#include "ggml.h" +#include "ggml_v2.h" #define MULTILINE_QUOTE(...) #__VA_ARGS__ const char * clblast_dequant_legacy = MULTILINE_QUOTE( @@ -171,7 +171,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f #define QK5_0 32 typedef struct { - ggml_fp16_t d; // delta + ggml_v2_fp16_t d; // delta uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_0 / 2]; // nibbles / quants } block_q5_0; @@ -221,12 +221,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co return p; } -void ggml_cl_init_legacy(void) { +void ggml_v2_cl_init_legacy(void) { cl_int err = 0; - char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); - char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); - int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); - int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); + char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); + char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); + int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM)); + int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE)); printf("\nInitializing LEGACY CLBlast (First Run)..."); printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); cl_uint num_platforms; @@ -271,7 +271,7 @@ void ggml_cl_init_legacy(void) { CL_CHECK(err, "clCreateKernel"); } -static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { +static void ggml_v2_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { if (req_size <= *cur_size) { return; } @@ -286,8 +286,8 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags CL_CHECK(err, "clCreateBuffer"); } -void ggml_cl_sgemm_wrapper_legacy( - const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, +void ggml_v2_cl_sgemm_wrapper_legacy( + const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, @@ -300,34 +300,34 @@ void ggml_cl_sgemm_wrapper_legacy( cl_block_q5_0* cl_host_b; switch (btype) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: dequant = false; break; - case GGML_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_0: dequant = true; kernel = kernel_q4_0; local = 16; size_qb = global * (sizeof(float) + local) / 32; break; - case GGML_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_1: dequant = true; kernel = kernel_q4_1; local = 16; size_qb = global * (sizeof(float) * 2 + local) / 32; break; - case GGML_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_2: dequant = true; kernel = kernel_q4_2; local = 8; - size_qb = global * (sizeof(ggml_fp16_t) + local) / 16; + size_qb = global * (sizeof(ggml_v2_fp16_t) + local) / 16; break; - case GGML_TYPE_Q4_3: + case GGML_V2_TYPE_Q4_3: dequant = true; kernel = kernel_q4_3; local = 8; size_qb = global * (sizeof(short) * 2 + local) / 16; break; - case GGML_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_0: dequant = true; kernel = kernel_q5_0; local = 16; @@ -337,20 +337,20 @@ void ggml_cl_sgemm_wrapper_legacy( const block_q5_0* b = (const block_q5_0*) host_b; cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32); for (size_t i = 0; i < global / 32; i++) { - cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d); + cl_host_b[i].d = ggml_v2_fp16_to_fp32(b[i].d); memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t)); memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2); } host_b = (const float*) cl_host_b; size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32; break; - case GGML_TYPE_Q5_1: + case GGML_V2_TYPE_Q5_1: dequant = true; kernel = kernel_q5_1; local = 16; - size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; + size_qb = global * (sizeof(ggml_v2_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; break; - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_0: dequant = true; kernel = kernel_q8_0; local = 32; @@ -366,12 +366,12 @@ void ggml_cl_sgemm_wrapper_legacy( const size_t size_c = m * n * sizeof(float); // Prepare buffers - ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); + ggml_v2_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); if (dequant) { - ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); + ggml_v2_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); } - ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); - ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); + ggml_v2_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); + ggml_v2_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); cl_event ev_a, ev_qb, ev_b; @@ -421,7 +421,7 @@ void ggml_cl_sgemm_wrapper_legacy( clWaitForEvents(1, &ev_c); clReleaseEvent(ev_sgemm); clReleaseEvent(ev_c); - if (btype == GGML_TYPE_Q5_0) { + if (btype == GGML_V2_TYPE_Q5_0) { free((void*) cl_host_b); } } diff --git a/otherarch/ggml_v2-opencl-legacy.h b/otherarch/ggml_v2-opencl-legacy.h new file mode 100644 index 000000000..bcfe670c9 --- /dev/null +++ b/otherarch/ggml_v2-opencl-legacy.h @@ -0,0 +1,15 @@ +#pragma once + +#include "ggml_v2-opencl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_v2_cl_init_legacy(void); + +void ggml_v2_cl_sgemm_wrapper_legacy(const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); + +#ifdef __cplusplus +} +#endif diff --git a/ggml-opencl.cpp b/otherarch/ggml_v2-opencl.cpp similarity index 99% rename from ggml-opencl.cpp rename to otherarch/ggml_v2-opencl.cpp index 1791fb2f6..23afdb34c 100644 --- a/ggml-opencl.cpp +++ b/otherarch/ggml_v2-opencl.cpp @@ -1,4 +1,4 @@ -#include "ggml-opencl.h" +#include "ggml_v2-opencl.h" #include #include diff --git a/otherarch/ggml_v2-opencl.h b/otherarch/ggml_v2-opencl.h new file mode 100644 index 000000000..c21de9186 --- /dev/null +++ b/otherarch/ggml_v2-opencl.h @@ -0,0 +1,35 @@ +#pragma once + +#include "ggml_v2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum ggml_v2_blas_order { + GGML_V2_BLAS_ORDER_ROW_MAJOR = 101, + GGML_V2_BLAS_ORDER_COLUMN_MAJOR = 102, +}; + +enum ggml_v2_blas_op { + GGML_V2_BLAS_OP_N = 111, + GGML_V2_BLAS_OP_T = 112, + GGML_V2_BLAS_OP_C = 113, +}; + +void ggml_v2_cl_init(void); + +bool ggml_v2_cl_can_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst); +size_t ggml_v2_cl_mul_mat_get_wsize(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst); +void ggml_v2_cl_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst, void * wdata, size_t wsize); + +void * ggml_v2_cl_host_malloc(size_t size); +void ggml_v2_cl_host_free(void * ptr); + +void ggml_v2_cl_transform_tensor(struct ggml_v2_tensor * tensor); + +void ggml_v2_cl_sgemm_wrapper(const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype); + +#ifdef __cplusplus +} +#endif diff --git a/ggml_v2.c b/otherarch/ggml_v2.c similarity index 65% rename from ggml_v2.c rename to otherarch/ggml_v2.c index b521f7932..26c05725d 100644 --- a/ggml_v2.c +++ b/otherarch/ggml_v2.c @@ -1,7 +1,7 @@ // Defines CLOCK_MONOTONIC on Linux #define _GNU_SOURCE -#include "ggml.h" +#include "ggml_v2.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -95,41 +95,41 @@ typedef void* thread_ret_t; #define static_assert(cond, msg) _Static_assert(cond, msg) #endif -/*#define GGML_PERF*/ -#define GGML_DEBUG 0 -#define GGML_GELU_FP16 -#define GGML_SILU_FP16 +/*#define GGML_V2_PERF*/ +#define GGML_V2_DEBUG 0 +#define GGML_V2_GELU_FP16 +#define GGML_V2_SILU_FP16 -#define GGML_SOFT_MAX_UNROLL 4 -#define GGML_VEC_DOT_UNROLL 2 +#define GGML_V2_SOFT_MAX_UNROLL 4 +#define GGML_V2_VEC_DOT_UNROLL 2 #ifdef GGML_USE_ACCELERATE // uncomment to use vDSP for soft max computation // note: not sure if it is actually faster -//#define GGML_SOFT_MAX_ACCELERATE +//#define GGML_V2_SOFT_MAX_ACCELERATE #endif #if UINTPTR_MAX == 0xFFFFFFFF - #define GGML_MEM_ALIGN 4 + #define GGML_V2_MEM_ALIGN 4 #else - #define GGML_MEM_ALIGN 16 + #define GGML_V2_MEM_ALIGN 16 #endif #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) -#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#define GGML_V2_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_V2_MEM_ALIGN) +#define GGML_V2_ALIGNED_FREE(ptr) _aligned_free(ptr) #else -inline static void* ggml_aligned_malloc(size_t size) { +inline static void* ggml_v2_aligned_malloc(size_t size) { void* aligned_memory = NULL; - int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); + int result = posix_memalign(&aligned_memory, GGML_V2_MEM_ALIGN, size); if (result != 0) { // Handle allocation failure return NULL; } return aligned_memory; } -#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) -#define GGML_ALIGNED_FREE(ptr) free(ptr) +#define GGML_V2_ALIGNED_MALLOC(size) ggml_v2_aligned_malloc(size) +#define GGML_V2_ALIGNED_FREE(ptr) free(ptr) #endif #define UNUSED(x) (void)(x) @@ -143,8 +143,8 @@ inline static void* ggml_aligned_malloc(size_t size) { #include "ggml-cuda.h" #endif #if defined(GGML_USE_CLBLAST) -#include "ggml-opencl.h" -#include "ggml-opencl-legacy.h" +#include "ggml_v2-opencl.h" +#include "ggml_v2-opencl-legacy.h" #endif #undef MIN @@ -153,7 +153,7 @@ inline static void* ggml_aligned_malloc(size_t size) { #define MAX(a, b) ((a) > (b) ? (a) : (b)) // floating point type used to accumulate sums -typedef double ggml_float; +typedef double ggml_v2_float; // 16-bit float // on Arm, we use __fp16 @@ -166,11 +166,11 @@ typedef double ggml_float; // #include -#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define GGML_COMPUTE_FP32_TO_FP16(x) (x) +#define GGML_V2_COMPUTE_FP16_TO_FP32(x) ((float) (x)) +#define GGML_V2_COMPUTE_FP32_TO_FP16(x) (x) -#define GGML_FP16_TO_FP32(x) ((float) (x)) -#define GGML_FP32_TO_FP16(x) (x) +#define GGML_V2_FP16_TO_FP32(x) ((float) (x)) +#define GGML_V2_FP32_TO_FP16(x) (x) #else @@ -193,22 +193,22 @@ typedef double ggml_float; #ifdef __F16C__ #ifdef _MSC_VER -#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#define GGML_V2_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_V2_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) #else -#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#define GGML_V2_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_V2_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) #endif #elif defined(__POWER9_VECTOR__) -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +#define GGML_V2_COMPUTE_FP16_TO_FP32(x) ggml_v2_compute_fp16_to_fp32(x) +#define GGML_V2_COMPUTE_FP32_TO_FP16(x) ggml_v2_compute_fp32_to_fp16(x) /* the inline asm below is about 12% faster than the lookup method */ -#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#define GGML_V2_FP16_TO_FP32(x) GGML_V2_COMPUTE_FP16_TO_FP32(x) +#define GGML_V2_FP32_TO_FP16(x) GGML_V2_COMPUTE_FP32_TO_FP16(x) -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { +static inline float ggml_v2_compute_fp16_to_fp32(ggml_v2_fp16_t h) { register float f; register double d; __asm__( @@ -221,9 +221,9 @@ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { return f; } -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +static inline ggml_v2_fp16_t ggml_v2_compute_fp32_to_fp16(float f) { register double d; - register ggml_fp16_t r; + register ggml_v2_fp16_t r; __asm__( /* xscvdphp can work on double or single precision */ "xscvdphp %0,%2\n" "mffprd %1,%0\n" : @@ -256,7 +256,7 @@ static inline uint32_t fp32_to_bits(float f) { return fp32.as_bits; } -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { +static inline float ggml_v2_compute_fp16_to_fp32(ggml_v2_fp16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t two_w = w + w; @@ -279,7 +279,7 @@ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { return fp32_from_bits(result); } -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +static inline ggml_v2_fp16_t ggml_v2_compute_fp32_to_fp16(float f) { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float scale_to_inf = 0x1.0p+112f; const float scale_to_zero = 0x1.0p-110f; @@ -305,8 +305,8 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +#define GGML_V2_COMPUTE_FP16_TO_FP32(x) ggml_v2_compute_fp16_to_fp32(x) +#define GGML_V2_COMPUTE_FP32_TO_FP16(x) ggml_v2_compute_fp32_to_fp16(x) #endif // __F16C__ @@ -317,13 +317,13 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { // // precomputed gelu table for f16 (128 KB) -static ggml_fp16_t table_gelu_f16[1 << 16]; +static ggml_v2_fp16_t table_gelu_f16[1 << 16]; // precomputed silu table for f16 (128 KB) -static ggml_fp16_t table_silu_f16[1 << 16]; +static ggml_v2_fp16_t table_silu_f16[1 << 16]; // precomputed exp table for f16 (128 KB) -static ggml_fp16_t table_exp_f16[1 << 16]; +static ggml_v2_fp16_t table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) static float table_f32_f16[1 << 16]; @@ -343,39 +343,39 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_v2_lookup_fp16_to_fp32, +// so we define GGML_V2_FP16_TO_FP32 and GGML_V2_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) +#if !defined(GGML_V2_FP16_TO_FP32) || !defined(GGML_V2_FP32_TO_FP16) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { +inline static float ggml_v2_lookup_fp16_to_fp32(ggml_v2_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); return table_f32_f16[s]; } -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#define GGML_V2_FP16_TO_FP32(x) ggml_v2_lookup_fp16_to_fp32(x) +#define GGML_V2_FP32_TO_FP16(x) GGML_V2_COMPUTE_FP32_TO_FP16(x) #endif // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API -float ggml_fp16_to_fp32(ggml_fp16_t x) { - return (float) GGML_FP16_TO_FP32(x); +float ggml_v2_fp16_to_fp32(ggml_v2_fp16_t x) { + return (float) GGML_V2_FP16_TO_FP32(x); } -ggml_fp16_t ggml_fp32_to_fp16(float x) { - return GGML_FP32_TO_FP16(x); +ggml_v2_fp16_t ggml_v2_fp32_to_fp16(float x) { + return GGML_V2_FP32_TO_FP16(x); } -void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) { +void ggml_v2_fp16_to_fp32_row(const ggml_v2_fp16_t * x, float * y, size_t n) { for (size_t i = 0; i < n; i++) { - y[i] = GGML_FP16_TO_FP32(x[i]); + y[i] = GGML_V2_FP16_TO_FP32(x[i]); } } -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { +void ggml_v2_fp32_to_fp16_row(const float * x, ggml_v2_fp16_t * y, size_t n) { size_t i = 0; #if defined(__F16C__) for (; i + 7 < n; i += 8) { @@ -390,7 +390,7 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { } #endif for (; i < n; i++) { - y[i] = GGML_FP32_TO_FP16(x[i]); + y[i] = GGML_V2_FP32_TO_FP16(x[i]); } } @@ -400,54 +400,54 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { #if defined(_MSC_VER) || defined(__MINGW32__) static int64_t timer_freq; -void ggml_time_init(void) { +void ggml_v2_time_init(void) { LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); timer_freq = frequency.QuadPart; } -int64_t ggml_time_ms(void) { +int64_t ggml_v2_time_ms(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return (t.QuadPart * 1000) / timer_freq; } -int64_t ggml_time_us(void) { +int64_t ggml_v2_time_us(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return (t.QuadPart * 1000000) / timer_freq; } #else -void ggml_time_init(void) {} -int64_t ggml_time_ms(void) { +void ggml_v2_time_init(void) {} +int64_t ggml_v2_time_ms(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; } -int64_t ggml_time_us(void) { +int64_t ggml_v2_time_us(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; } #endif -int64_t ggml_cycles(void) { +int64_t ggml_v2_cycles(void) { return clock(); } -int64_t ggml_cycles_per_ms(void) { +int64_t ggml_v2_cycles_per_ms(void) { return CLOCKS_PER_SEC/1000; } -#ifdef GGML_PERF -#define ggml_perf_time_ms() ggml_time_ms() -#define ggml_perf_time_us() ggml_time_us() -#define ggml_perf_cycles() ggml_cycles() -#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#ifdef GGML_V2_PERF +#define ggml_v2_perf_time_ms() ggml_v2_time_ms() +#define ggml_v2_perf_time_us() ggml_v2_time_us() +#define ggml_v2_perf_cycles() ggml_v2_cycles() +#define ggml_v2_perf_cycles_per_ms() ggml_v2_cycles_per_ms() #else -#define ggml_perf_time_ms() 0 -#define ggml_perf_time_us() 0 -#define ggml_perf_cycles() 0 -#define ggml_perf_cycles_per_ms() 0 +#define ggml_v2_perf_time_ms() 0 +#define ggml_v2_perf_time_us() 0 +#define ggml_v2_perf_cycles() 0 +#define ggml_v2_perf_cycles_per_ms() 0 #endif // @@ -782,20 +782,20 @@ static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 b #define QK5_0 32 typedef struct { - ggml_fp16_t d; // delta + ggml_v2_fp16_t d; // delta uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_0 / 2]; // nibbles / quants } block_q5_0; -static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); +static_assert(sizeof(block_q5_0) == sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); #define QK5_1 32 typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min + ggml_v2_fp16_t d; // delta + ggml_v2_fp16_t m; // min uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; -static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); #define QK8_0 32 typedef struct { @@ -814,18 +814,18 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s #define QK4_2 16 typedef struct { - ggml_fp16_t d; // delta + ggml_v2_fp16_t d; // delta uint8_t qs[QK4_2 / 2]; // nibbles / quants } block_q4_2; -static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); +static_assert(sizeof(block_q4_2) == sizeof(ggml_v2_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); #define QK4_3 16 typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min + ggml_v2_fp16_t d; // delta + ggml_v2_fp16_t m; // min uint8_t qs[QK4_3 / 2]; // nibbles / quants } block_q4_3; -static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); +static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_v2_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding"); #define QK8_1 32 typedef struct { @@ -942,7 +942,7 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r const float d = max / -16; const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_V2_FP32_TO_FP16(d); uint32_t qh = 0; @@ -989,8 +989,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r const float d = (max - min) / ((1 << 5) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); - y[i].m = GGML_FP32_TO_FP16(min); + y[i].d = GGML_V2_FP32_TO_FP16(d); + y[i].m = GGML_V2_FP32_TO_FP16(min); uint32_t qh = 0; @@ -1394,7 +1394,7 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -1420,8 +1420,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); + const float m = GGML_V2_FP16_TO_FP32(x[i].m); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -1457,71 +1457,71 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in } } -static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { - [GGML_TYPE_Q4_0] = { +static const quantize_fns_t quantize_fns[GGML_V2_TYPE_COUNT] = { + [GGML_V2_TYPE_Q4_0] = { .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q4_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, + .vec_dot_q = ggml_v2_vec_dot_q4_0_q8_0, + .vec_dot_type = GGML_V2_TYPE_Q8_0, }, - [GGML_TYPE_Q4_1] = { + [GGML_V2_TYPE_Q4_1] = { .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = ggml_vec_dot_q4_1_q8_1, - .vec_dot_type = GGML_TYPE_Q8_1, + .vec_dot_q = ggml_v2_vec_dot_q4_1_q8_1, + .vec_dot_type = GGML_V2_TYPE_Q8_1, }, - [GGML_TYPE_Q5_0] = { + [GGML_V2_TYPE_Q5_0] = { .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, .quantize_row_q = quantize_row_q5_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference, .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q5_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, + .vec_dot_q = ggml_v2_vec_dot_q5_0_q8_0, + .vec_dot_type = GGML_V2_TYPE_Q8_0, }, - [GGML_TYPE_Q5_1] = { + [GGML_V2_TYPE_Q5_1] = { .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_1, .quantize_row_q = quantize_row_q5_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference, .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = ggml_vec_dot_q5_1_q8_1, - .vec_dot_type = GGML_TYPE_Q8_1, + .vec_dot_q = ggml_v2_vec_dot_q5_1_q8_1, + .vec_dot_type = GGML_V2_TYPE_Q8_1, }, - [GGML_TYPE_Q8_0] = { + [GGML_V2_TYPE_Q8_0] = { .dequantize_row_q = dequantize_row_q8_0, .quantize_row_q = quantize_row_q8_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference, .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = ggml_vec_dot_q8_0_q8_0, - .vec_dot_type = GGML_TYPE_Q8_0, + .vec_dot_q = ggml_v2_vec_dot_q8_0_q8_0, + .vec_dot_type = GGML_V2_TYPE_Q8_0, }, - [GGML_TYPE_Q8_1] = { + [GGML_V2_TYPE_Q8_1] = { .dequantize_row_q = NULL, // TODO .quantize_row_q = quantize_row_q8_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference, .quantize_row_q_dot = quantize_row_q8_1, .vec_dot_q = NULL, // TODO - .vec_dot_type = GGML_TYPE_Q8_1, + .vec_dot_type = GGML_V2_TYPE_Q8_1, }, }; // For internal test use -quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { - GGML_ASSERT(i < GGML_TYPE_COUNT); +quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i) { + GGML_V2_ASSERT(i < GGML_V2_TYPE_COUNT); return quantize_fns[i]; } bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled -static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT]; //forward decl +static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT]; //forward decl static inline quantize_fns_t get_quantize_fn(size_t i) { return(quants_unshuffled?quantize_fns[i]:quantize_fns_v2[i]); @@ -1536,152 +1536,152 @@ static inline quantize_fns_t get_quantize_fn(size_t i) // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros // -// GGML_F32_STEP / GGML_F16_STEP +// GGML_V2_F32_STEP / GGML_V2_F16_STEP // number of elements to process in a single step // -// GGML_F32_EPR / GGML_F16_EPR +// GGML_V2_F32_EPR / GGML_V2_F16_EPR // number of elements to fit in a single register // #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) -#define GGML_SIMD +#define GGML_V2_SIMD // F32 NEON -#define GGML_F32_STEP 16 -#define GGML_F32_EPR 4 +#define GGML_V2_F32_STEP 16 +#define GGML_V2_F32_EPR 4 -#define GGML_F32x4 float32x4_t -#define GGML_F32x4_ZERO vdupq_n_f32(0.0f) -#define GGML_F32x4_SET1(x) vdupq_n_f32(x) -#define GGML_F32x4_LOAD vld1q_f32 -#define GGML_F32x4_STORE vst1q_f32 -#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) -#define GGML_F32x4_ADD vaddq_f32 -#define GGML_F32x4_MUL vmulq_f32 -#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#define GGML_F32x4_REDUCE(res, x) \ +#define GGML_V2_F32x4 float32x4_t +#define GGML_V2_F32x4_ZERO vdupq_n_f32(0.0f) +#define GGML_V2_F32x4_SET1(x) vdupq_n_f32(x) +#define GGML_V2_F32x4_LOAD vld1q_f32 +#define GGML_V2_F32x4_STORE vst1q_f32 +#define GGML_V2_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) +#define GGML_V2_F32x4_ADD vaddq_f32 +#define GGML_V2_F32x4_MUL vmulq_f32 +#define GGML_V2_F32x4_REDUCE_ONE(x) vaddvq_f32(x) +#define GGML_V2_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/2; ++i) { \ x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/4; ++i) { \ x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/8; ++i) { \ x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \ } \ - res = GGML_F32x4_REDUCE_ONE(x[0]); \ + res = GGML_V2_F32x4_REDUCE_ONE(x[0]); \ } -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE +#define GGML_V2_F32_VEC GGML_V2_F32x4 +#define GGML_V2_F32_VEC_ZERO GGML_V2_F32x4_ZERO +#define GGML_V2_F32_VEC_SET1 GGML_V2_F32x4_SET1 +#define GGML_V2_F32_VEC_LOAD GGML_V2_F32x4_LOAD +#define GGML_V2_F32_VEC_STORE GGML_V2_F32x4_STORE +#define GGML_V2_F32_VEC_FMA GGML_V2_F32x4_FMA +#define GGML_V2_F32_VEC_ADD GGML_V2_F32x4_ADD +#define GGML_V2_F32_VEC_MUL GGML_V2_F32x4_MUL +#define GGML_V2_F32_VEC_REDUCE GGML_V2_F32x4_REDUCE // F16 NEON #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #define GGML_F16_STEP 32 - #define GGML_F16_EPR 8 + #define GGML_V2_F16_STEP 32 + #define GGML_V2_F16_EPR 8 - #define GGML_F16x8 float16x8_t - #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) - #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD vld1q_f16 - #define GGML_F16x8_STORE vst1q_f16 - #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) - #define GGML_F16x8_ADD vaddq_f16 - #define GGML_F16x8_MUL vmulq_f16 - #define GGML_F16x8_REDUCE(res, x) \ + #define GGML_V2_F16x8 float16x8_t + #define GGML_V2_F16x8_ZERO vdupq_n_f16(0.0f) + #define GGML_V2_F16x8_SET1(x) vdupq_n_f16(x) + #define GGML_V2_F16x8_LOAD vld1q_f16 + #define GGML_V2_F16x8_STORE vst1q_f16 + #define GGML_V2_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) + #define GGML_V2_F16x8_ADD vaddq_f16 + #define GGML_V2_F16x8_MUL vmulq_f16 + #define GGML_V2_F16x8_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F16_ARR/2; ++i) { \ + for (int i = 0; i < GGML_V2_F16_ARR/2; ++i) { \ x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \ } \ - for (int i = 0; i < GGML_F16_ARR/4; ++i) { \ + for (int i = 0; i < GGML_V2_F16_ARR/4; ++i) { \ x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \ } \ - for (int i = 0; i < GGML_F16_ARR/8; ++i) { \ + for (int i = 0; i < GGML_V2_F16_ARR/8; ++i) { \ x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \ } \ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + res = (ggml_v2_float) vaddvq_f32(vaddq_f32(t0, t1)); \ } - #define GGML_F16_VEC GGML_F16x8 - #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO - #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 - #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i]) - #define GGML_F16_VEC_FMA GGML_F16x8_FMA - #define GGML_F16_VEC_ADD GGML_F16x8_ADD - #define GGML_F16_VEC_MUL GGML_F16x8_MUL - #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE + #define GGML_V2_F16_VEC GGML_V2_F16x8 + #define GGML_V2_F16_VEC_ZERO GGML_V2_F16x8_ZERO + #define GGML_V2_F16_VEC_SET1 GGML_V2_F16x8_SET1 + #define GGML_V2_F16_VEC_LOAD(p, i) GGML_V2_F16x8_LOAD(p) + #define GGML_V2_F16_VEC_STORE(p, r, i) GGML_V2_F16x8_STORE(p, r[i]) + #define GGML_V2_F16_VEC_FMA GGML_V2_F16x8_FMA + #define GGML_V2_F16_VEC_ADD GGML_V2_F16x8_ADD + #define GGML_V2_F16_VEC_MUL GGML_V2_F16x8_MUL + #define GGML_V2_F16_VEC_REDUCE GGML_V2_F16x8_REDUCE #else // if FP16 vector arithmetic is not supported, we use FP32 instead // and take advantage of the vcvt_ functions to convert to/from FP16 - #define GGML_F16_STEP 16 - #define GGML_F16_EPR 4 + #define GGML_V2_F16_STEP 16 + #define GGML_V2_F16_EPR 4 - #define GGML_F32Cx4 float32x4_t - #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) - #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) - #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) - #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) - #define GGML_F32Cx4_ADD vaddq_f32 - #define GGML_F32Cx4_MUL vmulq_f32 - #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + #define GGML_V2_F32Cx4 float32x4_t + #define GGML_V2_F32Cx4_ZERO vdupq_n_f32(0.0f) + #define GGML_V2_F32Cx4_SET1(x) vdupq_n_f32(x) + #define GGML_V2_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define GGML_V2_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) + #define GGML_V2_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) + #define GGML_V2_F32Cx4_ADD vaddq_f32 + #define GGML_V2_F32Cx4_MUL vmulq_f32 + #define GGML_V2_F32Cx4_REDUCE GGML_V2_F32x4_REDUCE - #define GGML_F16_VEC GGML_F32Cx4 - #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO - #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 - #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) - #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA - #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD - #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL - #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE + #define GGML_V2_F16_VEC GGML_V2_F32Cx4 + #define GGML_V2_F16_VEC_ZERO GGML_V2_F32Cx4_ZERO + #define GGML_V2_F16_VEC_SET1 GGML_V2_F32Cx4_SET1 + #define GGML_V2_F16_VEC_LOAD(p, i) GGML_V2_F32Cx4_LOAD(p) + #define GGML_V2_F16_VEC_STORE(p, r, i) GGML_V2_F32Cx4_STORE(p, r[i]) + #define GGML_V2_F16_VEC_FMA GGML_V2_F32Cx4_FMA + #define GGML_V2_F16_VEC_ADD GGML_V2_F32Cx4_ADD + #define GGML_V2_F16_VEC_MUL GGML_V2_F32Cx4_MUL + #define GGML_V2_F16_VEC_REDUCE GGML_V2_F32Cx4_REDUCE #endif #elif defined(__AVX__) -#define GGML_SIMD +#define GGML_V2_SIMD // F32 AVX -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 8 +#define GGML_V2_F32_STEP 32 +#define GGML_V2_F32_EPR 8 -#define GGML_F32x8 __m256 -#define GGML_F32x8_ZERO _mm256_setzero_ps() -#define GGML_F32x8_SET1(x) _mm256_set1_ps(x) -#define GGML_F32x8_LOAD _mm256_loadu_ps -#define GGML_F32x8_STORE _mm256_storeu_ps +#define GGML_V2_F32x8 __m256 +#define GGML_V2_F32x8_ZERO _mm256_setzero_ps() +#define GGML_V2_F32x8_SET1(x) _mm256_set1_ps(x) +#define GGML_V2_F32x8_LOAD _mm256_loadu_ps +#define GGML_V2_F32x8_STORE _mm256_storeu_ps #if defined(__FMA__) - #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) + #define GGML_V2_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) #else - #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) + #define GGML_V2_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) #endif -#define GGML_F32x8_ADD _mm256_add_ps -#define GGML_F32x8_MUL _mm256_mul_ps -#define GGML_F32x8_REDUCE(res, x) \ +#define GGML_V2_F32x8_ADD _mm256_add_ps +#define GGML_V2_F32x8_MUL _mm256_mul_ps +#define GGML_V2_F32x8_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/2; ++i) { \ x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/4; ++i) { \ x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/8; ++i) { \ x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \ } \ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ @@ -1691,93 +1691,93 @@ static inline quantize_fns_t get_quantize_fn(size_t i) } // TODO: is this optimal ? -#define GGML_F32_VEC GGML_F32x8 -#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x8_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD -#define GGML_F32_VEC_STORE GGML_F32x8_STORE -#define GGML_F32_VEC_FMA GGML_F32x8_FMA -#define GGML_F32_VEC_ADD GGML_F32x8_ADD -#define GGML_F32_VEC_MUL GGML_F32x8_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE +#define GGML_V2_F32_VEC GGML_V2_F32x8 +#define GGML_V2_F32_VEC_ZERO GGML_V2_F32x8_ZERO +#define GGML_V2_F32_VEC_SET1 GGML_V2_F32x8_SET1 +#define GGML_V2_F32_VEC_LOAD GGML_V2_F32x8_LOAD +#define GGML_V2_F32_VEC_STORE GGML_V2_F32x8_STORE +#define GGML_V2_F32_VEC_FMA GGML_V2_F32x8_FMA +#define GGML_V2_F32_VEC_ADD GGML_V2_F32x8_ADD +#define GGML_V2_F32_VEC_MUL GGML_V2_F32x8_MUL +#define GGML_V2_F32_VEC_REDUCE GGML_V2_F32x8_REDUCE // F16 AVX -#define GGML_F16_STEP 32 -#define GGML_F16_EPR 8 +#define GGML_V2_F16_STEP 32 +#define GGML_V2_F16_EPR 8 // F16 arithmetic is not supported by AVX, so we use F32 instead -#define GGML_F32Cx8 __m256 -#define GGML_F32Cx8_ZERO _mm256_setzero_ps() -#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) +#define GGML_V2_F32Cx8 __m256 +#define GGML_V2_F32Cx8_ZERO _mm256_setzero_ps() +#define GGML_V2_F32Cx8_SET1(x) _mm256_set1_ps(x) #if defined(__F16C__) // the _mm256_cvt intrinsics require F16C -#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) -#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#define GGML_V2_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) +#define GGML_V2_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #else -static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { +static inline __m256 __avx_f32cx8_load(ggml_v2_fp16_t *x) { float tmp[8]; for (int i = 0; i < 8; i++) - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_V2_FP16_TO_FP32(x[i]); return _mm256_loadu_ps(tmp); } -static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { +static inline void __avx_f32cx8_store(ggml_v2_fp16_t *x, __m256 y) { float arr[8]; _mm256_storeu_ps(arr, y); for (int i = 0; i < 8; i++) - x[i] = GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_V2_FP32_TO_FP16(arr[i]); } -#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) -#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) +#define GGML_V2_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define GGML_V2_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) #endif -#define GGML_F32Cx8_FMA GGML_F32x8_FMA -#define GGML_F32Cx8_ADD _mm256_add_ps -#define GGML_F32Cx8_MUL _mm256_mul_ps -#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE +#define GGML_V2_F32Cx8_FMA GGML_V2_F32x8_FMA +#define GGML_V2_F32Cx8_ADD _mm256_add_ps +#define GGML_V2_F32Cx8_MUL _mm256_mul_ps +#define GGML_V2_F32Cx8_REDUCE GGML_V2_F32x8_REDUCE -#define GGML_F16_VEC GGML_F32Cx8 -#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO -#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA -#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD -#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE +#define GGML_V2_F16_VEC GGML_V2_F32Cx8 +#define GGML_V2_F16_VEC_ZERO GGML_V2_F32Cx8_ZERO +#define GGML_V2_F16_VEC_SET1 GGML_V2_F32Cx8_SET1 +#define GGML_V2_F16_VEC_LOAD(p, i) GGML_V2_F32Cx8_LOAD(p) +#define GGML_V2_F16_VEC_STORE(p, r, i) GGML_V2_F32Cx8_STORE(p, r[i]) +#define GGML_V2_F16_VEC_FMA GGML_V2_F32Cx8_FMA +#define GGML_V2_F16_VEC_ADD GGML_V2_F32Cx8_ADD +#define GGML_V2_F16_VEC_MUL GGML_V2_F32Cx8_MUL +#define GGML_V2_F16_VEC_REDUCE GGML_V2_F32Cx8_REDUCE #elif defined(__POWER9_VECTOR__) -#define GGML_SIMD +#define GGML_V2_SIMD // F32 POWER9 -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 4 +#define GGML_V2_F32_STEP 32 +#define GGML_V2_F32_EPR 4 -#define GGML_F32x4 vector float -#define GGML_F32x4_ZERO 0.0f -#define GGML_F32x4_SET1 vec_splats -#define GGML_F32x4_LOAD(p) vec_xl(0, p) -#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) -#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) -#define GGML_F32x4_ADD vec_add -#define GGML_F32x4_MUL vec_mul -#define GGML_F32x4_REDUCE(res, x) \ +#define GGML_V2_F32x4 vector float +#define GGML_V2_F32x4_ZERO 0.0f +#define GGML_V2_F32x4_SET1 vec_splats +#define GGML_V2_F32x4_LOAD(p) vec_xl(0, p) +#define GGML_V2_F32x4_STORE(p, r) vec_xst(r, 0, p) +#define GGML_V2_F32x4_FMA(a, b, c) vec_madd(b, c, a) +#define GGML_V2_F32x4_ADD vec_add +#define GGML_V2_F32x4_MUL vec_mul +#define GGML_V2_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/2; ++i) { \ x[2*i] = vec_add(x[2*i], x[2*i+1]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/4; ++i) { \ x[4*i] = vec_add(x[4*i], x[4*i+2]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/8; ++i) { \ x[8*i] = vec_add(x[8*i], x[8*i+4]); \ } \ res = vec_extract(x[0], 0) + \ @@ -1786,61 +1786,61 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { vec_extract(x[0], 3); \ } -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE +#define GGML_V2_F32_VEC GGML_V2_F32x4 +#define GGML_V2_F32_VEC_ZERO GGML_V2_F32x4_ZERO +#define GGML_V2_F32_VEC_SET1 GGML_V2_F32x4_SET1 +#define GGML_V2_F32_VEC_LOAD GGML_V2_F32x4_LOAD +#define GGML_V2_F32_VEC_STORE GGML_V2_F32x4_STORE +#define GGML_V2_F32_VEC_FMA GGML_V2_F32x4_FMA +#define GGML_V2_F32_VEC_ADD GGML_V2_F32x4_ADD +#define GGML_V2_F32_VEC_MUL GGML_V2_F32x4_MUL +#define GGML_V2_F32_VEC_REDUCE GGML_V2_F32x4_REDUCE // F16 POWER9 -#define GGML_F16_STEP GGML_F32_STEP -#define GGML_F16_EPR GGML_F32_EPR -#define GGML_F16_VEC GGML_F32x4 -#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F16_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F16_VEC_FMA GGML_F32x4_FMA -#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE +#define GGML_V2_F16_STEP GGML_V2_F32_STEP +#define GGML_V2_F16_EPR GGML_V2_F32_EPR +#define GGML_V2_F16_VEC GGML_V2_F32x4 +#define GGML_V2_F16_VEC_ZERO GGML_V2_F32x4_ZERO +#define GGML_V2_F16_VEC_SET1 GGML_V2_F32x4_SET1 +#define GGML_V2_F16_VEC_FMA GGML_V2_F32x4_FMA +#define GGML_V2_F16_VEC_REDUCE GGML_V2_F32x4_REDUCE // Use vec_xl, not vec_ld, in case the load address is not aligned. -#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ - vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ +#define GGML_V2_F16_VEC_LOAD(p, i) (i & 0x1) ? \ + vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_V2_F16_EPR)) : \ vec_extract_fp32_from_shortl(vec_xl(0, p)) -#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] -#define GGML_F16_VEC_STORE(p, r, i) \ +#define GGML_V2_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] +#define GGML_V2_F16_VEC_STORE(p, r, i) \ if (i & 0x1) \ - vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ - r[i - GGML_ENDIAN_BYTE(0)]), \ - 0, p - GGML_F16_EPR) + vec_xst(vec_pack_to_short_fp32(r[i - GGML_V2_ENDIAN_BYTE(1)], \ + r[i - GGML_V2_ENDIAN_BYTE(0)]), \ + 0, p - GGML_V2_F16_EPR) #elif defined(__wasm_simd128__) -#define GGML_SIMD +#define GGML_V2_SIMD // F32 WASM -#define GGML_F32_STEP 16 -#define GGML_F32_EPR 4 +#define GGML_V2_F32_STEP 16 +#define GGML_V2_F32_EPR 4 -#define GGML_F32x4 v128_t -#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) -#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) -#define GGML_F32x4_LOAD wasm_v128_load -#define GGML_F32x4_STORE wasm_v128_store -#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) -#define GGML_F32x4_ADD wasm_f32x4_add -#define GGML_F32x4_MUL wasm_f32x4_mul -#define GGML_F32x4_REDUCE(res, x) \ +#define GGML_V2_F32x4 v128_t +#define GGML_V2_F32x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_V2_F32x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_V2_F32x4_LOAD wasm_v128_load +#define GGML_V2_F32x4_STORE wasm_v128_store +#define GGML_V2_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) +#define GGML_V2_F32x4_ADD wasm_f32x4_add +#define GGML_V2_F32x4_MUL wasm_f32x4_mul +#define GGML_V2_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/2; ++i) { \ x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/4; ++i) { \ x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/8; ++i) { \ x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ } \ res = wasm_f32x4_extract_lane(x[0], 0) + \ @@ -1849,60 +1849,60 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { wasm_f32x4_extract_lane(x[0], 3); \ } -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE +#define GGML_V2_F32_VEC GGML_V2_F32x4 +#define GGML_V2_F32_VEC_ZERO GGML_V2_F32x4_ZERO +#define GGML_V2_F32_VEC_SET1 GGML_V2_F32x4_SET1 +#define GGML_V2_F32_VEC_LOAD GGML_V2_F32x4_LOAD +#define GGML_V2_F32_VEC_STORE GGML_V2_F32x4_STORE +#define GGML_V2_F32_VEC_FMA GGML_V2_F32x4_FMA +#define GGML_V2_F32_VEC_ADD GGML_V2_F32x4_ADD +#define GGML_V2_F32_VEC_MUL GGML_V2_F32x4_MUL +#define GGML_V2_F32_VEC_REDUCE GGML_V2_F32x4_REDUCE // F16 WASM -#define GGML_F16_STEP 16 -#define GGML_F16_EPR 4 +#define GGML_V2_F16_STEP 16 +#define GGML_V2_F16_EPR 4 -inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { +inline static v128_t __wasm_f16x4_load(const ggml_v2_fp16_t * p) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(p[0]); - tmp[1] = GGML_FP16_TO_FP32(p[1]); - tmp[2] = GGML_FP16_TO_FP32(p[2]); - tmp[3] = GGML_FP16_TO_FP32(p[3]); + tmp[0] = GGML_V2_FP16_TO_FP32(p[0]); + tmp[1] = GGML_V2_FP16_TO_FP32(p[1]); + tmp[2] = GGML_V2_FP16_TO_FP32(p[2]); + tmp[3] = GGML_V2_FP16_TO_FP32(p[3]); return wasm_v128_load(tmp); } -inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { +inline static void __wasm_f16x4_store(ggml_v2_fp16_t * p, v128_t x) { float tmp[4]; wasm_v128_store(tmp, x); - p[0] = GGML_FP32_TO_FP16(tmp[0]); - p[1] = GGML_FP32_TO_FP16(tmp[1]); - p[2] = GGML_FP32_TO_FP16(tmp[2]); - p[3] = GGML_FP32_TO_FP16(tmp[3]); + p[0] = GGML_V2_FP32_TO_FP16(tmp[0]); + p[1] = GGML_V2_FP32_TO_FP16(tmp[1]); + p[2] = GGML_V2_FP32_TO_FP16(tmp[2]); + p[3] = GGML_V2_FP32_TO_FP16(tmp[3]); } -#define GGML_F16x4 v128_t -#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) -#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) -#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) -#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) -#define GGML_F16x4_FMA GGML_F32x4_FMA -#define GGML_F16x4_ADD wasm_f32x4_add -#define GGML_F16x4_MUL wasm_f32x4_mul -#define GGML_F16x4_REDUCE(res, x) \ +#define GGML_V2_F16x4 v128_t +#define GGML_V2_F16x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_V2_F16x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_V2_F16x4_LOAD(x) __wasm_f16x4_load(x) +#define GGML_V2_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) +#define GGML_V2_F16x4_FMA GGML_V2_F32x4_FMA +#define GGML_V2_F16x4_ADD wasm_f32x4_add +#define GGML_V2_F16x4_MUL wasm_f32x4_mul +#define GGML_V2_F16x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F16_ARR/2; ++i) { \ + for (int i = 0; i < GGML_V2_F16_ARR/2; ++i) { \ x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ } \ - for (int i = 0; i < GGML_F16_ARR/4; ++i) { \ + for (int i = 0; i < GGML_V2_F16_ARR/4; ++i) { \ x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ } \ - for (int i = 0; i < GGML_F16_ARR/8; ++i) { \ + for (int i = 0; i < GGML_V2_F16_ARR/8; ++i) { \ x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ } \ res = wasm_f32x4_extract_lane(x[0], 0) + \ @@ -1911,47 +1911,47 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { wasm_f32x4_extract_lane(x[0], 3); \ } -#define GGML_F16_VEC GGML_F16x4 -#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO -#define GGML_F16_VEC_SET1 GGML_F16x4_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F16x4_FMA -#define GGML_F16_VEC_ADD GGML_F16x4_ADD -#define GGML_F16_VEC_MUL GGML_F16x4_MUL -#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE +#define GGML_V2_F16_VEC GGML_V2_F16x4 +#define GGML_V2_F16_VEC_ZERO GGML_V2_F16x4_ZERO +#define GGML_V2_F16_VEC_SET1 GGML_V2_F16x4_SET1 +#define GGML_V2_F16_VEC_LOAD(p, i) GGML_V2_F16x4_LOAD(p) +#define GGML_V2_F16_VEC_STORE(p, r, i) GGML_V2_F16x4_STORE(p, r[i]) +#define GGML_V2_F16_VEC_FMA GGML_V2_F16x4_FMA +#define GGML_V2_F16_VEC_ADD GGML_V2_F16x4_ADD +#define GGML_V2_F16_VEC_MUL GGML_V2_F16x4_MUL +#define GGML_V2_F16_VEC_REDUCE GGML_V2_F16x4_REDUCE #elif defined(__SSE3__) -#define GGML_SIMD +#define GGML_V2_SIMD // F32 SSE -#define GGML_F32_STEP 32 -#define GGML_F32_EPR 4 +#define GGML_V2_F32_STEP 32 +#define GGML_V2_F32_EPR 4 -#define GGML_F32x4 __m128 -#define GGML_F32x4_ZERO _mm_setzero_ps() -#define GGML_F32x4_SET1(x) _mm_set1_ps(x) -#define GGML_F32x4_LOAD _mm_loadu_ps -#define GGML_F32x4_STORE _mm_storeu_ps +#define GGML_V2_F32x4 __m128 +#define GGML_V2_F32x4_ZERO _mm_setzero_ps() +#define GGML_V2_F32x4_SET1(x) _mm_set1_ps(x) +#define GGML_V2_F32x4_LOAD _mm_loadu_ps +#define GGML_V2_F32x4_STORE _mm_storeu_ps #if defined(__FMA__) // TODO: Does this work? - #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) + #define GGML_V2_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) #else - #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) + #define GGML_V2_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) #endif -#define GGML_F32x4_ADD _mm_add_ps -#define GGML_F32x4_MUL _mm_mul_ps -#define GGML_F32x4_REDUCE(res, x) \ +#define GGML_V2_F32x4_ADD _mm_add_ps +#define GGML_V2_F32x4_MUL _mm_mul_ps +#define GGML_V2_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/2; ++i) { \ x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/4; ++i) { \ x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ + for (int i = 0; i < GGML_V2_F32_ARR/8; ++i) { \ x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \ } \ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ @@ -1959,116 +1959,116 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { } // TODO: is this optimal ? -#define GGML_F32_VEC GGML_F32x4 -#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO -#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 -#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD -#define GGML_F32_VEC_STORE GGML_F32x4_STORE -#define GGML_F32_VEC_FMA GGML_F32x4_FMA -#define GGML_F32_VEC_ADD GGML_F32x4_ADD -#define GGML_F32_VEC_MUL GGML_F32x4_MUL -#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE +#define GGML_V2_F32_VEC GGML_V2_F32x4 +#define GGML_V2_F32_VEC_ZERO GGML_V2_F32x4_ZERO +#define GGML_V2_F32_VEC_SET1 GGML_V2_F32x4_SET1 +#define GGML_V2_F32_VEC_LOAD GGML_V2_F32x4_LOAD +#define GGML_V2_F32_VEC_STORE GGML_V2_F32x4_STORE +#define GGML_V2_F32_VEC_FMA GGML_V2_F32x4_FMA +#define GGML_V2_F32_VEC_ADD GGML_V2_F32x4_ADD +#define GGML_V2_F32_VEC_MUL GGML_V2_F32x4_MUL +#define GGML_V2_F32_VEC_REDUCE GGML_V2_F32x4_REDUCE // F16 SSE -#define GGML_F16_STEP 32 -#define GGML_F16_EPR 4 +#define GGML_V2_F16_STEP 32 +#define GGML_V2_F16_EPR 4 -static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { +static inline __m128 __sse_f16x4_load(ggml_v2_fp16_t *x) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_V2_FP16_TO_FP32(x[0]); + tmp[1] = GGML_V2_FP16_TO_FP32(x[1]); + tmp[2] = GGML_V2_FP16_TO_FP32(x[2]); + tmp[3] = GGML_V2_FP16_TO_FP32(x[3]); return _mm_loadu_ps(tmp); } -static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { +static inline void __sse_f16x4_store(ggml_v2_fp16_t *x, __m128 y) { float arr[4]; _mm_storeu_ps(arr, y); - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); + x[0] = GGML_V2_FP32_TO_FP16(arr[0]); + x[1] = GGML_V2_FP32_TO_FP16(arr[1]); + x[2] = GGML_V2_FP32_TO_FP16(arr[2]); + x[3] = GGML_V2_FP32_TO_FP16(arr[3]); } -#define GGML_F32Cx4 __m128 -#define GGML_F32Cx4_ZERO _mm_setzero_ps() -#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) -#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) -#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) -#define GGML_F32Cx4_FMA GGML_F32x4_FMA -#define GGML_F32Cx4_ADD _mm_add_ps -#define GGML_F32Cx4_MUL _mm_mul_ps -#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE +#define GGML_V2_F32Cx4 __m128 +#define GGML_V2_F32Cx4_ZERO _mm_setzero_ps() +#define GGML_V2_F32Cx4_SET1(x) _mm_set1_ps(x) +#define GGML_V2_F32Cx4_LOAD(x) __sse_f16x4_load(x) +#define GGML_V2_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) +#define GGML_V2_F32Cx4_FMA GGML_V2_F32x4_FMA +#define GGML_V2_F32Cx4_ADD _mm_add_ps +#define GGML_V2_F32Cx4_MUL _mm_mul_ps +#define GGML_V2_F32Cx4_REDUCE GGML_V2_F32x4_REDUCE -#define GGML_F16_VEC GGML_F32Cx4 -#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO -#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 -#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) -#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) -#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA -#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD -#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE +#define GGML_V2_F16_VEC GGML_V2_F32Cx4 +#define GGML_V2_F16_VEC_ZERO GGML_V2_F32Cx4_ZERO +#define GGML_V2_F16_VEC_SET1 GGML_V2_F32Cx4_SET1 +#define GGML_V2_F16_VEC_LOAD(p, i) GGML_V2_F32Cx4_LOAD(p) +#define GGML_V2_F16_VEC_STORE(p, r, i) GGML_V2_F32Cx4_STORE(p, r[i]) +#define GGML_V2_F16_VEC_FMA GGML_V2_F32Cx4_FMA +#define GGML_V2_F16_VEC_ADD GGML_V2_F32Cx4_ADD +#define GGML_V2_F16_VEC_MUL GGML_V2_F32Cx4_MUL +#define GGML_V2_F16_VEC_REDUCE GGML_V2_F32Cx4_REDUCE #endif -// GGML_F32_ARR / GGML_F16_ARR +// GGML_V2_F32_ARR / GGML_V2_F16_ARR // number of registers to use per step -#ifdef GGML_SIMD -#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) -#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) +#ifdef GGML_V2_SIMD +#define GGML_V2_F32_ARR (GGML_V2_F32_STEP/GGML_V2_F32_EPR) +#define GGML_V2_F16_ARR (GGML_V2_F16_STEP/GGML_V2_F16_EPR) #endif // // fundamental operations // -inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_v2_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_v2_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_v2_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_v2_vec_set_f16(const int n, ggml_v2_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } -inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } -inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } -inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } -inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } -inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } -inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } -inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } -inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } +inline static void ggml_v2_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +inline static void ggml_v2_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } +inline static void ggml_v2_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +inline static void ggml_v2_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void ggml_v2_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +inline static void ggml_v2_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_v2_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +inline static void ggml_v2_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +inline static void ggml_v2_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void ggml_v2_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { -#ifdef GGML_SIMD +inline static void ggml_v2_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +#ifdef GGML_V2_SIMD float sumf = 0.0f; - const int np = (n & ~(GGML_F32_STEP - 1)); + const int np = (n & ~(GGML_V2_F32_STEP - 1)); - GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + GGML_V2_F32_VEC sum[GGML_V2_F32_ARR] = { GGML_V2_F32_VEC_ZERO }; - GGML_F32_VEC ax[GGML_F32_ARR]; - GGML_F32_VEC ay[GGML_F32_ARR]; + GGML_V2_F32_VEC ax[GGML_V2_F32_ARR]; + GGML_V2_F32_VEC ay[GGML_V2_F32_ARR]; - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + for (int i = 0; i < np; i += GGML_V2_F32_STEP) { + for (int j = 0; j < GGML_V2_F32_ARR; j++) { + ax[j] = GGML_V2_F32_VEC_LOAD(x + i + j*GGML_V2_F32_EPR); + ay[j] = GGML_V2_F32_VEC_LOAD(y + i + j*GGML_V2_F32_EPR); - sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); + sum[j] = GGML_V2_F32_VEC_FMA(sum[j], ax[j], ay[j]); } } // reduce sum0..sum3 to sum0 - GGML_F32_VEC_REDUCE(sumf, sum); + GGML_V2_F32_VEC_REDUCE(sumf, sum); // leftovers for (int i = np; i < n; ++i) { @@ -2076,52 +2076,52 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float } #else // scalar - ggml_float sumf = 0.0; + ggml_v2_float sumf = 0.0; for (int i = 0; i < n; ++i) { - sumf += (ggml_float)(x[i]*y[i]); + sumf += (ggml_v2_float)(x[i]*y[i]); } #endif *s = sumf; } -inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { - ggml_float sumf = 0.0; +inline static void ggml_v2_vec_dot_f16(const int n, float * restrict s, ggml_v2_fp16_t * restrict x, ggml_v2_fp16_t * restrict y) { + ggml_v2_float sumf = 0.0; -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); +#if defined(GGML_V2_SIMD) + const int np = (n & ~(GGML_V2_F16_STEP - 1)); - GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; + GGML_V2_F16_VEC sum[GGML_V2_F16_ARR] = { GGML_V2_F16_VEC_ZERO }; - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; + GGML_V2_F16_VEC ax[GGML_V2_F16_ARR]; + GGML_V2_F16_VEC ay[GGML_V2_F16_ARR]; - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + for (int i = 0; i < np; i += GGML_V2_F16_STEP) { + for (int j = 0; j < GGML_V2_F16_ARR; j++) { + ax[j] = GGML_V2_F16_VEC_LOAD(x + i + j*GGML_V2_F16_EPR, j); + ay[j] = GGML_V2_F16_VEC_LOAD(y + i + j*GGML_V2_F16_EPR, j); - sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); + sum[j] = GGML_V2_F16_VEC_FMA(sum[j], ax[j], ay[j]); } } // reduce sum0..sum3 to sum0 - GGML_F16_VEC_REDUCE(sumf, sum); + GGML_V2_F16_VEC_REDUCE(sumf, sum); // leftovers for (int i = np; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_v2_float)(GGML_V2_FP16_TO_FP32(x[i])*GGML_V2_FP16_TO_FP32(y[i])); } #else for (int i = 0; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_v2_float)(GGML_V2_FP16_TO_FP32(x[i])*GGML_V2_FP16_TO_FP32(y[i])); } #endif *s = sumf; } -static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; @@ -2391,7 +2391,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * #endif } -static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_1; const int nb = n / qk; @@ -2517,7 +2517,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * #endif } -static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; @@ -2586,8 +2586,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - const float x0d = GGML_FP16_TO_FP32(x0->d); - const float x1d = GGML_FP16_TO_FP32(x1->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); + const float x1d = GGML_V2_FP16_TO_FP32(x1->d); #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( @@ -2668,7 +2668,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); // dot product sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( @@ -2688,7 +2688,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_V2_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2712,7 +2712,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_V2_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2755,14 +2755,14 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi; + sumf += (GGML_V2_FP16_TO_FP32(x[i].d)*y[i].d)*sumi; } *s = sumf; #endif } -static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_1; const int nb = n / qk; @@ -2794,8 +2794,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const uint8x16_t m4b = vdupq_n_u8(0x0F); - summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; - summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; + summs0 += GGML_V2_FP16_TO_FP32(x0->m) * y0->s; + summs1 += GGML_V2_FP16_TO_FP32(x1->m) * y1->s; // extract the 5th bit via lookup table ((b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); @@ -2837,8 +2837,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - const float x0d = GGML_FP16_TO_FP32(x0->d); - const float x1d = GGML_FP16_TO_FP32(x1->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); + const float x1d = GGML_V2_FP16_TO_FP32(x1->d); #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( @@ -2882,7 +2882,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const block_q5_1 * restrict x0 = &x[i]; const block_q8_1 * restrict y0 = &y[i]; - summs += GGML_FP16_TO_FP32(x0->m) * y0->s; + summs += GGML_V2_FP16_TO_FP32(x0->m) * y0->s; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -2924,7 +2924,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); // dot product sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( @@ -2945,9 +2945,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + const __m256 dx = _mm256_set1_ps(GGML_V2_FP16_TO_FP32(x[i].d)); - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_V2_FP16_TO_FP32(x[i].m) * y[i].s; __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2972,9 +2972,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * // Main loop for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + const __m256 dx = _mm256_set1_ps(GGML_V2_FP16_TO_FP32(x[i].d)); - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_V2_FP16_TO_FP32(x[i].m) * y[i].s; __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -3017,14 +3017,14 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_V2_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_V2_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; #endif } -static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; @@ -3126,77 +3126,77 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * #endif } -// compute GGML_VEC_DOT_UNROLL dot products at once +// compute GGML_V2_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes -inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { - ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; +inline static void ggml_v2_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_v2_fp16_t * restrict y) { + ggml_v2_float sumf[GGML_V2_VEC_DOT_UNROLL] = { 0.0 }; - ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; + ggml_v2_fp16_t * restrict x[GGML_V2_VEC_DOT_UNROLL]; - for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { - x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); + for (int i = 0; i < GGML_V2_VEC_DOT_UNROLL; ++i) { + x[i] = (ggml_v2_fp16_t *) ((char *) xv + i*xs); } -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F16_STEP - 1)); +#if defined(GGML_V2_SIMD) + const int np = (n & ~(GGML_V2_F16_STEP - 1)); - GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; + GGML_V2_F16_VEC sum[GGML_V2_VEC_DOT_UNROLL][GGML_V2_F16_ARR] = { { GGML_V2_F16_VEC_ZERO } }; - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; + GGML_V2_F16_VEC ax[GGML_V2_F16_ARR]; + GGML_V2_F16_VEC ay[GGML_V2_F16_ARR]; - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + for (int i = 0; i < np; i += GGML_V2_F16_STEP) { + for (int j = 0; j < GGML_V2_F16_ARR; j++) { + ay[j] = GGML_V2_F16_VEC_LOAD(y + i + j*GGML_V2_F16_EPR, j); - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); + for (int k = 0; k < GGML_V2_VEC_DOT_UNROLL; ++k) { + ax[j] = GGML_V2_F16_VEC_LOAD(x[k] + i + j*GGML_V2_F16_EPR, j); - sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); + sum[k][j] = GGML_V2_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); } } } // reduce sum0..sum3 to sum0 - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - GGML_F16_VEC_REDUCE(sumf[k], sum[k]); + for (int k = 0; k < GGML_V2_VEC_DOT_UNROLL; ++k) { + GGML_V2_F16_VEC_REDUCE(sumf[k], sum[k]); } // leftovers for (int i = np; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + for (int j = 0; j < GGML_V2_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_v2_float)(GGML_V2_FP16_TO_FP32(x[j][i])*GGML_V2_FP16_TO_FP32(y[i])); } } #else for (int i = 0; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + for (int j = 0; j < GGML_V2_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_v2_float)(GGML_V2_FP16_TO_FP32(x[j][i])*GGML_V2_FP16_TO_FP32(y[i])); } } #endif - for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { + for (int i = 0; i < GGML_V2_VEC_DOT_UNROLL; ++i) { s[i] = sumf[i]; } } -inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F32_STEP - 1)); +inline static void ggml_v2_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +#if defined(GGML_V2_SIMD) + const int np = (n & ~(GGML_V2_F32_STEP - 1)); - GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); + GGML_V2_F32_VEC vx = GGML_V2_F32_VEC_SET1(v); - GGML_F32_VEC ax[GGML_F32_ARR]; - GGML_F32_VEC ay[GGML_F32_ARR]; + GGML_V2_F32_VEC ax[GGML_V2_F32_ARR]; + GGML_V2_F32_VEC ay[GGML_V2_F32_ARR]; - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); + for (int i = 0; i < np; i += GGML_V2_F32_STEP) { + for (int j = 0; j < GGML_V2_F32_ARR; j++) { + ax[j] = GGML_V2_F32_VEC_LOAD(x + i + j*GGML_V2_F32_EPR); + ay[j] = GGML_V2_F32_VEC_LOAD(y + i + j*GGML_V2_F32_EPR); + ay[j] = GGML_V2_F32_VEC_FMA(ay[j], ax[j], vx); - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + GGML_V2_F32_VEC_STORE(y + i + j*GGML_V2_F32_EPR, ay[j]); } } @@ -3212,21 +3212,21 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float #endif } -//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } -inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { -#if defined(GGML_SIMD) - const int np = (n & ~(GGML_F32_STEP - 1)); +//inline static void ggml_v2_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } +inline static void ggml_v2_vec_scale_f32(const int n, float * y, const float v) { +#if defined(GGML_V2_SIMD) + const int np = (n & ~(GGML_V2_F32_STEP - 1)); - GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); + GGML_V2_F32_VEC vx = GGML_V2_F32_VEC_SET1(v); - GGML_F32_VEC ay[GGML_F32_ARR]; + GGML_V2_F32_VEC ay[GGML_V2_F32_ARR]; - for (int i = 0; i < np; i += GGML_F32_STEP) { - for (int j = 0; j < GGML_F32_ARR; j++) { - ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - ay[j] = GGML_F32_VEC_MUL(ay[j], vx); + for (int i = 0; i < np; i += GGML_V2_F32_STEP) { + for (int j = 0; j < GGML_V2_F32_ARR; j++) { + ay[j] = GGML_V2_F32_VEC_LOAD(y + i + j*GGML_V2_F32_EPR); + ay[j] = GGML_V2_F32_VEC_MUL(ay[j], vx); - GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); + GGML_V2_F32_VEC_STORE(y + i + j*GGML_V2_F32_EPR, ay[j]); } } @@ -3242,103 +3242,103 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #endif } -inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } -inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } -inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } -inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } -inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } -inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } -inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } -inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } +inline static void ggml_v2_vec_norm_f32 (const int n, float * s, const float * x) { ggml_v2_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } +inline static void ggml_v2_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } +inline static void ggml_v2_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } +inline static void ggml_v2_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } +inline static void ggml_v2_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } +inline static void ggml_v2_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } +inline static void ggml_v2_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +inline static void ggml_v2_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } static const float GELU_COEF_A = 0.044715f; static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; -inline static float ggml_gelu_f32(float x) { +inline static float ggml_v2_gelu_f32(float x) { return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } -inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { +inline static void ggml_v2_vec_gelu_f16(const int n, ggml_v2_fp16_t * y, const ggml_v2_fp16_t * x) { const uint16_t * i16 = (const uint16_t *) x; for (int i = 0; i < n; ++i) { y[i] = table_gelu_f16[i16[i]]; } } -#ifdef GGML_GELU_FP16 -inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { +#ifdef GGML_V2_GELU_FP16 +inline static void ggml_v2_vec_gelu_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_v2_fp16_t fp16 = GGML_V2_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]); + y[i] = GGML_V2_FP16_TO_FP32(table_gelu_f16[t]); } } #else -inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { +inline static void ggml_v2_vec_gelu_f32(const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) { - y[i] = ggml_gelu_f32(x[i]); + y[i] = ggml_v2_gelu_f32(x[i]); } } #endif // Sigmoid Linear Unit (SiLU) function -inline static float ggml_silu_f32(float x) { +inline static float ggml_v2_silu_f32(float x) { return x/(1.0f + expf(-x)); } -//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { +//inline static void ggml_v2_vec_silu_f16(const int n, ggml_v2_fp16_t * y, const ggml_v2_fp16_t * x) { // const uint16_t * i16 = (const uint16_t *) x; // for (int i = 0; i < n; ++i) { // y[i] = table_silu_f16[i16[i]]; // } //} -#ifdef GGML_SILU_FP16 -inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { +#ifdef GGML_V2_SILU_FP16 +inline static void ggml_v2_vec_silu_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_v2_fp16_t fp16 = GGML_V2_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]); + y[i] = GGML_V2_FP16_TO_FP32(table_silu_f16[t]); } } #else -inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { +inline static void ggml_v2_vec_silu_f32(const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) { - y[i] = ggml_silu_f32(x[i]); + y[i] = ggml_v2_silu_f32(x[i]); } } #endif -inline static float ggml_silu_backward_f32(float x, float dy) { +inline static float ggml_v2_silu_backward_f32(float x, float dy) { const float s = 1.0f/(1.0f + expf(-x)); return dy*s*(1.0f + x*(1.0f - s)); } -#ifdef GGML_SILU_FP16 -inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { +#ifdef GGML_V2_SILU_FP16 +inline static void ggml_v2_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { for (int i = 0; i < n; ++i) { // we did not use x[i] to compute forward silu but its f16 equivalent // take derivative at f16 of x[i]: - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); - float usedx = GGML_FP16_TO_FP32(fp16); - dx[i] = ggml_silu_backward_f32(usedx, dy[i]); + ggml_v2_fp16_t fp16 = GGML_V2_FP32_TO_FP16(x[i]); + float usedx = GGML_V2_FP16_TO_FP32(fp16); + dx[i] = ggml_v2_silu_backward_f32(usedx, dy[i]); } } #else -inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { +inline static void ggml_v2_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { for (int i = 0; i < n; ++i) { - dx[i] = ggml_silu_backward_f32(x[i], dy[i]); + dx[i] = ggml_v2_silu_backward_f32(x[i], dy[i]); } } #endif -inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { +inline static void ggml_v2_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE - ggml_float sum = 0.0; + ggml_v2_float sum = 0.0; for (int i = 0; i < n; ++i) { - sum += (ggml_float)x[i]; + sum += (ggml_v2_float)x[i]; } *s = sum; #else @@ -3346,15 +3346,15 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #endif } -inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) { - ggml_float sum = 0.0; +inline static void ggml_v2_vec_sum_ggf(const int n, ggml_v2_float * s, const float * x) { + ggml_v2_float sum = 0.0; for (int i = 0; i < n; ++i) { - sum += (ggml_float)x[i]; + sum += (ggml_v2_float)x[i]; } *s = sum; } -inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { +inline static void ggml_v2_vec_max_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE float max = -INFINITY; for (int i = 0; i < n; ++i) { @@ -3366,8 +3366,8 @@ inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { #endif } -inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { - ggml_vec_norm_f32(n, s, x); +inline static void ggml_v2_vec_norm_inv_f32(const int n, float * s, const float * x) { + ggml_v2_vec_norm_f32(n, s, x); *s = 1.f/(*s); } @@ -3375,104 +3375,104 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // logging // -#if (GGML_DEBUG >= 1) -#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#if (GGML_V2_DEBUG >= 1) +#define GGML_V2_PRINT_DEBUG(...) printf(__VA_ARGS__) #else -#define GGML_PRINT_DEBUG(...) +#define GGML_V2_PRINT_DEBUG(...) #endif -#if (GGML_DEBUG >= 5) -#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#if (GGML_V2_DEBUG >= 5) +#define GGML_V2_PRINT_DEBUG_5(...) printf(__VA_ARGS__) #else -#define GGML_PRINT_DEBUG_5(...) +#define GGML_V2_PRINT_DEBUG_5(...) #endif -#if (GGML_DEBUG >= 10) -#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#if (GGML_V2_DEBUG >= 10) +#define GGML_V2_PRINT_DEBUG_10(...) printf(__VA_ARGS__) #else -#define GGML_PRINT_DEBUG_10(...) +#define GGML_V2_PRINT_DEBUG_10(...) #endif -#define GGML_PRINT(...) printf(__VA_ARGS__) +#define GGML_V2_PRINT(...) printf(__VA_ARGS__) // // data types // -static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = 1, - [GGML_TYPE_F16] = 1, - [GGML_TYPE_Q4_0] = QK4_0, - [GGML_TYPE_Q4_1] = QK4_1, - [GGML_TYPE_Q4_2] = QK4_2, - [GGML_TYPE_Q4_3] = QK4_3, - [GGML_TYPE_Q5_0] = QK5_0, - [GGML_TYPE_Q5_1] = QK5_1, - [GGML_TYPE_Q8_0] = QK8_0, - [GGML_TYPE_Q8_1] = QK8_1, - [GGML_TYPE_Q8_1B] = QK8_1, - [GGML_TYPE_I8] = 1, - [GGML_TYPE_I16] = 1, - [GGML_TYPE_I32] = 1, +static const int GGML_V2_BLCK_SIZE[GGML_V2_TYPE_COUNT] = { + [GGML_V2_TYPE_F32] = 1, + [GGML_V2_TYPE_F16] = 1, + [GGML_V2_TYPE_Q4_0] = QK4_0, + [GGML_V2_TYPE_Q4_1] = QK4_1, + [GGML_V2_TYPE_Q4_2] = QK4_2, + [GGML_V2_TYPE_Q4_3] = QK4_3, + [GGML_V2_TYPE_Q5_0] = QK5_0, + [GGML_V2_TYPE_Q5_1] = QK5_1, + [GGML_V2_TYPE_Q8_0] = QK8_0, + [GGML_V2_TYPE_Q8_1] = QK8_1, + [GGML_V2_TYPE_Q8_1B] = QK8_1, + [GGML_V2_TYPE_I8] = 1, + [GGML_V2_TYPE_I16] = 1, + [GGML_V2_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 14, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_V2_TYPE_COUNT == 14, "GGML_V2_BLCK_SIZE is outdated"); -static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = sizeof(float), - [GGML_TYPE_F16] = sizeof(ggml_fp16_t), - [GGML_TYPE_Q4_0] = sizeof(block_q4_0), - [GGML_TYPE_Q4_1] = sizeof(block_q4_1), - [GGML_TYPE_Q4_2] = sizeof(block_q4_2), - [GGML_TYPE_Q4_3] = sizeof(block_q4_3), - [GGML_TYPE_Q5_0] = sizeof(block_q5_0), - [GGML_TYPE_Q5_1] = sizeof(block_q5_1), - [GGML_TYPE_Q8_0] = sizeof(block_q8_0), - [GGML_TYPE_Q8_1] = sizeof(block_q8_1), - [GGML_TYPE_Q8_1B] = sizeof(block_q8_1_v2), - [GGML_TYPE_I8] = sizeof(int8_t), - [GGML_TYPE_I16] = sizeof(int16_t), - [GGML_TYPE_I32] = sizeof(int32_t), +static const size_t GGML_V2_TYPE_SIZE[GGML_V2_TYPE_COUNT] = { + [GGML_V2_TYPE_F32] = sizeof(float), + [GGML_V2_TYPE_F16] = sizeof(ggml_v2_fp16_t), + [GGML_V2_TYPE_Q4_0] = sizeof(block_q4_0), + [GGML_V2_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_V2_TYPE_Q4_2] = sizeof(block_q4_2), + [GGML_V2_TYPE_Q4_3] = sizeof(block_q4_3), + [GGML_V2_TYPE_Q5_0] = sizeof(block_q5_0), + [GGML_V2_TYPE_Q5_1] = sizeof(block_q5_1), + [GGML_V2_TYPE_Q8_0] = sizeof(block_q8_0), + [GGML_V2_TYPE_Q8_1] = sizeof(block_q8_1), + [GGML_V2_TYPE_Q8_1B] = sizeof(block_q8_1_v2), + [GGML_V2_TYPE_I8] = sizeof(int8_t), + [GGML_V2_TYPE_I16] = sizeof(int16_t), + [GGML_V2_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 14, "GGML_TYPE_SIZE is outdated"); +static_assert(GGML_V2_TYPE_COUNT == 14, "GGML_V2_TYPE_SIZE is outdated"); -static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = "f32", - [GGML_TYPE_F16] = "f16", - [GGML_TYPE_Q4_0] = "q4_0", - [GGML_TYPE_Q4_1] = "q4_1", - [GGML_TYPE_Q4_2] = "q4_2", - [GGML_TYPE_Q4_3] = "q4_3", - [GGML_TYPE_Q5_0] = "q5_0", - [GGML_TYPE_Q5_1] = "q5_1", - [GGML_TYPE_Q8_0] = "q8_0", - [GGML_TYPE_Q8_1] = "q8_1", - [GGML_TYPE_Q8_1B] = "q8_1b", - [GGML_TYPE_I8] = "i8", - [GGML_TYPE_I16] = "i16", - [GGML_TYPE_I32] = "i32", +static const char * GGML_V2_TYPE_NAME[GGML_V2_TYPE_COUNT] = { + [GGML_V2_TYPE_F32] = "f32", + [GGML_V2_TYPE_F16] = "f16", + [GGML_V2_TYPE_Q4_0] = "q4_0", + [GGML_V2_TYPE_Q4_1] = "q4_1", + [GGML_V2_TYPE_Q4_2] = "q4_2", + [GGML_V2_TYPE_Q4_3] = "q4_3", + [GGML_V2_TYPE_Q5_0] = "q5_0", + [GGML_V2_TYPE_Q5_1] = "q5_1", + [GGML_V2_TYPE_Q8_0] = "q8_0", + [GGML_V2_TYPE_Q8_1] = "q8_1", + [GGML_V2_TYPE_Q8_1B] = "q8_1b", + [GGML_V2_TYPE_I8] = "i8", + [GGML_V2_TYPE_I16] = "i16", + [GGML_V2_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 14, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_V2_TYPE_COUNT == 14, "GGML_V2_TYPE_NAME is outdated"); -static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = false, - [GGML_TYPE_F16] = false, - [GGML_TYPE_Q4_0] = true, - [GGML_TYPE_Q4_1] = true, - [GGML_TYPE_Q4_2] = true, - [GGML_TYPE_Q4_3] = true, - [GGML_TYPE_Q5_0] = true, - [GGML_TYPE_Q5_1] = true, - [GGML_TYPE_Q8_0] = true, - [GGML_TYPE_Q8_1] = true, - [GGML_TYPE_Q8_1B] = true, - [GGML_TYPE_I8] = false, - [GGML_TYPE_I16] = false, - [GGML_TYPE_I32] = false, +static bool GGML_V2_IS_QUANTIZED[GGML_V2_TYPE_COUNT] = { + [GGML_V2_TYPE_F32] = false, + [GGML_V2_TYPE_F16] = false, + [GGML_V2_TYPE_Q4_0] = true, + [GGML_V2_TYPE_Q4_1] = true, + [GGML_V2_TYPE_Q4_2] = true, + [GGML_V2_TYPE_Q4_3] = true, + [GGML_V2_TYPE_Q5_0] = true, + [GGML_V2_TYPE_Q5_1] = true, + [GGML_V2_TYPE_Q8_0] = true, + [GGML_V2_TYPE_Q8_1] = true, + [GGML_V2_TYPE_Q8_1B] = true, + [GGML_V2_TYPE_I8] = false, + [GGML_V2_TYPE_I16] = false, + [GGML_V2_TYPE_I32] = false, }; -static_assert(GGML_TYPE_COUNT == 14, "GGML_IS_QUANTIZED is outdated"); +static_assert(GGML_V2_TYPE_COUNT == 14, "GGML_V2_IS_QUANTIZED is outdated"); -static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { +static const char * GGML_V2_OP_LABEL[GGML_V2_OP_COUNT] = { "NONE", "DUP", @@ -3530,9 +3530,9 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "MAP_BINARY", }; -static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); +static_assert(GGML_V2_OP_COUNT == 50, "GGML_V2_OP_COUNT != 50"); -static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { +static const char * GGML_V2_OP_SYMBOL[GGML_V2_OP_COUNT] = { "none", "x", @@ -3590,16 +3590,16 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "f(x,y)", }; -static_assert(GGML_OP_COUNT == 50, "GGML_OP_COUNT != 50"); +static_assert(GGML_V2_OP_COUNT == 50, "GGML_V2_OP_COUNT != 50"); -static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); -static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); +static_assert(sizeof(struct ggml_v2_object)%GGML_V2_MEM_ALIGN == 0, "ggml_v2_object size must be a multiple of GGML_V2_MEM_ALIGN"); +static_assert(sizeof(struct ggml_v2_tensor)%GGML_V2_MEM_ALIGN == 0, "ggml_v2_tensor size must be a multiple of GGML_V2_MEM_ALIGN"); // // ggml context // -struct ggml_context { +struct ggml_v2_context { size_t mem_size; void * mem_buffer; bool mem_buffer_owned; @@ -3607,31 +3607,31 @@ struct ggml_context { int n_objects; - struct ggml_object * objects_begin; - struct ggml_object * objects_end; + struct ggml_v2_object * objects_begin; + struct ggml_v2_object * objects_end; - struct ggml_scratch scratch; - struct ggml_scratch scratch_save; + struct ggml_v2_scratch scratch; + struct ggml_v2_scratch scratch_save; }; -struct ggml_context_container { +struct ggml_v2_context_container { bool used; - struct ggml_context context; + struct ggml_v2_context context; }; // // compute types // -enum ggml_task_type { - GGML_TASK_INIT = 0, - GGML_TASK_COMPUTE, - GGML_TASK_FINALIZE, +enum ggml_v2_task_type { + GGML_V2_TASK_INIT = 0, + GGML_V2_TASK_COMPUTE, + GGML_V2_TASK_FINALIZE, }; -struct ggml_compute_params { - enum ggml_task_type type; +struct ggml_v2_compute_params { + enum ggml_v2_task_type type; int ith, nth; @@ -3644,16 +3644,16 @@ struct ggml_compute_params { // ggml state // -struct ggml_state { - struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; +struct ggml_v2_state { + struct ggml_v2_context_container contexts[GGML_V2_MAX_CONTEXTS]; }; // global state -static struct ggml_state g_state; +static struct ggml_v2_state g_state; static atomic_int g_state_barrier = 0; // barrier via spin lock -inline static void ggml_critical_section_start(void) { +inline static void ggml_v2_critical_section_start(void) { int processing = atomic_fetch_add(&g_state_barrier, 1); while (processing > 0) { @@ -3666,89 +3666,89 @@ inline static void ggml_critical_section_start(void) { // TODO: make this somehow automatically executed // some sort of "sentry" mechanism -inline static void ggml_critical_section_end(void) { +inline static void ggml_v2_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } //////////////////////////////////////////////////////////////////////////////// -void ggml_print_object(const struct ggml_object * obj) { - GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n", +void ggml_v2_print_object(const struct ggml_v2_object * obj) { + GGML_V2_PRINT(" - ggml_v2_object: offset = %zu, size = %zu, next = %p\n", obj->offs, obj->size, (const void *) obj->next); } -void ggml_print_objects(const struct ggml_context * ctx) { - struct ggml_object * obj = ctx->objects_begin; +void ggml_v2_print_objects(const struct ggml_v2_context * ctx) { + struct ggml_v2_object * obj = ctx->objects_begin; - GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); + GGML_V2_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); while (obj != NULL) { - ggml_print_object(obj); + ggml_v2_print_object(obj); obj = obj->next; } - GGML_PRINT("%s: --- end ---\n", __func__); + GGML_V2_PRINT("%s: --- end ---\n", __func__); } -int64_t ggml_nelements(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +int64_t ggml_v2_nelements(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -int ggml_nrows(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +int ggml_v2_nrows(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -size_t ggml_nbytes(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +size_t ggml_v2_nbytes(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); - return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]; + return (ggml_v2_nelements(tensor)*GGML_V2_TYPE_SIZE[tensor->type])/GGML_V2_BLCK_SIZE[tensor->type]; } -int ggml_blck_size(enum ggml_type type) { - return GGML_BLCK_SIZE[type]; +int ggml_v2_blck_size(enum ggml_v2_type type) { + return GGML_V2_BLCK_SIZE[type]; } -size_t ggml_type_size(enum ggml_type type) { - return GGML_TYPE_SIZE[type]; +size_t ggml_v2_type_size(enum ggml_v2_type type) { + return GGML_V2_TYPE_SIZE[type]; } -float ggml_type_sizef(enum ggml_type type) { - return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type]; +float ggml_v2_type_sizef(enum ggml_v2_type type) { + return ((float)(GGML_V2_TYPE_SIZE[type]))/GGML_V2_BLCK_SIZE[type]; } -const char * ggml_type_name(enum ggml_type type) { - return GGML_TYPE_NAME[type]; +const char * ggml_v2_type_name(enum ggml_v2_type type) { + return GGML_V2_TYPE_NAME[type]; } -size_t ggml_element_size(const struct ggml_tensor * tensor) { - return GGML_TYPE_SIZE[tensor->type]; +size_t ggml_v2_element_size(const struct ggml_v2_tensor * tensor) { + return GGML_V2_TYPE_SIZE[tensor->type]; } -static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_is_scalar(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_vector(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_is_vector(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_is_matrix(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_can_mul_mat(const struct ggml_v2_tensor * t0, const struct ggml_v2_tensor * t1) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return (t0->ne[0] == t1->ne[0]) && @@ -3756,57 +3756,57 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } -bool ggml_is_quantized(enum ggml_type type) { - return GGML_IS_QUANTIZED[type]; +bool ggml_v2_is_quantized(enum ggml_v2_type type) { + return GGML_V2_IS_QUANTIZED[type]; } -enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { - enum ggml_type wtype = GGML_TYPE_COUNT; +enum ggml_v2_type ggml_v2_ftype_to_ggml_v2_type(enum ggml_v2_ftype ftype) { + enum ggml_v2_type wtype = GGML_V2_TYPE_COUNT; switch (ftype) { - case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; - case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; - case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; - case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; - case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break; - case GGML_FTYPE_MOSTLY_Q4_3: wtype = GGML_TYPE_Q4_3; break; - case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; - case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; - case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; - case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; - case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; + case GGML_V2_FTYPE_ALL_F32: wtype = GGML_V2_TYPE_F32; break; + case GGML_V2_FTYPE_MOSTLY_F16: wtype = GGML_V2_TYPE_F16; break; + case GGML_V2_FTYPE_MOSTLY_Q4_0: wtype = GGML_V2_TYPE_Q4_0; break; + case GGML_V2_FTYPE_MOSTLY_Q4_1: wtype = GGML_V2_TYPE_Q4_1; break; + case GGML_V2_FTYPE_MOSTLY_Q4_2: wtype = GGML_V2_TYPE_Q4_2; break; + case GGML_V2_FTYPE_MOSTLY_Q4_3: wtype = GGML_V2_TYPE_Q4_3; break; + case GGML_V2_FTYPE_MOSTLY_Q5_0: wtype = GGML_V2_TYPE_Q5_0; break; + case GGML_V2_FTYPE_MOSTLY_Q5_1: wtype = GGML_V2_TYPE_Q5_1; break; + case GGML_V2_FTYPE_MOSTLY_Q8_0: wtype = GGML_V2_TYPE_Q8_0; break; + case GGML_V2_FTYPE_UNKNOWN: wtype = GGML_V2_TYPE_COUNT; break; + case GGML_V2_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_V2_TYPE_COUNT; break; } - GGML_ASSERT(wtype != GGML_TYPE_COUNT); + GGML_V2_ASSERT(wtype != GGML_V2_TYPE_COUNT); return wtype; } -static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { +static inline bool ggml_v2_is_transposed(const struct ggml_v2_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } -static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_is_contiguous(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] && + tensor->nb[0] == GGML_V2_TYPE_SIZE[tensor->type] && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_V2_BLCK_SIZE[tensor->type] && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_is_padded_1d(const struct ggml_v2_tensor * tensor) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && + tensor->nb[0] == GGML_V2_TYPE_SIZE[tensor->type] && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_are_same_shape(const struct ggml_v2_tensor * t0, const struct ggml_v2_tensor * t1) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return (t0->ne[0] == t1->ne[0] ) && @@ -3816,8 +3816,8 @@ static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const stru } // check if t1 can be represented as a repeatition of t0 -static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_v2_can_repeat(const struct ggml_v2_tensor * t0, const struct ggml_v2_tensor * t1) { + static_assert(GGML_V2_MAX_DIMS == 4, "GGML_V2_MAX_DIMS is not 4 - update this function"); return (t1->ne[0]%t0->ne[0] == 0) && @@ -3826,82 +3826,82 @@ static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct g (t1->ne[3]%t0->ne[3] == 0); } -static inline int ggml_up32(int n) { +static inline int ggml_v2_up32(int n) { return (n + 31) & ~31; } -//static inline int ggml_up64(int n) { +//static inline int ggml_v2_up64(int n) { // return (n + 63) & ~63; //} -static inline int ggml_up(int n, int m) { +static inline int ggml_v2_up(int n, int m) { // assert m is a power of 2 - GGML_ASSERT((m & (m - 1)) == 0); + GGML_V2_ASSERT((m & (m - 1)) == 0); return (n + m - 1) & ~(m - 1); } -// assert that pointer is aligned to GGML_MEM_ALIGN -#define ggml_assert_aligned(ptr) \ - GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) +// assert that pointer is aligned to GGML_V2_MEM_ALIGN +#define ggml_v2_assert_aligned(ptr) \ + GGML_V2_ASSERT(((uintptr_t) (ptr))%GGML_V2_MEM_ALIGN == 0) //////////////////////////////////////////////////////////////////////////////// -struct ggml_context * ggml_init(struct ggml_init_params params) { +struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params) { // make this function thread safe - ggml_critical_section_start(); + ggml_v2_critical_section_start(); static bool is_first_call = true; if (is_first_call) { // initialize time system (required on Windows) - ggml_time_init(); + ggml_v2_time_init(); // initialize GELU, SILU and EXP F32 tables { - const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_v2_time_us(); UNUSED(t_start); - ggml_fp16_t ii; + ggml_v2_fp16_t ii; for (int i = 0; i < (1 << 16); ++i) { uint16_t ui = i; memcpy(&ii, &ui, sizeof(ii)); - const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); - table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); - table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); - table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); + const float f = table_f32_f16[i] = GGML_V2_COMPUTE_FP16_TO_FP32(ii); + table_gelu_f16[i] = GGML_V2_FP32_TO_FP16(ggml_v2_gelu_f32(f)); + table_silu_f16[i] = GGML_V2_FP32_TO_FP16(ggml_v2_silu_f32(f)); + table_exp_f16[i] = GGML_V2_FP32_TO_FP16(expf(f)); } - const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_v2_time_us(); UNUSED(t_end); - GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + GGML_V2_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } // initialize g_state { - const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_v2_time_us(); UNUSED(t_start); - g_state = (struct ggml_state) { + g_state = (struct ggml_v2_state) { /*.contexts =*/ { { 0 } }, }; - for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) { + for (int i = 0; i < GGML_V2_MAX_CONTEXTS; ++i) { g_state.contexts[i].used = false; } - const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_v2_time_us(); UNUSED(t_end); - GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + GGML_V2_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } #if defined(GGML_USE_CUBLAS) - ggml_init_cublas(); + ggml_v2_init_cublas(); #elif defined(GGML_USE_CLBLAST) if(quants_unshuffled) { - ggml_cl_init(); + ggml_v2_cl_init(); } else { - ggml_cl_init_legacy(); + ggml_v2_cl_init_legacy(); } #endif @@ -3909,31 +3909,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } // find non-used context in g_state - struct ggml_context * ctx = NULL; + struct ggml_v2_context * ctx = NULL; - for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { + for (int i = 0; i < GGML_V2_MAX_CONTEXTS; i++) { if (!g_state.contexts[i].used) { g_state.contexts[i].used = true; ctx = &g_state.contexts[i].context; - GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); + GGML_V2_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); break; } } if (ctx == NULL) { - GGML_PRINT_DEBUG("%s: no unused context found\n", __func__); + GGML_V2_PRINT_DEBUG("%s: no unused context found\n", __func__); - ggml_critical_section_end(); + ggml_v2_critical_section_end(); return NULL; } - const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1); + const size_t mem_size = (params.mem_size + GGML_V2_MEM_ALIGN - 1) & ~(GGML_V2_MEM_ALIGN - 1); - *ctx = (struct ggml_context) { + *ctx = (struct ggml_v2_context) { /*.mem_size =*/ mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_V2_ALIGNED_MALLOC(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.n_objects =*/ 0, @@ -3943,32 +3943,32 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.scratch_save =*/ { 0, 0, NULL, }, }; - GGML_ASSERT(ctx->mem_buffer != NULL); + GGML_V2_ASSERT(ctx->mem_buffer != NULL); - ggml_assert_aligned(ctx->mem_buffer); + ggml_v2_assert_aligned(ctx->mem_buffer); - GGML_PRINT_DEBUG("%s: context initialized\n", __func__); + GGML_V2_PRINT_DEBUG("%s: context initialized\n", __func__); - ggml_critical_section_end(); + ggml_v2_critical_section_end(); return ctx; } -void ggml_free(struct ggml_context * ctx) { +void ggml_v2_free(struct ggml_v2_context * ctx) { // make this function thread safe - ggml_critical_section_start(); + ggml_v2_critical_section_start(); bool found = false; - for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { + for (int i = 0; i < GGML_V2_MAX_CONTEXTS; i++) { if (&g_state.contexts[i].context == ctx) { g_state.contexts[i].used = false; - GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", + GGML_V2_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); if (ctx->mem_buffer_owned) { - GGML_ALIGNED_FREE(ctx->mem_buffer); + GGML_V2_ALIGNED_FREE(ctx->mem_buffer); } found = true; @@ -3977,17 +3977,17 @@ void ggml_free(struct ggml_context * ctx) { } if (!found) { - GGML_PRINT_DEBUG("%s: context not found\n", __func__); + GGML_V2_PRINT_DEBUG("%s: context not found\n", __func__); } - ggml_critical_section_end(); + ggml_v2_critical_section_end(); } -size_t ggml_used_mem(const struct ggml_context * ctx) { +size_t ggml_v2_used_mem(const struct ggml_v2_context * ctx) { return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; } -size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { +size_t ggml_v2_set_scratch(struct ggml_v2_context * ctx, struct ggml_v2_scratch scratch) { const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0; ctx->scratch = scratch; @@ -4000,25 +4000,25 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) // this is an error prone process, but it is necessary to support inplace // operators when using scratch buffers // TODO: implement a better way -void ggml_scratch_save(struct ggml_context * ctx) { +void ggml_v2_scratch_save(struct ggml_v2_context * ctx) { ctx->scratch_save = ctx->scratch; ctx->scratch.data = NULL; } -void ggml_scratch_load(struct ggml_context * ctx) { +void ggml_v2_scratch_load(struct ggml_v2_context * ctx) { ctx->scratch = ctx->scratch_save; } //////////////////////////////////////////////////////////////////////////////// -struct ggml_tensor * ggml_new_tensor_impl( - struct ggml_context * ctx, - enum ggml_type type, +struct ggml_v2_tensor * ggml_v2_new_tensor_impl( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, int n_dims, const int64_t* ne, void* data) { // always insert objects at the end of the context's memory pool - struct ggml_object * obj_cur = ctx->objects_end; + struct ggml_v2_object * obj_cur = ctx->objects_end; const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs; const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; @@ -4027,51 +4027,51 @@ struct ggml_tensor * ggml_new_tensor_impl( size_t size_needed = 0; if (data == NULL && !ctx->no_alloc) { - size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); + size_needed += GGML_V2_TYPE_SIZE[type]*(ne[0]/GGML_V2_BLCK_SIZE[type]); for (int i = 1; i < n_dims; i++) { size_needed *= ne[i]; } - // align to GGML_MEM_ALIGN - size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN; + // align to GGML_V2_MEM_ALIGN + size_needed = ((size_needed + GGML_V2_MEM_ALIGN - 1)/GGML_V2_MEM_ALIGN)*GGML_V2_MEM_ALIGN; } char * const mem_buffer = ctx->mem_buffer; - struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); + struct ggml_v2_object * const obj_new = (struct ggml_v2_object *)(mem_buffer + cur_end); if (ctx->scratch.data == NULL || data != NULL) { - size_needed += sizeof(struct ggml_tensor); + size_needed += sizeof(struct ggml_v2_tensor); - if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { - GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); + if (cur_end + size_needed + GGML_V2_OBJECT_SIZE > ctx->mem_size) { + GGML_V2_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", + __func__, cur_end + size_needed + GGML_V2_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } - *obj_new = (struct ggml_object) { - .offs = cur_end + GGML_OBJECT_SIZE, + *obj_new = (struct ggml_v2_object) { + .offs = cur_end + GGML_V2_OBJECT_SIZE, .size = size_needed, .next = NULL, }; } else { if (ctx->scratch.offs + size_needed > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory\n", __func__); + GGML_V2_PRINT("%s: not enough space in the scratch memory\n", __func__); assert(false); return NULL; } - if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) { - GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size); + if (cur_end + sizeof(struct ggml_v2_tensor) + GGML_V2_OBJECT_SIZE > ctx->mem_size) { + GGML_V2_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", + __func__, cur_end + sizeof(struct ggml_v2_tensor) + GGML_V2_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } data = (char * const) ctx->scratch.data + ctx->scratch.offs; - *obj_new = (struct ggml_object) { - .offs = cur_end + GGML_OBJECT_SIZE, - .size = sizeof(struct ggml_tensor), + *obj_new = (struct ggml_v2_object) { + .offs = cur_end + GGML_V2_OBJECT_SIZE, + .size = sizeof(struct ggml_v2_tensor), .next = NULL, }; @@ -4091,17 +4091,17 @@ struct ggml_tensor * ggml_new_tensor_impl( //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size); - struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs); + struct ggml_v2_tensor * const result = (struct ggml_v2_tensor *)(mem_buffer + obj_new->offs); - ggml_assert_aligned(result); + ggml_v2_assert_aligned(result); - *result = (struct ggml_tensor) { + *result = (struct ggml_v2_tensor) { /*.type =*/ type, - /*.backend =*/ GGML_BACKEND_CPU, + /*.backend =*/ GGML_V2_BACKEND_CPU, /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, - /*.op =*/ GGML_OP_NONE, + /*.op =*/ GGML_V2_OP_NONE, /*.is_param =*/ false, /*.grad =*/ NULL, /*.src0 =*/ NULL, @@ -4117,15 +4117,15 @@ struct ggml_tensor * ggml_new_tensor_impl( }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads - //ggml_assert_aligned(result->data); + //ggml_v2_assert_aligned(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; } - result->nb[0] = GGML_TYPE_SIZE[type]; - result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]); - for (int i = 2; i < GGML_MAX_DIMS; i++) { + result->nb[0] = GGML_V2_TYPE_SIZE[type]; + result->nb[1] = result->nb[0]*(result->ne[0]/GGML_V2_BLCK_SIZE[type]); + for (int i = 2; i < GGML_V2_MAX_DIMS; i++) { result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } @@ -4134,350 +4134,350 @@ struct ggml_tensor * ggml_new_tensor_impl( return result; } -struct ggml_tensor * ggml_new_tensor( - struct ggml_context * ctx, - enum ggml_type type, +struct ggml_v2_tensor * ggml_v2_new_tensor( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, int n_dims, const int64_t * ne) { - return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); + return ggml_v2_new_tensor_impl(ctx, type, n_dims, ne, NULL); } -struct ggml_tensor * ggml_new_tensor_1d( - struct ggml_context * ctx, - enum ggml_type type, +struct ggml_v2_tensor * ggml_v2_new_tensor_1d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, int64_t ne0) { - return ggml_new_tensor(ctx, type, 1, &ne0); + return ggml_v2_new_tensor(ctx, type, 1, &ne0); } -struct ggml_tensor * ggml_new_tensor_2d( - struct ggml_context * ctx, - enum ggml_type type, +struct ggml_v2_tensor * ggml_v2_new_tensor_2d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, int64_t ne0, int64_t ne1) { const int64_t ne[2] = { ne0, ne1 }; - return ggml_new_tensor(ctx, type, 2, ne); + return ggml_v2_new_tensor(ctx, type, 2, ne); } -struct ggml_tensor * ggml_new_tensor_3d( - struct ggml_context * ctx, - enum ggml_type type, +struct ggml_v2_tensor * ggml_v2_new_tensor_3d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, int64_t ne0, int64_t ne1, int64_t ne2) { const int64_t ne[3] = { ne0, ne1, ne2 }; - return ggml_new_tensor(ctx, type, 3, ne); + return ggml_v2_new_tensor(ctx, type, 3, ne); } -struct ggml_tensor * ggml_new_tensor_4d( - struct ggml_context * ctx, - enum ggml_type type, +struct ggml_v2_tensor * ggml_v2_new_tensor_4d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - return ggml_new_tensor(ctx, type, 4, ne); + return ggml_v2_new_tensor(ctx, type, 4, ne); } -struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { - ggml_scratch_save(ctx); +struct ggml_v2_tensor * ggml_v2_new_i32(struct ggml_v2_context * ctx, int32_t value) { + ggml_v2_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 1); - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - ggml_set_i32(result, value); + ggml_v2_set_i32(result, value); return result; } -struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { - ggml_scratch_save(ctx); +struct ggml_v2_tensor * ggml_v2_new_f32(struct ggml_v2_context * ctx, float value) { + ggml_v2_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 1); - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - ggml_set_f32(result, value); + ggml_v2_set_f32(result, value); return result; } -struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); +struct ggml_v2_tensor * ggml_v2_dup_tensor(struct ggml_v2_context * ctx, const struct ggml_v2_tensor * src) { + return ggml_v2_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); } -struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { - memset(tensor->data, 0, ggml_nbytes(tensor)); +struct ggml_v2_tensor * ggml_v2_set_zero(struct ggml_v2_tensor * tensor) { + memset(tensor->data, 0, ggml_v2_nbytes(tensor)); return tensor; } -struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { - const int n = ggml_nrows(tensor); +struct ggml_v2_tensor * ggml_v2_set_i32 (struct ggml_v2_tensor * tensor, int32_t value) { + const int n = ggml_v2_nrows(tensor); const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; char * const data = tensor->data; switch (tensor->type) { - case GGML_TYPE_I8: + case GGML_V2_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + ggml_v2_vec_set_i8(nc, (int8_t *)(data + i*n1), value); } } break; - case GGML_TYPE_I16: + case GGML_V2_TYPE_I16: { assert(tensor->nb[0] == sizeof(int16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + ggml_v2_vec_set_i16(nc, (int16_t *)(data + i*n1), value); } } break; - case GGML_TYPE_I32: + case GGML_V2_TYPE_I32: { assert(tensor->nb[0] == sizeof(int32_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + ggml_v2_vec_set_i32(nc, (int32_t *)(data + i*n1), value); } } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + assert(tensor->nb[0] == sizeof(ggml_v2_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + ggml_v2_vec_set_f16(nc, (ggml_v2_fp16_t *)(data + i*n1), value); } } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + ggml_v2_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } return tensor; } -struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { - const int n = ggml_nrows(tensor); +struct ggml_v2_tensor * ggml_v2_set_f32(struct ggml_v2_tensor * tensor, float value) { + const int n = ggml_v2_nrows(tensor); const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; char * const data = tensor->data; switch (tensor->type) { - case GGML_TYPE_I8: + case GGML_V2_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + ggml_v2_vec_set_i8(nc, (int8_t *)(data + i*n1), value); } } break; - case GGML_TYPE_I16: + case GGML_V2_TYPE_I16: { assert(tensor->nb[0] == sizeof(int16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + ggml_v2_vec_set_i16(nc, (int16_t *)(data + i*n1), value); } } break; - case GGML_TYPE_I32: + case GGML_V2_TYPE_I32: { assert(tensor->nb[0] == sizeof(int32_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + ggml_v2_vec_set_i32(nc, (int32_t *)(data + i*n1), value); } } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + assert(tensor->nb[0] == sizeof(ggml_v2_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value); + ggml_v2_vec_set_f16(nc, (ggml_v2_fp16_t *)(data + i*n1), value); } } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + ggml_v2_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } return tensor; } -int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { +int32_t ggml_v2_get_i32_1d(const struct ggml_v2_tensor * tensor, int i) { switch (tensor->type) { - case GGML_TYPE_I8: + case GGML_V2_TYPE_I8: { - GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; } break; - case GGML_TYPE_I16: + case GGML_V2_TYPE_I16: { - GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; } break; - case GGML_TYPE_I32: + case GGML_V2_TYPE_I32: { - GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(ggml_v2_fp16_t)); + return GGML_V2_FP16_TO_FP32(((ggml_v2_fp16_t *)(tensor->data))[i]); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } return 0.0f; } -void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { +void ggml_v2_set_i32_1d(const struct ggml_v2_tensor * tensor, int i, int32_t value) { switch (tensor->type) { - case GGML_TYPE_I8: + case GGML_V2_TYPE_I8: { - GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int8_t)); ((int8_t *)(tensor->data))[i] = value; } break; - case GGML_TYPE_I16: + case GGML_V2_TYPE_I16: { - GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int16_t)); ((int16_t *)(tensor->data))[i] = value; } break; - case GGML_TYPE_I32: + case GGML_V2_TYPE_I32: { - GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int32_t)); ((int32_t *)(tensor->data))[i] = value; } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(ggml_v2_fp16_t)); + ((ggml_v2_fp16_t *)(tensor->data))[i] = GGML_V2_FP32_TO_FP16(value); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { +float ggml_v2_get_f32_1d(const struct ggml_v2_tensor * tensor, int i) { switch (tensor->type) { - case GGML_TYPE_I8: + case GGML_V2_TYPE_I8: { - GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; } break; - case GGML_TYPE_I16: + case GGML_V2_TYPE_I16: { - GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; } break; - case GGML_TYPE_I32: + case GGML_V2_TYPE_I32: { - GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(ggml_v2_fp16_t)); + return GGML_V2_FP16_TO_FP32(((ggml_v2_fp16_t *)(tensor->data))[i]); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } return 0.0f; } -void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { +void ggml_v2_set_f32_1d(const struct ggml_v2_tensor * tensor, int i, float value) { switch (tensor->type) { - case GGML_TYPE_I8: + case GGML_V2_TYPE_I8: { - GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int8_t)); ((int8_t *)(tensor->data))[i] = value; } break; - case GGML_TYPE_I16: + case GGML_V2_TYPE_I16: { - GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int16_t)); ((int16_t *)(tensor->data))[i] = value; } break; - case GGML_TYPE_I32: + case GGML_V2_TYPE_I32: { - GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(int32_t)); ((int32_t *)(tensor->data))[i] = value; } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(ggml_v2_fp16_t)); + ((ggml_v2_fp16_t *)(tensor->data))[i] = GGML_V2_FP32_TO_FP16(value); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_V2_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -void * ggml_get_data(const struct ggml_tensor * tensor) { +void * ggml_v2_get_data(const struct ggml_v2_tensor * tensor) { return tensor->data; } -float * ggml_get_data_f32(const struct ggml_tensor * tensor) { - assert(tensor->type == GGML_TYPE_F32); +float * ggml_v2_get_data_f32(const struct ggml_v2_tensor * tensor) { + assert(tensor->type == GGML_V2_TYPE_F32); return (float *)(tensor->data); } -const char * ggml_get_name(const struct ggml_tensor * tensor) { +const char * ggml_v2_get_name(const struct ggml_v2_tensor * tensor) { return tensor->name; } -void ggml_set_name(struct ggml_tensor * tensor, const char * name) { +void ggml_v2_set_name(struct ggml_v2_tensor * tensor, const char * name) { strncpy(tensor->name, name, sizeof(tensor->name)); tensor->name[sizeof(tensor->name) - 1] = '\0'; } -struct ggml_tensor * ggml_view_tensor( - struct ggml_context * ctx, - const struct ggml_tensor * src) { - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); +struct ggml_v2_tensor * ggml_v2_view_tensor( + struct ggml_v2_context * ctx, + const struct ggml_v2_tensor * src) { + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); result->nb[0] = src->nb[0]; result->nb[1] = src->nb[1]; @@ -4489,11 +4489,11 @@ struct ggml_tensor * ggml_view_tensor( //////////////////////////////////////////////////////////////////////////////// -// ggml_dup +// ggml_v2_dup -struct ggml_tensor * ggml_dup_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_dup_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -4501,36 +4501,36 @@ struct ggml_tensor * ggml_dup_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_DUP; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_DUP; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_dup( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_dup_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_dup( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_dup_impl(ctx, a, false); } -struct ggml_tensor * ggml_dup_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_dup_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_dup_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_dup_impl(ctx, a, true); } -// ggml_add +// ggml_v2_add -struct ggml_tensor * ggml_add_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_add_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(a, b)); bool is_node = false; @@ -4538,39 +4538,39 @@ struct ggml_tensor * ggml_add_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_ADD; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_add( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_add_impl(ctx, a, b, false); +struct ggml_v2_tensor * ggml_v2_add( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_add_impl(ctx, a, b, false); } -struct ggml_tensor * ggml_add_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_add_impl(ctx, a, b, true); +struct ggml_v2_tensor * ggml_v2_add_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_add_impl(ctx, a, b, true); } -// ggml_add1 +// ggml_v2_add1 -struct ggml_tensor * ggml_add1_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_add1_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, bool inplace) { - GGML_ASSERT(ggml_is_scalar(b)); - GGML_ASSERT(ggml_is_padded_1d(a)); + GGML_V2_ASSERT(ggml_v2_is_scalar(b)); + GGML_V2_ASSERT(ggml_v2_is_padded_1d(a)); bool is_node = false; @@ -4578,45 +4578,45 @@ struct ggml_tensor * ggml_add1_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_ADD1; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_ADD1; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_add1( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_add1_impl(ctx, a, b, false); +struct ggml_v2_tensor * ggml_v2_add1( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_add1_impl(ctx, a, b, false); } -struct ggml_tensor * ggml_add1_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_add1_impl(ctx, a, b, true); +struct ggml_v2_tensor * ggml_v2_add1_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_add1_impl(ctx, a, b, true); } -// ggml_acc +// ggml_v2_acc -struct ggml_tensor * ggml_acc_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_acc_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, bool inplace) { - GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a)); - GGML_ASSERT(ggml_is_contiguous(a)); - GGML_ASSERT(a->type == GGML_TYPE_F32); - GGML_ASSERT(b->type == GGML_TYPE_F32); + GGML_V2_ASSERT(ggml_v2_nelements(b) <= ggml_v2_nelements(a)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(a)); + GGML_V2_ASSERT(a->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT(b->type == GGML_V2_TYPE_F32); bool is_node = false; @@ -4624,11 +4624,11 @@ struct ggml_tensor * ggml_acc_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - ggml_scratch_save(ctx); + ggml_v2_scratch_save(ctx); - struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); + struct ggml_v2_tensor * c = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 5); ((int32_t *) c->data)[0] = nb1; ((int32_t *) c->data)[1] = nb2; @@ -4636,10 +4636,10 @@ struct ggml_tensor * ggml_acc_impl( ((int32_t *) c->data)[3] = offset; ((int32_t *) c->data)[4] = inplace ? 1 : 0; - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - result->op = GGML_OP_ACC; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_ACC; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; result->opt[0] = c; @@ -4647,36 +4647,36 @@ struct ggml_tensor * ggml_acc_impl( return result; } -struct ggml_tensor * ggml_acc( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_acc( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); + return ggml_v2_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } -struct ggml_tensor * ggml_acc_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_acc_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); + return ggml_v2_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); } -// ggml_sub +// ggml_v2_sub -struct ggml_tensor * ggml_sub_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_sub_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(a, b)); bool is_node = false; @@ -4684,38 +4684,38 @@ struct ggml_tensor * ggml_sub_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SUB; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SUB; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_sub( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_sub_impl(ctx, a, b, false); +struct ggml_v2_tensor * ggml_v2_sub( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_sub_impl(ctx, a, b, false); } -struct ggml_tensor * ggml_sub_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_sub_impl(ctx, a, b, true); +struct ggml_v2_tensor * ggml_v2_sub_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_sub_impl(ctx, a, b, true); } -// ggml_mul +// ggml_v2_mul -struct ggml_tensor * ggml_mul_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_mul_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(a, b)); bool is_node = false; @@ -4724,41 +4724,41 @@ struct ggml_tensor * ggml_mul_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_V2_ASSERT(is_node == false); } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_MUL; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_MUL; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_mul( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_mul_impl(ctx, a, b, false); +struct ggml_v2_tensor * ggml_v2_mul( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_mul_impl(ctx, a, b, false); } -struct ggml_tensor * ggml_mul_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_mul_impl(ctx, a, b, true); +struct ggml_v2_tensor * ggml_v2_mul_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_mul_impl(ctx, a, b, true); } -// ggml_div +// ggml_v2_div -struct ggml_tensor * ggml_div_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_div_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(a, b)); bool is_node = false; @@ -4767,38 +4767,38 @@ struct ggml_tensor * ggml_div_impl( } if (inplace) { - GGML_ASSERT(is_node == false); + GGML_V2_ASSERT(is_node == false); } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_DIV; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_DIV; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_div( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_div_impl(ctx, a, b, false); +struct ggml_v2_tensor * ggml_v2_div( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_div_impl(ctx, a, b, false); } -struct ggml_tensor * ggml_div_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_div_impl(ctx, a, b, true); +struct ggml_v2_tensor * ggml_v2_div_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_div_impl(ctx, a, b, true); } -// ggml_sqr +// ggml_v2_sqr -struct ggml_tensor * ggml_sqr_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_sqr_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -4806,33 +4806,33 @@ struct ggml_tensor * ggml_sqr_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SQR; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SQR; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_sqr( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_sqr_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_sqr( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_sqr_impl(ctx, a, false); } -struct ggml_tensor * ggml_sqr_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_sqr_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_sqr_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_sqr_impl(ctx, a, true); } -// ggml_sqrt +// ggml_v2_sqrt -struct ggml_tensor * ggml_sqrt_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_sqrt_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -4840,34 +4840,34 @@ struct ggml_tensor * ggml_sqrt_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SQRT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SQRT; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_sqrt( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_sqrt_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_sqrt( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_sqrt_impl(ctx, a, false); } -struct ggml_tensor * ggml_sqrt_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_sqrt_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_sqrt_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_sqrt_impl(ctx, a, true); } -// ggml_log +// ggml_v2_log -struct ggml_tensor * ggml_log_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_log_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -4875,43 +4875,43 @@ struct ggml_tensor * ggml_log_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_LOG; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_LOG; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_log( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_log_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_log( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_log_impl(ctx, a, false); } -struct ggml_tensor * ggml_log_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_log_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_log_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_log_impl(ctx, a, true); } -// ggml_sum +// ggml_v2_sum -struct ggml_tensor * ggml_sum( - struct ggml_context * ctx, - struct ggml_tensor * a) { +struct ggml_v2_tensor * ggml_v2_sum( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { bool is_node = false; if (a->grad) { is_node = true; } - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_1d(ctx, a->type, 1); - result->op = GGML_OP_SUM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SUM; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -4919,11 +4919,11 @@ struct ggml_tensor * ggml_sum( } -// ggml_sum_rows +// ggml_v2_sum_rows -struct ggml_tensor * ggml_sum_rows( - struct ggml_context * ctx, - struct ggml_tensor * a) { +struct ggml_v2_tensor * ggml_v2_sum_rows( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { bool is_node = false; if (a->grad) { @@ -4935,46 +4935,46 @@ struct ggml_tensor * ggml_sum_rows( ne[i] = a->ne[i]; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, a->type, a->n_dims, ne); - result->op = GGML_OP_SUM_ROWS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SUM_ROWS; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -// ggml_mean +// ggml_v2_mean -struct ggml_tensor * ggml_mean( - struct ggml_context * ctx, - struct ggml_tensor * a) { +struct ggml_v2_tensor * ggml_v2_mean( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement is_node = true; } - int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne); + int64_t ne[GGML_V2_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, GGML_V2_TYPE_F32, a->n_dims, ne); - result->op = GGML_OP_MEAN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_MEAN; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -// ggml_repeat +// ggml_v2_repeat -struct ggml_tensor * ggml_repeat( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_can_repeat(a, b)); +struct ggml_v2_tensor * ggml_v2_repeat( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + GGML_V2_ASSERT(ggml_v2_can_repeat(a, b)); bool is_node = false; @@ -4982,25 +4982,25 @@ struct ggml_tensor * ggml_repeat( is_node = true; } - if (ggml_are_same_shape(a, b) && !is_node) { + if (ggml_v2_are_same_shape(a, b) && !is_node) { return a; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, a->type, b->n_dims, b->ne); - result->op = GGML_OP_REPEAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_REPEAT; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_abs +// ggml_v2_abs -struct ggml_tensor * ggml_abs_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_abs_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5008,34 +5008,34 @@ struct ggml_tensor * ggml_abs_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_ABS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_ABS; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_abs( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_abs_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_abs( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_abs_impl(ctx, a, false); } -struct ggml_tensor * ggml_abs_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_abs_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_abs_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_abs_impl(ctx, a, true); } -// ggml_sgn +// ggml_v2_sgn -struct ggml_tensor * ggml_sgn_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_sgn_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5043,33 +5043,33 @@ struct ggml_tensor * ggml_sgn_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SGN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SGN; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_sgn( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_sgn_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_sgn( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_sgn_impl(ctx, a, false); } -struct ggml_tensor * ggml_sgn_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_sgn_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_sgn_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_sgn_impl(ctx, a, true); } -// ggml_neg +// ggml_v2_neg -struct ggml_tensor * ggml_neg_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_neg_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5077,33 +5077,33 @@ struct ggml_tensor * ggml_neg_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_NEG; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_NEG; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_neg( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_neg_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_neg( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_neg_impl(ctx, a, false); } -struct ggml_tensor * ggml_neg_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_neg_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_neg_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_neg_impl(ctx, a, true); } -// ggml_step +// ggml_v2_step -struct ggml_tensor * ggml_step_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_step_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5111,33 +5111,33 @@ struct ggml_tensor * ggml_step_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_STEP; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_STEP; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_step( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_step_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_step( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_step_impl(ctx, a, false); } -struct ggml_tensor * ggml_step_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_step_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_step_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_step_impl(ctx, a, true); } -// ggml_relu +// ggml_v2_relu -struct ggml_tensor * ggml_relu_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_relu_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5145,33 +5145,33 @@ struct ggml_tensor * ggml_relu_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_RELU; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RELU; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_relu( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_relu_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_relu( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_relu_impl(ctx, a, false); } -struct ggml_tensor * ggml_relu_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_relu_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_relu_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_relu_impl(ctx, a, true); } -// ggml_gelu +// ggml_v2_gelu -struct ggml_tensor * ggml_gelu_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_gelu_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5179,33 +5179,33 @@ struct ggml_tensor * ggml_gelu_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_GELU; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_GELU; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_gelu( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_gelu_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_gelu( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_gelu_impl(ctx, a, false); } -struct ggml_tensor * ggml_gelu_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_gelu_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_gelu_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_gelu_impl(ctx, a, true); } -// ggml_silu +// ggml_v2_silu -struct ggml_tensor * ggml_silu_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_silu_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5213,34 +5213,34 @@ struct ggml_tensor * ggml_silu_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SILU; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SILU; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_silu( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_silu_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_silu( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_silu_impl(ctx, a, false); } -struct ggml_tensor * ggml_silu_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_silu_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_silu_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_silu_impl(ctx, a, true); } -// ggml_silu_back +// ggml_v2_silu_back -struct ggml_tensor * ggml_silu_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { +struct ggml_v2_tensor * ggml_v2_silu_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { bool is_node = false; if (a->grad || b->grad) { @@ -5248,54 +5248,54 @@ struct ggml_tensor * ggml_silu_back( is_node = true; } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SILU_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SILU_BACK; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_norm +// ggml_v2_norm -struct ggml_tensor * ggml_norm_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_norm_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward + GGML_V2_ASSERT(false); // TODO: implement backward is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_NORM; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; // TODO: maybe store epsilon here? return result; } -struct ggml_tensor * ggml_norm( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_norm_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_norm( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_norm_impl(ctx, a, false); } -struct ggml_tensor * ggml_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_norm_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_norm_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_norm_impl(ctx, a, true); } -struct ggml_tensor * ggml_rms_norm_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_rms_norm_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5303,32 +5303,32 @@ struct ggml_tensor * ggml_rms_norm_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_RMS_NORM; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RMS_NORM; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; // TODO: maybe store epsilon here? return result; } -struct ggml_tensor * ggml_rms_norm( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_rms_norm_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_rms_norm( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_rms_norm_impl(ctx, a, false); } -struct ggml_tensor * ggml_rms_norm_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_rms_norm_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_rms_norm_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_rms_norm_impl(ctx, a, true); } -struct ggml_tensor * ggml_rms_norm_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { +struct ggml_v2_tensor * ggml_v2_rms_norm_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { bool is_node = false; if (a->grad) { @@ -5336,10 +5336,10 @@ struct ggml_tensor * ggml_rms_norm_back( is_node = true; } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_RMS_NORM_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RMS_NORM_BACK; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; @@ -5347,14 +5347,14 @@ struct ggml_tensor * ggml_rms_norm_back( } -// ggml_mul_mat +// ggml_v2_mul_mat -struct ggml_tensor * ggml_mul_mat( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_can_mul_mat(a, b)); - GGML_ASSERT(!ggml_is_transposed(a)); +struct ggml_v2_tensor * ggml_v2_mul_mat( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + GGML_V2_ASSERT(ggml_v2_can_mul_mat(a, b)); + GGML_V2_ASSERT(!ggml_v2_is_transposed(a)); bool is_node = false; @@ -5363,25 +5363,25 @@ struct ggml_tensor * ggml_mul_mat( } const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, GGML_V2_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); - result->op = GGML_OP_MUL_MAT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_MUL_MAT; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_scale +// ggml_v2_scale -struct ggml_tensor * ggml_scale_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_scale_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, bool inplace) { - GGML_ASSERT(ggml_is_scalar(b)); - GGML_ASSERT(ggml_is_padded_1d(a)); + GGML_V2_ASSERT(ggml_v2_is_scalar(b)); + GGML_V2_ASSERT(ggml_v2_is_padded_1d(a)); bool is_node = false; @@ -5389,42 +5389,42 @@ struct ggml_tensor * ggml_scale_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SCALE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SCALE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_scale( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_scale_impl(ctx, a, b, false); +struct ggml_v2_tensor * ggml_v2_scale( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_scale_impl(ctx, a, b, false); } -struct ggml_tensor * ggml_scale_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_scale_impl(ctx, a, b, true); +struct ggml_v2_tensor * ggml_v2_scale_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_scale_impl(ctx, a, b, true); } -// ggml_set +// ggml_v2_set -struct ggml_tensor * ggml_set_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_set_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, bool inplace) { - GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b)); + GGML_V2_ASSERT(ggml_v2_nelements(a) >= ggml_v2_nelements(b)); bool is_node = false; @@ -5433,11 +5433,11 @@ struct ggml_tensor * ggml_set_impl( } // make a view of the destination - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - ggml_scratch_save(ctx); + ggml_v2_scratch_save(ctx); - struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); + struct ggml_v2_tensor * c = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 5); (( int32_t * ) c->data)[0] = nb1; (( int32_t * ) c->data)[1] = nb2; @@ -5445,10 +5445,10 @@ struct ggml_tensor * ggml_set_impl( (( int32_t * ) c->data)[3] = offset; (( int32_t * ) c->data)[4] = inplace ? 1 : 0; - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - result->op = GGML_OP_SET; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SET; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; result->opt[0] = c; @@ -5456,71 +5456,71 @@ struct ggml_tensor * ggml_set_impl( return result; } -struct ggml_tensor * ggml_set( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_set( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false); + return ggml_v2_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } -struct ggml_tensor * ggml_set_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_set_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true); + return ggml_v2_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true); } -struct ggml_tensor * ggml_set_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_set_1d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t offset) { - return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); + return ggml_v2_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); } -struct ggml_tensor * ggml_set_1d_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_set_1d_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t offset) { - return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); + return ggml_v2_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); } -struct ggml_tensor * ggml_set_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_set_2d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t offset) { - return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); + return ggml_v2_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } -struct ggml_tensor * ggml_set_2d_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_set_2d_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, size_t nb1, size_t offset) { - return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); + return ggml_v2_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } -// ggml_cpy +// ggml_v2_cpy -struct ggml_tensor * ggml_cpy_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, +struct ggml_v2_tensor * ggml_v2_cpy_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, bool inplace) { - GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); + GGML_V2_ASSERT(ggml_v2_nelements(a) == ggml_v2_nelements(b)); bool is_node = false; @@ -5529,35 +5529,35 @@ struct ggml_tensor * ggml_cpy_impl( } // make a view of the destination - struct ggml_tensor * result = ggml_view_tensor(ctx, b); + struct ggml_v2_tensor * result = ggml_v2_view_tensor(ctx, b); - result->op = GGML_OP_CPY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_CPY; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_cpy( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, false); +struct ggml_v2_tensor * ggml_v2_cpy( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_cpy_impl(ctx, a, b, false); } -struct ggml_tensor * ggml_cpy_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, true); +struct ggml_v2_tensor * ggml_v2_cpy_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + return ggml_v2_cpy_impl(ctx, a, b, true); } -// ggml_cont +// ggml_v2_cont -struct ggml_tensor * ggml_cont_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_cont_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -5565,37 +5565,37 @@ struct ggml_tensor * ggml_cont_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_CONT; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_CONT; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_cont( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_cont( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_cont_impl(ctx, a, false); } -struct ggml_tensor * ggml_cont_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_cont_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_cont_impl(ctx, a, true); } -// ggml_reshape +// ggml_v2_reshape -struct ggml_tensor * ggml_reshape( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_contiguous(a)); - GGML_ASSERT(ggml_is_contiguous(b)); - GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); +struct ggml_v2_tensor * ggml_v2_reshape( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + GGML_V2_ASSERT(ggml_v2_is_contiguous(a)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(b)); + GGML_V2_ASSERT(ggml_v2_nelements(a) == ggml_v2_nelements(b)); bool is_node = false; @@ -5605,25 +5605,25 @@ struct ggml_tensor * ggml_reshape( if (b->grad) { // gradient propagation is not supported - //GGML_ASSERT(false); + //GGML_V2_ASSERT(false); } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RESHAPE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_reshape_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_reshape_1d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0) { - GGML_ASSERT(ggml_is_contiguous(a)); - GGML_ASSERT(ggml_nelements(a) == ne0); + GGML_V2_ASSERT(ggml_v2_is_contiguous(a)); + GGML_V2_ASSERT(ggml_v2_nelements(a) == ne0); bool is_node = false; @@ -5632,23 +5632,23 @@ struct ggml_tensor * ggml_reshape_1d( } const int64_t ne[1] = { ne0 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 1, ne, a->data); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RESHAPE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_reshape_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_reshape_2d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0, int64_t ne1) { - GGML_ASSERT(ggml_is_contiguous(a)); - GGML_ASSERT(ggml_nelements(a) == ne0*ne1); + GGML_V2_ASSERT(ggml_v2_is_contiguous(a)); + GGML_V2_ASSERT(ggml_v2_nelements(a) == ne0*ne1); bool is_node = false; @@ -5657,24 +5657,24 @@ struct ggml_tensor * ggml_reshape_2d( } const int64_t ne[2] = { ne0, ne1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 2, ne, a->data); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RESHAPE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_reshape_3d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_reshape_3d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(ggml_is_contiguous(a)); - GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2); + GGML_V2_ASSERT(ggml_v2_is_contiguous(a)); + GGML_V2_ASSERT(ggml_v2_nelements(a) == ne0*ne1*ne2); bool is_node = false; @@ -5683,10 +5683,10 @@ struct ggml_tensor * ggml_reshape_3d( } const int64_t ne[3] = { ne0, ne1, ne2 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 3, ne, a->data); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RESHAPE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -5694,15 +5694,15 @@ struct ggml_tensor * ggml_reshape_3d( } -struct ggml_tensor * ggml_reshape_4d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_reshape_4d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(ggml_is_contiguous(a)); - GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3); + GGML_V2_ASSERT(ggml_v2_is_contiguous(a)); + GGML_V2_ASSERT(ggml_v2_nelements(a) == ne0*ne1*ne2*ne3); bool is_node = false; @@ -5711,21 +5711,21 @@ struct ggml_tensor * ggml_reshape_4d( } const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 4, ne, a->data); - result->op = GGML_OP_RESHAPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_RESHAPE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -// ggml_view_1d +// ggml_v2_view_1d -struct ggml_tensor * ggml_view_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_view_1d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0, size_t offset) { @@ -5735,10 +5735,10 @@ struct ggml_tensor * ggml_view_1d( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_VIEW; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -5749,11 +5749,11 @@ struct ggml_tensor * ggml_view_1d( return result; } -// ggml_view_2d +// ggml_v2_view_2d -struct ggml_tensor * ggml_view_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_view_2d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0, int64_t ne1, size_t nb1, @@ -5765,16 +5765,16 @@ struct ggml_tensor * ggml_view_2d( is_node = true; } - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; + const int64_t ne[GGML_V2_MAX_DIMS] = { ne0, ne1, 1, 1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; result->nb[3] = result->nb[2]; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_VIEW; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -5785,11 +5785,11 @@ struct ggml_tensor * ggml_view_2d( return result; } -// ggml_view_3d +// ggml_v2_view_3d -struct ggml_tensor * ggml_view_3d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_view_3d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, @@ -5803,16 +5803,16 @@ struct ggml_tensor * ggml_view_3d( is_node = true; } - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; + const int64_t ne[GGML_V2_MAX_DIMS] = { ne0, ne1, ne2, 1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = result->nb[2]*ne2; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_VIEW; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -5823,11 +5823,11 @@ struct ggml_tensor * ggml_view_3d( return result; } -// ggml_view_4d +// ggml_v2_view_4d -struct ggml_tensor * ggml_view_4d( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_view_4d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, @@ -5843,16 +5843,16 @@ struct ggml_tensor * ggml_view_4d( is_node = true; } - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; + const int64_t ne[GGML_V2_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = nb3; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_VIEW; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -5863,26 +5863,26 @@ struct ggml_tensor * ggml_view_4d( return result; } -// ggml_permute +// ggml_v2_permute -struct ggml_tensor * ggml_permute( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_permute( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int axis0, int axis1, int axis2, int axis3) { - GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS); - GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS); - GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS); - GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS); + GGML_V2_ASSERT(axis0 >= 0 && axis0 < GGML_V2_MAX_DIMS); + GGML_V2_ASSERT(axis1 >= 0 && axis1 < GGML_V2_MAX_DIMS); + GGML_V2_ASSERT(axis2 >= 0 && axis2 < GGML_V2_MAX_DIMS); + GGML_V2_ASSERT(axis3 >= 0 && axis3 < GGML_V2_MAX_DIMS); - GGML_ASSERT(axis0 != axis1); - GGML_ASSERT(axis0 != axis2); - GGML_ASSERT(axis0 != axis3); - GGML_ASSERT(axis1 != axis2); - GGML_ASSERT(axis1 != axis3); - GGML_ASSERT(axis2 != axis3); + GGML_V2_ASSERT(axis0 != axis1); + GGML_V2_ASSERT(axis0 != axis2); + GGML_V2_ASSERT(axis0 != axis3); + GGML_V2_ASSERT(axis1 != axis2); + GGML_V2_ASSERT(axis1 != axis3); + GGML_V2_ASSERT(axis2 != axis3); bool is_node = false; @@ -5890,10 +5890,10 @@ struct ggml_tensor * ggml_permute( is_node = true; } - struct ggml_tensor * result = ggml_view_tensor(ctx, a); + struct ggml_v2_tensor * result = ggml_v2_view_tensor(ctx, a); - int ne[GGML_MAX_DIMS]; - int nb[GGML_MAX_DIMS]; + int ne[GGML_V2_MAX_DIMS]; + int nb[GGML_V2_MAX_DIMS]; ne[axis0] = a->ne[0]; ne[axis1] = a->ne[1]; @@ -5915,8 +5915,8 @@ struct ggml_tensor * ggml_permute( result->nb[2] = nb[2]; result->nb[3] = nb[3]; - result->op = GGML_OP_PERMUTE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_PERMUTE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -5930,18 +5930,18 @@ struct ggml_tensor * ggml_permute( return result; } -// ggml_transpose +// ggml_v2_transpose -struct ggml_tensor * ggml_transpose( - struct ggml_context * ctx, - struct ggml_tensor * a) { +struct ggml_v2_tensor * ggml_v2_transpose( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { bool is_node = false; if (a->grad) { is_node = true; } - struct ggml_tensor * result = ggml_view_tensor(ctx, a); + struct ggml_v2_tensor * result = ggml_v2_view_tensor(ctx, a); result->ne[0] = a->ne[1]; result->ne[1] = a->ne[0]; @@ -5949,21 +5949,21 @@ struct ggml_tensor * ggml_transpose( result->nb[0] = a->nb[1]; result->nb[1] = a->nb[0]; - result->op = GGML_OP_TRANSPOSE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_TRANSPOSE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -// ggml_get_rows +// ggml_v2_get_rows -struct ggml_tensor * ggml_get_rows( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); +struct ggml_v2_tensor * ggml_v2_get_rows( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + GGML_V2_ASSERT(ggml_v2_is_matrix(a) && ggml_v2_is_vector(b) && b->type == GGML_V2_TYPE_I32); bool is_node = false; @@ -5972,26 +5972,26 @@ struct ggml_tensor * ggml_get_rows( } // TODO: implement non F32 return - //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]); + //struct ggml_v2_tensor * result = ggml_v2_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_2d(ctx, GGML_V2_TYPE_F32, a->ne[0], b->ne[0]); - result->op = GGML_OP_GET_ROWS; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_GET_ROWS; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_get_rows_back +// ggml_v2_get_rows_back -struct ggml_tensor * ggml_get_rows_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - struct ggml_tensor * c) { - GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); +struct ggml_v2_tensor * ggml_v2_get_rows_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + struct ggml_v2_tensor * c) { + GGML_V2_ASSERT(ggml_v2_is_matrix(a) && ggml_v2_is_vector(b) && b->type == GGML_V2_TYPE_I32); + GGML_V2_ASSERT(ggml_v2_is_matrix(c) && (a->ne[0] == c->ne[0])); bool is_node = false; @@ -6000,11 +6000,11 @@ struct ggml_tensor * ggml_get_rows_back( } // TODO: implement non F32 return - //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); + //struct ggml_v2_tensor * result = ggml_v2_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + struct ggml_v2_tensor * result = ggml_v2_new_tensor_2d(ctx, GGML_V2_TYPE_F32, c->ne[0], c->ne[1]); - result->op = GGML_OP_GET_ROWS_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_GET_ROWS_BACK; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; result->opt[0] = c; @@ -6012,12 +6012,12 @@ struct ggml_tensor * ggml_get_rows_back( return result; } -// ggml_diag +// ggml_v2_diag -struct ggml_tensor * ggml_diag( - struct ggml_context * ctx, - struct ggml_tensor * a) { - GGML_ASSERT(a->ne[1] == 1); +struct ggml_v2_tensor * ggml_v2_diag( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + GGML_V2_ASSERT(a->ne[1] == 1); bool is_node = false; if (a->grad) { @@ -6025,10 +6025,10 @@ struct ggml_tensor * ggml_diag( } const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); - result->op = GGML_OP_DIAG; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_DIAG; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; @@ -6036,11 +6036,11 @@ struct ggml_tensor * ggml_diag( } -// ggml_diag_mask_inf +// ggml_v2_diag_mask_inf -struct ggml_tensor * ggml_diag_mask_inf_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_diag_mask_inf_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past, bool inplace) { bool is_node = false; @@ -6049,45 +6049,45 @@ struct ggml_tensor * ggml_diag_mask_inf_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - ggml_scratch_save(ctx); + ggml_v2_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + struct ggml_v2_tensor * b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 2); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = inplace ? 1 : 0; - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - result->op = GGML_OP_DIAG_MASK_INF; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_DIAG_MASK_INF; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_diag_mask_inf( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_diag_mask_inf( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past) { - return ggml_diag_mask_inf_impl(ctx, a, n_past, false); + return ggml_v2_diag_mask_inf_impl(ctx, a, n_past, false); } -struct ggml_tensor * ggml_diag_mask_inf_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_diag_mask_inf_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past) { - return ggml_diag_mask_inf_impl(ctx, a, n_past, true); + return ggml_v2_diag_mask_inf_impl(ctx, a, n_past, true); } -// ggml_diag_mask_zero +// ggml_v2_diag_mask_zero -struct ggml_tensor * ggml_diag_mask_zero_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_diag_mask_zero_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past, bool inplace) { bool is_node = false; @@ -6096,45 +6096,45 @@ struct ggml_tensor * ggml_diag_mask_zero_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - ggml_scratch_save(ctx); + ggml_v2_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(b, "n_past, inplace"); + struct ggml_v2_tensor * b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 2); + ggml_v2_set_name(b, "n_past, inplace"); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = inplace ? 1 : 0; - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - result->op = GGML_OP_DIAG_MASK_ZERO; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_DIAG_MASK_ZERO; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_diag_mask_zero( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_diag_mask_zero( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past) { - return ggml_diag_mask_zero_impl(ctx, a, n_past, false); + return ggml_v2_diag_mask_zero_impl(ctx, a, n_past, false); } -struct ggml_tensor * ggml_diag_mask_zero_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_diag_mask_zero_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past) { - return ggml_diag_mask_zero_impl(ctx, a, n_past, true); + return ggml_v2_diag_mask_zero_impl(ctx, a, n_past, true); } -// ggml_soft_max +// ggml_v2_soft_max -struct ggml_tensor * ggml_soft_max_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_soft_max_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, bool inplace) { bool is_node = false; @@ -6142,264 +6142,264 @@ struct ggml_tensor * ggml_soft_max_impl( is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_SOFT_MAX; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_SOFT_MAX; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = NULL; return result; } -struct ggml_tensor * ggml_soft_max( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, false); +struct ggml_v2_tensor * ggml_v2_soft_max( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_soft_max_impl(ctx, a, false); } -struct ggml_tensor * ggml_soft_max_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_soft_max_impl(ctx, a, true); +struct ggml_v2_tensor * ggml_v2_soft_max_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a) { + return ggml_v2_soft_max_impl(ctx, a, true); } -// ggml_rope +// ggml_v2_rope -struct ggml_tensor * ggml_rope_impl( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_rope_impl( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past, int n_dims, int mode, bool inplace) { - GGML_ASSERT(n_past >= 0); + GGML_V2_ASSERT(n_past >= 0); bool is_node = false; if (!inplace && a->grad) { is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - ggml_scratch_save(ctx); + ggml_v2_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + struct ggml_v2_tensor * b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 3); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[2] = mode; - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - result->op = GGML_OP_ROPE; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_ROPE; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -struct ggml_tensor * ggml_rope( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_rope( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past, int n_dims, int mode) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false); + return ggml_v2_rope_impl(ctx, a, n_past, n_dims, mode, false); } -struct ggml_tensor * ggml_rope_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_rope_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past, int n_dims, int mode) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true); + return ggml_v2_rope_impl(ctx, a, n_past, n_dims, mode, true); } -// ggml_rope_back +// ggml_v2_rope_back -struct ggml_tensor * ggml_rope_back( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_rope_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past, int n_dims, int mode) { - GGML_ASSERT(n_past >= 0); + GGML_V2_ASSERT(n_past >= 0); bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_V2_ASSERT(false); // TODO: implement backward is_node = true; } - struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = ggml_v2_dup_tensor(ctx, a); - ggml_scratch_save(ctx); + ggml_v2_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - ggml_set_name(b, "n_past, n_dims, mode"); + struct ggml_v2_tensor * b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 3); + ggml_v2_set_name(b, "n_past, n_dims, mode"); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[2] = mode; - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - result->op = GGML_OP_ROPE_BACK; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_ROPE_BACK; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_alibi +// ggml_v2_alibi -struct ggml_tensor * ggml_alibi( - struct ggml_context * ctx, - struct ggml_tensor * a, +struct ggml_v2_tensor * ggml_v2_alibi( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, int n_past, int n_head) { - GGML_ASSERT(n_past >= 0); + GGML_V2_ASSERT(n_past >= 0); bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_V2_ASSERT(false); // TODO: implement backward is_node = true; } // TODO: when implement backward, fix this: - //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_view_tensor(ctx, a); + //struct ggml_v2_tensor * result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = ggml_v2_view_tensor(ctx, a); - ggml_scratch_save(ctx); + ggml_v2_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + struct ggml_v2_tensor * b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 2); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_head; - ggml_scratch_load(ctx); + ggml_v2_scratch_load(ctx); - result->op = GGML_OP_ALIBI; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_ALIBI; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_conv_1d_1s +// ggml_v2_conv_1d_1s -struct ggml_tensor * ggml_conv_1d_1s( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); +struct ggml_v2_tensor * ggml_v2_conv_1d_1s( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + GGML_V2_ASSERT(ggml_v2_is_matrix(b)); + GGML_V2_ASSERT(a->ne[1] == b->ne[1]); + GGML_V2_ASSERT(a->ne[3] == 1); bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_V2_ASSERT(false); // TODO: implement backward is_node = true; } const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, GGML_V2_TYPE_F32, 2, ne); - result->op = GGML_OP_CONV_1D_1S; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_CONV_1D_1S; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_conv_1d_2s +// ggml_v2_conv_1d_2s -struct ggml_tensor * ggml_conv_1d_2s( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); +struct ggml_v2_tensor * ggml_v2_conv_1d_2s( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b) { + GGML_V2_ASSERT(ggml_v2_is_matrix(b)); + GGML_V2_ASSERT(a->ne[1] == b->ne[1]); + GGML_V2_ASSERT(a->ne[3] == 1); bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_V2_ASSERT(false); // TODO: implement backward is_node = true; } const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, GGML_V2_TYPE_F32, 2, ne); - result->op = GGML_OP_CONV_1D_2S; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_CONV_1D_2S; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; return result; } -// ggml_flash_attn +// ggml_v2_flash_attn -struct ggml_tensor * ggml_flash_attn( - struct ggml_context * ctx, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, +struct ggml_v2_tensor * ggml_v2_flash_attn( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * q, + struct ggml_v2_tensor * k, + struct ggml_v2_tensor * v, bool masked) { - GGML_ASSERT(ggml_can_mul_mat(k, q)); + GGML_V2_ASSERT(ggml_v2_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) bool is_node = false; if (q->grad || k->grad || v->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_V2_ASSERT(false); // TODO: implement backward is_node = true; } - //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne); + //struct ggml_v2_tensor * result = ggml_v2_dup_tensor(ctx, q); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, GGML_V2_TYPE_F32, 4, q->ne); - result->op = GGML_OP_FLASH_ATTN; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_FLASH_ATTN; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = q; result->src1 = k; result->opt[0] = v; - result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0); + result->opt[1] = ggml_v2_new_i32(ctx, masked ? 1 : 0); return result; } -// ggml_flash_ff +// ggml_v2_flash_ff -struct ggml_tensor * ggml_flash_ff( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b0, - struct ggml_tensor * b1, - struct ggml_tensor * c0, - struct ggml_tensor * c1) { - GGML_ASSERT(ggml_can_mul_mat(b0, a)); +struct ggml_v2_tensor * ggml_v2_flash_ff( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b0, + struct ggml_v2_tensor * b1, + struct ggml_v2_tensor * c0, + struct ggml_v2_tensor * c1) { + GGML_V2_ASSERT(ggml_v2_can_mul_mat(b0, a)); // TODO: more checks bool is_node = false; if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_V2_ASSERT(false); // TODO: implement backward is_node = true; } - //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne); + //struct ggml_v2_tensor * result = ggml_v2_dup_tensor(ctx, a); + struct ggml_v2_tensor * result = ggml_v2_new_tensor(ctx, GGML_V2_TYPE_F32, 4, a->ne); - result->op = GGML_OP_FLASH_FF; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_FLASH_FF; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b0; result->opt[0] = b1; @@ -6409,12 +6409,12 @@ struct ggml_tensor * ggml_flash_ff( return result; } -// ggml_map_unary +// ggml_v2_map_unary -struct ggml_tensor * ggml_map_unary_impl_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_unary_op_f32_t fun, +struct ggml_v2_tensor * ggml_v2_map_unary_impl_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + const ggml_v2_unary_op_f32_t fun, bool inplace) { bool is_node = false; @@ -6422,41 +6422,41 @@ struct ggml_tensor * ggml_map_unary_impl_f32( is_node = true; } - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + struct ggml_v2_tensor * addr_tensor = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, sizeof(void *) / sizeof(int32_t)); *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor *result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_MAP_UNARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_MAP_UNARY; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->opt[0] = addr_tensor; return result; } -struct ggml_tensor * ggml_map_unary_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_unary_op_f32_t fun) { - return ggml_map_unary_impl_f32(ctx, a, fun, false); +struct ggml_v2_tensor * ggml_v2_map_unary_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + const ggml_v2_unary_op_f32_t fun) { + return ggml_v2_map_unary_impl_f32(ctx, a, fun, false); } -struct ggml_tensor * ggml_map_unary_inplace_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - const ggml_unary_op_f32_t fun) { - return ggml_map_unary_impl_f32(ctx, a, fun, true); +struct ggml_v2_tensor * ggml_v2_map_unary_inplace_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + const ggml_v2_unary_op_f32_t fun) { + return ggml_v2_map_unary_impl_f32(ctx, a, fun, true); } -// ggml_map_binary +// ggml_v2_map_binary -struct ggml_tensor * ggml_map_binary_impl_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_binary_op_f32_t fun, +struct ggml_v2_tensor * ggml_v2_map_binary_impl_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + const ggml_v2_binary_op_f32_t fun, bool inplace) { - GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(a, b)); bool is_node = false; @@ -6464,12 +6464,12 @@ struct ggml_tensor * ggml_map_binary_impl_f32( is_node = true; } - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); + struct ggml_v2_tensor * addr_tensor = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, sizeof(void *) / sizeof(int32_t)); *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_v2_tensor *result = inplace ? ggml_v2_view_tensor(ctx, a) : ggml_v2_dup_tensor(ctx, a); - result->op = GGML_OP_MAP_BINARY; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->op = GGML_V2_OP_MAP_BINARY; + result->grad = is_node ? ggml_v2_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; result->opt[0] = addr_tensor; @@ -6477,44 +6477,44 @@ struct ggml_tensor * ggml_map_binary_impl_f32( return result; } -struct ggml_tensor * ggml_map_binary_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_binary_op_f32_t fun) { - return ggml_map_binary_impl_f32(ctx, a, b, fun, false); +struct ggml_v2_tensor * ggml_v2_map_binary_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + const ggml_v2_binary_op_f32_t fun) { + return ggml_v2_map_binary_impl_f32(ctx, a, b, fun, false); } -struct ggml_tensor * ggml_map_binary_inplace_f32( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - const ggml_binary_op_f32_t fun) { - return ggml_map_binary_impl_f32(ctx, a, b, fun, true); +struct ggml_v2_tensor * ggml_v2_map_binary_inplace_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + const ggml_v2_binary_op_f32_t fun) { + return ggml_v2_map_binary_impl_f32(ctx, a, b, fun, true); } //////////////////////////////////////////////////////////////////////////////// -void ggml_set_param( - struct ggml_context * ctx, - struct ggml_tensor * tensor) { +void ggml_v2_set_param( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * tensor) { tensor->is_param = true; - GGML_ASSERT(tensor->grad == NULL); - tensor->grad = ggml_dup_tensor(ctx, tensor); + GGML_V2_ASSERT(tensor->grad == NULL); + tensor->grad = ggml_v2_dup_tensor(ctx, tensor); } -// ggml_compute_forward_dup +// ggml_v2_compute_forward_dup -static void ggml_compute_forward_dup_same_cont( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - GGML_ASSERT(src0->type == dst->type); +static void ggml_v2_compute_forward_dup_same_cont( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_nelements(dst) == ggml_v2_nelements(src0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst) && ggml_v2_is_contiguous(src0)); + GGML_V2_ASSERT(src0->type == dst->type); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -6525,7 +6525,7 @@ static void ggml_compute_forward_dup_same_cont( const int nth = params->nth; // number of threads // parallelize by elements - const int ne = ggml_nelements(dst); + const int ne = ggml_v2_nelements(dst); const int dr = (ne + nth - 1) / nth; const int ie0 = dr * ith; const int ie1 = MIN(ie0 + dr, ne); @@ -6534,17 +6534,17 @@ static void ggml_compute_forward_dup_same_cont( memcpy( ((char *) dst->data + ie0*nb0), ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + (ie1 - ie0) * GGML_V2_TYPE_SIZE[src0->type]); } } -static void ggml_compute_forward_dup_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); +static void ggml_v2_compute_forward_dup_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_nelements(dst) == ggml_v2_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -6571,8 +6571,8 @@ static void ggml_compute_forward_dup_f16( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + if (ggml_v2_is_contiguous(src0) && ggml_v2_is_contiguous(dst) && src0->type == dst->type) { + ggml_v2_compute_forward_dup_same_cont(params, src0, dst); return; } @@ -6586,7 +6586,7 @@ static void ggml_compute_forward_dup_f16( if (src0->type == dst->type && ne00 == ne0 && - nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { + nb00 == GGML_V2_TYPE_SIZE[src0->type] && nb0 == GGML_V2_TYPE_SIZE[dst->type]) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -6604,9 +6604,9 @@ static void ggml_compute_forward_dup_f16( // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy - if (ggml_is_contiguous(dst)) { - if (nb00 == sizeof(ggml_fp16_t)) { - if (dst->type == GGML_TYPE_F16) { + if (ggml_v2_is_contiguous(dst)) { + if (nb00 == sizeof(ggml_v2_fp16_t)) { + if (dst->type == GGML_V2_TYPE_F16) { size_t id = 0; const size_t rs = ne00 * nb00; char * dst_ptr = (char *) dst->data; @@ -6622,7 +6622,7 @@ static void ggml_compute_forward_dup_f16( id += rs * (ne01 - ir1); } } - } else if (dst->type == GGML_TYPE_F32) { + } else if (dst->type == GGML_V2_TYPE_F32) { size_t id = 0; float * dst_ptr = (float *) dst->data; @@ -6630,31 +6630,31 @@ static void ggml_compute_forward_dup_f16( for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); + dst_ptr[id] = GGML_V2_FP16_TO_FP32(src0_ptr[i00]); id++; } } id += ne00 * (ne01 - ir1); } } - } else if (ggml_is_quantized(dst->type)) { + } else if (ggml_v2_is_quantized(dst->type)) { quantize_row_q_t const quantize_row_q = get_quantize_fn(dst->type).quantize_row_q; float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; - size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / GGML_V2_BLCK_SIZE[dst->type]); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + src0_f32[i00] = GGML_V2_FP16_TO_FP32(src0_ptr[i00]); } quantize_row_q(src0_f32, dst_ptr + id, ne00); @@ -6664,12 +6664,12 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); - if (dst->type == GGML_TYPE_F32) { + if (dst->type == GGML_V2_TYPE_F32) { size_t id = 0; float * dst_ptr = (float *) dst->data; @@ -6678,25 +6678,25 @@ static void ggml_compute_forward_dup_f16( id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + dst_ptr[id] = GGML_V2_FP16_TO_FP32(*src0_ptr); id++; } } id += ne00 * (ne01 - ir1); } } - } else if (dst->type == GGML_TYPE_F16) { + } else if (dst->type == GGML_V2_TYPE_F16) { size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + ggml_v2_fp16_t * dst_ptr = (ggml_v2_fp16_t *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = *src0_ptr; id++; @@ -6706,7 +6706,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement } } return; @@ -6718,7 +6718,7 @@ static void ggml_compute_forward_dup_f16( int64_t i12 = 0; int64_t i13 = 0; - if (dst->type == GGML_TYPE_F16) { + if (dst->type == GGML_V2_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -6739,7 +6739,7 @@ static void ggml_compute_forward_dup_f16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); + memcpy(dst_ptr, src0_ptr, sizeof(ggml_v2_fp16_t)); if (++i10 == ne00) { i10 = 0; @@ -6770,7 +6770,7 @@ static void ggml_compute_forward_dup_f16( } } } - } else if (dst->type == GGML_TYPE_F32) { + } else if (dst->type == GGML_V2_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -6791,7 +6791,7 @@ static void ggml_compute_forward_dup_f16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + *(float *) dst_ptr = GGML_V2_FP16_TO_FP32(*(const ggml_v2_fp16_t *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -6823,17 +6823,17 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement } } -static void ggml_compute_forward_dup_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); +static void ggml_v2_compute_forward_dup_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_nelements(dst) == ggml_v2_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -6860,8 +6860,8 @@ static void ggml_compute_forward_dup_f32( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); + if (ggml_v2_is_contiguous(src0) && ggml_v2_is_contiguous(dst) && src0->type == dst->type) { + ggml_v2_compute_forward_dup_same_cont(params, src0, dst); return; } @@ -6875,7 +6875,7 @@ static void ggml_compute_forward_dup_f32( if (src0->type == dst->type && ne00 == ne0 && - nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { + nb00 == GGML_V2_TYPE_SIZE[src0->type] && nb0 == GGML_V2_TYPE_SIZE[dst->type]) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -6891,10 +6891,10 @@ static void ggml_compute_forward_dup_f32( return; } - if (ggml_is_contiguous(dst)) { + if (ggml_v2_is_contiguous(dst)) { // TODO: simplify if (nb00 == sizeof(float)) { - if (dst->type == GGML_TYPE_F32) { + if (dst->type == GGML_V2_TYPE_F32) { size_t id = 0; const size_t rs = ne00 * nb00; char * dst_ptr = (char *) dst->data; @@ -6910,9 +6910,9 @@ static void ggml_compute_forward_dup_f32( id += rs * (ne01 - ir1); } } - } else if (dst->type == GGML_TYPE_F16) { + } else if (dst->type == GGML_V2_TYPE_F16) { size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + ggml_v2_fp16_t * dst_ptr = (ggml_v2_fp16_t *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { @@ -6921,18 +6921,18 @@ static void ggml_compute_forward_dup_f32( for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + dst_ptr[id] = GGML_V2_FP32_TO_FP16(*src0_ptr); id++; } } id += ne00 * (ne01 - ir1); } } - } else if (ggml_is_quantized(dst->type)) { + } else if (ggml_v2_is_quantized(dst->type)) { quantize_row_q_t const quantize_row_q = get_quantize_fn(dst->type).quantize_row_q; size_t id = 0; - size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / GGML_V2_BLCK_SIZE[dst->type]); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -6947,12 +6947,12 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); - if (dst->type == GGML_TYPE_F32) { + if (dst->type == GGML_V2_TYPE_F32) { size_t id = 0; float * dst_ptr = (float *) dst->data; @@ -6970,9 +6970,9 @@ static void ggml_compute_forward_dup_f32( id += ne00 * (ne01 - ir1); } } - } else if (dst->type == GGML_TYPE_F16) { + } else if (dst->type == GGML_V2_TYPE_F16) { size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + ggml_v2_fp16_t * dst_ptr = (ggml_v2_fp16_t *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { @@ -6981,7 +6981,7 @@ static void ggml_compute_forward_dup_f32( for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + dst_ptr[id] = GGML_V2_FP32_TO_FP16(*src0_ptr); id++; } } @@ -6989,7 +6989,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement } } @@ -7003,7 +7003,7 @@ static void ggml_compute_forward_dup_f32( int64_t i12 = 0; int64_t i13 = 0; - if (dst->type == GGML_TYPE_F32) { + if (dst->type == GGML_V2_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -7055,7 +7055,7 @@ static void ggml_compute_forward_dup_f32( } } } - } else if (dst->type == GGML_TYPE_F16) { + } else if (dst->type == GGML_V2_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -7076,7 +7076,7 @@ static void ggml_compute_forward_dup_f32( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + *(ggml_v2_fp16_t *) dst_ptr = GGML_V2_FP32_TO_FP16(*(const float *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -7108,51 +7108,51 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement } } -static void ggml_compute_forward_dup( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, src0, dst); +static void ggml_v2_compute_forward_dup( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + if (ggml_v2_is_contiguous(src0) && ggml_v2_is_contiguous(dst) && src0->type == dst->type) { + ggml_v2_compute_forward_dup_same_cont(params, src0, dst); return; } switch (src0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_dup_f16(params, src0, dst); + ggml_v2_compute_forward_dup_f16(params, src0, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_dup_f32(params, src0, dst); + ggml_v2_compute_forward_dup_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_add +// ggml_v2_compute_forward_add -static void ggml_compute_forward_add_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_add_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7172,8 +7172,8 @@ static void ggml_compute_forward_add_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT( nb0 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7197,7 +7197,7 @@ static void ggml_compute_forward_add_f32( (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - ggml_vec_add_f32(ne0, + ggml_v2_vec_add_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -7224,21 +7224,21 @@ static void ggml_compute_forward_add_f32( } } -static void ggml_compute_forward_add_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_add_f16_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7258,12 +7258,12 @@ static void ggml_compute_forward_add_f16_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT(dst->type == GGML_V2_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT( nb0 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nb00 == sizeof(ggml_v2_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7279,36 +7279,36 @@ static void ggml_compute_forward_add_f16_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_v2_fp16_t * dst_ptr = (ggml_v2_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + dst_ptr[i] = GGML_V2_FP32_TO_FP16(GGML_V2_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); } } } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } } -static void ggml_compute_forward_add_f16_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_add_f16_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7328,12 +7328,12 @@ static void ggml_compute_forward_add_f16_f16( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(dst->type == GGML_V2_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT( nb0 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nb00 == sizeof(ggml_v2_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7342,40 +7342,40 @@ static void ggml_compute_forward_add_f16_f16( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - if (nb10 == sizeof(ggml_fp16_t)) { + if (nb10 == sizeof(ggml_v2_fp16_t)) { for (int ir = ir0; ir < ir1; ++ir) { // src0, src1 and dst are same shape => same indices const int i3 = ir/(ne2*ne1); const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + ggml_v2_fp16_t * dst_ptr = (ggml_v2_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_v2_fp16_t * src1_ptr = (ggml_v2_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i])); + dst_ptr[i] = GGML_V2_FP32_TO_FP16(GGML_V2_FP16_TO_FP32(src0_ptr[i]) + GGML_V2_FP16_TO_FP32(src1_ptr[i])); } } } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } } -static void ggml_compute_forward_add_q_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_add_q_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -7399,22 +7399,22 @@ static void ggml_compute_forward_add_q_f32( const int ith = params->ith; const int nth = params->nth; - const enum ggml_type type = src0->type; + const enum ggml_v2_type type = src0->type; dequantize_row_q_t const dequantize_row_q = get_quantize_fn(type).dequantize_row_q; quantize_row_q_t const quantize_row_q = get_quantize_fn(type).quantize_row_q; // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(nb00 == GGML_V2_TYPE_SIZE[type]); + GGML_V2_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + GGML_V2_ASSERT(nb0 <= nb1); + GGML_V2_ASSERT(nb1 <= nb2); + GGML_V2_ASSERT(nb2 <= nb3); - GGML_ASSERT(ggml_is_quantized(src0->type)); - GGML_ASSERT(dst->type == src0->type); - GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_V2_ASSERT(ggml_v2_is_quantized(src0->type)); + GGML_V2_ASSERT(dst->type == src0->type); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7449,69 +7449,69 @@ static void ggml_compute_forward_add_q_f32( // unquantize row from src0 to temp buffer dequantize_row_q(src0_row, wdata, ne00); // add src1 - ggml_vec_acc_f32(ne00, wdata, src1_row); + ggml_v2_vec_acc_f32(ne00, wdata, src1_row); // quantize row to dst quantize_row_q(wdata, dst_row, ne00); } } -static void ggml_compute_forward_add( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_add( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_add_f32(params, src0, src1, dst); + ggml_v2_compute_forward_add_f32(params, src0, src1, dst); } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + if (src1->type == GGML_V2_TYPE_F16) { + ggml_v2_compute_forward_add_f16_f16(params, src0, src1, dst); } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + else if (src1->type == GGML_V2_TYPE_F32) { + ggml_v2_compute_forward_add_f16_f32(params, src0, src1, dst); } else { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_3: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: { - ggml_compute_forward_add_q_f32(params, src0, src1, dst); + ggml_v2_compute_forward_add_q_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_add1 +// ggml_v2_compute_forward_add1 -static void ggml_compute_forward_add1_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); +static void ggml_v2_compute_forward_add1_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7526,8 +7526,8 @@ static void ggml_compute_forward_add1_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT( nb0 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7543,7 +7543,7 @@ static void ggml_compute_forward_add1_f32( const int i1 = (ir - i3*ne2*ne1 - i2*ne1); #ifdef GGML_USE_ACCELERATE - UNUSED(ggml_vec_add1_f32); + UNUSED(ggml_v2_vec_add1_f32); vDSP_vadd( (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, @@ -7551,7 +7551,7 @@ static void ggml_compute_forward_add1_f32( (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - ggml_vec_add1_f32(ne0, + ggml_v2_vec_add1_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), *(float *) src1->data); @@ -7559,15 +7559,15 @@ static void ggml_compute_forward_add1_f32( } } -static void ggml_compute_forward_add1_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); +static void ggml_v2_compute_forward_add1_f16_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -7577,7 +7577,7 @@ static void ggml_compute_forward_add1_f16_f32( const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7592,12 +7592,12 @@ static void ggml_compute_forward_add1_f16_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT(dst->type == GGML_V2_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT( nb0 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nb00 == sizeof(ggml_v2_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7612,33 +7612,33 @@ static void ggml_compute_forward_add1_f16_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_v2_fp16_t * dst_ptr = (ggml_v2_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_V2_FP32_TO_FP16(GGML_V2_FP16_TO_FP32(src0_ptr[i]) + v); } } } -static void ggml_compute_forward_add1_f16_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); +static void ggml_v2_compute_forward_add1_f16_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } // scalar to add - const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + const float v = GGML_V2_FP16_TO_FP32(*(ggml_v2_fp16_t *) src1->data); const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7653,12 +7653,12 @@ static void ggml_compute_forward_add1_f16_f16( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(dst->type == GGML_V2_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT( nb0 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nb00 == sizeof(ggml_v2_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7673,23 +7673,23 @@ static void ggml_compute_forward_add1_f16_f16( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_v2_fp16_t * dst_ptr = (ggml_v2_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_v2_fp16_t * src0_ptr = (ggml_v2_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_V2_FP32_TO_FP16(GGML_V2_FP16_TO_FP32(src0_ptr[i]) + v); } } } -static void ggml_compute_forward_add1_q_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); +static void ggml_v2_compute_forward_add1_q_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -7699,7 +7699,7 @@ static void ggml_compute_forward_add1_q_f32( const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7714,21 +7714,21 @@ static void ggml_compute_forward_add1_q_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - const enum ggml_type type = src0->type; + const enum ggml_v2_type type = src0->type; dequantize_row_q_t const dequantize_row_q = get_quantize_fn(type).dequantize_row_q; quantize_row_q_t const quantize_row_q = get_quantize_fn(type).quantize_row_q; // we don't support permuted src0 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + GGML_V2_ASSERT(nb00 == GGML_V2_TYPE_SIZE[type]); // dst cannot be transposed or permuted - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + GGML_V2_ASSERT(nb0 <= nb1); + GGML_V2_ASSERT(nb1 <= nb2); + GGML_V2_ASSERT(nb2 <= nb3); - GGML_ASSERT(ggml_is_quantized(src0->type)); - GGML_ASSERT(dst->type == src0->type); - GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_V2_ASSERT(ggml_v2_is_quantized(src0->type)); + GGML_V2_ASSERT(dst->type == src0->type); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7753,64 +7753,64 @@ static void ggml_compute_forward_add1_q_f32( // unquantize row from src0 to temp buffer dequantize_row_q(src0_row, wdata, ne0); // add src1 - ggml_vec_acc1_f32(ne0, wdata, v); + ggml_v2_vec_acc1_f32(ne0, wdata, v); // quantize row to dst quantize_row_q(wdata, dst_row, ne0); } } -static void ggml_compute_forward_add1( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_add1( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_add1_f32(params, src0, src1, dst); + ggml_v2_compute_forward_add1_f32(params, src0, src1, dst); } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + if (src1->type == GGML_V2_TYPE_F16) { + ggml_v2_compute_forward_add1_f16_f16(params, src0, src1, dst); } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + else if (src1->type == GGML_V2_TYPE_F32) { + ggml_v2_compute_forward_add1_f16_f32(params, src0, src1, dst); } else { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_1: { - ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + ggml_v2_compute_forward_add1_q_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_acc +// ggml_v2_compute_forward_acc -static void ggml_compute_forward_acc_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); +static void ggml_v2_compute_forward_acc_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + const struct ggml_v2_tensor * opt0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst) && ggml_v2_is_contiguous(src0)); - GGML_ASSERT(opt0->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(opt0) == 5); + GGML_V2_ASSERT(opt0->type == GGML_V2_TYPE_I32); + GGML_V2_ASSERT(ggml_v2_nelements(opt0) == 5); // view src0 and dst with these strides and data offset inbytes during acc // nb0 is implicitely element_size because src0 and dst are contiguous @@ -7820,23 +7820,23 @@ static void ggml_compute_forward_acc_f32( size_t offset = ((int32_t *) opt0->data)[3]; bool inplace = (bool) ((int32_t *) opt0->data)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_V2_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( ((char *) dst->data), ((char *) src0->data), - ggml_nbytes(dst)); + ggml_v2_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src1); + const int nr = ggml_v2_nrows(src1); const int nc = src1->ne[0]; const int64_t ne10 = src1->ne[0]; @@ -7850,17 +7850,17 @@ static void ggml_compute_forward_acc_f32( const size_t nb13 = src1->nb[3]; // src0 and dst as viewed during acc - const size_t nb0 = ggml_element_size(src0); + const size_t nb0 = ggml_v2_element_size(src0); const size_t nb00 = nb0; const size_t nb01 = nb1; const size_t nb02 = nb2; const size_t nb03 = nb3; - GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst)); - GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0)); + GGML_V2_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_v2_nbytes(dst)); + GGML_V2_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_v2_nbytes(src0)); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(nb10 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -7882,7 +7882,7 @@ static void ggml_compute_forward_acc_f32( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); #else - ggml_vec_add_f32(nc, + ggml_v2_vec_add_f32(nc, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -7890,47 +7890,47 @@ static void ggml_compute_forward_acc_f32( } } -static void ggml_compute_forward_acc( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_acc( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + const struct ggml_v2_tensor * opt0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst); + ggml_v2_compute_forward_acc_f32(params, src0, src1, opt0, dst); } break; - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: + case GGML_V2_TYPE_F16: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_1: default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_sub +// ggml_v2_compute_forward_sub -static void ggml_compute_forward_sub_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sub_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -7950,8 +7950,8 @@ static void ggml_compute_forward_sub_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT( nb0 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { for (int ir = 0; ir < nr; ++ir) { @@ -7968,7 +7968,7 @@ static void ggml_compute_forward_sub_f32( (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - ggml_vec_sub_f32(ne0, + ggml_v2_vec_sub_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -7995,39 +7995,39 @@ static void ggml_compute_forward_sub_f32( } } -static void ggml_compute_forward_sub( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sub( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_sub_f32(params, src0, src1, dst); + ggml_v2_compute_forward_sub_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_mul +// ggml_v2_compute_forward_mul -static void ggml_compute_forward_mul_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_mul_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + assert(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -8047,8 +8047,8 @@ static void ggml_compute_forward_mul_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT( nb0 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { for (int ir = ith; ir < nr; ir += nth) { @@ -8059,7 +8059,7 @@ static void ggml_compute_forward_mul_f32( #ifdef GGML_USE_ACCELERATE - UNUSED(ggml_vec_mul_f32); + UNUSED(ggml_v2_vec_mul_f32); vDSP_vmul( (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, @@ -8067,7 +8067,7 @@ static void ggml_compute_forward_mul_f32( (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - ggml_vec_mul_f32(ne0, + ggml_v2_vec_mul_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -8094,38 +8094,38 @@ static void ggml_compute_forward_mul_f32( } } -static void ggml_compute_forward_mul( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_mul( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_mul_f32(params, src0, src1, dst); + ggml_v2_compute_forward_mul_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_div +// ggml_v2_compute_forward_div -static void ggml_compute_forward_div_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_div_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); const int64_t ne0 = src0->ne[0]; const int64_t ne1 = src0->ne[1]; const int64_t ne2 = src0->ne[2]; @@ -8145,8 +8145,8 @@ static void ggml_compute_forward_div_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT( nb0 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { for (int ir = 0; ir < nr; ++ir) { @@ -8163,7 +8163,7 @@ static void ggml_compute_forward_div_f32( (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - ggml_vec_div_f32(ne0, + ggml_v2_vec_div_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -8190,164 +8190,164 @@ static void ggml_compute_forward_div_f32( } } -static void ggml_compute_forward_div( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_div( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_div_f32(params, src0, src1, dst); + ggml_v2_compute_forward_div_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_sqr +// ggml_v2_compute_forward_sqr -static void ggml_compute_forward_sqr_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sqr_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert( dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_sqr_f32(nc, + ggml_v2_vec_sqr_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_sqr( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sqr( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_sqr_f32(params, src0, dst); + ggml_v2_compute_forward_sqr_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_sqrt +// ggml_v2_compute_forward_sqrt -static void ggml_compute_forward_sqrt_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sqrt_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert( dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_sqrt_f32(nc, + ggml_v2_vec_sqrt_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_sqrt( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sqrt( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_sqrt_f32(params, src0, dst); + ggml_v2_compute_forward_sqrt_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_log +// ggml_v2_compute_forward_log -static void ggml_compute_forward_log_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_log_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(params->ith == 0); + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_V2_ASSERT( dst->nb[0] == sizeof(float)); + GGML_V2_ASSERT(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_log_f32(nc, + ggml_v2_vec_log_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_log( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_log( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_log_f32(params, src0, dst); + ggml_v2_compute_forward_log_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_sum +// ggml_v2_compute_forward_sum -static void ggml_compute_forward_sum_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sum_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_is_scalar(dst)); + assert(ggml_v2_is_scalar(dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - assert(ggml_is_scalar(dst)); + assert(ggml_v2_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); const int64_t ne00 = src0->ne[0]; @@ -8359,13 +8359,13 @@ static void ggml_compute_forward_sum_f32( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - ggml_float sum = 0; - ggml_float row_sum = 0; + ggml_v2_float sum = 0; + ggml_v2_float row_sum = 0; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_ggf(ne00, + ggml_v2_vec_sum_ggf(ne00, &row_sum, (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); sum += row_sum; @@ -8375,36 +8375,36 @@ static void ggml_compute_forward_sum_f32( ((float *) dst->data)[0] = sum; } -static void ggml_compute_forward_sum( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sum( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_sum_f32(params, src0, dst); + ggml_v2_compute_forward_sum_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_sum_rows +// ggml_v2_compute_forward_sum_rows -static void ggml_compute_forward_sum_rows_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); +static void ggml_v2_compute_forward_sum_rows_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT(dst->nb[0] == sizeof(float)); + GGML_V2_ASSERT(src0->nb[0] == sizeof(float)); + GGML_V2_ASSERT(dst->nb[0] == sizeof(float)); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; @@ -8416,10 +8416,10 @@ static void ggml_compute_forward_sum_rows_f32( const int64_t ne2 = dst->ne[2]; const int64_t ne3 = dst->ne[3]; - GGML_ASSERT(ne0 == 1); - GGML_ASSERT(ne1 == ne01); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); + GGML_V2_ASSERT(ne0 == 1); + GGML_V2_ASSERT(ne1 == ne01); + GGML_V2_ASSERT(ne2 == ne02); + GGML_V2_ASSERT(ne3 == ne03); const size_t nb01 = src0->nb[1]; const size_t nb02 = src0->nb[2]; @@ -8435,38 +8435,38 @@ static void ggml_compute_forward_sum_rows_f32( float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); float row_sum = 0; - ggml_vec_sum_f32(ne00, &row_sum, src_row); + ggml_v2_vec_sum_f32(ne00, &row_sum, src_row); dst_row[0] = row_sum; } } } } -static void ggml_compute_forward_sum_rows( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sum_rows( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_sum_rows_f32(params, src0, dst); + ggml_v2_compute_forward_sum_rows_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_mean +// ggml_v2_compute_forward_mean -static void ggml_compute_forward_mean_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_mean_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -8503,7 +8503,7 @@ static void ggml_compute_forward_mean_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f32(ne00, + ggml_v2_vec_sum_f32(ne00, (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); @@ -8513,32 +8513,32 @@ static void ggml_compute_forward_mean_f32( } } -static void ggml_compute_forward_mean( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_mean( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_mean_f32(params, src0, dst); + ggml_v2_compute_forward_mean_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_repeat +// ggml_v2_compute_forward_repeat -static void ggml_compute_forward_repeat_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_can_repeat(src0, dst)); +static void ggml_v2_compute_forward_repeat_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(params->ith == 0); + GGML_V2_ASSERT(ggml_v2_can_repeat(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -8562,15 +8562,15 @@ static void ggml_compute_forward_repeat_f32( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - // guaranteed to be an integer due to the check in ggml_can_repeat + // guaranteed to be an integer due to the check in ggml_v2_can_repeat const int nr0 = (int)(ne0/ne00); const int nr1 = (int)(ne1/ne01); const int nr2 = (int)(ne2/ne02); const int nr3 = (int)(ne3/ne03); // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT(nb0 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); // TODO: maybe this is not optimal? for (int i3 = 0; i3 < nr3; i3++) { @@ -8580,7 +8580,7 @@ static void ggml_compute_forward_repeat_f32( for (int i1 = 0; i1 < nr1; i1++) { for (int k1 = 0; k1 < ne01; k1++) { for (int i0 = 0; i0 < nr0; i0++) { - ggml_vec_cpy_f32(ne00, + ggml_v2_vec_cpy_f32(ne00, (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); } @@ -8592,243 +8592,243 @@ static void ggml_compute_forward_repeat_f32( } } -static void ggml_compute_forward_repeat( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_repeat( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_repeat_f32(params, src0, dst); + ggml_v2_compute_forward_repeat_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_abs +// ggml_v2_compute_forward_abs -static void ggml_compute_forward_abs_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_abs_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_abs_f32(nc, + ggml_v2_vec_abs_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_abs( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_abs( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_abs_f32(params, src0, dst); + ggml_v2_compute_forward_abs_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_sgn +// ggml_v2_compute_forward_sgn -static void ggml_compute_forward_sgn_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sgn_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_sgn_f32(nc, + ggml_v2_vec_sgn_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_sgn( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_sgn( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_sgn_f32(params, src0, dst); + ggml_v2_compute_forward_sgn_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_neg +// ggml_v2_compute_forward_neg -static void ggml_compute_forward_neg_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_neg_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_neg_f32(nc, + ggml_v2_vec_neg_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_neg( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_neg( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_neg_f32(params, src0, dst); + ggml_v2_compute_forward_neg_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_step +// ggml_v2_compute_forward_step -static void ggml_compute_forward_step_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_step_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_step_f32(nc, + ggml_v2_vec_step_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_step( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_step( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_step_f32(params, src0, dst); + ggml_v2_compute_forward_step_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_relu +// ggml_v2_compute_forward_relu -static void ggml_compute_forward_relu_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_relu_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - ggml_vec_relu_f32(nc, + ggml_v2_vec_relu_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void ggml_compute_forward_relu( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_relu( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_relu_f32(params, src0, dst); + ggml_v2_compute_forward_relu_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_gelu +// ggml_v2_compute_forward_gelu -static void ggml_compute_forward_gelu_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_gelu_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_is_contiguous(src0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -8836,7 +8836,7 @@ static void ggml_compute_forward_gelu_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8846,7 +8846,7 @@ static void ggml_compute_forward_gelu_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_gelu_f32(nc, + ggml_v2_vec_gelu_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) src0->data + i1*(src0->nb[1]))); @@ -8861,35 +8861,35 @@ static void ggml_compute_forward_gelu_f32( } } -static void ggml_compute_forward_gelu( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_gelu( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_gelu_f32(params, src0, dst); + ggml_v2_compute_forward_gelu_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } //printf("XXXXXXXX gelu\n"); } -// ggml_compute_forward_silu +// ggml_v2_compute_forward_silu -static void ggml_compute_forward_silu_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_silu_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_is_contiguous(src0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -8897,7 +8897,7 @@ static void ggml_compute_forward_silu_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8907,7 +8907,7 @@ static void ggml_compute_forward_silu_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_silu_f32(nc, + ggml_v2_vec_silu_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) src0->data + i1*(src0->nb[1]))); @@ -8922,37 +8922,37 @@ static void ggml_compute_forward_silu_f32( } } -static void ggml_compute_forward_silu( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_silu( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_silu_f32(params, src0, dst); + ggml_v2_compute_forward_silu_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_silu_back +// ggml_v2_compute_forward_silu_back -static void ggml_compute_forward_silu_back_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(grad)); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_are_same_shape(src0, grad)); +static void ggml_v2_compute_forward_silu_back_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * grad, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_is_contiguous(grad)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(src0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, grad)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -8960,7 +8960,7 @@ static void ggml_compute_forward_silu_back_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8970,7 +8970,7 @@ static void ggml_compute_forward_silu_back_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - ggml_vec_silu_backward_f32(nc, + ggml_v2_vec_silu_backward_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) src0->data + i1*(src0->nb[1])), (float *) ((char *) grad->data + i1*(grad->nb[1]))); @@ -8986,36 +8986,36 @@ static void ggml_compute_forward_silu_back_f32( } } -static void ggml_compute_forward_silu_back( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * grad, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_silu_back( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * grad, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + ggml_v2_compute_forward_silu_back_f32(params, src0, grad, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_norm +// ggml_v2_compute_forward_norm -static void ggml_compute_forward_norm_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_norm_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_V2_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -9041,58 +9041,58 @@ static void ggml_compute_forward_norm_f32( for (int64_t i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - ggml_float sum = 0.0; + ggml_v2_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)x[i00]; + sum += (ggml_v2_float)x[i00]; } float mean = sum/ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - ggml_float sum2 = 0.0; + ggml_v2_float sum2 = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { float v = x[i00] - mean; y[i00] = v; - sum2 += (ggml_float)(v*v); + sum2 += (ggml_v2_float)(v*v); } float variance = sum2/ne00; const float scale = 1.0f/sqrtf(variance + eps); - ggml_vec_scale_f32(ne00, y, scale); + ggml_v2_vec_scale_f32(ne00, y, scale); } } } } -static void ggml_compute_forward_norm( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_norm( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_norm_f32(params, src0, dst); + ggml_v2_compute_forward_norm_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -static void ggml_compute_forward_rms_norm_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_rms_norm_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_V2_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -9118,9 +9118,9 @@ static void ggml_compute_forward_rms_norm_f32( for (int64_t i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - ggml_float sum = 0.0; + ggml_v2_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)(x[i00] * x[i00]); + sum += (ggml_v2_float)(x[i00] * x[i00]); } float mean = sum/ne00; @@ -9134,41 +9134,41 @@ static void ggml_compute_forward_rms_norm_f32( const float scale = 1.0f/sqrtf(mean + eps); - ggml_vec_scale_f32(ne00, y, scale); + ggml_v2_vec_scale_f32(ne00, y, scale); } } } } -static void ggml_compute_forward_rms_norm( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_rms_norm( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_rms_norm_f32(params, src0, dst); + ggml_v2_compute_forward_rms_norm_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -static void ggml_compute_forward_rms_norm_back_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); +static void ggml_v2_compute_forward_rms_norm_back_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst) && ggml_v2_are_same_shape(src0, src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_V2_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -9204,12 +9204,12 @@ static void ggml_compute_forward_rms_norm_back_f32( const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); - ggml_float sum_xx = 0.0; - ggml_float sum_xdz = 0.0; + ggml_v2_float sum_xx = 0.0; + ggml_v2_float sum_xdz = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum_xx += (ggml_float)(x[i00] * x[i00]); - sum_xdz += (ggml_float)(x[i00] * dz[i00]); + sum_xx += (ggml_v2_float)(x[i00] * x[i00]); + sum_xdz += (ggml_v2_float)(x[i00] * dz[i00]); } //const float mean = (float)(sum_xx)/ne00; @@ -9217,7 +9217,7 @@ static void ggml_compute_forward_rms_norm_back_f32( const float sum_eps = (float)(sum_xx) + eps*ne00; //const float mean_xdz = (float)(sum_xdz)/ne00; // we could cache rms from forward pass to improve performance. - // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms. + // to do this implement ggml_v2_rms and compose ggml_v2_rms_norm using ggml_v2_rms. //const float rms = sqrtf(mean_eps); const float rrms = 1.0f / sqrtf(mean_eps); //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3) @@ -9318,43 +9318,43 @@ static void ggml_compute_forward_rms_norm_back_f32( // dx := scale(dx, rrms) float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - ggml_vec_cpy_f32 (ne00, dx, x); - // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); - ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); - ggml_vec_acc_f32 (ne00, dx, dz); - ggml_vec_scale_f32(ne00, dx, rrms); + ggml_v2_vec_cpy_f32 (ne00, dx, x); + // ggml_v2_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); + ggml_v2_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); + ggml_v2_vec_acc_f32 (ne00, dx, dz); + ggml_v2_vec_scale_f32(ne00, dx, rrms); } } } } -static void ggml_compute_forward_rms_norm_back( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_rms_norm_back( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + ggml_v2_compute_forward_rms_norm_back_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_mul_mat +// ggml_v2_compute_forward_mul_mat #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static bool ggml_v2_compute_forward_mul_mat_use_blas( + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { //const int64_t ne00 = src0->ne[0]; //const int64_t ne01 = src0->ne[1]; @@ -9364,8 +9364,8 @@ static bool ggml_compute_forward_mul_mat_use_blas( const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && + if (ggml_v2_is_contiguous(src0) && + ggml_v2_is_contiguous(src1) && (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ @@ -9376,12 +9376,12 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif -static void ggml_compute_forward_mul_mat_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); +static void ggml_v2_compute_forward_mul_mat_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t ne00 = src0->ne[0]; @@ -9447,32 +9447,32 @@ static void ggml_compute_forward_mul_mat_f32( // compute by src0 rows #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { + ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; } #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + if (ggml_v2_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { + ggml_v2_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; } #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (ggml_v2_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -9486,21 +9486,21 @@ static void ggml_compute_forward_mul_mat_f32( // zT = y * xT if(quants_unshuffled) { - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ggml_v2_cl_sgemm_wrapper(GGML_V2_BLAS_ORDER_ROW_MAJOR, GGML_V2_BLAS_OP_N, GGML_V2_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, 0.0f, d, ne01, - GGML_TYPE_F32); + GGML_V2_TYPE_F32); } else { - ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ggml_v2_cl_sgemm_wrapper_legacy(GGML_V2_BLAS_ORDER_ROW_MAJOR, GGML_V2_BLAS_OP_N, GGML_V2_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, 0.0f, d, ne01, - GGML_TYPE_F32); + GGML_V2_TYPE_F32); } #else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, @@ -9511,21 +9511,21 @@ static void ggml_compute_forward_mul_mat_f32( #endif } } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_v2_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } - // parallelize by src0 rows using ggml_vec_dot_f32 + // parallelize by src0 rows using ggml_v2_vec_dot_f32 // total rows in src0 const int nr = ne01*ne02*ne03; @@ -9555,14 +9555,14 @@ static void ggml_compute_forward_mul_mat_f32( const int i2 = i02; const int i3 = i03; - ggml_vec_dot_f32(ne00, + ggml_v2_vec_dot_f32(ne00, (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); } } - //int64_t t1 = ggml_perf_time_us(); + //int64_t t1 = ggml_v2_perf_time_us(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -9576,12 +9576,12 @@ static void ggml_compute_forward_mul_mat_f32( //} } -static void ggml_compute_forward_mul_mat_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); +static void ggml_v2_compute_forward_mul_mat_f16_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t ne00 = src0->ne[0]; @@ -9618,57 +9618,57 @@ static void ggml_compute_forward_mul_mat_f16_f32( const int ith = params->ith; const int nth = params->nth; - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); + GGML_V2_ASSERT(ne02 == ne12); + GGML_V2_ASSERT(ne03 == ne13); + GGML_V2_ASSERT(ne2 == ne12); + GGML_V2_ASSERT(ne3 == ne13); // TODO: we don't support permuted src0 - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT(nb00 == sizeof(ggml_v2_fp16_t)); // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + GGML_V2_ASSERT(nb0 == sizeof(float)); + GGML_V2_ASSERT(nb0 <= nb1); + GGML_V2_ASSERT(nb1 <= nb2); + GGML_V2_ASSERT(nb2 <= nb3); - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); + GGML_V2_ASSERT(ne0 == ne01); + GGML_V2_ASSERT(ne1 == ne11); + GGML_V2_ASSERT(ne2 == ne02); + GGML_V2_ASSERT(ne3 == ne03); // nb01 >= nb00 - src0 is not transposed // compute by src0 rows #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { + ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; } #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + if (ggml_v2_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { + ggml_v2_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; } #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - GGML_ASSERT(nb10 == sizeof(float)); + if (ggml_v2_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + GGML_V2_ASSERT(nb10 == sizeof(float)); if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -9679,7 +9679,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { for (int64_t i00 = 0; i00 < ne00; ++i00) { - wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); + wdata[id++] = GGML_V2_FP16_TO_FP32(*(ggml_v2_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); } } @@ -9695,21 +9695,21 @@ static void ggml_compute_forward_mul_mat_f16_f32( // zT = y * xT if(quants_unshuffled) { - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ggml_v2_cl_sgemm_wrapper(GGML_V2_BLAS_ORDER_ROW_MAJOR, GGML_V2_BLAS_OP_N, GGML_V2_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, 0.0f, d, ne01, - GGML_TYPE_F32); + GGML_V2_TYPE_F32); } else { - ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ggml_v2_cl_sgemm_wrapper_legacy(GGML_V2_BLAS_ORDER_ROW_MAJOR, GGML_V2_BLAS_OP_N, GGML_V2_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, 0.0f, d, ne01, - GGML_TYPE_F32); + GGML_V2_TYPE_F32); } #else const float * x = wdata; @@ -9727,40 +9727,40 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } - /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ + /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_v2_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ return; } #endif - if (params->type == GGML_TASK_INIT) { - ggml_fp16_t * const wdata = params->wdata; + if (params->type == GGML_V2_TASK_INIT) { + ggml_v2_fp16_t * const wdata = params->wdata; size_t id = 0; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { for (int64_t i10 = 0; i10 < ne10; ++i10) { - wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); + wdata[id++] = GGML_V2_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); } } } } - GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize); + GGML_V2_ASSERT(id*sizeof(ggml_v2_fp16_t) <= params->wsize); return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } // fp16 -> half the size, so divide by 2 // TODO: do not support transposed src1 - assert(nb10/2 == sizeof(ggml_fp16_t)); + assert(nb10/2 == sizeof(ggml_v2_fp16_t)); - // parallelize by src0 rows using ggml_vec_dot_f16 + // parallelize by src0 rows using ggml_v2_vec_dot_f16 // total rows in src0 const int nr = ne01*ne02*ne03; @@ -9772,7 +9772,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - ggml_fp16_t * wdata = params->wdata; + ggml_v2_fp16_t * wdata = params->wdata; for (int ir = ir0; ir < ir1; ++ir) { // src0 indices @@ -9787,17 +9787,17 @@ static void ggml_compute_forward_mul_mat_f16_f32( const int i2 = i02; const int i3 = i03; - ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; + ggml_v2_fp16_t * src0_row = (ggml_v2_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); + ggml_v2_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); for (int64_t ic = 0; ic < ne11; ++ic) { - ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); + ggml_v2_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); } } - //int64_t t1 = ggml_time_us(); + //int64_t t1 = ggml_v2_time_us(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -9810,12 +9810,12 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -static void ggml_compute_forward_mul_mat_q_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); +static void ggml_v2_compute_forward_mul_mat_q_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t ne00 = src0->ne[0]; @@ -9851,61 +9851,61 @@ static void ggml_compute_forward_mul_mat_q_f32( const int ith = params->ith; const int nth = params->nth; - GGML_ASSERT(ne02 == ne12); - GGML_ASSERT(ne03 == ne13); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); + GGML_V2_ASSERT(ne02 == ne12); + GGML_V2_ASSERT(ne03 == ne13); + GGML_V2_ASSERT(ne2 == ne12); + GGML_V2_ASSERT(ne3 == ne13); - const enum ggml_type type = src0->type; + const enum ggml_v2_type type = src0->type; quantize_row_q_t const quantize_row_q_dot = get_quantize_fn(type).quantize_row_q_dot; vec_dot_q_t const vec_dot_q = get_quantize_fn(type).vec_dot_q; - enum ggml_type const vec_dot_type = get_quantize_fn(type).vec_dot_type; + enum ggml_v2_type const vec_dot_type = get_quantize_fn(type).vec_dot_type; // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(nb00 == (int) GGML_V2_TYPE_SIZE[type]); + GGML_V2_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + GGML_V2_ASSERT(nb0 == sizeof(float)); + GGML_V2_ASSERT(nb0 <= nb1); + GGML_V2_ASSERT(nb1 <= nb2); + GGML_V2_ASSERT(nb2 <= nb3); - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); + GGML_V2_ASSERT(ne0 == ne01); + GGML_V2_ASSERT(ne1 == ne11); + GGML_V2_ASSERT(ne2 == ne02); + GGML_V2_ASSERT(ne3 == ne03); // nb01 >= nb00 - src0 is not transposed // compute by src0 rows #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + if (ggml_v2_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { + ggml_v2_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; } #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + if (ggml_v2_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_V2_TASK_COMPUTE) { + ggml_v2_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; } #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (ggml_v2_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -9938,7 +9938,7 @@ static void ggml_compute_forward_mul_mat_q_f32( // zT = y * xT if(quants_unshuffled) { - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ggml_v2_cl_sgemm_wrapper(GGML_V2_BLAS_ORDER_ROW_MAJOR, GGML_V2_BLAS_OP_N, GGML_V2_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, @@ -9947,7 +9947,7 @@ static void ggml_compute_forward_mul_mat_q_f32( } else { - ggml_cl_sgemm_wrapper_legacy(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, + ggml_v2_cl_sgemm_wrapper_legacy(GGML_V2_BLAS_ORDER_ROW_MAJOR, GGML_V2_BLAS_OP_N, GGML_V2_BLAS_OP_T, ne11, ne01, ne10, 1.0f, y, ne10, x, ne10, @@ -9964,15 +9964,15 @@ static void ggml_compute_forward_mul_mat_q_f32( } } - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_v2_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { char * wdata = params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const size_t row_size = ne10*GGML_V2_TYPE_SIZE[vec_dot_type]/GGML_V2_BLCK_SIZE[vec_dot_type]; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { @@ -9986,11 +9986,11 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } - // parallelize by src0 rows using ggml_vec_dot_q + // parallelize by src0 rows using ggml_v2_vec_dot_q // total rows in src0 const int nr = ne01*ne02*ne03; @@ -10003,7 +10003,7 @@ static void ggml_compute_forward_mul_mat_q_f32( const int ir1 = MIN(ir0 + dr, nr); void * wdata = params->wdata; - const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const size_t row_size = ne00*GGML_V2_TYPE_SIZE[vec_dot_type]/GGML_V2_BLCK_SIZE[vec_dot_type]; for (int ir = ir0; ir < ir1; ++ir) { // src0 indices @@ -10030,7 +10030,7 @@ static void ggml_compute_forward_mul_mat_q_f32( } } - //int64_t t1 = ggml_time_us(); + //int64_t t1 = ggml_v2_time_us(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -10043,52 +10043,52 @@ static void ggml_compute_forward_mul_mat_q_f32( //} } -static void ggml_compute_forward_mul_mat( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_mul_mat( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q8_1B: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_3: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_1: + case GGML_V2_TYPE_Q8_1B: { - ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); + ggml_v2_compute_forward_mul_mat_q_f32(params, src0, src1, dst); } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst); + ggml_v2_compute_forward_mul_mat_f16_f32(params, src0, src1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_mul_mat_f32(params, src0, src1, dst); + ggml_v2_compute_forward_mul_mat_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_scale +// ggml_v2_compute_forward_scale -static void ggml_compute_forward_scale_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_scalar(src1)); +static void ggml_v2_compute_forward_scale_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_is_contiguous(src0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -10099,7 +10099,7 @@ static void ggml_compute_forward_scale_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -10118,40 +10118,40 @@ static void ggml_compute_forward_scale_f32( // src0 is same shape as dst => same indices memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); } - ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); + ggml_v2_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); } } -static void ggml_compute_forward_scale( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_scale( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_scale_f32(params, src0, src1, dst); + ggml_v2_compute_forward_scale_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_set +// ggml_v2_compute_forward_set -static void ggml_compute_forward_set_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); +static void ggml_v2_compute_forward_set_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + const struct ggml_v2_tensor * opt0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst) && ggml_v2_is_contiguous(src0)); - GGML_ASSERT(opt0->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(opt0) == 5); + GGML_V2_ASSERT(opt0->type == GGML_V2_TYPE_I32); + GGML_V2_ASSERT(ggml_v2_nelements(opt0) == 5); // view src0 and dst with these strides and data offset inbytes during set // nb0 is implicitely element_size because src0 and dst are contiguous @@ -10161,23 +10161,23 @@ static void ggml_compute_forward_set_f32( size_t offset = ((int32_t *) opt0->data)[3]; bool inplace = (bool) ((int32_t *) opt0->data)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_V2_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( ((char *) dst->data), ((char *) src0->data), - ggml_nbytes(dst)); + ggml_v2_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(src1); + const int nr = ggml_v2_nrows(src1); const int nc = src1->ne[0]; const int64_t ne10 = src1->ne[0]; @@ -10191,16 +10191,16 @@ static void ggml_compute_forward_set_f32( const size_t nb13 = src1->nb[3]; // src0 and dst as viewed during set - const size_t nb0 = ggml_element_size(src0); + const size_t nb0 = ggml_v2_element_size(src0); const int im0 = (ne10 == 0 ? 0 : ne10-1); const int im1 = (ne11 == 0 ? 0 : ne11-1); const int im2 = (ne12 == 0 ? 0 : ne12-1); const int im3 = (ne13 == 0 ? 0 : ne13-1); - GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst)); + GGML_V2_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_v2_nbytes(dst)); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(nb10 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -10216,119 +10216,119 @@ static void ggml_compute_forward_set_f32( const int i2 = (ir - i3*ne12*ne11)/ne11; const int i1 = (ir - i3*ne12*ne11 - i2*ne11); - ggml_vec_cpy_f32(nc, + ggml_v2_vec_cpy_f32(nc, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); } } -static void ggml_compute_forward_set( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_set( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + const struct ggml_v2_tensor * opt0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_set_f32(params, src0, src1, opt0, dst); + ggml_v2_compute_forward_set_f32(params, src0, src1, opt0, dst); } break; - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: + case GGML_V2_TYPE_F16: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_1: default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_cpy +// ggml_v2_compute_forward_cpy -static void ggml_compute_forward_cpy( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); +static void ggml_v2_compute_forward_cpy( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + ggml_v2_compute_forward_dup(params, src0, dst); } -// ggml_compute_forward_cont +// ggml_v2_compute_forward_cont -static void ggml_compute_forward_cont( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - ggml_compute_forward_dup(params, src0, dst); +static void ggml_v2_compute_forward_cont( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + ggml_v2_compute_forward_dup(params, src0, dst); } -// ggml_compute_forward_reshape +// ggml_v2_compute_forward_reshape -static void ggml_compute_forward_reshape( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_reshape( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { // NOP UNUSED(params); UNUSED(src0); UNUSED(dst); } -// ggml_compute_forward_view +// ggml_v2_compute_forward_view -static void ggml_compute_forward_view( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { +static void ggml_v2_compute_forward_view( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0) { // NOP UNUSED(params); UNUSED(src0); } -// ggml_compute_forward_permute +// ggml_v2_compute_forward_permute -static void ggml_compute_forward_permute( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { +static void ggml_v2_compute_forward_permute( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0) { // NOP UNUSED(params); UNUSED(src0); } -// ggml_compute_forward_transpose +// ggml_v2_compute_forward_transpose -static void ggml_compute_forward_transpose( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0) { +static void ggml_v2_compute_forward_transpose( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0) { // NOP UNUSED(params); UNUSED(src0); } -// ggml_compute_forward_get_rows +// ggml_v2_compute_forward_get_rows -static void ggml_compute_forward_get_rows_q( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_get_rows_q( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); - const enum ggml_type type = src0->type; + const int nr = ggml_v2_nelements(src1); + const enum ggml_v2_type type = src0->type; dequantize_row_q_t const dequantize_row_q = get_quantize_fn(type).dequantize_row_q; assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); - assert(src0->nb[0] == GGML_TYPE_SIZE[type]); + assert(src0->nb[0] == GGML_V2_TYPE_SIZE[type]); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; @@ -10339,47 +10339,47 @@ static void ggml_compute_forward_get_rows_q( } } -static void ggml_compute_forward_get_rows_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_get_rows_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + const int nr = ggml_v2_nelements(src1); assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); - assert(src0->nb[0] == sizeof(ggml_fp16_t)); + assert(src0->nb[0] == sizeof(ggml_v2_fp16_t)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; for (int j = 0; j < nc; ++j) { - ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); + ggml_v2_fp16_t v = ((ggml_v2_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_V2_FP16_TO_FP32(v); } } } -static void ggml_compute_forward_get_rows_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_get_rows_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + const int nr = ggml_v2_nelements(src1); assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); @@ -10388,41 +10388,41 @@ static void ggml_compute_forward_get_rows_f32( for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; - ggml_vec_cpy_f32(nc, + ggml_v2_vec_cpy_f32(nc, (float *) ((char *) dst->data + i*dst->nb[1]), (float *) ((char *) src0->data + r*src0->nb[1])); } } -static void ggml_compute_forward_get_rows( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_get_rows( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q8_1B: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_3: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_1: + case GGML_V2_TYPE_Q8_1B: { - ggml_compute_forward_get_rows_q(params, src0, src1, dst); + ggml_v2_compute_forward_get_rows_q(params, src0, src1, dst); } break; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + ggml_v2_compute_forward_get_rows_f16(params, src0, src1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + ggml_v2_compute_forward_get_rows_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } @@ -10445,68 +10445,68 @@ static void ggml_compute_forward_get_rows( //} } -// ggml_compute_forward_get_rows_back +// ggml_v2_compute_forward_get_rows_back -static void ggml_compute_forward_get_rows_back_f32_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_are_same_shape(opt0, dst)); - GGML_ASSERT(ggml_is_contiguous(opt0)); - GGML_ASSERT(ggml_is_contiguous(dst)); +static void ggml_v2_compute_forward_get_rows_back_f32_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + const struct ggml_v2_tensor * opt0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(params->ith == 0); + GGML_V2_ASSERT(ggml_v2_are_same_shape(opt0, dst)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(opt0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst)); - ggml_compute_forward_dup_same_cont(params, opt0, dst); + ggml_v2_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + const int nr = ggml_v2_nelements(src1); - GGML_ASSERT( dst->ne[0] == nc); - GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT( dst->ne[0] == nc); + GGML_V2_ASSERT(src0->nb[0] == sizeof(ggml_v2_fp16_t)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; for (int j = 0; j < nc; ++j) { - ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); + ggml_v2_fp16_t v = ((ggml_v2_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_V2_FP16_TO_FP32(v); } } } -static void ggml_compute_forward_get_rows_back_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_are_same_shape(opt0, dst)); - GGML_ASSERT(ggml_is_contiguous(opt0)); - GGML_ASSERT(ggml_is_contiguous(dst)); +static void ggml_v2_compute_forward_get_rows_back_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + const struct ggml_v2_tensor * opt0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(params->ith == 0); + GGML_V2_ASSERT(ggml_v2_are_same_shape(opt0, dst)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(opt0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst)); - ggml_compute_forward_dup_same_cont(params, opt0, dst); + ggml_v2_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = ggml_nelements(src1); + const int nr = ggml_v2_nelements(src1); - GGML_ASSERT( dst->ne[0] == nc); - GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_V2_ASSERT( dst->ne[0] == nc); + GGML_V2_ASSERT(src0->nb[0] == sizeof(float)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; - ggml_vec_add_f32(nc, + ggml_v2_vec_add_f32(nc, (float *) ((char *) dst->data + r*dst->nb[1]), (float *) ((char *) dst->data + r*dst->nb[1]), (float *) ((char *) src0->data + i*src0->nb[1])); @@ -10514,24 +10514,24 @@ static void ggml_compute_forward_get_rows_back_f32( } -static void ggml_compute_forward_get_rows_back( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_get_rows_back( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + const struct ggml_v2_tensor * opt0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); + ggml_v2_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); + ggml_v2_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } @@ -10554,15 +10554,15 @@ static void ggml_compute_forward_get_rows_back( //} } -// ggml_compute_forward_diag +// ggml_v2_compute_forward_diag -static void ggml_compute_forward_diag_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(params->ith == 0); +static void ggml_v2_compute_forward_diag_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -10576,11 +10576,11 @@ static void ggml_compute_forward_diag_f32( const int ne1 = dst->ne[1]; const int ne2 = dst->ne[2]; const int ne3 = dst->ne[3]; - GGML_ASSERT(ne00 == ne0); - GGML_ASSERT(ne00 == ne1); - GGML_ASSERT(ne01 == 1); - GGML_ASSERT(ne02 == ne2); - GGML_ASSERT(ne03 == ne3); + GGML_V2_ASSERT(ne00 == ne0); + GGML_V2_ASSERT(ne00 == ne1); + GGML_V2_ASSERT(ne01 == 1); + GGML_V2_ASSERT(ne02 == ne2); + GGML_V2_ASSERT(ne03 == ne3); const int nb00 = src0->nb[0]; //const int nb01 = src0->nb[1]; @@ -10591,8 +10591,8 @@ static void ggml_compute_forward_diag_f32( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(nb0 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT(nb0 == sizeof(float)); for (int i3 = 0; i3 < ne3; i3++) { for (int i2 = 0; i2 < ne2; i2++) { @@ -10611,32 +10611,32 @@ static void ggml_compute_forward_diag_f32( } } -static void ggml_compute_forward_diag( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_diag( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_diag_f32(params, src0, dst); + ggml_v2_compute_forward_diag_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_diag_mask_inf +// ggml_v2_compute_forward_diag_mask_inf -static void ggml_compute_forward_diag_mask_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst, +static void ggml_v2_compute_forward_diag_mask_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst, const float value) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 2); const int ith = params->ith; const int nth = params->nth; @@ -10645,24 +10645,24 @@ static void ggml_compute_forward_diag_mask_f32( const bool inplace = (bool)((int32_t *) src1->data)[1]; assert(n_past >= 0); - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_V2_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + GGML_V2_ASSERT(ggml_v2_nelements(dst) == ggml_v2_nelements(src0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst) && ggml_v2_is_contiguous(src0)); memcpy( ((char *) dst->data), ((char *) src0->data), - ggml_nbytes(dst)); + ggml_v2_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } // TODO: handle transposed/permuted matrices - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; const int nr = src0->ne[1]; const int nz = n/nr; @@ -10681,51 +10681,51 @@ static void ggml_compute_forward_diag_mask_f32( } } -static void ggml_compute_forward_diag_mask_inf( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_diag_mask_inf( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY); + ggml_v2_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -static void ggml_compute_forward_diag_mask_zero( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_diag_mask_zero( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0); + ggml_v2_compute_forward_diag_mask_f32(params, src0, src1, dst, 0); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_soft_max +// ggml_v2_compute_forward_soft_max -static void ggml_compute_forward_soft_max_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_soft_max_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(ggml_v2_is_contiguous(src0)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(dst)); + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -10735,7 +10735,7 @@ static void ggml_compute_forward_soft_max_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = ggml_nrows(src0); + const int nr = ggml_v2_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -10756,9 +10756,9 @@ static void ggml_compute_forward_soft_max_f32( #endif float max = -INFINITY; - ggml_vec_max_f32(nc, &max, sp); + ggml_v2_vec_max_f32(nc, &max, sp); - ggml_float sum = 0.0; + ggml_v2_float sum = 0.0; uint16_t scvt; for (int i = 0; i < nc; i++) { @@ -10766,10 +10766,10 @@ static void ggml_compute_forward_soft_max_f32( dp[i] = 0.0f; } else { // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max); - ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max); + ggml_v2_fp16_t s = GGML_V2_FP32_TO_FP16(sp[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); - sum += (ggml_float)val; + const float val = GGML_V2_FP16_TO_FP32(table_exp_f16[scvt]); + sum += (ggml_v2_float)val; dp[i] = val; } } @@ -10777,7 +10777,7 @@ static void ggml_compute_forward_soft_max_f32( assert(sum > 0.0); sum = 1.0/sum; - ggml_vec_scale_f32(nc, dp, sum); + ggml_v2_vec_scale_f32(nc, dp, sum); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -10788,34 +10788,34 @@ static void ggml_compute_forward_soft_max_f32( } } -static void ggml_compute_forward_soft_max( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_soft_max( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_soft_max_f32(params, src0, dst); + ggml_v2_compute_forward_soft_max_f32(params, src0, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_alibi +// ggml_v2_compute_forward_alibi -static void ggml_compute_forward_alibi_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_alibi_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 2); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -10829,7 +10829,7 @@ static void ggml_compute_forward_alibi_f32( //const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int ne2_ne3 = n/ne1; // ne2*ne3 const int nb0 = src0->nb[0]; @@ -10869,16 +10869,16 @@ static void ggml_compute_forward_alibi_f32( } -static void ggml_compute_forward_alibi_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_alibi_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { assert(params->ith == 0); - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 2); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -10892,7 +10892,7 @@ static void ggml_compute_forward_alibi_f16( //const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int ne2_ne3 = n/ne1; // ne2*ne3 const int nb0 = src0->nb[0]; @@ -10900,7 +10900,7 @@ static void ggml_compute_forward_alibi_f16( const int nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(ggml_fp16_t)); + assert(nb0 == sizeof(ggml_v2_fp16_t)); assert(ne1 + n_past == ne0); (void) n_past; // add alibi to src0 (KQ_scaled) @@ -10912,7 +10912,7 @@ static void ggml_compute_forward_alibi_f16( for (int i = 0; i < ne0; i++) { for (int j = 0; j < ne1; j++) { for (int k = 0; k < ne2_ne3; k++) { - ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); + ggml_v2_fp16_t * const src = (ggml_v2_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); // TODO: k*nb2 or k*nb3 @@ -10926,56 +10926,56 @@ static void ggml_compute_forward_alibi_f16( } // we return F32 - pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); + pdst[0] = i * m_k + GGML_V2_FP16_TO_FP32(src[0]); } } } } -static void ggml_compute_forward_alibi( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_alibi( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_alibi_f16(params, src0, src1, dst); + ggml_v2_compute_forward_alibi_f16(params, src0, src1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_alibi_f32(params, src0, src1, dst); + ggml_v2_compute_forward_alibi_f32(params, src0, src1, dst); } break; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q8_1B: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_COUNT: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_3: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_1: + case GGML_V2_TYPE_Q8_1B: + case GGML_V2_TYPE_I8: + case GGML_V2_TYPE_I16: + case GGML_V2_TYPE_I32: + case GGML_V2_TYPE_COUNT: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_rope +// ggml_v2_compute_forward_rope -static void ggml_compute_forward_rope_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); +static void ggml_v2_compute_forward_rope_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_I32); + GGML_V2_ASSERT(ggml_v2_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11003,15 +11003,15 @@ static void ggml_compute_forward_rope_f32( //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - GGML_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT(nb00 == sizeof(float)); const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(dst); + const int nr = ggml_v2_nrows(dst); - GGML_ASSERT(n_dims <= ne0); - GGML_ASSERT(n_dims % 2 == 0); + GGML_V2_ASSERT(n_dims <= ne0); + GGML_V2_ASSERT(n_dims % 2 == 0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11080,15 +11080,15 @@ static void ggml_compute_forward_rope_f32( } } -static void ggml_compute_forward_rope_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); +static void ggml_v2_compute_forward_rope_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_I32); + GGML_V2_ASSERT(ggml_v2_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11116,15 +11116,15 @@ static void ggml_compute_forward_rope_f16( //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT(nb0 == sizeof(ggml_v2_fp16_t)); const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(dst); + const int nr = ggml_v2_nrows(dst); - GGML_ASSERT(n_dims <= ne0); - GGML_ASSERT(n_dims % 2 == 0); + GGML_V2_ASSERT(n_dims <= ne0); + GGML_V2_ASSERT(n_dims % 2 == 0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11156,14 +11156,14 @@ static void ggml_compute_forward_rope_f16( theta *= theta_scale; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_v2_fp16_t * const src = (ggml_v2_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_v2_fp16_t * dst_data = (ggml_v2_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[1]); + const float x0 = GGML_V2_FP16_TO_FP32(src[0]); + const float x1 = GGML_V2_FP16_TO_FP32(src[1]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_V2_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_V2_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { // TODO: this is probably wrong, but I can't figure it out .. @@ -11177,14 +11177,14 @@ static void ggml_compute_forward_rope_f16( const int64_t i0 = ib*n_dims + ic/2; - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_v2_fp16_t * const src = (ggml_v2_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_v2_fp16_t * dst_data = (ggml_v2_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + const float x0 = GGML_V2_FP16_TO_FP32(src[0]); + const float x1 = GGML_V2_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_V2_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_V2_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } } @@ -11193,38 +11193,38 @@ static void ggml_compute_forward_rope_f16( } } -static void ggml_compute_forward_rope( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_rope( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst); + ggml_v2_compute_forward_rope_f16(params, src0, src1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst); + ggml_v2_compute_forward_rope_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_rope_back +// ggml_v2_compute_forward_rope_back -static void ggml_compute_forward_rope_back_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); +static void ggml_v2_compute_forward_rope_back_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11262,7 +11262,7 @@ static void ggml_compute_forward_rope_back_f32( const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(dst); + const int nr = ggml_v2_nrows(dst); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11329,15 +11329,15 @@ static void ggml_compute_forward_rope_back_f32( } } -static void ggml_compute_forward_rope_back_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); +static void ggml_v2_compute_forward_rope_back_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11370,12 +11370,12 @@ static void ggml_compute_forward_rope_back_f16( //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - assert(nb0 == sizeof(ggml_fp16_t)); + assert(nb0 == sizeof(ggml_v2_fp16_t)); const int ith = params->ith; const int nth = params->nth; - const int nr = ggml_nrows(dst); + const int nr = ggml_v2_nrows(dst); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11407,14 +11407,14 @@ static void ggml_compute_forward_rope_back_f16( theta *= theta_scale; - const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_v2_fp16_t * const dy = (ggml_v2_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_v2_fp16_t * dx = (ggml_v2_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float dy0 = GGML_FP16_TO_FP32(dy[0]); - const float dy1 = GGML_FP16_TO_FP32(dy[1]); + const float dy0 = GGML_V2_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_V2_FP16_TO_FP32(dy[1]); - dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + dx[0] = GGML_V2_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[1] = GGML_V2_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); } } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { @@ -11426,14 +11426,14 @@ static void ggml_compute_forward_rope_back_f16( const int64_t i0 = ib*n_dims + ic/2; - const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_v2_fp16_t * const dy = (ggml_v2_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_v2_fp16_t * dx = (ggml_v2_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float dy0 = GGML_FP16_TO_FP32(dy[0]); - const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]); + const float dy0 = GGML_V2_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_V2_FP16_TO_FP32(dy[n_dims/2]); - dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + dx[0] = GGML_V2_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[n_dims/2] = GGML_V2_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); } } } @@ -11442,39 +11442,39 @@ static void ggml_compute_forward_rope_back_f16( } } -static void ggml_compute_forward_rope_back( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_rope_back( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + ggml_v2_compute_forward_rope_back_f16(params, src0, src1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + ggml_v2_compute_forward_rope_back_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_conv_1d_1s +// ggml_v2_compute_forward_conv_1d_1s -static void ggml_compute_forward_conv_1d_1s_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +static void ggml_v2_compute_forward_conv_1d_1s_f16_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT( dst->type == GGML_V2_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t ne00 = src0->ne[0]; @@ -11514,24 +11514,24 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = ggml_up32(ne01); + const int ew0 = ggml_v2_up32(ne01); - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_V2_ASSERT(nb00 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); // prepare kernel data (src0) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_v2_fp16_t * const wdata = (ggml_v2_fp16_t *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + const ggml_v2_fp16_t * const src = (ggml_v2_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_v2_fp16_t * dst_data = wdata + i02*ew0*ne00; for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i00*ew0 + i01] = src[i00]; } @@ -11541,13 +11541,13 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( // prepare source data (src1) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + ggml_v2_fp16_t * const wdata = (ggml_v2_fp16_t *) params->wdata + ne02*ew0*ne00; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); - ggml_fp16_t * dst_data = wdata; + ggml_v2_fp16_t * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + dst_data[(i10 + nh)*ew0 + i11] = GGML_V2_FP32_TO_FP16(src[i10]); } } } @@ -11555,7 +11555,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11575,9 +11575,9 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( dst_data[i0] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - ggml_vec_dot_f16(ew0, &v, - (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + ggml_v2_vec_dot_f16(ew0, &v, + (ggml_v2_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_v2_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); dst_data[i0] += v; } @@ -11585,16 +11585,16 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( } } -static void ggml_compute_forward_conv_1d_1s_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +static void ggml_v2_compute_forward_conv_1d_1s_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT( dst->type == GGML_V2_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t ne00 = src0->ne[0]; @@ -11634,13 +11634,13 @@ static void ggml_compute_forward_conv_1d_1s_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = ggml_up32(ne01); + const int ew0 = ggml_v2_up32(ne01); - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_V2_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); @@ -11675,7 +11675,7 @@ static void ggml_compute_forward_conv_1d_1s_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11695,7 +11695,7 @@ static void ggml_compute_forward_conv_1d_1s_f32( dst_data[i0] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - ggml_vec_dot_f32(ew0, &v, + ggml_v2_vec_dot_f32(ew0, &v, (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); @@ -11705,39 +11705,39 @@ static void ggml_compute_forward_conv_1d_1s_f32( } } -static void ggml_compute_forward_conv_1d_1s( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_conv_1d_1s( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst); + ggml_v2_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst); + ggml_v2_compute_forward_conv_1d_1s_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_conv_1d_2s +// ggml_v2_compute_forward_conv_1d_2s -static void ggml_compute_forward_conv_1d_2s_f16_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +static void ggml_v2_compute_forward_conv_1d_2s_f16_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F16); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT( dst->type == GGML_V2_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t ne00 = src0->ne[0]; @@ -11777,24 +11777,24 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = ggml_up32(ne01); + const int ew0 = ggml_v2_up32(ne01); - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_V2_ASSERT(nb00 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); // prepare kernel data (src0) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_v2_fp16_t * const wdata = (ggml_v2_fp16_t *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + const ggml_v2_fp16_t * const src = (ggml_v2_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_v2_fp16_t * dst_data = wdata + i02*ew0*ne00; for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i00*ew0 + i01] = src[i00]; } @@ -11804,13 +11804,13 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( // prepare source data (src1) { - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + ggml_v2_fp16_t * const wdata = (ggml_v2_fp16_t *) params->wdata + ne02*ew0*ne00; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); - ggml_fp16_t * dst_data = wdata; + ggml_v2_fp16_t * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); + dst_data[(i10 + nh)*ew0 + i11] = GGML_V2_FP32_TO_FP16(src[i10]); } } } @@ -11818,7 +11818,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11838,9 +11838,9 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( dst_data[i0/2] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - ggml_vec_dot_f16(ew0, &v, - (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + ggml_v2_vec_dot_f16(ew0, &v, + (ggml_v2_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_v2_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); dst_data[i0/2] += v; } @@ -11848,16 +11848,16 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( } } -static void ggml_compute_forward_conv_1d_2s_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +static void ggml_v2_compute_forward_conv_1d_2s_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { + GGML_V2_ASSERT(src0->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT(src1->type == GGML_V2_TYPE_F32); + GGML_V2_ASSERT( dst->type == GGML_V2_TYPE_F32); - int64_t t0 = ggml_perf_time_us(); + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t ne00 = src0->ne[0]; @@ -11897,13 +11897,13 @@ static void ggml_compute_forward_conv_1d_2s_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = ggml_up32(ne01); + const int ew0 = ggml_v2_up32(ne01); - GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(nb10 == sizeof(float)); + GGML_V2_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_V2_ASSERT(nb00 == sizeof(float)); + GGML_V2_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); @@ -11938,7 +11938,7 @@ static void ggml_compute_forward_conv_1d_2s_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } @@ -11958,7 +11958,7 @@ static void ggml_compute_forward_conv_1d_2s_f32( dst_data[i0/2] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - ggml_vec_dot_f32(ew0, &v, + ggml_v2_vec_dot_f32(ew0, &v, (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); @@ -11968,37 +11968,37 @@ static void ggml_compute_forward_conv_1d_2s_f32( } } -static void ggml_compute_forward_conv_1d_2s( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_conv_1d_2s( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst) { switch (src0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst); + ggml_v2_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst); + ggml_v2_compute_forward_conv_1d_2s_f32(params, src0, src1, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_flash_attn +// ggml_v2_compute_forward_flash_attn -static void ggml_compute_forward_flash_attn_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, +static void ggml_v2_compute_forward_flash_attn_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * q, + const struct ggml_v2_tensor * k, + const struct ggml_v2_tensor * v, const bool masked, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); + struct ggml_v2_tensor * dst) { + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t neq0 = q->ne[0]; @@ -12049,39 +12049,39 @@ static void ggml_compute_forward_flash_attn_f32( const int64_t P = nek1 - N; const int64_t M = P + N; - const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + const int Mup = ggml_v2_up(M, GGML_V2_SOFT_MAX_UNROLL); - GGML_ASSERT(ne0 == D); - GGML_ASSERT(ne1 == N); - GGML_ASSERT(P >= 0); + GGML_V2_ASSERT(ne0 == D); + GGML_V2_ASSERT(ne1 == N); + GGML_V2_ASSERT(P >= 0); - GGML_ASSERT(nbq0 == sizeof(float)); - GGML_ASSERT(nbk0 == sizeof(float)); - GGML_ASSERT(nbv0 == sizeof(float)); + GGML_V2_ASSERT(nbq0 == sizeof(float)); + GGML_V2_ASSERT(nbk0 == sizeof(float)); + GGML_V2_ASSERT(nbv0 == sizeof(float)); - GGML_ASSERT(neq0 == D); - GGML_ASSERT(nek0 == D); - GGML_ASSERT(nev1 == D); + GGML_V2_ASSERT(neq0 == D); + GGML_V2_ASSERT(nek0 == D); + GGML_V2_ASSERT(nev1 == D); - GGML_ASSERT(neq1 == N); - GGML_ASSERT(nek1 == N + P); - GGML_ASSERT(nev1 == D); + GGML_V2_ASSERT(neq1 == N); + GGML_V2_ASSERT(nek1 == N + P); + GGML_V2_ASSERT(nev1 == D); // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + GGML_V2_ASSERT(nb0 == sizeof(float)); + GGML_V2_ASSERT(nb0 <= nb1); + GGML_V2_ASSERT(nb1 <= nb2); + GGML_V2_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } - // parallelize by q rows using ggml_vec_dot_f32 + // parallelize by q rows using ggml_v2_vec_dot_f32 // total rows in q const int nr = neq1*neq2*neq3; @@ -12118,14 +12118,14 @@ static void ggml_compute_forward_flash_attn_f32( // S indices const int i1 = ik1; - ggml_vec_dot_f32(neq0, + ggml_v2_vec_dot_f32(neq0, S + i1, (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } // scale - ggml_vec_scale_f32(nek1, S, scale); + ggml_v2_vec_scale_f32(nek1, S, scale); if (masked) { for (int64_t i = P; i < M; i++) { @@ -12138,36 +12138,36 @@ static void ggml_compute_forward_flash_attn_f32( // softmax { float max = -INFINITY; - ggml_vec_max_f32(M, &max, S); + ggml_v2_vec_max_f32(M, &max, S); - ggml_float sum = 0.0; + ggml_v2_float sum = 0.0; { -#ifdef GGML_SOFT_MAX_ACCELERATE +#ifdef GGML_V2_SOFT_MAX_ACCELERATE max = -max; vDSP_vsadd(S, 1, &max, S, 1, Mup); vvexpf(S, S, &Mup); - ggml_vec_sum_f32(Mup, &sum, S); + ggml_v2_vec_sum_f32(Mup, &sum, S); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; - ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + uint16_t scvt[GGML_V2_SOFT_MAX_UNROLL]; + ggml_v2_float sump[GGML_V2_SOFT_MAX_UNROLL] = { 0.0 }; - for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + for (int i = 0; i < Mup; i += GGML_V2_SOFT_MAX_UNROLL) { float * SS = S + i; - for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + for (int j = 0; j < GGML_V2_SOFT_MAX_UNROLL; ++j) { if (SS[j] == -INFINITY) { SS[j] = 0.0f; } else { - ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + ggml_v2_fp16_t s = GGML_V2_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += (ggml_float)val; + const float val = GGML_V2_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += (ggml_v2_float)val; SS[j] = val; } } } - for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + for (int i = 0; i < GGML_V2_SOFT_MAX_UNROLL; i++) { sum += sump[i]; } #endif @@ -12176,7 +12176,7 @@ static void ggml_compute_forward_flash_attn_f32( assert(sum > 0.0); sum = 1.0/sum; - ggml_vec_scale_f32(M, S, sum); + ggml_v2_vec_scale_f32(M, S, sum); #ifndef NDEBUG for (int i = 0; i < M; ++i) { @@ -12192,7 +12192,7 @@ static void ggml_compute_forward_flash_attn_f32( const int i2 = iq2; const int i3 = iq3; - ggml_vec_dot_f32(nek1, + ggml_v2_vec_dot_f32(nek1, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), S); @@ -12200,14 +12200,14 @@ static void ggml_compute_forward_flash_attn_f32( } } -static void ggml_compute_forward_flash_attn_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, +static void ggml_v2_compute_forward_flash_attn_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * q, + const struct ggml_v2_tensor * k, + const struct ggml_v2_tensor * v, const bool masked, - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); + struct ggml_v2_tensor * dst) { + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t neq0 = q->ne[0]; @@ -12258,39 +12258,39 @@ static void ggml_compute_forward_flash_attn_f16( const int64_t P = nek1 - N; const int64_t M = P + N; - const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + const int Mup = ggml_v2_up(M, GGML_V2_SOFT_MAX_UNROLL); - GGML_ASSERT(ne0 == D); - GGML_ASSERT(ne1 == N); - GGML_ASSERT(P >= 0); + GGML_V2_ASSERT(ne0 == D); + GGML_V2_ASSERT(ne1 == N); + GGML_V2_ASSERT(P >= 0); - GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t)); + GGML_V2_ASSERT(nbq0 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nbk0 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nbv0 == sizeof(ggml_v2_fp16_t)); - GGML_ASSERT(neq0 == D); - GGML_ASSERT(nek0 == D); - GGML_ASSERT(nev1 == D); + GGML_V2_ASSERT(neq0 == D); + GGML_V2_ASSERT(nek0 == D); + GGML_V2_ASSERT(nev1 == D); - GGML_ASSERT(neq1 == N); - GGML_ASSERT(nek1 == N + P); - GGML_ASSERT(nev1 == D); + GGML_V2_ASSERT(neq1 == N); + GGML_V2_ASSERT(nek1 == N + P); + GGML_V2_ASSERT(nev1 == D); // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + GGML_V2_ASSERT(nb0 == sizeof(float)); + GGML_V2_ASSERT(nb0 <= nb1); + GGML_V2_ASSERT(nb1 <= nb2); + GGML_V2_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } - // parallelize by q rows using ggml_vec_dot_f32 + // parallelize by q rows using ggml_v2_vec_dot_f32 // total rows in q const int nr = neq1*neq2*neq3; @@ -12318,7 +12318,7 @@ static void ggml_compute_forward_flash_attn_f16( S[i] = -INFINITY; } - if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) { + if (GGML_V2_VEC_DOT_UNROLL > 2 || nek1 % GGML_V2_VEC_DOT_UNROLL != 0) { for (int64_t ic = 0; ic < nek1; ++ic) { // k indices const int ik3 = iq3; @@ -12328,13 +12328,13 @@ static void ggml_compute_forward_flash_attn_f16( // S indices const int i1 = ik1; - ggml_vec_dot_f16(neq0, + ggml_v2_vec_dot_f16(neq0, S + i1, - (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + (ggml_v2_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (ggml_v2_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } } else { - for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { + for (int64_t ic = 0; ic < nek1; ic += GGML_V2_VEC_DOT_UNROLL) { // k indices const int ik3 = iq3; const int ik2 = iq2; @@ -12343,15 +12343,15 @@ static void ggml_compute_forward_flash_attn_f16( // S indices const int i1 = ik1; - ggml_vec_dot_f16_unroll(neq0, nbk1, + ggml_v2_vec_dot_f16_unroll(neq0, nbk1, S + i1, ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + (ggml_v2_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } } // scale - ggml_vec_scale_f32(nek1, S, scale); + ggml_v2_vec_scale_f32(nek1, S, scale); if (masked) { for (int64_t i = P; i < M; i++) { @@ -12364,36 +12364,36 @@ static void ggml_compute_forward_flash_attn_f16( // softmax { float max = -INFINITY; - ggml_vec_max_f32(M, &max, S); + ggml_v2_vec_max_f32(M, &max, S); - ggml_float sum = 0.0; + ggml_v2_float sum = 0.0; { -#ifdef GGML_SOFT_MAX_ACCELERATE +#ifdef GGML_V2_SOFT_MAX_ACCELERATE max = -max; vDSP_vsadd(S, 1, &max, S, 1, Mup); vvexpf(S, S, &Mup); - ggml_vec_sum_f32(Mup, &sum, S); + ggml_v2_vec_sum_f32(Mup, &sum, S); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; - ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + uint16_t scvt[GGML_V2_SOFT_MAX_UNROLL]; + ggml_v2_float sump[GGML_V2_SOFT_MAX_UNROLL] = { 0.0 }; - for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + for (int i = 0; i < Mup; i += GGML_V2_SOFT_MAX_UNROLL) { float * SS = S + i; - for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + for (int j = 0; j < GGML_V2_SOFT_MAX_UNROLL; ++j) { if (SS[j] == -INFINITY) { SS[j] = 0.0f; } else { - ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + ggml_v2_fp16_t s = GGML_V2_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += (ggml_float)val; + const float val = GGML_V2_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += (ggml_v2_float)val; SS[j] = val; } } } - for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + for (int i = 0; i < GGML_V2_SOFT_MAX_UNROLL; i++) { sum += sump[i]; } #endif @@ -12402,7 +12402,7 @@ static void ggml_compute_forward_flash_attn_f16( assert(sum > 0.0); sum = 1.0/sum; - ggml_vec_scale_f32(M, S, sum); + ggml_v2_vec_scale_f32(M, S, sum); #ifndef NDEBUG for (int i = 0; i < M; ++i) { @@ -12412,32 +12412,32 @@ static void ggml_compute_forward_flash_attn_f16( #endif } - ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); + ggml_v2_fp16_t * S16 = (ggml_v2_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); for (int64_t i = 0; i < M; i++) { - S16[i] = GGML_FP32_TO_FP16(S[i]); + S16[i] = GGML_V2_FP32_TO_FP16(S[i]); } - if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) { + if (GGML_V2_VEC_DOT_UNROLL == 1 || (nev1 % GGML_V2_VEC_DOT_UNROLL != 0)) { for (int64_t ic = 0; ic < nev1; ++ic) { // dst indices const int i1 = iq1; const int i2 = iq2; const int i3 = iq3; - ggml_vec_dot_f16(nek1, + ggml_v2_vec_dot_f16(nek1, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + (ggml_v2_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), S16); } } else { - for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) { + for (int64_t ic = 0; ic < nev1; ic += GGML_V2_VEC_DOT_UNROLL) { // dst indices const int i1 = iq1; const int i2 = iq2; const int i3 = iq3; - ggml_vec_dot_f16_unroll(nek1, nbv1, + ggml_v2_vec_dot_f16_unroll(nek1, nbv1, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), S16); @@ -12446,40 +12446,40 @@ static void ggml_compute_forward_flash_attn_f16( } } -static void ggml_compute_forward_flash_attn( - const struct ggml_compute_params * params, - const struct ggml_tensor * q, - const struct ggml_tensor * k, - const struct ggml_tensor * v, +static void ggml_v2_compute_forward_flash_attn( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * q, + const struct ggml_v2_tensor * k, + const struct ggml_v2_tensor * v, const bool masked, - struct ggml_tensor * dst) { + struct ggml_v2_tensor * dst) { switch (q->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + ggml_v2_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + ggml_v2_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_flash_ff +// ggml_v2_compute_forward_flash_ff -static void ggml_compute_forward_flash_ff_f16( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, // F16 - const struct ggml_tensor * b0, // F16 fc_w - const struct ggml_tensor * b1, // F32 fc_b - const struct ggml_tensor * c0, // F16 proj_w - const struct ggml_tensor * c1, // F32 proj_b - struct ggml_tensor * dst) { - int64_t t0 = ggml_perf_time_us(); +static void ggml_v2_compute_forward_flash_ff_f16( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * a, // F16 + const struct ggml_v2_tensor * b0, // F16 fc_w + const struct ggml_v2_tensor * b1, // F32 fc_b + const struct ggml_v2_tensor * c0, // F16 proj_w + const struct ggml_v2_tensor * c1, // F32 proj_b + struct ggml_v2_tensor * dst) { + int64_t t0 = ggml_v2_perf_time_us(); UNUSED(t0); const int64_t nea0 = a->ne[0]; @@ -12549,41 +12549,41 @@ static void ggml_compute_forward_flash_ff_f16( //const int64_t N = nea1; const int64_t M = neb01; - GGML_ASSERT(ne0 == nea0); - GGML_ASSERT(ne1 == nea1); - GGML_ASSERT(ne2 == nea2); + GGML_V2_ASSERT(ne0 == nea0); + GGML_V2_ASSERT(ne1 == nea1); + GGML_V2_ASSERT(ne2 == nea2); - GGML_ASSERT(nba0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbb10 == sizeof(float)); - GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nbc10 == sizeof(float)); + GGML_V2_ASSERT(nba0 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nbb00 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nbb10 == sizeof(float)); + GGML_V2_ASSERT(nbc00 == sizeof(ggml_v2_fp16_t)); + GGML_V2_ASSERT(nbc10 == sizeof(float)); - GGML_ASSERT(neb00 == D); - GGML_ASSERT(neb01 == M); - GGML_ASSERT(neb10 == M); - GGML_ASSERT(neb11 == 1); + GGML_V2_ASSERT(neb00 == D); + GGML_V2_ASSERT(neb01 == M); + GGML_V2_ASSERT(neb10 == M); + GGML_V2_ASSERT(neb11 == 1); - GGML_ASSERT(nec00 == M); - GGML_ASSERT(nec01 == D); - GGML_ASSERT(nec10 == D); - GGML_ASSERT(nec11 == 1); + GGML_V2_ASSERT(nec00 == M); + GGML_V2_ASSERT(nec01 == D); + GGML_V2_ASSERT(nec10 == D); + GGML_V2_ASSERT(nec11 == 1); // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); + GGML_V2_ASSERT(nb0 == sizeof(float)); + GGML_V2_ASSERT(nb0 <= nb1); + GGML_V2_ASSERT(nb1 <= nb2); + GGML_V2_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_V2_TASK_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_FINALIZE) { return; } - // parallelize by a rows using ggml_vec_dot_f32 + // parallelize by a rows using ggml_v2_vec_dot_f32 // total rows in a const int nr = nea1*nea2*nea3; @@ -12612,22 +12612,22 @@ static void ggml_compute_forward_flash_ff_f16( // S indices const int i1 = ib01; - ggml_vec_dot_f16(nea0, + ggml_v2_vec_dot_f16(nea0, S + i1, - (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), - (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); + (ggml_v2_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), + (ggml_v2_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); } - ggml_vec_add_f32(neb01, S, S, (float *) b1->data); - //ggml_vec_gelu_f32(neb01, S, S); + ggml_v2_vec_add_f32(neb01, S, S, (float *) b1->data); + //ggml_v2_vec_gelu_f32(neb01, S, S); - ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); + ggml_v2_fp16_t * S16 = (ggml_v2_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); for (int64_t i = 0; i < M; i++) { - S16[i] = GGML_FP32_TO_FP16(S[i]); + S16[i] = GGML_V2_FP32_TO_FP16(S[i]); } - ggml_vec_gelu_f16(neb01, S16, S16); + ggml_v2_vec_gelu_f16(neb01, S16, S16); { // dst indices @@ -12637,13 +12637,13 @@ static void ggml_compute_forward_flash_ff_f16( for (int64_t ic = 0; ic < nec01; ++ic) { - ggml_vec_dot_f16(neb01, + ggml_v2_vec_dot_f16(neb01, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), + (ggml_v2_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), S16); } - ggml_vec_add_f32(nec01, + ggml_v2_vec_add_f32(nec01, (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), (float *) c1->data); @@ -12651,44 +12651,44 @@ static void ggml_compute_forward_flash_ff_f16( } } -static void ggml_compute_forward_flash_ff( - const struct ggml_compute_params * params, - const struct ggml_tensor * a, - const struct ggml_tensor * b0, - const struct ggml_tensor * b1, - const struct ggml_tensor * c0, - const struct ggml_tensor * c1, - struct ggml_tensor * dst) { +static void ggml_v2_compute_forward_flash_ff( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * a, + const struct ggml_v2_tensor * b0, + const struct ggml_v2_tensor * b1, + const struct ggml_v2_tensor * c0, + const struct ggml_v2_tensor * c1, + struct ggml_v2_tensor * dst) { switch (b0->type) { - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: { - ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + ggml_v2_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); } break; - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - GGML_ASSERT(false); // TODO + GGML_V2_ASSERT(false); // TODO } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_map_unary +// ggml_v2_compute_forward_map_unary -static void ggml_compute_forward_map_unary_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst, - const ggml_unary_op_f32_t fun) { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); +static void ggml_v2_compute_forward_map_unary_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst, + const ggml_v2_unary_op_f32_t fun) { + GGML_V2_ASSERT(ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert( dst->nb[0] == sizeof(float)); @@ -12702,39 +12702,39 @@ static void ggml_compute_forward_map_unary_f32( } -static void ggml_compute_forward_map_unary( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst, - const ggml_unary_op_f32_t fun) { +static void ggml_v2_compute_forward_map_unary( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + struct ggml_v2_tensor * dst, + const ggml_v2_unary_op_f32_t fun) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + ggml_v2_compute_forward_map_unary_f32(params, src0, dst, fun); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -// ggml_compute_forward_map_binary +// ggml_v2_compute_forward_map_binary -static void ggml_compute_forward_map_binary_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst, - const ggml_binary_op_f32_t fun) { +static void ggml_v2_compute_forward_map_binary_f32( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst, + const ggml_v2_binary_op_f32_t fun) { assert(params->ith == 0); - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + assert(ggml_v2_are_same_shape(src0, src1) && ggml_v2_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_V2_TASK_INIT || params->type == GGML_V2_TASK_FINALIZE) { return; } - const int n = ggml_nrows(src0); + const int n = ggml_v2_nrows(src0); const int nc = src0->ne[0]; assert( dst->nb[0] == sizeof(float)); @@ -12750,292 +12750,292 @@ static void ggml_compute_forward_map_binary_f32( } -static void ggml_compute_forward_map_binary( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst, - const ggml_binary_op_f32_t fun) { +static void ggml_v2_compute_forward_map_binary( + const struct ggml_v2_compute_params * params, + const struct ggml_v2_tensor * src0, + const struct ggml_v2_tensor * src1, + struct ggml_v2_tensor * dst, + const ggml_v2_binary_op_f32_t fun) { switch (src0->type) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: { - ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + ggml_v2_compute_forward_map_binary_f32(params, src0, src1, dst, fun); } break; default: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } ///////////////////////////////// -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { - GGML_ASSERT(params); +static void ggml_v2_compute_forward(struct ggml_v2_compute_params * params, struct ggml_v2_tensor * tensor) { + GGML_V2_ASSERT(params); switch (tensor->op) { - case GGML_OP_DUP: + case GGML_V2_OP_DUP: { - ggml_compute_forward_dup(params, tensor->src0, tensor); + ggml_v2_compute_forward_dup(params, tensor->src0, tensor); } break; - case GGML_OP_ADD: + case GGML_V2_OP_ADD: { - ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_add(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_ADD1: + case GGML_V2_OP_ADD1: { - ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_add1(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_ACC: + case GGML_V2_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_v2_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); } break; - case GGML_OP_SUB: + case GGML_V2_OP_SUB: { - ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_MUL: + case GGML_V2_OP_MUL: { - ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_mul(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_DIV: + case GGML_V2_OP_DIV: { - ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_div(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_SQR: + case GGML_V2_OP_SQR: { - ggml_compute_forward_sqr(params, tensor->src0, tensor); + ggml_v2_compute_forward_sqr(params, tensor->src0, tensor); } break; - case GGML_OP_SQRT: + case GGML_V2_OP_SQRT: { - ggml_compute_forward_sqrt(params, tensor->src0, tensor); + ggml_v2_compute_forward_sqrt(params, tensor->src0, tensor); } break; - case GGML_OP_LOG: + case GGML_V2_OP_LOG: { - ggml_compute_forward_log(params, tensor->src0, tensor); + ggml_v2_compute_forward_log(params, tensor->src0, tensor); } break; - case GGML_OP_SUM: + case GGML_V2_OP_SUM: { - ggml_compute_forward_sum(params, tensor->src0, tensor); + ggml_v2_compute_forward_sum(params, tensor->src0, tensor); } break; - case GGML_OP_SUM_ROWS: + case GGML_V2_OP_SUM_ROWS: { - ggml_compute_forward_sum_rows(params, tensor->src0, tensor); + ggml_v2_compute_forward_sum_rows(params, tensor->src0, tensor); } break; - case GGML_OP_MEAN: + case GGML_V2_OP_MEAN: { - ggml_compute_forward_mean(params, tensor->src0, tensor); + ggml_v2_compute_forward_mean(params, tensor->src0, tensor); } break; - case GGML_OP_REPEAT: + case GGML_V2_OP_REPEAT: { - ggml_compute_forward_repeat(params, tensor->src0, tensor); + ggml_v2_compute_forward_repeat(params, tensor->src0, tensor); } break; - case GGML_OP_ABS: + case GGML_V2_OP_ABS: { - ggml_compute_forward_abs(params, tensor->src0, tensor); + ggml_v2_compute_forward_abs(params, tensor->src0, tensor); } break; - case GGML_OP_SGN: + case GGML_V2_OP_SGN: { - ggml_compute_forward_sgn(params, tensor->src0, tensor); + ggml_v2_compute_forward_sgn(params, tensor->src0, tensor); } break; - case GGML_OP_NEG: + case GGML_V2_OP_NEG: { - ggml_compute_forward_neg(params, tensor->src0, tensor); + ggml_v2_compute_forward_neg(params, tensor->src0, tensor); } break; - case GGML_OP_STEP: + case GGML_V2_OP_STEP: { - ggml_compute_forward_step(params, tensor->src0, tensor); + ggml_v2_compute_forward_step(params, tensor->src0, tensor); } break; - case GGML_OP_RELU: + case GGML_V2_OP_RELU: { - ggml_compute_forward_relu(params, tensor->src0, tensor); + ggml_v2_compute_forward_relu(params, tensor->src0, tensor); } break; - case GGML_OP_GELU: + case GGML_V2_OP_GELU: { - ggml_compute_forward_gelu(params, tensor->src0, tensor); + ggml_v2_compute_forward_gelu(params, tensor->src0, tensor); } break; - case GGML_OP_SILU: + case GGML_V2_OP_SILU: { - ggml_compute_forward_silu(params, tensor->src0, tensor); + ggml_v2_compute_forward_silu(params, tensor->src0, tensor); } break; - case GGML_OP_SILU_BACK: + case GGML_V2_OP_SILU_BACK: { - ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_NORM: + case GGML_V2_OP_NORM: { - ggml_compute_forward_norm(params, tensor->src0, tensor); + ggml_v2_compute_forward_norm(params, tensor->src0, tensor); } break; - case GGML_OP_RMS_NORM: + case GGML_V2_OP_RMS_NORM: { - ggml_compute_forward_rms_norm(params, tensor->src0, tensor); + ggml_v2_compute_forward_rms_norm(params, tensor->src0, tensor); } break; - case GGML_OP_RMS_NORM_BACK: + case GGML_V2_OP_RMS_NORM_BACK: { - ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_MUL_MAT: + case GGML_V2_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_SCALE: + case GGML_V2_OP_SCALE: { - ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_SET: + case GGML_V2_OP_SET: { - ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_v2_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); } break; - case GGML_OP_CPY: + case GGML_V2_OP_CPY: { - ggml_compute_forward_cpy(params, tensor->src0, tensor); + ggml_v2_compute_forward_cpy(params, tensor->src0, tensor); } break; - case GGML_OP_CONT: + case GGML_V2_OP_CONT: { - ggml_compute_forward_cont(params, tensor->src0, tensor); + ggml_v2_compute_forward_cont(params, tensor->src0, tensor); } break; - case GGML_OP_RESHAPE: + case GGML_V2_OP_RESHAPE: { - ggml_compute_forward_reshape(params, tensor->src0, tensor); + ggml_v2_compute_forward_reshape(params, tensor->src0, tensor); } break; - case GGML_OP_VIEW: + case GGML_V2_OP_VIEW: { - ggml_compute_forward_view(params, tensor->src0); + ggml_v2_compute_forward_view(params, tensor->src0); } break; - case GGML_OP_PERMUTE: + case GGML_V2_OP_PERMUTE: { - ggml_compute_forward_permute(params, tensor->src0); + ggml_v2_compute_forward_permute(params, tensor->src0); } break; - case GGML_OP_TRANSPOSE: + case GGML_V2_OP_TRANSPOSE: { - ggml_compute_forward_transpose(params, tensor->src0); + ggml_v2_compute_forward_transpose(params, tensor->src0); } break; - case GGML_OP_GET_ROWS: + case GGML_V2_OP_GET_ROWS: { - ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_GET_ROWS_BACK: + case GGML_V2_OP_GET_ROWS_BACK: { - ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_v2_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); } break; - case GGML_OP_DIAG: + case GGML_V2_OP_DIAG: { - ggml_compute_forward_diag(params, tensor->src0, tensor); + ggml_v2_compute_forward_diag(params, tensor->src0, tensor); } break; - case GGML_OP_DIAG_MASK_INF: + case GGML_V2_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_DIAG_MASK_ZERO: + case GGML_V2_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_SOFT_MAX: + case GGML_V2_OP_SOFT_MAX: { - ggml_compute_forward_soft_max(params, tensor->src0, tensor); + ggml_v2_compute_forward_soft_max(params, tensor->src0, tensor); } break; - case GGML_OP_ROPE: + case GGML_V2_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_ROPE_BACK: + case GGML_V2_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_ALIBI: + case GGML_V2_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_CONV_1D_1S: + case GGML_V2_OP_CONV_1D_1S: { - ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_CONV_1D_2S: + case GGML_V2_OP_CONV_1D_2S: { - ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor); + ggml_v2_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_FLASH_ATTN: + case GGML_V2_OP_FLASH_ATTN: { - int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); - GGML_ASSERT(t == 0 || t == 1); + int32_t t = ggml_v2_get_i32_1d(tensor->opt[1], 0); + GGML_V2_ASSERT(t == 0 || t == 1); bool masked = t != 0; - ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); + ggml_v2_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); } break; - case GGML_OP_FLASH_FF: + case GGML_V2_OP_FLASH_FF: { - ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); + ggml_v2_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); } break; - case GGML_OP_MAP_UNARY: + case GGML_V2_OP_MAP_UNARY: { - const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun); + const ggml_v2_unary_op_f32_t fun = *((ggml_v2_unary_op_f32_t *)tensor->opt[0]->data); + ggml_v2_compute_forward_map_unary(params, tensor->src0, tensor, fun); } break; - case GGML_OP_MAP_BINARY: + case GGML_V2_OP_MAP_BINARY: { - const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data); - ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); + const ggml_v2_binary_op_f32_t fun = *((ggml_v2_binary_op_f32_t *)tensor->opt[0]->data); + ggml_v2_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); } break; - case GGML_OP_NONE: + case GGML_V2_OP_NONE: { // nop } break; - case GGML_OP_COUNT: + case GGML_V2_OP_COUNT: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } //////////////////////////////////////////////////////////////////////////////// -static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { - struct ggml_tensor * src0 = tensor->src0; - struct ggml_tensor * src1 = tensor->src1; +static void ggml_v2_compute_backward(struct ggml_v2_context * ctx, struct ggml_v2_tensor * tensor, bool inplace) { + struct ggml_v2_tensor * src0 = tensor->src0; + struct ggml_v2_tensor * src1 = tensor->src1; switch (tensor->op) { - case GGML_OP_DUP: + case GGML_V2_OP_DUP: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_v2_add_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case GGML_OP_ADD: + case GGML_V2_OP_ADD: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_v2_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = ggml_v2_add_impl(ctx, src1->grad, tensor->grad, inplace); } } break; - case GGML_OP_ADD1: + case GGML_V2_OP_ADD1: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_v2_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - src1->grad = ggml_add_impl(ctx, + src1->grad = ggml_v2_add_impl(ctx, src1->grad, - ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean + ggml_v2_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean inplace); } } break; - case GGML_OP_ACC: + case GGML_V2_OP_ACC: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_v2_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); - GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); + GGML_V2_ASSERT(ggml_v2_nelements(tensor->opt[0]) == 5); + GGML_V2_ASSERT(tensor->opt[0]->type == GGML_V2_TYPE_I32); const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; - struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, + struct ggml_v2_tensor * tensor_grad_view = ggml_v2_view_4d(ctx, tensor->grad, src1->grad->ne[0], src1->grad->ne[1], @@ -13044,134 +13044,134 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor nb1, nb2, nb3, offset); src1->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src1->grad, - ggml_reshape(ctx, - ggml_cont(ctx, tensor_grad_view), + ggml_v2_reshape(ctx, + ggml_v2_cont(ctx, tensor_grad_view), src1->grad), inplace); } } break; - case GGML_OP_SUB: + case GGML_V2_OP_SUB: { if (src0->grad) { - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_v2_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = ggml_v2_sub_impl(ctx, src1->grad, tensor->grad, inplace); } } break; - case GGML_OP_MUL: + case GGML_V2_OP_MUL: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_mul(ctx, src1, tensor->grad), + ggml_v2_mul(ctx, src1, tensor->grad), inplace); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src1->grad, - ggml_mul(ctx, src0, tensor->grad), + ggml_v2_mul(ctx, src0, tensor->grad), inplace); } } break; - case GGML_OP_DIV: + case GGML_V2_OP_DIV: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_div(ctx, tensor->grad, src1), + ggml_v2_div(ctx, tensor->grad, src1), inplace); } if (src1->grad) { src1->grad = - ggml_sub_impl(ctx, + ggml_v2_sub_impl(ctx, src1->grad, - ggml_mul(ctx, + ggml_v2_mul(ctx, tensor->grad, - ggml_div(ctx, tensor, src1)), + ggml_v2_div(ctx, tensor, src1)), inplace); } } break; - case GGML_OP_SQR: + case GGML_V2_OP_SQR: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_scale(ctx, - ggml_mul(ctx, src0, tensor->grad), - ggml_new_f32(ctx, 2.0f)), + ggml_v2_scale(ctx, + ggml_v2_mul(ctx, src0, tensor->grad), + ggml_v2_new_f32(ctx, 2.0f)), inplace); } } break; - case GGML_OP_SQRT: + case GGML_V2_OP_SQRT: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_mul(ctx, + ggml_v2_mul(ctx, tensor->grad, // this was not catched by test_grad because in test_grad tensor->grad is 1 - ggml_div(ctx, - ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor), + ggml_v2_div(ctx, + ggml_v2_repeat(ctx, ggml_v2_new_f32(ctx, 0.5f), tensor), tensor)), inplace); } } break; - case GGML_OP_LOG: + case GGML_V2_OP_LOG: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_div(ctx, + ggml_v2_div(ctx, tensor->grad, src0), inplace); } } break; - case GGML_OP_SUM: + case GGML_V2_OP_SUM: { if (src0->grad) { src0->grad = - ggml_add1_impl(ctx, + ggml_v2_add1_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case GGML_OP_SUM_ROWS: + case GGML_V2_OP_SUM_ROWS: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_repeat(ctx, + ggml_v2_repeat(ctx, tensor->grad, src0->grad), inplace); } } break; - case GGML_OP_MEAN: + case GGML_V2_OP_MEAN: { - GGML_ASSERT(false); // TODO: implement + GGML_V2_ASSERT(false); // TODO: implement } break; - case GGML_OP_REPEAT: + case GGML_V2_OP_REPEAT: { // necessary for llama if (src0->grad) { - GGML_ASSERT(src0->n_dims == 1 || src0->n_dims == 2); + GGML_V2_ASSERT(src0->n_dims == 1 || src0->n_dims == 2); const int nc = tensor->ne[0]; const int nr = tensor->ne[1]; const int nc0 = src0->ne[0]; const int nr0 = src0->ne[1]; - const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat - const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat + const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_v2_can_repeat + const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_v2_can_repeat // tensor->grad [nc,nr,1,1] // reshape [nc0,nc/nc0,nr0,nr/nr0] // permute [nc0,nr0,nc/nc0,nr/nr0] @@ -13185,107 +13185,107 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor int64_t ne[4] = {nc0,ncr,nr0,nrr}; - struct ggml_tensor* F00 = tensor->grad; - struct ggml_tensor* F01 = ggml_reshape (ctx, F00, ggml_new_tensor(ctx,tensor->grad->type,4,ne)); - struct ggml_tensor* F02 = ggml_permute (ctx, F01, 0,2,1,3); - struct ggml_tensor* F03 = ggml_cont (ctx, F02); - struct ggml_tensor* F04 = ggml_reshape_2d(ctx, F03, nc0*nr0, ncr*nrr); - struct ggml_tensor* F05 = ggml_transpose (ctx, F04); - struct ggml_tensor* F06 = ggml_cont (ctx, F05); - struct ggml_tensor* F07 = ggml_sum_rows (ctx, F06); - struct ggml_tensor* F08 = ggml_transpose (ctx, F07); - struct ggml_tensor* F09 = ggml_cont (ctx, F08); - struct ggml_tensor* F10 = ggml_reshape (ctx, F09, src0->grad); + struct ggml_v2_tensor* F00 = tensor->grad; + struct ggml_v2_tensor* F01 = ggml_v2_reshape (ctx, F00, ggml_v2_new_tensor(ctx,tensor->grad->type,4,ne)); + struct ggml_v2_tensor* F02 = ggml_v2_permute (ctx, F01, 0,2,1,3); + struct ggml_v2_tensor* F03 = ggml_v2_cont (ctx, F02); + struct ggml_v2_tensor* F04 = ggml_v2_reshape_2d(ctx, F03, nc0*nr0, ncr*nrr); + struct ggml_v2_tensor* F05 = ggml_v2_transpose (ctx, F04); + struct ggml_v2_tensor* F06 = ggml_v2_cont (ctx, F05); + struct ggml_v2_tensor* F07 = ggml_v2_sum_rows (ctx, F06); + struct ggml_v2_tensor* F08 = ggml_v2_transpose (ctx, F07); + struct ggml_v2_tensor* F09 = ggml_v2_cont (ctx, F08); + struct ggml_v2_tensor* F10 = ggml_v2_reshape (ctx, F09, src0->grad); src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, F10, inplace); } } break; - case GGML_OP_ABS: + case GGML_V2_OP_ABS: { if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_mul(ctx, - ggml_sgn(ctx, src0), + ggml_v2_mul(ctx, + ggml_v2_sgn(ctx, src0), tensor->grad), inplace); } } break; - case GGML_OP_SGN: + case GGML_V2_OP_SGN: { if (src0->grad) { // noop } } break; - case GGML_OP_NEG: + case GGML_V2_OP_NEG: { if (src0->grad) { - src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_v2_sub_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case GGML_OP_STEP: + case GGML_V2_OP_STEP: { if (src0->grad) { // noop } } break; - case GGML_OP_RELU: + case GGML_V2_OP_RELU: { if (src0->grad) { - src0->grad = ggml_sub_impl(ctx, + src0->grad = ggml_v2_sub_impl(ctx, src0->grad, - ggml_mul(ctx, - ggml_step(ctx, src0), + ggml_v2_mul(ctx, + ggml_v2_step(ctx, src0), tensor->grad), inplace); } } break; - case GGML_OP_GELU: + case GGML_V2_OP_GELU: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_ALIBI: + case GGML_V2_OP_ALIBI: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_SILU: + case GGML_V2_OP_SILU: { // necessary for llama if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_v2_add_impl(ctx, src0->grad, - ggml_silu_back(ctx, src0, tensor->grad), + ggml_v2_silu_back(ctx, src0, tensor->grad), inplace); } } break; - case GGML_OP_SILU_BACK: + case GGML_V2_OP_SILU_BACK: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_NORM: + case GGML_V2_OP_NORM: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_RMS_NORM: + case GGML_V2_OP_RMS_NORM: { // necessary for llama if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_v2_add_impl(ctx, src0->grad, - ggml_rms_norm_back(ctx, src0, tensor->grad), + ggml_v2_rms_norm_back(ctx, src0, tensor->grad), inplace); } } break; - case GGML_OP_RMS_NORM_BACK: + case GGML_V2_OP_RMS_NORM_BACK: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_MUL_MAT: + case GGML_V2_OP_MUL_MAT: { // https://cs231n.github.io/optimization-2/#staged // # forward pass @@ -13304,73 +13304,73 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { - // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad); + // TODO: this requires outer product - ggml_v2_out_prod(ctx, src1, tensor->grad); src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, // ds0 = dt.dot(s1.T) - // ggml_out_prod(ctx, // [n,m] + // ggml_v2_out_prod(ctx, // [n,m] // src1, // [n,p] // tensor->grad), // [m,p] // for now just using A*B==(B.T*A.T).T - ggml_cont(ctx, // [n,m] - ggml_transpose(ctx, // [n,m] - ggml_mul_mat(ctx, // [m,n] - ggml_cont(ctx, // [p,m] - ggml_transpose(ctx, // [p,m] + ggml_v2_cont(ctx, // [n,m] + ggml_v2_transpose(ctx, // [n,m] + ggml_v2_mul_mat(ctx, // [m,n] + ggml_v2_cont(ctx, // [p,m] + ggml_v2_transpose(ctx, // [p,m] tensor->grad)), // [m,p] - ggml_cont(ctx, // [p,n] - ggml_transpose(ctx, // [p,n] + ggml_v2_cont(ctx, // [p,n] + ggml_v2_transpose(ctx, // [p,n] src1))))), // [n,p] inplace); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src1->grad, // ds1 = s0.T.dot(dt): - ggml_mul_mat(ctx, // [n,p] - ggml_cont(ctx, // [m,n] - ggml_transpose(ctx, src0)), // [m,n] + ggml_v2_mul_mat(ctx, // [n,p] + ggml_v2_cont(ctx, // [m,n] + ggml_v2_transpose(ctx, src0)), // [m,n] tensor->grad), // [m,p] inplace); } } break; - case GGML_OP_SCALE: + case GGML_V2_OP_SCALE: { // necessary for llama if (src0->grad) { src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, - ggml_scale_impl(ctx, tensor->grad, src1, false), + ggml_v2_scale_impl(ctx, tensor->grad, src1, false), inplace); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src1->grad, - ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), + ggml_v2_sum(ctx, ggml_v2_mul_impl(ctx, tensor->grad, src0, false)), inplace); } } break; - case GGML_OP_SET: + case GGML_V2_OP_SET: { - GGML_ASSERT(ggml_nelements(tensor->opt[0]) == 5); - GGML_ASSERT(tensor->opt[0]->type == GGML_TYPE_I32); + GGML_V2_ASSERT(ggml_v2_nelements(tensor->opt[0]) == 5); + GGML_V2_ASSERT(tensor->opt[0]->type == GGML_V2_TYPE_I32); const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; - struct ggml_tensor * tensor_grad_view = NULL; + struct ggml_v2_tensor * tensor_grad_view = NULL; if (src0->grad || src1->grad) { - GGML_ASSERT(src0->type == tensor->type); - GGML_ASSERT(tensor->grad->type == tensor->type); - GGML_ASSERT(tensor->grad->type == src1->grad->type); + GGML_V2_ASSERT(src0->type == tensor->type); + GGML_V2_ASSERT(tensor->grad->type == tensor->type); + GGML_V2_ASSERT(tensor->grad->type == src1->grad->type); - tensor_grad_view = ggml_view_4d(ctx, + tensor_grad_view = ggml_v2_view_4d(ctx, tensor->grad, src1->grad->ne[0], src1->grad->ne[1], @@ -13380,26 +13380,26 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } if (src0->grad) { - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_v2_add_impl(ctx, src0->grad, - ggml_acc_impl(ctx, + ggml_v2_acc_impl(ctx, tensor->grad, - ggml_neg(ctx, tensor_grad_view), + ggml_v2_neg(ctx, tensor_grad_view), nb1, nb2, nb3, offset, false), inplace); } if (src1->grad) { src1->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src1->grad, - ggml_reshape(ctx, - ggml_cont(ctx, tensor_grad_view), + ggml_v2_reshape(ctx, + ggml_v2_cont(ctx, tensor_grad_view), src1->grad), inplace); } } break; - case GGML_OP_CPY: + case GGML_V2_OP_CPY: { // necessary for llama // cpy overwrites value of src1 by src0 and returns view(src1) @@ -13407,32 +13407,32 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // tensor = src0 * 1 + src1 * 0 if (src0->grad) { // dsrc0 = dtensor * 1 - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_v2_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { // dsrc1 = dtensor * 0 -> noop } } break; - case GGML_OP_CONT: + case GGML_V2_OP_CONT: { // same as cpy if (src0->grad) { - GGML_ASSERT(ggml_is_contiguous(src0->grad)); - GGML_ASSERT(ggml_is_contiguous(tensor->grad)); - src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + GGML_V2_ASSERT(ggml_v2_is_contiguous(src0->grad)); + GGML_V2_ASSERT(ggml_v2_is_contiguous(tensor->grad)); + src0->grad = ggml_v2_add_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case GGML_OP_RESHAPE: + case GGML_V2_OP_RESHAPE: { // necessary for llama if (src0->grad) { src0->grad = - ggml_add_impl(ctx, src0->grad, - ggml_reshape(ctx, tensor->grad, src0->grad), + ggml_v2_add_impl(ctx, src0->grad, + ggml_v2_reshape(ctx, tensor->grad, src0->grad), inplace); } } break; - case GGML_OP_VIEW: + case GGML_V2_OP_VIEW: { // necessary for llama if (src0->grad) { @@ -13445,22 +13445,22 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->type != src0->grad->type) { // gradient is typically F32, but src0 could be other type - size_t ng = ggml_element_size(src0->grad); - size_t n0 = ggml_element_size(src0); - GGML_ASSERT(offset % n0 == 0); - GGML_ASSERT(nb1 % n0 == 0); - GGML_ASSERT(nb2 % n0 == 0); - GGML_ASSERT(nb3 % n0 == 0); + size_t ng = ggml_v2_element_size(src0->grad); + size_t n0 = ggml_v2_element_size(src0); + GGML_V2_ASSERT(offset % n0 == 0); + GGML_V2_ASSERT(nb1 % n0 == 0); + GGML_V2_ASSERT(nb2 % n0 == 0); + GGML_V2_ASSERT(nb3 % n0 == 0); offset = (offset / n0) * ng; nb1 = (nb1 / n0) * ng; nb2 = (nb2 / n0) * ng; nb3 = (nb3 / n0) * ng; } - src0->grad = ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); + src0->grad = ggml_v2_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); } } break; - case GGML_OP_PERMUTE: + case GGML_V2_OP_PERMUTE: { // necessary for llama if (src0->grad) { @@ -13474,8 +13474,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor axes_backward[axis2] = 2; axes_backward[axis3] = 3; src0->grad = - ggml_add_impl(ctx, src0->grad, - ggml_permute(ctx, + ggml_v2_add_impl(ctx, src0->grad, + ggml_v2_permute(ctx, tensor->grad, axes_backward[0], axes_backward[1], @@ -13484,70 +13484,70 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor inplace); } } break; - case GGML_OP_TRANSPOSE: + case GGML_V2_OP_TRANSPOSE: { // necessary for llama if (src0->grad) { src0->grad = - ggml_add_impl(ctx, src0->grad, - ggml_transpose(ctx, tensor->grad), + ggml_v2_add_impl(ctx, src0->grad, + ggml_v2_transpose(ctx, tensor->grad), inplace); } } break; - case GGML_OP_GET_ROWS: + case GGML_V2_OP_GET_ROWS: { // necessary for llama (only for tokenizer) if (src0->grad) { src0->grad = - ggml_add_impl(ctx, src0->grad, - ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), + ggml_v2_add_impl(ctx, src0->grad, + ggml_v2_get_rows_back(ctx, tensor->grad, src1, src0->grad), inplace); } if (src1->grad) { // noop } } break; - case GGML_OP_GET_ROWS_BACK: + case GGML_V2_OP_GET_ROWS_BACK: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_DIAG: + case GGML_V2_OP_DIAG: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_DIAG_MASK_INF: + case GGML_V2_OP_DIAG_MASK_INF: { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 2); const int n_past = ((int32_t *) src1->data)[0]; src0->grad = - ggml_add_impl(ctx, src0->grad, - ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + ggml_v2_add_impl(ctx, src0->grad, + ggml_v2_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), inplace); } if (src1->grad) { // noop } } break; - case GGML_OP_DIAG_MASK_ZERO: + case GGML_V2_OP_DIAG_MASK_ZERO: { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 2); const int n_past = ((int32_t *) src1->data)[0]; src0->grad = - ggml_add_impl(ctx, src0->grad, - ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + ggml_v2_add_impl(ctx, src0->grad, + ggml_v2_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), inplace); } if (src1->grad) { // noop } } break; - case GGML_OP_SOFT_MAX: + case GGML_V2_OP_SOFT_MAX: { // necessary for llama if (src0->grad) { @@ -13565,30 +13565,30 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor tensor->ne[1]*tensor->ne[2], tensor->ne[3] }; - struct ggml_tensor * tensor2 = ggml_cont(ctx, - ggml_reshape_4d(ctx, - ggml_cont(ctx, tensor), + struct ggml_v2_tensor * tensor2 = ggml_v2_cont(ctx, + ggml_v2_reshape_4d(ctx, + ggml_v2_cont(ctx, tensor), ne2[0], ne2[1], ne2[2], ne2[3])); - struct ggml_tensor * grad2 = ggml_cont(ctx, - ggml_reshape_4d(ctx, - ggml_cont(ctx, tensor->grad), + struct ggml_v2_tensor * grad2 = ggml_v2_cont(ctx, + ggml_v2_reshape_4d(ctx, + ggml_v2_cont(ctx, tensor->grad), ne2[0], ne2[1], ne2[2], ne2[3])); - struct ggml_tensor * tensor2_t = ggml_cont(ctx, // [1,ne0,ne1*ne2,ne3] - ggml_permute(ctx, // [1,ne0,ne1*ne2,ne3] + struct ggml_v2_tensor * tensor2_t = ggml_v2_cont(ctx, // [1,ne0,ne1*ne2,ne3] + ggml_v2_permute(ctx, // [1,ne0,ne1*ne2,ne3] tensor2, // [ne0,1,ne1*ne2,ne3] 1, 0, 2, 3)); src0->grad = - ggml_add_impl(ctx, + ggml_v2_add_impl(ctx, src0->grad, // [ne0,ne1,ne2,ne3] - ggml_reshape(ctx, // [ne0,ne1,ne2,ne3] - ggml_mul_mat(ctx, // [ne0,1,ne1*ne2,ne3] - ggml_sub(ctx, // [ne0,ne0,ne1*ne2,ne3] - ggml_diag(ctx, // [ne0,ne0,ne1*ne2,ne3] + ggml_v2_reshape(ctx, // [ne0,ne1,ne2,ne3] + ggml_v2_mul_mat(ctx, // [ne0,1,ne1*ne2,ne3] + ggml_v2_sub(ctx, // [ne0,ne0,ne1*ne2,ne3] + ggml_v2_diag(ctx, // [ne0,ne0,ne1*ne2,ne3] tensor2), // [ne0,1,ne1*ne2,ne3] - ggml_mul_mat(ctx, // [ne0,ne0,ne1*ne2,ne3] + ggml_v2_mul_mat(ctx, // [ne0,ne0,ne1*ne2,ne3] tensor2_t, // [1,ne0,ne1*ne2,ne3] tensor2_t)), // [1,ne0,ne1*ne2,ne3] grad2), // [ne0,1,ne1*ne2,ne3] @@ -13596,18 +13596,18 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor inplace); } } break; - case GGML_OP_ROPE: + case GGML_V2_OP_ROPE: { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 3); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_v2_add_impl(ctx, src0->grad, - ggml_rope_back(ctx, + ggml_v2_rope_back(ctx, tensor->grad, n_past, n_dims, @@ -13618,17 +13618,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // noop } } break; - case GGML_OP_ROPE_BACK: + case GGML_V2_OP_ROPE_BACK: { if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(src1->type == GGML_V2_TYPE_I32); + assert(ggml_v2_nelements(src1) == 3); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; - src0->grad = ggml_add_impl(ctx, + src0->grad = ggml_v2_add_impl(ctx, src0->grad, - ggml_rope(ctx, + ggml_v2_rope(ctx, tensor->grad, n_past, n_dims, @@ -13639,44 +13639,44 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // noop } } break; - case GGML_OP_CONV_1D_1S: + case GGML_V2_OP_CONV_1D_1S: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_1D_2S: + case GGML_V2_OP_CONV_1D_2S: { - GGML_ASSERT(false); // TODO: not implemented + GGML_V2_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_FLASH_ATTN: + case GGML_V2_OP_FLASH_ATTN: { - GGML_ASSERT(false); // not supported + GGML_V2_ASSERT(false); // not supported } break; - case GGML_OP_FLASH_FF: + case GGML_V2_OP_FLASH_FF: { - GGML_ASSERT(false); // not supported + GGML_V2_ASSERT(false); // not supported } break; - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: + case GGML_V2_OP_MAP_UNARY: + case GGML_V2_OP_MAP_BINARY: { - GGML_ASSERT(false); // not supported + GGML_V2_ASSERT(false); // not supported } break; - case GGML_OP_NONE: + case GGML_V2_OP_NONE: { // nop } break; - case GGML_OP_COUNT: + case GGML_V2_OP_COUNT: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } -static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { +static void ggml_v2_visit_parents(struct ggml_v2_cgraph * cgraph, struct ggml_v2_tensor * node) { if (node->grad == NULL) { // this usually happens when we generate intermediate nodes from constants in the backward pass // it can also happen during forward pass, if the user performs computations with constants - if (node->op != GGML_OP_NONE) { - //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op); + if (node->op != GGML_V2_OP_NONE) { + //GGML_V2_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op); } } @@ -13694,27 +13694,27 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } if (node->src0) { - ggml_visit_parents(cgraph, node->src0); + ggml_v2_visit_parents(cgraph, node->src0); } if (node->src1) { - ggml_visit_parents(cgraph, node->src1); + ggml_v2_visit_parents(cgraph, node->src1); } - for (int i = 0; i < GGML_MAX_OPT; ++i) { + for (int i = 0; i < GGML_V2_MAX_OPT; ++i) { if (node->opt[i]) { - ggml_visit_parents(cgraph, node->opt[i]); + ggml_v2_visit_parents(cgraph, node->opt[i]); } } - if (node->op == GGML_OP_NONE && node->grad == NULL) { + if (node->op == GGML_V2_OP_NONE && node->grad == NULL) { // reached a leaf node, not part of the gradient graph (e.g. a constant) - GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); + GGML_V2_ASSERT(cgraph->n_leafs < GGML_V2_MAX_NODES); cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; } else { - GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); + GGML_V2_ASSERT(cgraph->n_nodes < GGML_V2_MAX_NODES); cgraph->nodes[cgraph->n_nodes] = node; cgraph->grads[cgraph->n_nodes] = node->grad; @@ -13722,7 +13722,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } } -static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) { +static void ggml_v2_build_forward_impl(struct ggml_v2_cgraph * cgraph, struct ggml_v2_tensor * tensor, bool expand) { if (!expand) { cgraph->n_nodes = 0; cgraph->n_leafs = 0; @@ -13731,26 +13731,26 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten const int n0 = cgraph->n_nodes; UNUSED(n0); - ggml_visit_parents(cgraph, tensor); + ggml_v2_visit_parents(cgraph, tensor); const int n_new = cgraph->n_nodes - n0; - GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); + GGML_V2_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); if (n_new > 0) { // the last added node should always be starting point - GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); + GGML_V2_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); } } -void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { - ggml_build_forward_impl(cgraph, tensor, true); +void ggml_v2_build_forward_expand(struct ggml_v2_cgraph * cgraph, struct ggml_v2_tensor * tensor) { + ggml_v2_build_forward_impl(cgraph, tensor, true); } -struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { - struct ggml_cgraph result = { +struct ggml_v2_cgraph ggml_v2_build_forward(struct ggml_v2_tensor * tensor) { + struct ggml_v2_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, + /*.n_threads =*/ GGML_V2_DEFAULT_N_THREADS, /*.work_size =*/ 0, /*.work =*/ NULL, /*.nodes =*/ { NULL }, @@ -13761,43 +13761,43 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.perf_time_us =*/ 0, }; - ggml_build_forward_impl(&result, tensor, false); + ggml_v2_build_forward_impl(&result, tensor, false); return result; } -struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { - struct ggml_cgraph result = *gf; +struct ggml_v2_cgraph ggml_v2_build_backward(struct ggml_v2_context * ctx, struct ggml_v2_cgraph * gf, bool keep) { + struct ggml_v2_cgraph result = *gf; - GGML_ASSERT(gf->n_nodes > 0); + GGML_V2_ASSERT(gf->n_nodes > 0); // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph if (keep) { for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; + struct ggml_v2_tensor * node = gf->nodes[i]; if (node->grad) { - node->grad = ggml_dup_tensor(ctx, node); + node->grad = ggml_v2_dup_tensor(ctx, node); gf->grads[i] = node->grad; } } } for (int i = gf->n_nodes - 1; i >= 0; i--) { - struct ggml_tensor * node = gf->nodes[i]; + struct ggml_v2_tensor * node = gf->nodes[i]; // because we detached the grad nodes from the original graph, we can afford inplace operations if (node->grad) { - ggml_compute_backward(ctx, node, keep); + ggml_v2_compute_backward(ctx, node, keep); } } for (int i = gf->n_nodes - 1; i >= 0; i--) { - struct ggml_tensor * node = gf->nodes[i]; + struct ggml_v2_tensor * node = gf->nodes[i]; if (node->is_param) { - GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); - ggml_build_forward_impl(&result, node->grad, true); + GGML_V2_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); + ggml_v2_build_forward_impl(&result, node->grad, true); } } @@ -13815,60 +13815,60 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg //#include // -//typedef os_unfair_lock ggml_lock_t; +//typedef os_unfair_lock ggml_v2_lock_t; // -//#define ggml_lock_init(x) UNUSED(x) -//#define ggml_lock_destroy(x) UNUSED(x) -//#define ggml_lock_lock os_unfair_lock_lock -//#define ggml_lock_unlock os_unfair_lock_unlock +//#define ggml_v2_lock_init(x) UNUSED(x) +//#define ggml_v2_lock_destroy(x) UNUSED(x) +//#define ggml_v2_lock_lock os_unfair_lock_lock +//#define ggml_v2_lock_unlock os_unfair_lock_unlock // -//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT +//#define GGML_V2_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT -typedef int ggml_lock_t; +typedef int ggml_v2_lock_t; -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) +#define ggml_v2_lock_init(x) UNUSED(x) +#define ggml_v2_lock_destroy(x) UNUSED(x) +#define ggml_v2_lock_lock(x) UNUSED(x) +#define ggml_v2_lock_unlock(x) UNUSED(x) -#define GGML_LOCK_INITIALIZER 0 +#define GGML_V2_LOCK_INITIALIZER 0 -typedef pthread_t ggml_thread_t; +typedef pthread_t ggml_v2_thread_t; -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join +#define ggml_v2_thread_create pthread_create +#define ggml_v2_thread_join pthread_join #else -//typedef pthread_spinlock_t ggml_lock_t; +//typedef pthread_spinlock_t ggml_v2_lock_t; -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock +//#define ggml_v2_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) +//#define ggml_v2_lock_destroy pthread_spin_destroy +//#define ggml_v2_lock_lock pthread_spin_lock +//#define ggml_v2_lock_unlock pthread_spin_unlock -typedef int ggml_lock_t; +typedef int ggml_v2_lock_t; -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) +#define ggml_v2_lock_init(x) UNUSED(x) +#define ggml_v2_lock_destroy(x) UNUSED(x) #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() +#define ggml_v2_lock_lock(x) _mm_pause() #else -#define ggml_lock_lock(x) UNUSED(x) +#define ggml_v2_lock_lock(x) UNUSED(x) #endif -#define ggml_lock_unlock(x) UNUSED(x) +#define ggml_v2_lock_unlock(x) UNUSED(x) -#define GGML_LOCK_INITIALIZER 0 +#define GGML_V2_LOCK_INITIALIZER 0 -typedef pthread_t ggml_thread_t; +typedef pthread_t ggml_v2_thread_t; -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join +#define ggml_v2_thread_create pthread_create +#define ggml_v2_thread_join pthread_join #endif -struct ggml_compute_state_shared { - ggml_lock_t spin; +struct ggml_v2_compute_state_shared { + ggml_v2_lock_t spin; int n_threads; @@ -13878,17 +13878,17 @@ struct ggml_compute_state_shared { atomic_bool stop; // stop all threads }; -struct ggml_compute_state { - ggml_thread_t thrd; +struct ggml_v2_compute_state { + ggml_v2_thread_t thrd; - struct ggml_compute_params params; - struct ggml_tensor * node; + struct ggml_v2_compute_params params; + struct ggml_v2_tensor * node; - struct ggml_compute_state_shared * shared; + struct ggml_v2_compute_state_shared * shared; }; -static thread_ret_t ggml_graph_compute_thread(void * data) { - struct ggml_compute_state * state = (struct ggml_compute_state *) data; +static thread_ret_t ggml_v2_graph_compute_thread(void * data) { + struct ggml_v2_compute_state * state = (struct ggml_v2_compute_state *) data; const int n_threads = state->shared->n_threads; @@ -13900,8 +13900,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (atomic_load(&state->shared->stop)) { return 0; } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); + ggml_v2_lock_lock (&state->shared->spin); + ggml_v2_lock_unlock(&state->shared->spin); } } @@ -13912,8 +13912,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (atomic_load(&state->shared->stop)) { return 0; } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); + ggml_v2_lock_lock (&state->shared->spin); + ggml_v2_lock_unlock(&state->shared->spin); } // check if we should stop @@ -13923,7 +13923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (state->node) { if (state->params.ith < state->params.nth) { - ggml_compute_forward(&state->params, state->node); + ggml_v2_compute_forward(&state->params, state->node); } state->node = NULL; @@ -13935,40 +13935,40 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { +void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph * cgraph) { const int n_threads = cgraph->n_threads; - struct ggml_compute_state_shared state_shared = { - /*.spin =*/ GGML_LOCK_INITIALIZER, + struct ggml_v2_compute_state_shared state_shared = { + /*.spin =*/ GGML_V2_LOCK_INITIALIZER, /*.n_threads =*/ n_threads, /*.n_ready =*/ 0, /*.has_work =*/ false, /*.stop =*/ false, }; - struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; + struct ggml_v2_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_v2_compute_state)*(n_threads - 1)) : NULL; // create thread pool if (n_threads > 1) { - ggml_lock_init(&state_shared.spin); + ggml_v2_lock_init(&state_shared.spin); atomic_store(&state_shared.has_work, true); for (int j = 0; j < n_threads - 1; j++) { - workers[j] = (struct ggml_compute_state) { + workers[j] = (struct ggml_v2_compute_state) { .thrd = 0, .params = { - .type = GGML_TASK_COMPUTE, + .type = GGML_V2_TASK_COMPUTE, .ith = j + 1, .nth = n_threads, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wsize = cgraph->work ? ggml_v2_nbytes(cgraph->work) : 0, .wdata = cgraph->work ? cgraph->work->data : NULL, }, .node = NULL, .shared = &state_shared, }; - int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); + int rc = ggml_v2_thread_create(&workers[j].thrd, NULL, ggml_v2_graph_compute_thread, &workers[j]); + GGML_V2_ASSERT(rc == 0); UNUSED(rc); } } @@ -13979,80 +13979,80 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + struct ggml_v2_tensor * node = cgraph->nodes[i]; switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: + case GGML_V2_OP_CPY: + case GGML_V2_OP_DUP: { node->n_tasks = n_threads; size_t cur = 0; - if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; + if (ggml_v2_is_quantized(node->type)) { + cur = GGML_V2_TYPE_SIZE[GGML_V2_TYPE_F32] * node->ne[0] * n_threads; } work_size = MAX(work_size, cur); } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: + case GGML_V2_OP_ADD: + case GGML_V2_OP_ADD1: { node->n_tasks = n_threads; size_t cur = 0; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; + if (ggml_v2_is_quantized(node->src0->type)) { + cur = GGML_V2_TYPE_SIZE[GGML_V2_TYPE_F32] * node->src0->ne[0] * n_threads; } work_size = MAX(work_size, cur); } break; - case GGML_OP_ACC: + case GGML_V2_OP_ACC: { node->n_tasks = n_threads; size_t cur = 0; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; + if (ggml_v2_is_quantized(node->src0->type)) { + cur = GGML_V2_TYPE_SIZE[GGML_V2_TYPE_F32] * node->src1->ne[0] * n_threads; } work_size = MAX(work_size, cur); } break; - case GGML_OP_SUB: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_REPEAT: - case GGML_OP_ABS: - case GGML_OP_SGN: - case GGML_OP_NEG: - case GGML_OP_STEP: - case GGML_OP_RELU: + case GGML_V2_OP_SUB: + case GGML_V2_OP_DIV: + case GGML_V2_OP_SQR: + case GGML_V2_OP_SQRT: + case GGML_V2_OP_LOG: + case GGML_V2_OP_SUM: + case GGML_V2_OP_SUM_ROWS: + case GGML_V2_OP_MEAN: + case GGML_V2_OP_REPEAT: + case GGML_V2_OP_ABS: + case GGML_V2_OP_SGN: + case GGML_V2_OP_NEG: + case GGML_V2_OP_STEP: + case GGML_V2_OP_RELU: { node->n_tasks = 1; } break; - case GGML_OP_MUL: - case GGML_OP_GELU: - case GGML_OP_SILU: - case GGML_OP_SILU_BACK: - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: + case GGML_V2_OP_MUL: + case GGML_V2_OP_GELU: + case GGML_V2_OP_SILU: + case GGML_V2_OP_SILU_BACK: + case GGML_V2_OP_NORM: + case GGML_V2_OP_RMS_NORM: + case GGML_V2_OP_RMS_NORM_BACK: { node->n_tasks = n_threads; } break; - case GGML_OP_MUL_MAT: + case GGML_V2_OP_MUL_MAT: { node->n_tasks = n_threads; // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); + //const int nr0 = ggml_v2_nrows(node->src0); + //const int nr1 = ggml_v2_nrows(node->src1); //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); @@ -14060,207 +14060,207 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + if (ggml_v2_cuda_can_mul_mat(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning - cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node); + cur = ggml_v2_cuda_mul_mat_get_wsize(node->src0, node->src1, node); } else #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + if (ggml_v2_cl_can_mul_mat(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + cur = ggml_v2_cl_mul_mat_get_wsize(node->src0, node->src1, node); } else #endif - if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { + if (node->src0->type == GGML_V2_TYPE_F16 && node->src1->type == GGML_V2_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)|| defined(GGML_USE_CLBLAST) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + if (ggml_v2_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + cur = GGML_V2_TYPE_SIZE[GGML_V2_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } else { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); + cur = GGML_V2_TYPE_SIZE[GGML_V2_TYPE_F16]*ggml_v2_nelements(node->src1); } #else - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); + cur = GGML_V2_TYPE_SIZE[GGML_V2_TYPE_F16]*ggml_v2_nelements(node->src1); #endif - } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { + } else if (node->src0->type == GGML_V2_TYPE_F32 && node->src1->type == GGML_V2_TYPE_F32) { cur = 0; #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + if (ggml_v2_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } #endif - } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { + } else if (ggml_v2_is_quantized(node->src0->type) && node->src1->type == GGML_V2_TYPE_F32) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + if (ggml_v2_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + cur = GGML_V2_TYPE_SIZE[GGML_V2_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } else #endif { - const enum ggml_type type_q = get_quantize_fn(node->src0->type).vec_dot_type; - cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; + const enum ggml_v2_type type_q = get_quantize_fn(node->src0->type).vec_dot_type; + cur = GGML_V2_TYPE_SIZE[type_q]*ggml_v2_nelements(node->src1)/GGML_V2_BLCK_SIZE[type_q]; } } else { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } work_size = MAX(work_size, cur); } break; - case GGML_OP_SCALE: + case GGML_V2_OP_SCALE: { node->n_tasks = n_threads; } break; - case GGML_OP_SET: - case GGML_OP_CONT: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS: - case GGML_OP_GET_ROWS_BACK: - case GGML_OP_DIAG: - case GGML_OP_DIAG_MASK_ZERO: + case GGML_V2_OP_SET: + case GGML_V2_OP_CONT: + case GGML_V2_OP_RESHAPE: + case GGML_V2_OP_VIEW: + case GGML_V2_OP_PERMUTE: + case GGML_V2_OP_TRANSPOSE: + case GGML_V2_OP_GET_ROWS: + case GGML_V2_OP_GET_ROWS_BACK: + case GGML_V2_OP_DIAG: + case GGML_V2_OP_DIAG_MASK_ZERO: { node->n_tasks = 1; } break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: + case GGML_V2_OP_DIAG_MASK_INF: + case GGML_V2_OP_SOFT_MAX: + case GGML_V2_OP_ROPE: + case GGML_V2_OP_ROPE_BACK: { node->n_tasks = n_threads; } break; - case GGML_OP_ALIBI: + case GGML_V2_OP_ALIBI: { node->n_tasks = 1; //TODO } break; - case GGML_OP_CONV_1D_1S: - case GGML_OP_CONV_1D_2S: + case GGML_V2_OP_CONV_1D_1S: + case GGML_V2_OP_CONV_1D_2S: { node->n_tasks = n_threads; - GGML_ASSERT(node->src0->ne[3] == 1); - GGML_ASSERT(node->src1->ne[2] == 1); - GGML_ASSERT(node->src1->ne[3] == 1); + GGML_V2_ASSERT(node->src0->ne[3] == 1); + GGML_V2_ASSERT(node->src1->ne[2] == 1); + GGML_V2_ASSERT(node->src1->ne[3] == 1); size_t cur = 0; const int nk = node->src0->ne[0]; - if (node->src0->type == GGML_TYPE_F16 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + if (node->src0->type == GGML_V2_TYPE_F16 && + node->src1->type == GGML_V2_TYPE_F32) { + cur = sizeof(ggml_v2_fp16_t)*( + nk*ggml_v2_up32(node->src0->ne[1])*node->src0->ne[2] + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] ); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { + } else if (node->src0->type == GGML_V2_TYPE_F32 && + node->src1->type == GGML_V2_TYPE_F32) { cur = sizeof(float)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + nk*ggml_v2_up32(node->src0->ne[1])*node->src0->ne[2] + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] ); } else { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } work_size = MAX(work_size, cur); } break; - case GGML_OP_FLASH_ATTN: + case GGML_V2_OP_FLASH_ATTN: { node->n_tasks = n_threads; size_t cur = 0; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t ne11 = ggml_v2_up(node->src1->ne[1], GGML_V2_SOFT_MAX_UNROLL); - if (node->src1->type == GGML_TYPE_F32) { + if (node->src1->type == GGML_V2_TYPE_F32) { cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 } - if (node->src1->type == GGML_TYPE_F16) { + if (node->src1->type == GGML_V2_TYPE_F16) { cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; - case GGML_OP_FLASH_FF: + case GGML_V2_OP_FLASH_FF: { node->n_tasks = n_threads; size_t cur = 0; - if (node->src1->type == GGML_TYPE_F32) { + if (node->src1->type == GGML_V2_TYPE_F32) { cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 } - if (node->src1->type == GGML_TYPE_F16) { + if (node->src1->type == GGML_V2_TYPE_F16) { cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: + case GGML_V2_OP_MAP_UNARY: + case GGML_V2_OP_MAP_BINARY: { node->n_tasks = 1; } break; - case GGML_OP_NONE: + case GGML_V2_OP_NONE: { node->n_tasks = 1; } break; - case GGML_OP_COUNT: + case GGML_V2_OP_COUNT: { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } break; } } if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling + GGML_V2_ASSERT(false); // TODO: better handling } if (work_size > 0 && cgraph->work == NULL) { cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); + GGML_V2_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); + cgraph->work = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I8, cgraph->work_size); } } - const int64_t perf_start_cycles = ggml_perf_cycles(); - const int64_t perf_start_time_us = ggml_perf_time_us(); + const int64_t perf_start_cycles = ggml_v2_perf_cycles(); + const int64_t perf_start_time_us = ggml_v2_perf_time_us(); for (int i = 0; i < cgraph->n_nodes; i++) { - GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); + GGML_V2_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); - struct ggml_tensor * node = cgraph->nodes[i]; + struct ggml_v2_tensor * node = cgraph->nodes[i]; // TODO: this could be used to avoid unnecessary computations, but it needs to be improved //if (node->grad == NULL && node->perf_runs > 0) { // continue; //} - const int64_t perf_node_start_cycles = ggml_perf_cycles(); - const int64_t perf_node_start_time_us = ggml_perf_time_us(); + const int64_t perf_node_start_cycles = ggml_v2_perf_cycles(); + const int64_t perf_node_start_time_us = ggml_v2_perf_time_us(); // INIT - struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_INIT, + struct ggml_v2_compute_params params = { + /*.type =*/ GGML_V2_TASK_INIT, /*.ith =*/ 0, /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, + /*.wsize =*/ cgraph->work ? ggml_v2_nbytes(cgraph->work) : 0, /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, }; - ggml_compute_forward(¶ms, node); + ggml_v2_compute_forward(¶ms, node); // COMPUTE if (node->n_tasks > 1) { @@ -14269,17 +14269,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } // launch thread pool for (int j = 0; j < n_threads - 1; j++) { - workers[j].params = (struct ggml_compute_params) { - .type = GGML_TASK_COMPUTE, + workers[j].params = (struct ggml_v2_compute_params) { + .type = GGML_V2_TASK_COMPUTE, .ith = j + 1, .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wsize = cgraph->work ? ggml_v2_nbytes(cgraph->work) : 0, .wdata = cgraph->work ? cgraph->work->data : NULL, }; workers[j].node = node; @@ -14288,15 +14288,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) atomic_fetch_sub(&state_shared.n_ready, 1); while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } atomic_store(&state_shared.has_work, true); } - params.type = GGML_TASK_COMPUTE; - ggml_compute_forward(¶ms, node); + params.type = GGML_V2_TASK_COMPUTE; + ggml_v2_compute_forward(¶ms, node); // wait for thread pool if (node->n_tasks > 1) { @@ -14305,15 +14305,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } atomic_fetch_sub(&state_shared.n_ready, 1); while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } } @@ -14324,17 +14324,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } // launch thread pool for (int j = 0; j < n_threads - 1; j++) { - workers[j].params = (struct ggml_compute_params) { - .type = GGML_TASK_FINALIZE, + workers[j].params = (struct ggml_v2_compute_params) { + .type = GGML_V2_TASK_FINALIZE, .ith = j + 1, .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wsize = cgraph->work ? ggml_v2_nbytes(cgraph->work) : 0, .wdata = cgraph->work ? cgraph->work->data : NULL, }; workers[j].node = node; @@ -14343,15 +14343,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) atomic_fetch_sub(&state_shared.n_ready, 1); while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } atomic_store(&state_shared.has_work, true); } - params.type = GGML_TASK_FINALIZE; - ggml_compute_forward(¶ms, node); + params.type = GGML_V2_TASK_FINALIZE; + ggml_v2_compute_forward(¶ms, node); // wait for thread pool if (node->n_tasks > 1) { @@ -14360,22 +14360,22 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } atomic_fetch_sub(&state_shared.n_ready, 1); while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + ggml_v2_lock_lock (&state_shared.spin); + ggml_v2_lock_unlock(&state_shared.spin); } } // performance stats (node) { - int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles; - int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us; + int64_t perf_cycles_cur = ggml_v2_perf_cycles() - perf_node_start_cycles; + int64_t perf_time_us_cur = ggml_v2_perf_time_us() - perf_node_start_time_us; node->perf_runs++; node->perf_cycles += perf_cycles_cur; @@ -14389,89 +14389,89 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) atomic_store(&state_shared.has_work, true); for (int j = 0; j < n_threads - 1; j++) { - int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); + int rc = ggml_v2_thread_join(workers[j].thrd, NULL); + GGML_V2_ASSERT(rc == 0); UNUSED(rc); } - ggml_lock_destroy(&state_shared.spin); + ggml_v2_lock_destroy(&state_shared.spin); } // performance stats (graph) { - int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; - int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us; + int64_t perf_cycles_cur = ggml_v2_perf_cycles() - perf_start_cycles; + int64_t perf_time_us_cur = ggml_v2_perf_time_us() - perf_start_time_us; cgraph->perf_runs++; cgraph->perf_cycles += perf_cycles_cur; cgraph->perf_time_us += perf_time_us_cur; - GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", + GGML_V2_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", __func__, cgraph->perf_runs, - (double) perf_cycles_cur / (double) ggml_cycles_per_ms(), - (double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs, + (double) perf_cycles_cur / (double) ggml_v2_cycles_per_ms(), + (double) cgraph->perf_cycles / (double) ggml_v2_cycles_per_ms() / (double) cgraph->perf_runs, (double) perf_time_us_cur / 1000.0, (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); } } -void ggml_graph_reset(struct ggml_cgraph * cgraph) { +void ggml_v2_graph_reset(struct ggml_v2_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * grad = cgraph->grads[i]; + struct ggml_v2_tensor * grad = cgraph->grads[i]; if (grad) { - ggml_set_zero(grad); + ggml_v2_set_zero(grad); } } } -void ggml_graph_print(const struct ggml_cgraph * cgraph) { - int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; +void ggml_v2_graph_print(const struct ggml_v2_cgraph * cgraph) { + int64_t perf_total_per_op_us[GGML_V2_OP_COUNT] = {0}; - GGML_PRINT("=== GRAPH ===\n"); + GGML_V2_PRINT("=== GRAPH ===\n"); - GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); + GGML_V2_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); + GGML_V2_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); - GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); + GGML_V2_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; + struct ggml_v2_tensor * node = cgraph->nodes[i]; perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us); - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", + GGML_V2_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, - (double) node->perf_cycles / (double) ggml_cycles_per_ms(), - (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, + GGML_V2_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + (double) node->perf_cycles / (double) ggml_v2_cycles_per_ms(), + (double) node->perf_cycles / (double) ggml_v2_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, (double) node->perf_time_us / 1000.0 / node->perf_runs); } - GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); + GGML_V2_PRINT("n_leafs = %d\n", cgraph->n_leafs); for (int i = 0; i < cgraph->n_leafs; i++) { - struct ggml_tensor * node = cgraph->leafs[i]; + struct ggml_v2_tensor * node = cgraph->leafs[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", + GGML_V2_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", i, node->ne[0], node->ne[1], - GGML_OP_LABEL[node->op]); + GGML_V2_OP_LABEL[node->op]); } - for (int i = 0; i < GGML_OP_COUNT; i++) { + for (int i = 0; i < GGML_V2_OP_COUNT; i++) { if (perf_total_per_op_us[i] == 0) { continue; } - GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); + GGML_V2_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_V2_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0); } - GGML_PRINT("========================================\n"); + GGML_V2_PRINT("========================================\n"); } // check if node is part of the graph -static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { +static bool ggml_v2_graph_find(const struct ggml_v2_cgraph * cgraph, const struct ggml_v2_tensor * node) { if (cgraph == NULL) { return true; } @@ -14485,9 +14485,9 @@ static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml return false; } -static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { +static struct ggml_v2_tensor * ggml_v2_graph_get_parent(const struct ggml_v2_cgraph * cgraph, const struct ggml_v2_tensor * node) { for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * parent = cgraph->nodes[i]; + struct ggml_v2_tensor * parent = cgraph->nodes[i]; if (parent->grad == node) { return parent; @@ -14497,27 +14497,27 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr return NULL; } -void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { +void ggml_v2_graph_dump_dot(const struct ggml_v2_cgraph * gb, const struct ggml_v2_cgraph * gf, const char * filename) { char color[16]; FILE * fp = fopen(filename, "w"); - GGML_ASSERT(fp); + GGML_V2_ASSERT(fp); fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); fprintf(fp, " rankdir = LR;\n"); for (int i = 0; i < gb->n_nodes; i++) { - struct ggml_tensor * node = gb->nodes[i]; + struct ggml_v2_tensor * node = gb->nodes[i]; - if (ggml_graph_get_parent(gb, node) != NULL) { + if (ggml_v2_graph_get_parent(gb, node) != NULL) { continue; } if (node->is_param) { snprintf(color, sizeof(color), "yellow"); } else if (node->grad) { - if (ggml_graph_find(gf, node)) { + if (ggml_v2_graph_find(gf, node)) { snprintf(color, sizeof(color), "green"); } else { snprintf(color, sizeof(color), "lightblue"); @@ -14537,17 +14537,17 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], - GGML_OP_SYMBOL[node->op]); + GGML_V2_OP_SYMBOL[node->op]); if (node->grad) { - fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); + fprintf(fp, " | %s\"; ]\n", GGML_V2_OP_SYMBOL[node->grad->op]); } else { fprintf(fp, "\"; ]\n"); } } for (int i = 0; i < gb->n_leafs; i++) { - struct ggml_tensor * node = gb->leafs[i]; + struct ggml_v2_tensor * node = gb->leafs[i]; snprintf(color, sizeof(color), "pink"); @@ -14559,12 +14559,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph if (strlen(node->name) > 0) { fprintf(fp, "%s | ", node->name); } - if (ggml_nelements(node) == 1) { - if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { - fprintf(fp, "%d", ggml_get_i32_1d(node, 0)); + if (ggml_v2_nelements(node) == 1) { + if (node->type == GGML_V2_TYPE_I8 || node->type == GGML_V2_TYPE_I16 || node->type == GGML_V2_TYPE_I32) { + fprintf(fp, "%d", ggml_v2_get_i32_1d(node, 0)); } else { - fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0)); + fprintf(fp, "%.1e", (double)ggml_v2_get_f32_1d(node, 0)); } } else { @@ -14574,12 +14574,12 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph } for (int i = 0; i < gb->n_nodes; i++) { - struct ggml_tensor * node = gb->nodes[i]; + struct ggml_v2_tensor * node = gb->nodes[i]; - struct ggml_tensor * parent = ggml_graph_get_parent(gb, node); + struct ggml_v2_tensor * parent = ggml_v2_graph_get_parent(gb, node); if (node->src0) { - struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0); + struct ggml_v2_tensor * parent0 = ggml_v2_graph_get_parent(gb, node->src0); fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n", parent0 ? (void *) parent0 : (void *) node->src0, @@ -14591,7 +14591,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph } if (node->src1) { - struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1); + struct ggml_v2_tensor * parent1 = ggml_v2_graph_get_parent(gb, node->src1); fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", parent1 ? (void *) parent1 : (void *) node->src1, @@ -14604,7 +14604,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph } for (int i = 0; i < gb->n_leafs; i++) { - struct ggml_tensor * node = gb->leafs[i]; + struct ggml_v2_tensor * node = gb->leafs[i]; if (node->src0) { fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", @@ -14623,40 +14623,40 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fclose(fp); - GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); + GGML_V2_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); } //////////////////////////////////////////////////////////////////////////////// -static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) { +static void ggml_v2_opt_set_params(int np, struct ggml_v2_tensor * const ps[], const float * x) { int i = 0; for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]) ; + const int64_t ne = ggml_v2_nelements(ps[p]) ; // TODO: add function to set tensor from array for (int64_t j = 0; j < ne; ++j) { - ggml_set_f32_1d(ps[p], j, x[i++]); + ggml_v2_set_f32_1d(ps[p], j, x[i++]); } } } -static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) { +static void ggml_v2_opt_get_params(int np, struct ggml_v2_tensor * const ps[], float * x) { int i = 0; for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]) ; + const int64_t ne = ggml_v2_nelements(ps[p]) ; // TODO: add function to get all elements at once for (int64_t j = 0; j < ne; ++j) { - x[i++] = ggml_get_f32_1d(ps[p], j); + x[i++] = ggml_v2_get_f32_1d(ps[p], j); } } } -static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) { +static void ggml_v2_opt_get_grad(int np, struct ggml_v2_tensor * const ps[], float * g) { int i = 0; for (int p = 0; p < np; ++p) { - const int64_t ne = ggml_nelements(ps[p]) ; + const int64_t ne = ggml_v2_nelements(ps[p]) ; // TODO: add function to get all elements at once for (int64_t j = 0; j < ne; ++j) { - g[i++] = ggml_get_f32_1d(ps[p]->grad, j); + g[i++] = ggml_v2_get_f32_1d(ps[p]->grad, j); } } } @@ -14667,30 +14667,30 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g // ref: https://arxiv.org/pdf/1412.6980.pdf // -static enum ggml_opt_result ggml_opt_adam( - struct ggml_context * ctx, - struct ggml_opt_params params, - struct ggml_tensor * f, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { - GGML_ASSERT(ggml_is_scalar(f)); +static enum ggml_v2_opt_result ggml_v2_opt_adam( + struct ggml_v2_context * ctx, + struct ggml_v2_opt_params params, + struct ggml_v2_tensor * f, + struct ggml_v2_cgraph * gf, + struct ggml_v2_cgraph * gb) { + GGML_V2_ASSERT(ggml_v2_is_scalar(f)); gf->n_threads = params.n_threads; gb->n_threads = params.n_threads; // these will store the parameters we want to optimize - struct ggml_tensor * ps[GGML_MAX_PARAMS]; + struct ggml_v2_tensor * ps[GGML_V2_MAX_PARAMS]; int np = 0; int nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { if (gf->nodes[i]->is_param) { - GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + GGML_V2_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); - GGML_ASSERT(np < GGML_MAX_PARAMS); + GGML_V2_ASSERT(np < GGML_V2_MAX_PARAMS); ps[np++] = gf->nodes[i]; - nx += ggml_nelements(gf->nodes[i]); + nx += ggml_v2_nelements(gf->nodes[i]); } } @@ -14700,29 +14700,29 @@ static enum ggml_opt_result ggml_opt_adam( const float beta2 = params.adam.beta2; const float eps = params.adam.eps; - float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters - float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient - float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared - float * m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment - float * v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment - float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat - float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat + float * x = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // view of the parameters + float * g1 = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // gradient + float * g2 = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // gradient squared + float * m = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // first moment + float * v = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // second moment + float * mh = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // first moment hat + float * vh = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // second moment hat - float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values + float * pf = params.past > 0 ? ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, params.past)->data : NULL; // past function values // initialize - ggml_vec_set_f32(nx, m, 0.0f); - ggml_vec_set_f32(nx, v, 0.0f); + ggml_v2_vec_set_f32(nx, m, 0.0f); + ggml_v2_vec_set_f32(nx, v, 0.0f); // update view - ggml_opt_get_params(np, ps, x); + ggml_v2_opt_get_params(np, ps, x); // compute the function value - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + ggml_v2_graph_reset (gf); + ggml_v2_set_f32 (f->grad, 1.0f); + ggml_v2_graph_compute(ctx, gb); - float fx_prev = ggml_get_f32_1d(f, 0); + float fx_prev = ggml_v2_get_f32_1d(f, 0); if (pf) { pf[0] = fx_prev; } @@ -14732,67 +14732,67 @@ static enum ggml_opt_result ggml_opt_adam( // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { - GGML_PRINT_DEBUG ("=== iter %d ===\n", t); + GGML_V2_PRINT_DEBUG ("=== iter %d ===\n", t); - GGML_PRINT_DEBUG ("f = %10.6f\n", ggml_get_f32_1d(f, 0)); - GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0)); - GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0)); + GGML_V2_PRINT_DEBUG ("f = %10.6f\n", ggml_v2_get_f32_1d(f, 0)); + GGML_V2_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_v2_get_f32_1d(ps[0]->grad, 0)); + GGML_V2_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_v2_get_f32_1d(ps[1]->grad, 0)); for (int i = 0; i < np; ++i) { - GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, - ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0)); + GGML_V2_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, + ggml_v2_get_f32_1d(ps[i], 0), ggml_v2_get_f32_1d(ps[i]->grad, 0)); } - const int64_t t_start_wall = ggml_time_us(); - const int64_t t_start_cpu = ggml_cycles(); + const int64_t t_start_wall = ggml_v2_time_us(); + const int64_t t_start_cpu = ggml_v2_cycles(); UNUSED(t_start_wall); UNUSED(t_start_cpu); { // update the gradient - ggml_opt_get_grad(np, ps, g1); + ggml_v2_opt_get_grad(np, ps, g1); // m_t = beta1*m_t-1 + (1 - beta1)*g_t - ggml_vec_scale_f32(nx, m, beta1); - ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1); + ggml_v2_vec_scale_f32(nx, m, beta1); + ggml_v2_vec_mad_f32 (nx, m, g1, 1.0f - beta1); // g2 = g1^2 - ggml_vec_sqr_f32 (nx, g2, g1); + ggml_v2_vec_sqr_f32 (nx, g2, g1); // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2 - ggml_vec_scale_f32(nx, v, beta2); - ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2); + ggml_v2_vec_scale_f32(nx, v, beta2); + ggml_v2_vec_mad_f32 (nx, v, g2, 1.0f - beta2); // m^hat = m_t / (1 - beta1^t) // v^hat = v_t / (1 - beta2^t) // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps) - ggml_vec_cpy_f32 (nx, mh, m); - ggml_vec_cpy_f32 (nx, vh, v); + ggml_v2_vec_cpy_f32 (nx, mh, m); + ggml_v2_vec_cpy_f32 (nx, vh, v); - ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1))); - ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, t + 1))); + ggml_v2_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1))); + ggml_v2_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, t + 1))); - ggml_vec_sqrt_f32 (nx, vh, vh); - ggml_vec_acc1_f32 (nx, vh, eps); + ggml_v2_vec_sqrt_f32 (nx, vh, vh); + ggml_v2_vec_acc1_f32 (nx, vh, eps); - ggml_vec_div_f32 (nx, mh, mh, vh); - ggml_vec_sub_f32 (nx, x, x, mh); + ggml_v2_vec_div_f32 (nx, mh, mh, vh); + ggml_v2_vec_sub_f32 (nx, x, x, mh); // update the parameters - ggml_opt_set_params(np, ps, x); + ggml_v2_opt_set_params(np, ps, x); } - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + ggml_v2_graph_reset (gf); + ggml_v2_set_f32 (f->grad, 1.0f); + ggml_v2_graph_compute(ctx, gb); - const float fx = ggml_get_f32_1d(f, 0); + const float fx = ggml_v2_get_f32_1d(f, 0); // check convergence if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) { - GGML_PRINT_DEBUG("converged\n"); + GGML_V2_PRINT_DEBUG("converged\n"); - return GGML_OPT_OK; + return GGML_V2_OPT_OK; } // delta-based convergence test @@ -14802,7 +14802,7 @@ static enum ggml_opt_result ggml_opt_adam( const float rate = (pf[t%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_V2_OPT_OK; } } @@ -14818,7 +14818,7 @@ static enum ggml_opt_result ggml_opt_adam( ++n_no_improvement; if (n_no_improvement >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_V2_OPT_OK; } } } @@ -14826,17 +14826,17 @@ static enum ggml_opt_result ggml_opt_adam( fx_prev = fx; { - const int64_t t_end_cpu = ggml_cycles(); - GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); + const int64_t t_end_cpu = ggml_v2_cycles(); + GGML_V2_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); UNUSED(t_end_cpu); - const int64_t t_end_wall = ggml_time_us(); - GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); + const int64_t t_end_wall = ggml_v2_time_us(); + GGML_V2_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); UNUSED(t_end_wall); } } - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_V2_OPT_DID_NOT_CONVERGE; } // @@ -14847,16 +14847,16 @@ static enum ggml_opt_result ggml_opt_adam( // https://github.com/chokkan/liblbfgs // -struct ggml_lbfgs_iteration_data { +struct ggml_v2_lbfgs_iteration_data { float alpha; float ys; float * s; float * y; }; -static enum ggml_opt_result linesearch_backtracking( - struct ggml_context * ctx, - const struct ggml_opt_params * params, +static enum ggml_v2_opt_result linesearch_backtracking( + struct ggml_v2_context * ctx, + const struct ggml_v2_opt_params * params, int nx, float * x, float * fx, @@ -14864,11 +14864,11 @@ static enum ggml_opt_result linesearch_backtracking( float * d, float * step, const float * xp, - struct ggml_tensor * f, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, + struct ggml_v2_tensor * f, + struct ggml_v2_cgraph * gf, + struct ggml_v2_cgraph * gb, const int np, - struct ggml_tensor * ps[]) { + struct ggml_v2_tensor * ps[]) { int count = 0; float width = 0.0f; @@ -14881,15 +14881,15 @@ static enum ggml_opt_result linesearch_backtracking( const float inc = 2.1f; if (*step <= 0.f) { - return GGML_LINESEARCH_INVALID_PARAMETERS; + return GGML_V2_LINESEARCH_INVALID_PARAMETERS; } // compute the initial gradient in the search direction - ggml_vec_dot_f32(nx, &dginit, g, d); + ggml_v2_vec_dot_f32(nx, &dginit, g, d); // make sure that d points to a descent direction if (0 < dginit) { - return GGML_LINESEARCH_FAIL; + return GGML_V2_LINESEARCH_FAIL; } // initialize local variables @@ -14897,20 +14897,20 @@ static enum ggml_opt_result linesearch_backtracking( dgtest = params->lbfgs.ftol*dginit; while (true) { - ggml_vec_cpy_f32(nx, x, xp); - ggml_vec_mad_f32(nx, x, d, *step); + ggml_v2_vec_cpy_f32(nx, x, xp); + ggml_v2_vec_mad_f32(nx, x, d, *step); // evaluate the function and gradient values { - ggml_opt_set_params(np, ps, x); + ggml_v2_opt_set_params(np, ps, x); - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + ggml_v2_graph_reset (gf); + ggml_v2_set_f32 (f->grad, 1.0f); + ggml_v2_graph_compute(ctx, gb); - ggml_opt_get_grad(np, ps, g); + ggml_v2_opt_get_grad(np, ps, g); - *fx = ggml_get_f32_1d(f, 0); + *fx = ggml_v2_get_f32_1d(f, 0); } ++count; @@ -14919,17 +14919,17 @@ static enum ggml_opt_result linesearch_backtracking( width = dec; } else { // Armijo condition is satisfied - if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) { + if (params->lbfgs.linesearch == GGML_V2_LINESEARCH_BACKTRACKING_ARMIJO) { return count; } - ggml_vec_dot_f32(nx, &dg, g, d); + ggml_v2_vec_dot_f32(nx, &dg, g, d); // check the Wolfe condition if (dg < params->lbfgs.wolfe * dginit) { width = inc; } else { - if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) { + if(params->lbfgs.linesearch == GGML_V2_LINESEARCH_BACKTRACKING_WOLFE) { // regular Wolfe conditions return count; } @@ -14937,7 +14937,7 @@ static enum ggml_opt_result linesearch_backtracking( if(dg > -params->lbfgs.wolfe*dginit) { width = dec; } else { - // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) + // strong Wolfe condition (GGML_V2_LINESEARCH_BACKTRACKING_STRONG_WOLFE) return count; } return count; @@ -14945,31 +14945,31 @@ static enum ggml_opt_result linesearch_backtracking( } if (*step < params->lbfgs.min_step) { - return GGML_LINESEARCH_MINIMUM_STEP; + return GGML_V2_LINESEARCH_MINIMUM_STEP; } if (*step > params->lbfgs.max_step) { - return GGML_LINESEARCH_MAXIMUM_STEP; + return GGML_V2_LINESEARCH_MAXIMUM_STEP; } if (params->lbfgs.max_linesearch <= count) { - return GGML_LINESEARCH_MAXIMUM_ITERATIONS; + return GGML_V2_LINESEARCH_MAXIMUM_ITERATIONS; } (*step) *= width; } - return GGML_LINESEARCH_FAIL; + return GGML_V2_LINESEARCH_FAIL; } -static enum ggml_opt_result ggml_opt_lbfgs( - struct ggml_context * ctx, - struct ggml_opt_params params, - struct ggml_tensor * f, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { - if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || - params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { +static enum ggml_v2_opt_result ggml_v2_opt_lbfgs( + struct ggml_v2_context * ctx, + struct ggml_v2_opt_params params, + struct ggml_v2_tensor * f, + struct ggml_v2_cgraph * gf, + struct ggml_v2_cgraph * gb) { + if (params.lbfgs.linesearch == GGML_V2_LINESEARCH_BACKTRACKING_WOLFE || + params.lbfgs.linesearch == GGML_V2_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { - return GGML_OPT_INVALID_WOLFE; + return GGML_V2_OPT_INVALID_WOLFE; } } @@ -14979,28 +14979,28 @@ static enum ggml_opt_result ggml_opt_lbfgs( const int m = params.lbfgs.m; // these will store the parameters we want to optimize - struct ggml_tensor * ps[GGML_MAX_PARAMS]; + struct ggml_v2_tensor * ps[GGML_V2_MAX_PARAMS]; int np = 0; int nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { if (gf->nodes[i]->is_param) { - GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + GGML_V2_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); - GGML_ASSERT(np < GGML_MAX_PARAMS); + GGML_V2_ASSERT(np < GGML_V2_MAX_PARAMS); ps[np++] = gf->nodes[i]; - nx += ggml_nelements(gf->nodes[i]); + nx += ggml_v2_nelements(gf->nodes[i]); } } - float * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters - float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters - float * g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient - float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient - float * d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction + float * x = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // current parameters + float * xp = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // previous parameters + float * g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // current gradient + float * gp = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // previous gradient + float * d = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; // search direction - float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values + float * pf = params.past > 0 ? ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, params.past)->data : NULL; // past function values float fx = 0.0f; // cost function value float xnorm = 0.0f; // ||x|| @@ -15008,29 +15008,29 @@ static enum ggml_opt_result ggml_opt_lbfgs( float step = 0.0f; // initialize x from the graph nodes - ggml_opt_get_params(np, ps, x); + ggml_v2_opt_get_params(np, ps, x); // the L-BFGS memory - struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m); + struct ggml_v2_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_v2_lbfgs_iteration_data)*m); for (int i = 0; i < m; ++i) { lm[i].alpha = 0.0f; lm[i].ys = 0.0f; - lm[i].s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; - lm[i].y = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; + lm[i].s = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; + lm[i].y = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, nx)->data; } // evaluate the function value and its gradient { - ggml_opt_set_params(np, ps, x); + ggml_v2_opt_set_params(np, ps, x); - ggml_graph_reset (gf); - ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + ggml_v2_graph_reset (gf); + ggml_v2_set_f32 (f->grad, 1.0f); + ggml_v2_graph_compute(ctx, gb); - ggml_opt_get_grad(np, ps, g); + ggml_v2_opt_get_grad(np, ps, g); - fx = ggml_get_f32_1d(f, 0); + fx = ggml_v2_get_f32_1d(f, 0); } if (pf) { @@ -15040,11 +15040,11 @@ static enum ggml_opt_result ggml_opt_lbfgs( float fx_best = fx; // search direction = -gradient - ggml_vec_neg_f32(nx, d, g); + ggml_v2_vec_neg_f32(nx, d, g); // ||x||, ||g|| - ggml_vec_norm_f32(nx, &xnorm, x); - ggml_vec_norm_f32(nx, &gnorm, g); + ggml_v2_vec_norm_f32(nx, &xnorm, x); + ggml_v2_vec_norm_f32(nx, &gnorm, g); if (xnorm < 1.0f) { xnorm = 1.0f; @@ -15052,11 +15052,11 @@ static enum ggml_opt_result ggml_opt_lbfgs( // already optimized if (gnorm/xnorm <= params.lbfgs.eps) { - return GGML_OPT_OK; + return GGML_V2_OPT_OK; } // initial step - ggml_vec_norm_inv_f32(nx, &step, d); + ggml_v2_vec_norm_inv_f32(nx, &step, d); int j = 0; int k = 1; @@ -15071,30 +15071,30 @@ static enum ggml_opt_result ggml_opt_lbfgs( while (true) { // store the current position and gradient vectors - ggml_vec_cpy_f32(nx, xp, x); - ggml_vec_cpy_f32(nx, gp, g); + ggml_v2_vec_cpy_f32(nx, xp, x); + ggml_v2_vec_cpy_f32(nx, gp, g); ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps); if (ls < 0) { // linesearch failed - go back to the previous point and return - ggml_vec_cpy_f32(nx, x, xp); - ggml_vec_cpy_f32(nx, g, gp); + ggml_v2_vec_cpy_f32(nx, x, xp); + ggml_v2_vec_cpy_f32(nx, g, gp); return ls; } - ggml_vec_norm_f32(nx, &xnorm, x); - ggml_vec_norm_f32(nx, &gnorm, g); + ggml_v2_vec_norm_f32(nx, &xnorm, x); + ggml_v2_vec_norm_f32(nx, &gnorm, g); - GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0)); + GGML_V2_PRINT_DEBUG("f = %10.6f\n", ggml_v2_get_f32_1d(f, 0)); if (xnorm < 1.0f) { xnorm = 1.0f; } if (gnorm/xnorm <= params.lbfgs.eps) { // converged - return GGML_OPT_OK; + return GGML_V2_OPT_OK; } // delta-based convergence test @@ -15104,7 +15104,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( const float rate = (pf[k%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_V2_OPT_OK; } } @@ -15120,29 +15120,29 @@ static enum ggml_opt_result ggml_opt_lbfgs( n_no_improvement++; if (n_no_improvement >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_V2_OPT_OK; } } } if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) { // reached the maximum number of iterations - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_V2_OPT_DID_NOT_CONVERGE; } // update vectors s and y: // s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. // y_{k+1} = g_{k+1} - g_{k}. // - ggml_vec_sub_f32(nx, lm[end].s, x, xp); - ggml_vec_sub_f32(nx, lm[end].y, g, gp); + ggml_v2_vec_sub_f32(nx, lm[end].s, x, xp); + ggml_v2_vec_sub_f32(nx, lm[end].y, g, gp); // compute scalars ys and yy: // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s); - ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y); + ggml_v2_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s); + ggml_v2_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y); lm[end].ys = ys; @@ -15154,43 +15154,43 @@ static enum ggml_opt_result ggml_opt_lbfgs( end = (end + 1)%m; // initialize search direction with -g - ggml_vec_neg_f32(nx, d, g); + ggml_v2_vec_neg_f32(nx, d, g); j = end; for (int i = 0; i < bound; ++i) { j = (j + m - 1) % m; // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d); + ggml_v2_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d); lm[j].alpha /= lm[j].ys; // q_{i} = q_{i+1} - \alpha_{i} y_{i} - ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha); + ggml_v2_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha); } - ggml_vec_scale_f32(nx, d, ys/yy); + ggml_v2_vec_scale_f32(nx, d, ys/yy); for (int i = 0; i < bound; ++i) { // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - ggml_vec_dot_f32(nx, &beta, lm[j].y, d); + ggml_v2_vec_dot_f32(nx, &beta, lm[j].y, d); beta /= lm[j].ys; // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} - ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta); + ggml_v2_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta); j = (j + 1)%m; } step = 1.0; } - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_V2_OPT_DID_NOT_CONVERGE; } -struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { - struct ggml_opt_params result; +struct ggml_v2_opt_params ggml_v2_opt_default_params(enum ggml_v2_opt_type type) { + struct ggml_v2_opt_params result; switch (type) { - case GGML_OPT_ADAM: + case GGML_V2_OPT_ADAM: { - result = (struct ggml_opt_params) { - .type = GGML_OPT_ADAM, + result = (struct ggml_v2_opt_params) { + .type = GGML_V2_OPT_ADAM, .n_threads = 1, .past = 0, .delta = 1e-5f, @@ -15211,10 +15211,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { }, }; } break; - case GGML_OPT_LBFGS: + case GGML_V2_OPT_LBFGS: { - result = (struct ggml_opt_params) { - .type = GGML_OPT_LBFGS, + result = (struct ggml_v2_opt_params) { + .type = GGML_V2_OPT_LBFGS, .n_threads = 1, .past = 0, .delta = 1e-5f, @@ -15235,7 +15235,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { .min_step = 1e-20f, .max_step = 1e+20f, - .linesearch = GGML_LINESEARCH_DEFAULT, + .linesearch = GGML_V2_LINESEARCH_DEFAULT, }, }; } break; @@ -15244,55 +15244,55 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { return result; } -enum ggml_opt_result ggml_opt( - struct ggml_context * ctx, - struct ggml_opt_params params, - struct ggml_tensor * f) { +enum ggml_v2_opt_result ggml_v2_opt( + struct ggml_v2_context * ctx, + struct ggml_v2_opt_params params, + struct ggml_v2_tensor * f) { bool free_ctx = false; if (ctx == NULL) { - struct ggml_init_params params_ctx = { + struct ggml_v2_init_params params_ctx = { .mem_size = 16*1024*1024, .mem_buffer = NULL, .no_alloc = false, }; - ctx = ggml_init(params_ctx); + ctx = ggml_v2_init(params_ctx); if (ctx == NULL) { - return GGML_OPT_NO_CONTEXT; + return GGML_V2_OPT_NO_CONTEXT; } free_ctx = true; } - enum ggml_opt_result result = GGML_OPT_OK; + enum ggml_v2_opt_result result = GGML_V2_OPT_OK; // build forward + backward compute graphs - struct ggml_cgraph gf = ggml_build_forward (f); - struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, true); + struct ggml_v2_cgraph gf = ggml_v2_build_forward (f); + struct ggml_v2_cgraph gb = ggml_v2_build_backward(ctx, &gf, true); switch (params.type) { - case GGML_OPT_ADAM: + case GGML_V2_OPT_ADAM: { - result = ggml_opt_adam(ctx, params, f, &gf, &gb); + result = ggml_v2_opt_adam(ctx, params, f, &gf, &gb); } break; - case GGML_OPT_LBFGS: + case GGML_V2_OPT_LBFGS: { - result = ggml_opt_lbfgs(ctx, params, f, &gf, &gb); + result = ggml_v2_opt_lbfgs(ctx, params, f, &gf, &gb); } break; } if (params.print_forward_graph) { - ggml_graph_print (&gf); - ggml_graph_dump_dot(&gf, NULL, "opt-forward.dot"); + ggml_v2_graph_print (&gf); + ggml_v2_graph_dump_dot(&gf, NULL, "opt-forward.dot"); } if (params.print_backward_graph) { - ggml_graph_print (&gb); - ggml_graph_dump_dot(&gb, &gf, "opt-backward.dot"); + ggml_v2_graph_print (&gb); + ggml_v2_graph_dump_dot(&gb, &gf, "opt-backward.dot"); } if (free_ctx) { - ggml_free(ctx); + ggml_v2_free(ctx); } return result; @@ -15300,7 +15300,7 @@ enum ggml_opt_result ggml_opt( //////////////////////////////////////////////////////////////////////////////// -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -15323,7 +15323,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_0*sizeof(block_q4_0)); } -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -15346,7 +15346,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_1*sizeof(block_q4_1)); } -size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_0 == 0); const int nb = k / QK5_0; @@ -15376,7 +15376,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK5_0*sizeof(block_q5_0)); } -size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_1 == 0); const int nb = k / QK5_1; @@ -15406,7 +15406,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * return (n/QK5_1*sizeof(block_q5_1)); } -size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -15427,38 +15427,38 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK8_0*sizeof(block_q8_0)); } -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t ggml_v2_quantize_chunk(enum ggml_v2_type type, const float * src, void * dst, int start, int n, int64_t * hist) { size_t result = 0; switch (type) { - case GGML_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_0: { - GGML_ASSERT(start % QK4_0 == 0); + GGML_V2_ASSERT(start % QK4_0 == 0); block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; - result = ggml_quantize_q4_0(src + start, block, n, n, hist); + result = ggml_v2_quantize_q4_0(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_1: { - GGML_ASSERT(start % QK4_1 == 0); + GGML_V2_ASSERT(start % QK4_1 == 0); block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; - result = ggml_quantize_q4_1(src + start, block, n, n, hist); + result = ggml_v2_quantize_q4_1(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_0: { - GGML_ASSERT(start % QK5_0 == 0); + GGML_V2_ASSERT(start % QK5_0 == 0); block_q5_0 * block = (block_q5_0*)dst + start / QK5_0; - result = ggml_quantize_q5_0(src + start, block, n, n, hist); + result = ggml_v2_quantize_q5_0(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q5_1: + case GGML_V2_TYPE_Q5_1: { - GGML_ASSERT(start % QK5_1 == 0); + GGML_V2_ASSERT(start % QK5_1 == 0); block_q5_1 * block = (block_q5_1*)dst + start / QK5_1; - result = ggml_quantize_q5_1(src + start, block, n, n, hist); + result = ggml_v2_quantize_q5_1(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_0: { - GGML_ASSERT(start % QK8_0 == 0); + GGML_V2_ASSERT(start % QK8_0 == 0); block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; - result = ggml_quantize_q8_0(src + start, block, n, n, hist); + result = ggml_v2_quantize_q8_0(src + start, block, n, n, hist); } break; default: assert(false); @@ -15468,7 +15468,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i //////////////////////////////////////////////////////////////////////////////// -int ggml_cpu_has_avx(void) { +int ggml_v2_cpu_has_avx(void) { #if defined(__AVX__) return 1; #else @@ -15476,7 +15476,7 @@ int ggml_cpu_has_avx(void) { #endif } -int ggml_cpu_has_avx2(void) { +int ggml_v2_cpu_has_avx2(void) { #if defined(__AVX2__) return 1; #else @@ -15484,7 +15484,7 @@ int ggml_cpu_has_avx2(void) { #endif } -int ggml_cpu_has_avx512(void) { +int ggml_v2_cpu_has_avx512(void) { #if defined(__AVX512F__) return 1; #else @@ -15492,7 +15492,7 @@ int ggml_cpu_has_avx512(void) { #endif } -int ggml_cpu_has_avx512_vbmi(void) { +int ggml_v2_cpu_has_avx512_vbmi(void) { #if defined(__AVX512VBMI__) return 1; #else @@ -15500,7 +15500,7 @@ int ggml_cpu_has_avx512_vbmi(void) { #endif } -int ggml_cpu_has_avx512_vnni(void) { +int ggml_v2_cpu_has_avx512_vnni(void) { #if defined(__AVX512VNNI__) return 1; #else @@ -15508,7 +15508,7 @@ int ggml_cpu_has_avx512_vnni(void) { #endif } -int ggml_cpu_has_fma(void) { +int ggml_v2_cpu_has_fma(void) { #if defined(__FMA__) return 1; #else @@ -15516,7 +15516,7 @@ int ggml_cpu_has_fma(void) { #endif } -int ggml_cpu_has_neon(void) { +int ggml_v2_cpu_has_neon(void) { #if defined(__ARM_NEON) return 1; #else @@ -15524,7 +15524,7 @@ int ggml_cpu_has_neon(void) { #endif } -int ggml_cpu_has_arm_fma(void) { +int ggml_v2_cpu_has_arm_fma(void) { #if defined(__ARM_FEATURE_FMA) return 1; #else @@ -15532,7 +15532,7 @@ int ggml_cpu_has_arm_fma(void) { #endif } -int ggml_cpu_has_f16c(void) { +int ggml_v2_cpu_has_f16c(void) { #if defined(__F16C__) return 1; #else @@ -15540,7 +15540,7 @@ int ggml_cpu_has_f16c(void) { #endif } -int ggml_cpu_has_fp16_va(void) { +int ggml_v2_cpu_has_fp16_va(void) { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) return 1; #else @@ -15548,7 +15548,7 @@ int ggml_cpu_has_fp16_va(void) { #endif } -int ggml_cpu_has_wasm_simd(void) { +int ggml_v2_cpu_has_wasm_simd(void) { #if defined(__wasm_simd128__) return 1; #else @@ -15556,7 +15556,7 @@ int ggml_cpu_has_wasm_simd(void) { #endif } -int ggml_cpu_has_blas(void) { +int ggml_v2_cpu_has_blas(void) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) return 1; #else @@ -15564,7 +15564,7 @@ int ggml_cpu_has_blas(void) { #endif } -int ggml_cpu_has_cublas(void) { +int ggml_v2_cpu_has_cublas(void) { #if defined(GGML_USE_CUBLAS) return 1; #else @@ -15572,7 +15572,7 @@ int ggml_cpu_has_cublas(void) { #endif } -int ggml_cpu_has_clblast(void) { +int ggml_v2_cpu_has_clblast(void) { #if defined(GGML_USE_CLBLAST) return 1; #else @@ -15580,11 +15580,11 @@ int ggml_cpu_has_clblast(void) { #endif } -int ggml_cpu_has_gpublas(void) { - return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); +int ggml_v2_cpu_has_gpublas(void) { + return ggml_v2_cpu_has_cublas() || ggml_v2_cpu_has_clblast(); } -int ggml_cpu_has_sse3(void) { +int ggml_v2_cpu_has_sse3(void) { #if defined(__SSE3__) return 1; #else @@ -15592,7 +15592,7 @@ int ggml_cpu_has_sse3(void) { #endif } -int ggml_cpu_has_vsx(void) { +int ggml_v2_cpu_has_vsx(void) { #if defined(__POWER9_VECTOR__) return 1; #else @@ -16277,7 +16277,7 @@ static void quantize_row_q4_2_reference_v2(const float * restrict x, block_q4_2 const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_V2_FP32_TO_FP16(d); for (int l = 0; l < QK4_2; l += 2) { const float v0 = x[i*QK4_2 + l + 0]*id; @@ -16319,8 +16319,8 @@ static void quantize_row_q4_3_reference_v2(const float * restrict x, block_q4_3 const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); - y[i].m = GGML_FP32_TO_FP16(min); + y[i].d = GGML_V2_FP32_TO_FP16(d); + y[i].m = GGML_V2_FP32_TO_FP16(min); for (int l = 0; l < QK4_3; l += 2) { const float v0 = (x[i*QK4_3 + l + 0] - min)*id; @@ -16364,7 +16364,7 @@ static void quantize_row_q5_0_reference_v2(const float * restrict x, block_q5_0 const float d = max / -16; const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_V2_FP32_TO_FP16(d); uint32_t qh = 0; @@ -16411,8 +16411,8 @@ static void quantize_row_q5_1_reference_v2(const float * restrict x, block_q5_1 const float d = (max - min) / ((1 << 5) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); - y[i].m = GGML_FP32_TO_FP16(min); + y[i].d = GGML_V2_FP32_TO_FP16(d); + y[i].m = GGML_V2_FP32_TO_FP16(min); uint32_t qh = 0; @@ -17026,7 +17026,7 @@ static void dequantize_row_q4_2_v2(const void * restrict vx, float * restrict y, const block_q4_2 * restrict x = vx; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); const uint8_t * restrict pp = x[i].qs; @@ -17055,8 +17055,8 @@ static void dequantize_row_q4_3_v2(const void * restrict vx, float * restrict y, const block_q4_3 * restrict x = vx; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); + const float m = GGML_V2_FP16_TO_FP32(x[i].m); const uint8_t * restrict pp = x[i].qs; @@ -17085,7 +17085,7 @@ static void dequantize_row_q5_0_v2(const void * restrict vx, float * restrict y, const block_q5_0 * restrict x = vx; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); const uint8_t * restrict pp = x[i].qs; @@ -17121,8 +17121,8 @@ static void dequantize_row_q5_1_v2(const void * restrict vx, float * restrict y, const block_q5_1 * restrict x = vx; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); + const float m = GGML_V2_FP16_TO_FP32(x[i].m); const uint8_t * restrict pp = x[i].qs; @@ -17168,13 +17168,13 @@ static void dequantize_row_q8_0_v2(const void * restrict vx, float * restrict y, } } -static void ggml_vec_dot_q4_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q4_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void ggml_vec_dot_q8_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q4_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q4_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_v2_vec_dot_q8_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); inline void SetQuantsUnshuffled(bool unshuffle) { @@ -17186,75 +17186,75 @@ inline bool GetQuantsUnshuffled() } //TODO: integrate backwards compat -static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = { - [GGML_TYPE_Q4_0] = { +static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT] = { + [GGML_V2_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0_v2, .quantize_row_q = quantize_row_q4_0_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference_v2, .quantize_row_q_dot = quantize_row_q8_0_v2, - .vec_dot_q = ggml_vec_dot_q4_0_q8_0_v2, - .vec_dot_type = GGML_TYPE_Q8_0, + .vec_dot_q = ggml_v2_vec_dot_q4_0_q8_0_v2, + .vec_dot_type = GGML_V2_TYPE_Q8_0, }, - [GGML_TYPE_Q4_1] = { + [GGML_V2_TYPE_Q4_1] = { .dequantize_row_q = dequantize_row_q4_1_v2, .quantize_row_q = quantize_row_q4_1_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, - .vec_dot_q = ggml_vec_dot_q4_1_q8_1_v2, - .vec_dot_type = GGML_TYPE_Q8_1B, + .vec_dot_q = ggml_v2_vec_dot_q4_1_q8_1_v2, + .vec_dot_type = GGML_V2_TYPE_Q8_1B, }, - [GGML_TYPE_Q4_2] = { + [GGML_V2_TYPE_Q4_2] = { .dequantize_row_q = dequantize_row_q4_2_v2, .quantize_row_q = quantize_row_q4_2_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference_v2, .quantize_row_q_dot = quantize_row_q8_0_v2, - .vec_dot_q = ggml_vec_dot_q4_2_q8_0_v2, - .vec_dot_type = GGML_TYPE_Q8_0, + .vec_dot_q = ggml_v2_vec_dot_q4_2_q8_0_v2, + .vec_dot_type = GGML_V2_TYPE_Q8_0, }, - [GGML_TYPE_Q4_3] = { + [GGML_V2_TYPE_Q4_3] = { .dequantize_row_q = dequantize_row_q4_3_v2, .quantize_row_q = quantize_row_q4_3_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, - .vec_dot_q = ggml_vec_dot_q4_3_q8_1_v2, - .vec_dot_type = GGML_TYPE_Q8_1B, + .vec_dot_q = ggml_v2_vec_dot_q4_3_q8_1_v2, + .vec_dot_type = GGML_V2_TYPE_Q8_1B, }, - [GGML_TYPE_Q5_0] = { + [GGML_V2_TYPE_Q5_0] = { .dequantize_row_q = dequantize_row_q5_0_v2, .quantize_row_q = quantize_row_q5_0_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference_v2, .quantize_row_q_dot = quantize_row_q8_0_v2, - .vec_dot_q = ggml_vec_dot_q5_0_q8_0_v2, - .vec_dot_type = GGML_TYPE_Q8_0, + .vec_dot_q = ggml_v2_vec_dot_q5_0_q8_0_v2, + .vec_dot_type = GGML_V2_TYPE_Q8_0, }, - [GGML_TYPE_Q5_1] = { + [GGML_V2_TYPE_Q5_1] = { .dequantize_row_q = dequantize_row_q5_1_v2, .quantize_row_q = quantize_row_q5_1_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, - .vec_dot_q = ggml_vec_dot_q5_1_q8_1_v2, - .vec_dot_type = GGML_TYPE_Q8_1B, + .vec_dot_q = ggml_v2_vec_dot_q5_1_q8_1_v2, + .vec_dot_type = GGML_V2_TYPE_Q8_1B, }, - [GGML_TYPE_Q8_0] = { + [GGML_V2_TYPE_Q8_0] = { .dequantize_row_q = dequantize_row_q8_0_v2, .quantize_row_q = quantize_row_q8_0_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference_v2, .quantize_row_q_dot = quantize_row_q8_0_v2, - .vec_dot_q = ggml_vec_dot_q8_0_q8_0_v2, - .vec_dot_type = GGML_TYPE_Q8_0, + .vec_dot_q = ggml_v2_vec_dot_q8_0_q8_0_v2, + .vec_dot_type = GGML_V2_TYPE_Q8_0, }, - [GGML_TYPE_Q8_1B] = { + [GGML_V2_TYPE_Q8_1B] = { .dequantize_row_q = NULL, // TODO .quantize_row_q = quantize_row_q8_1_v2, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference_v2, .quantize_row_q_dot = quantize_row_q8_1_v2, .vec_dot_q = NULL, // TODO - .vec_dot_type = GGML_TYPE_Q8_1B, + .vec_dot_type = GGML_V2_TYPE_Q8_1B, }, }; -static void ggml_vec_dot_q4_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q4_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; assert(n % QK8_0 == 0); @@ -17423,7 +17423,7 @@ static void ggml_vec_dot_q4_0_q8_0_v2(const int n, float * restrict s, const voi #endif } -static void ggml_vec_dot_q4_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q4_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_1; assert(n % QK8_1 == 0); @@ -17557,7 +17557,7 @@ static void ggml_vec_dot_q4_1_q8_1_v2(const int n, float * restrict s, const voi #endif } -static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; assert(n % QK8_0 == 0); @@ -17612,12 +17612,12 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)), - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d); + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_V2_FP16_TO_FP32(x0_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_V2_FP16_TO_FP32(x0_1->d))), y0->d); sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)), - vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d); + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_V2_FP16_TO_FP32(x1_0->d)), + vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_V2_FP16_TO_FP32(x1_1->d))), y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); @@ -17635,12 +17635,12 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); sumv0 = vmlaq_n_f32(sumv0, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)), - vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d); + vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_V2_FP16_TO_FP32(x0_0->d)), + vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_V2_FP16_TO_FP32(x0_1->d))), y0->d); sumv1 = vmlaq_n_f32(sumv1, vaddq_f32( - vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)), - vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d); + vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_V2_FP16_TO_FP32(x1_0->d)), + vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_V2_FP16_TO_FP32(x1_1->d))), y1->d); #endif } @@ -17652,8 +17652,8 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d)); - const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d)); + const __m128 d0 = _mm_set1_ps(GGML_V2_FP16_TO_FP32(x[2*i + 0].d)); + const __m128 d1 = _mm_set1_ps(GGML_V2_FP16_TO_FP32(x[2*i + 1].d)); const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d)); __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs); @@ -17681,8 +17681,8 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi const uint8_t * restrict x1 = x[2*i + 1].qs; const int8_t * restrict y0 = y[i].qs; - const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); - const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); + const float d0 = GGML_V2_FP16_TO_FP32(x[2*i + 0].d); + const float d1 = GGML_V2_FP16_TO_FP32(x[2*i + 1].d); int sumi_0 = 0; int sumi_1 = 0; @@ -17714,7 +17714,7 @@ static void ggml_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const voi #endif } -static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_1; assert(n % QK8_1 == 0); @@ -17737,8 +17737,8 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi const block_q8_1_v2 * restrict y0 = &y[i + 0]; - summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0; - summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1; + summs0 += GGML_V2_FP16_TO_FP32(x0_0->m) * y0->s0; + summs1 += GGML_V2_FP16_TO_FP32(x0_1->m) * y0->s1; const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs)); @@ -17754,8 +17754,8 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); - const float x0_0d = GGML_FP16_TO_FP32(x0_0->d); - const float x0_1d = GGML_FP16_TO_FP32(x0_1->d); + const float x0_0d = GGML_V2_FP16_TO_FP32(x0_0->d); + const float x0_1d = GGML_V2_FP16_TO_FP32(x0_1->d); #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d); @@ -17782,12 +17782,12 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi // Main loop for (int i = 0; i < nb; i++) { - const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d)); - const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d)); + const __m128 d0 = _mm_set1_ps(GGML_V2_FP16_TO_FP32(x[2*i + 0].d)); + const __m128 d1 = _mm_set1_ps(GGML_V2_FP16_TO_FP32(x[2*i + 1].d)); const __m256 dx = _mm256_set_m128(d1, d0); - summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0 - + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1; + summs += GGML_V2_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0 + + GGML_V2_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1; const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs); const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs); @@ -17810,10 +17810,10 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi const uint8_t * restrict x1 = x[2*i + 1].qs; const int8_t * restrict y0 = y[i].qs; - const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d); - const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m); - const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d); - const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m); + const float d0 = GGML_V2_FP16_TO_FP32(x[2*i + 0].d); + const float m0 = GGML_V2_FP16_TO_FP32(x[2*i + 0].m); + const float d1 = GGML_V2_FP16_TO_FP32(x[2*i + 1].d); + const float m1 = GGML_V2_FP16_TO_FP32(x[2*i + 1].m); int sxy_0 = 0; int sxy_1 = 0; @@ -17844,7 +17844,7 @@ static void ggml_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const voi #endif } -static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; assert(n % QK8_0 == 0); @@ -17896,7 +17896,7 @@ static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const voi const int8x16_t v1l = vld1q_s8(y0->qs); const int8x16_t v1h = vld1q_s8(y0->qs + 16); - const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); #if defined(__ARM_FEATURE_DOTPROD) sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32( @@ -17969,7 +17969,7 @@ static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const voi const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); // dot product sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( @@ -17989,7 +17989,7 @@ static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const voi // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); + const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_V2_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d)); __m256i bx = bytes_from_nibbles_32_v2(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -18015,7 +18015,7 @@ static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const voi uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - const float d = GGML_FP16_TO_FP32(x[i].d); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); int sxy = 0; @@ -18040,7 +18040,7 @@ static void ggml_vec_dot_q5_0_q8_0_v2(const int n, float * restrict s, const voi #endif } -static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_1; assert(n % QK8_1 == 0); @@ -18061,7 +18061,7 @@ static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const voi const block_q5_1 * restrict x0 = &x[i]; const block_q8_1_v2 * restrict y0 = &y[i]; - summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + summs += GGML_V2_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); // extract the 5th bit uint32_t qh; @@ -18093,7 +18093,7 @@ static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const voi const int8x16_t v1l = vld1q_s8(y0->qs); const int8x16_t v1h = vld1q_s8(y0->qs + 16); - const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); #if defined(__ARM_FEATURE_DOTPROD) sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32( @@ -18124,7 +18124,7 @@ static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const voi const block_q5_1 * restrict x0 = &x[i]; const block_q8_1_v2 * restrict y0 = &y[i]; - summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + summs += GGML_V2_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -18171,7 +18171,7 @@ static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const voi const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); - const float x0d = GGML_FP16_TO_FP32(x0->d); + const float x0d = GGML_V2_FP16_TO_FP32(x0->d); // dot product sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( @@ -18191,9 +18191,9 @@ static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const voi // Main loop for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); + const __m256 dx = _mm256_set1_ps(GGML_V2_FP16_TO_FP32(x[i].d)); - summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1); + summs += GGML_V2_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1); __m256i bx = bytes_from_nibbles_32_v2(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -18219,8 +18219,8 @@ static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const voi uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); - const float d = GGML_FP16_TO_FP32(x[i].d); - const float m = GGML_FP16_TO_FP32(x[i].m); + const float d = GGML_V2_FP16_TO_FP32(x[i].d); + const float m = GGML_V2_FP16_TO_FP32(x[i].m); int sxy = 0; @@ -18246,7 +18246,7 @@ static void ggml_vec_dot_q5_1_q8_1_v2(const int n, float * restrict s, const voi #endif } -static void ggml_vec_dot_q8_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_v2_vec_dot_q8_0_q8_0_v2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; assert(n % QK8_0 == 0); @@ -18354,7 +18354,7 @@ static void ggml_vec_dot_q8_0_q8_0_v2(const int n, float * restrict s, const voi //////////////////////////////////////////////////////////////////////////////// -size_t ggml_quantize_q4_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q4_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -18377,7 +18377,7 @@ size_t ggml_quantize_q4_0_v2(const float * src, void * dst, int n, int k, int64_ return (n/QK4_0*sizeof(block_q4_0)); } -size_t ggml_quantize_q4_1_v2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q4_1_v2(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -18400,7 +18400,7 @@ size_t ggml_quantize_q4_1_v2(const float * src, void * dst, int n, int k, int64_ return (n/QK4_1*sizeof(block_q4_1)); } -size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_2 == 0); const int nb = k / QK4_2; @@ -18423,7 +18423,7 @@ size_t ggml_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_ return (n/QK4_2*sizeof(block_q4_2)); } -size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_3 == 0); const int nb = k / QK4_3; @@ -18446,7 +18446,7 @@ size_t ggml_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_ return (n/QK4_3*sizeof(block_q4_3)); } -size_t ggml_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_0 == 0); const int nb = k / QK5_0; @@ -18476,7 +18476,7 @@ size_t ggml_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_ return (n/QK5_0*sizeof(block_q5_0)); } -size_t ggml_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_1 == 0); const int nb = k / QK5_1; @@ -18506,7 +18506,7 @@ size_t ggml_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_ return (n/QK5_1*sizeof(block_q5_1)); } -size_t ggml_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_v2_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -18528,50 +18528,50 @@ size_t ggml_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_ } //TODO: integrate -size_t ggml_quantize_chunk_v2(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t ggml_v2_quantize_chunk_v2(enum ggml_v2_type type, const float * src, void * dst, int start, int n, int64_t * hist) { size_t result = 0; switch (type) { - case GGML_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_0: { - GGML_ASSERT(start % QK4_0 == 0); + GGML_V2_ASSERT(start % QK4_0 == 0); block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; - result = ggml_quantize_q4_0_v2(src + start, block, n, n, hist); + result = ggml_v2_quantize_q4_0_v2(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_1: { - GGML_ASSERT(start % QK4_1 == 0); + GGML_V2_ASSERT(start % QK4_1 == 0); block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; - result = ggml_quantize_q4_1_v2(src + start, block, n, n, hist); + result = ggml_v2_quantize_q4_1_v2(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_2: { - GGML_ASSERT(start % QK4_2 == 0); + GGML_V2_ASSERT(start % QK4_2 == 0); block_q4_2 * block = (block_q4_2*)dst + start / QK4_2; - result = ggml_quantize_q4_2_v2(src + start, block, n, n, hist); + result = ggml_v2_quantize_q4_2_v2(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q4_3: + case GGML_V2_TYPE_Q4_3: { - GGML_ASSERT(start % QK4_3 == 0); + GGML_V2_ASSERT(start % QK4_3 == 0); block_q4_3 * block = (block_q4_3*)dst + start / QK4_3; - result = ggml_quantize_q4_3_v2(src + start, block, n, n, hist); + result = ggml_v2_quantize_q4_3_v2(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_0: { - GGML_ASSERT(start % QK5_0 == 0); + GGML_V2_ASSERT(start % QK5_0 == 0); block_q5_0 * block = (block_q5_0*)dst + start / QK5_0; - result = ggml_quantize_q5_0_v2(src + start, block, n, n, hist); + result = ggml_v2_quantize_q5_0_v2(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q5_1: + case GGML_V2_TYPE_Q5_1: { - GGML_ASSERT(start % QK5_1 == 0); + GGML_V2_ASSERT(start % QK5_1 == 0); block_q5_1 * block = (block_q5_1*)dst + start / QK5_1; - result = ggml_quantize_q5_1_v2(src + start, block, n, n, hist); + result = ggml_v2_quantize_q5_1_v2(src + start, block, n, n, hist); } break; - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_0: { - GGML_ASSERT(start % QK8_0 == 0); + GGML_V2_ASSERT(start % QK8_0 == 0); block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; - result = ggml_quantize_q8_0_v2(src + start, block, n, n, hist); + result = ggml_v2_quantize_q8_0_v2(src + start, block, n, n, hist); } break; default: assert(false); diff --git a/otherarch/ggml_v2.h b/otherarch/ggml_v2.h new file mode 100644 index 000000000..f24d6748d --- /dev/null +++ b/otherarch/ggml_v2.h @@ -0,0 +1,1143 @@ +#pragma once + +// +// GGML Tensor Library +// +// This documentation is still a work in progress. +// If you wish some specific topics to be covered, feel free to drop a comment: +// +// https://github.com/ggerganov/whisper.cpp/issues/40 +// +// ## Overview +// +// This library implements: +// +// - a set of tensor operations +// - automatic differentiation +// - basic optimization algorithms +// +// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, +// but is not limited to, the following: +// +// - linear regression +// - support vector machines +// - neural networks +// +// The library allows the user to define a certain function using the available tensor operations. This function +// definition is represented internally via a computation graph. Each tensor operation in the function definition +// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the +// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized +// using one of the available optimization algorithms. +// +// For example, here we define the function: f(x) = a*x^2 + b +// +// { +// struct ggml_v2_init_params params = { +// .mem_size = 16*1024*1024, +// .mem_buffer = NULL, +// }; +// +// // memory allocation happens here +// struct ggml_v2_context * ctx = ggml_v2_init(params); +// +// struct ggml_v2_tensor * x = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 1); +// +// ggml_v2_set_param(ctx, x); // x is an input variable +// +// struct ggml_v2_tensor * a = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 1); +// struct ggml_v2_tensor * b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 1); +// struct ggml_v2_tensor * x2 = ggml_v2_mul(ctx, x, x); +// struct ggml_v2_tensor * f = ggml_v2_add(ctx, ggml_v2_mul(ctx, a, x2), b); +// +// ... +// } +// +// Notice that the function definition above does not involve any actual computation. The computation is performed only +// when the user explicitly requests it. For example, to compute the function's value at x = 2.0: +// +// { +// ... +// +// struct ggml_v2_cgraph gf = ggml_v2_build_forward(f); +// +// // set the input variable and parameter values +// ggml_v2_set_f32(x, 2.0f); +// ggml_v2_set_f32(a, 3.0f); +// ggml_v2_set_f32(b, 4.0f); +// +// ggml_v2_graph_compute(ctx0, &gf); +// +// printf("f = %f\n", ggml_v2_get_f32_1d(f, 0)); +// +// ... +// } +// +// The actual computation is performed in the ggml_v2_graph_compute() function. +// +// The ggml_v2_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the +// ggml_v2_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know +// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory +// and after defining the computation graph, call the ggml_v2_used_mem() function to find out how much memory was +// actually needed. +// +// The ggml_v2_set_param() function marks a tensor as an input variable. This is used by the automatic +// differentiation and optimization algorithms. +// +// The described approach allows to define the function graph once and then compute its forward or backward graphs +// multiple times. All computations will use the same memory buffer allocated in the ggml_v2_init() function. This way +// the user can avoid the memory allocation overhead at runtime. +// +// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class +// citizens, but in theory the library can be extended to support FP8 and integer data types. +// +// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary +// and binary operations. Most of the available operations fall into one of these two categories. With time, it became +// clear that the library needs to support more complex operations. The way to support these operations is not clear +// yet, but a few examples are demonstrated in the following operations: +// +// - ggml_v2_permute() +// - ggml_v2_conv_1d_1s() +// - ggml_v2_conv_1d_2s() +// +// For each tensor operator, the library implements a forward and backward computation function. The forward function +// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the +// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a +// calculus class, or watch the following video: +// +// What is Automatic Differentiation? +// https://www.youtube.com/watch?v=wG_nF1awSSY +// +// +// ## Tensor data (struct ggml_v2_tensor) +// +// The tensors are stored in memory via the ggml_v2_tensor struct. The structure provides information about the size of +// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains +// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: +// +// { +// struct ggml_v2_tensor * c = ggml_v2_add(ctx, a, b); +// +// assert(c->src[0] == a); +// assert(c->src[1] == b); +// } +// +// The multi-dimensional tensors are stored in row-major order. The ggml_v2_tensor struct contains fields for the +// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows +// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and +// permutation. All tensor operations have to take the stride into account and not assume that the tensor is +// contiguous in memory. +// +// The data of the tensor is accessed via the "data" pointer. For example: +// +// { +// struct ggml_v2_tensor * a = ggml_v2_new_tensor_2d(ctx, GGML_V2_TYPE_F32, 2, 3); +// +// // a[1, 2] = 1.0f; +// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; +// +// // a[2, 0] = 2.0f; +// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; +// +// ... +// } +// +// Alternatively, there are helper functions, such as ggml_v2_get_f32_1d() and ggml_v2_set_f32_1d() that can be used. +// +// ## The matrix multiplication operator (ggml_v2_mul_mat) +// +// TODO +// +// +// ## Multi-threading +// +// TODO +// +// +// ## Overview of ggml.c +// +// TODO +// +// +// ## SIMD optimizations +// +// TODO +// +// +// ## Debugging ggml +// +// TODO +// +// + +#ifdef GGML_V2_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_V2_BUILD +# define GGML_V2_API __declspec(dllexport) +# else +# define GGML_V2_API __declspec(dllimport) +# endif +# else +# define GGML_V2_API __attribute__ ((visibility ("default"))) +# endif +#else +# define GGML_V2_API +#endif + +#include +#include +#include + +#define GGML_V2_FILE_MAGIC 0x67676d6c // "ggml" +#define GGML_V2_FILE_VERSION 1 + +#define GGML_V2_QNT_VERSION 1 // bump this on quantization format changes +#define GGML_V2_QNT_VERSION_FACTOR 1000 // do not change this + +#define GGML_V2_MAX_DIMS 4 +#define GGML_V2_MAX_NODES 4096 +#define GGML_V2_MAX_PARAMS 256 +#define GGML_V2_MAX_CONTEXTS 64 +#define GGML_V2_MAX_OPT 4 +#define GGML_V2_DEFAULT_N_THREADS 4 + +#define GGML_V2_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "GGML_V2_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __ARM_NEON + // we use the built-in 16-bit float type + typedef __fp16 ggml_v2_fp16_t; +#else + typedef uint16_t ggml_v2_fp16_t; +#endif + + // convert FP16 <-> FP32 + GGML_V2_API float ggml_v2_fp16_to_fp32(ggml_v2_fp16_t x); + GGML_V2_API ggml_v2_fp16_t ggml_v2_fp32_to_fp16(float x); + + GGML_V2_API void ggml_v2_fp16_to_fp32_row(const ggml_v2_fp16_t * x, float * y, size_t n); + GGML_V2_API void ggml_v2_fp32_to_fp16_row(const float * x, ggml_v2_fp16_t * y, size_t n); + + struct ggml_v2_object; + struct ggml_v2_context; + + enum ggml_v2_type { + GGML_V2_TYPE_F32 = 0, + GGML_V2_TYPE_F16 = 1, + GGML_V2_TYPE_Q4_0 = 2, + GGML_V2_TYPE_Q4_1 = 3, + GGML_V2_TYPE_Q4_2 = 4, //support has been removed + GGML_V2_TYPE_Q4_3 = 5, //support has been removed + GGML_V2_TYPE_Q5_0 = 6, + GGML_V2_TYPE_Q5_1 = 7, + GGML_V2_TYPE_Q8_0 = 8, + GGML_V2_TYPE_Q8_1 = 9, + GGML_V2_TYPE_I8, + GGML_V2_TYPE_I16, + GGML_V2_TYPE_I32, + GGML_V2_TYPE_Q8_1B = 13, //legacy q8_1 + GGML_V2_TYPE_COUNT, + }; + + enum ggml_v2_backend { + GGML_V2_BACKEND_CPU = 0, + GGML_V2_BACKEND_CUDA = 1, + GGML_V2_BACKEND_CL = 2, + }; + + // model file types + enum ggml_v2_ftype { + GGML_V2_FTYPE_UNKNOWN = -1, + GGML_V2_FTYPE_ALL_F32 = 0, + GGML_V2_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_V2_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_V2_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_V2_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_V2_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + GGML_V2_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors + GGML_V2_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_V2_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_V2_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + }; + + // available tensor operations: + enum ggml_v2_op { + GGML_V2_OP_NONE = 0, + + GGML_V2_OP_DUP, + GGML_V2_OP_ADD, + GGML_V2_OP_ADD1, + GGML_V2_OP_ACC, + GGML_V2_OP_SUB, + GGML_V2_OP_MUL, + GGML_V2_OP_DIV, + GGML_V2_OP_SQR, + GGML_V2_OP_SQRT, + GGML_V2_OP_LOG, + GGML_V2_OP_SUM, + GGML_V2_OP_SUM_ROWS, + GGML_V2_OP_MEAN, + GGML_V2_OP_REPEAT, + GGML_V2_OP_ABS, + GGML_V2_OP_SGN, + GGML_V2_OP_NEG, + GGML_V2_OP_STEP, + GGML_V2_OP_RELU, + GGML_V2_OP_GELU, + GGML_V2_OP_SILU, + GGML_V2_OP_SILU_BACK, + GGML_V2_OP_NORM, // normalize + GGML_V2_OP_RMS_NORM, + GGML_V2_OP_RMS_NORM_BACK, + + GGML_V2_OP_MUL_MAT, + + GGML_V2_OP_SCALE, + GGML_V2_OP_SET, + GGML_V2_OP_CPY, + GGML_V2_OP_CONT, + GGML_V2_OP_RESHAPE, + GGML_V2_OP_VIEW, + GGML_V2_OP_PERMUTE, + GGML_V2_OP_TRANSPOSE, + GGML_V2_OP_GET_ROWS, + GGML_V2_OP_GET_ROWS_BACK, + GGML_V2_OP_DIAG, + GGML_V2_OP_DIAG_MASK_INF, + GGML_V2_OP_DIAG_MASK_ZERO, + GGML_V2_OP_SOFT_MAX, + GGML_V2_OP_ROPE, + GGML_V2_OP_ROPE_BACK, + GGML_V2_OP_ALIBI, + GGML_V2_OP_CONV_1D_1S, + GGML_V2_OP_CONV_1D_2S, + + GGML_V2_OP_FLASH_ATTN, + GGML_V2_OP_FLASH_FF, + + GGML_V2_OP_MAP_UNARY, + GGML_V2_OP_MAP_BINARY, + + GGML_V2_OP_COUNT, + }; + + + // ggml object + struct ggml_v2_object { + size_t offs; + size_t size; + + struct ggml_v2_object * next; + + char padding[8]; + }; + + static const size_t GGML_V2_OBJECT_SIZE = sizeof(struct ggml_v2_object); + + // n-dimensional tensor + struct ggml_v2_tensor { + enum ggml_v2_type type; + enum ggml_v2_backend backend; + + int n_dims; + int64_t ne[GGML_V2_MAX_DIMS]; // number of elements + size_t nb[GGML_V2_MAX_DIMS]; // stride in bytes: + // nb[0] = sizeof(type) + // nb[1] = nb[0] * ne[0] + padding + // nb[i] = nb[i-1] * ne[i-1] + + // compute data + enum ggml_v2_op op; + + bool is_param; + + struct ggml_v2_tensor * grad; + struct ggml_v2_tensor * src0; + struct ggml_v2_tensor * src1; + struct ggml_v2_tensor * opt[GGML_V2_MAX_OPT]; + + // thread scheduling + int n_tasks; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + + void * data; + + char name[32]; + + char padding[16]; + }; + + // computation graph + struct ggml_v2_cgraph { + int n_nodes; + int n_leafs; + int n_threads; + + size_t work_size; + struct ggml_v2_tensor * work; + + struct ggml_v2_tensor * nodes[GGML_V2_MAX_NODES]; + struct ggml_v2_tensor * grads[GGML_V2_MAX_NODES]; + struct ggml_v2_tensor * leafs[GGML_V2_MAX_NODES]; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + }; + + // scratch buffer + struct ggml_v2_scratch { + size_t offs; + size_t size; + void * data; + }; + + struct ggml_v2_init_params { + // memory pool + size_t mem_size; // bytes + void * mem_buffer; // if NULL, memory will be allocated internally + bool no_alloc; // don't allocate memory for the tensor data + }; + + // misc + + GGML_V2_API void ggml_v2_time_init(void); // call this once at the beginning of the program + GGML_V2_API int64_t ggml_v2_time_ms(void); + GGML_V2_API int64_t ggml_v2_time_us(void); + GGML_V2_API int64_t ggml_v2_cycles(void); + GGML_V2_API int64_t ggml_v2_cycles_per_ms(void); + + GGML_V2_API void ggml_v2_print_object (const struct ggml_v2_object * obj); + GGML_V2_API void ggml_v2_print_objects(const struct ggml_v2_context * ctx); + + GGML_V2_API int64_t ggml_v2_nelements(const struct ggml_v2_tensor * tensor); + GGML_V2_API size_t ggml_v2_nbytes (const struct ggml_v2_tensor * tensor); + + GGML_V2_API int ggml_v2_blck_size (enum ggml_v2_type type); + GGML_V2_API size_t ggml_v2_type_size (enum ggml_v2_type type); // size in bytes for all elements in a block + GGML_V2_API float ggml_v2_type_sizef(enum ggml_v2_type type); // ggml_v2_type_size()/ggml_v2_blck_size() as float + + GGML_V2_API const char * ggml_v2_type_name(enum ggml_v2_type type); + + GGML_V2_API size_t ggml_v2_element_size(const struct ggml_v2_tensor * tensor); + + GGML_V2_API bool ggml_v2_is_quantized(enum ggml_v2_type type); + + // TODO: temporary until model loading of ggml examples is refactored + GGML_V2_API enum ggml_v2_type ggml_v2_ftype_to_ggml_v2_type(enum ggml_v2_ftype ftype); + + // main + + GGML_V2_API struct ggml_v2_context * ggml_v2_init(struct ggml_v2_init_params params); + GGML_V2_API void ggml_v2_free(struct ggml_v2_context * ctx); + + GGML_V2_API size_t ggml_v2_used_mem(const struct ggml_v2_context * ctx); + + GGML_V2_API size_t ggml_v2_set_scratch(struct ggml_v2_context * ctx, struct ggml_v2_scratch scratch); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_new_tensor( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, + int n_dims, + const int64_t *ne); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_new_tensor_1d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, + int64_t ne0); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_new_tensor_2d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, + int64_t ne0, + int64_t ne1); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_new_tensor_3d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_new_tensor_4d( + struct ggml_v2_context * ctx, + enum ggml_v2_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_new_i32(struct ggml_v2_context * ctx, int32_t value); + GGML_V2_API struct ggml_v2_tensor * ggml_v2_new_f32(struct ggml_v2_context * ctx, float value); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_dup_tensor (struct ggml_v2_context * ctx, const struct ggml_v2_tensor * src); + GGML_V2_API struct ggml_v2_tensor * ggml_v2_view_tensor(struct ggml_v2_context * ctx, const struct ggml_v2_tensor * src); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_zero(struct ggml_v2_tensor * tensor); + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_i32 (struct ggml_v2_tensor * tensor, int32_t value); + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_f32 (struct ggml_v2_tensor * tensor, float value); + + GGML_V2_API int32_t ggml_v2_get_i32_1d(const struct ggml_v2_tensor * tensor, int i); + GGML_V2_API void ggml_v2_set_i32_1d(const struct ggml_v2_tensor * tensor, int i, int32_t value); + + GGML_V2_API float ggml_v2_get_f32_1d(const struct ggml_v2_tensor * tensor, int i); + GGML_V2_API void ggml_v2_set_f32_1d(const struct ggml_v2_tensor * tensor, int i, float value); + + GGML_V2_API void * ggml_v2_get_data (const struct ggml_v2_tensor * tensor); + GGML_V2_API float * ggml_v2_get_data_f32(const struct ggml_v2_tensor * tensor); + + GGML_V2_API const char * ggml_v2_get_name(const struct ggml_v2_tensor * tensor); + GGML_V2_API void ggml_v2_set_name(struct ggml_v2_tensor * tensor, const char * name); + + // + // operations on tensors with backpropagation + // + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_dup( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_add( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_add_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_add1( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_acc( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_acc_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_sub( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_mul( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_div( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_sqr( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_sqrt( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_log( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_log_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // return scalar + GGML_V2_API struct ggml_v2_tensor * ggml_v2_sum( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] + GGML_V2_API struct ggml_v2_tensor * ggml_v2_sum_rows( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // mean along rows + GGML_V2_API struct ggml_v2_tensor * ggml_v2_mean( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // if a is the same shape as b, and a is not parameter, return a + // otherwise, return a new tensor: repeat(a) to fit in b + GGML_V2_API struct ggml_v2_tensor * ggml_v2_repeat( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_abs( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_sgn( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_neg( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_step( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_relu( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // TODO: double-check this computation is correct + GGML_V2_API struct ggml_v2_tensor * ggml_v2_gelu( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_silu( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // a - x + // b - dy + GGML_V2_API struct ggml_v2_tensor * ggml_v2_silu_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + // normalize along rows + // TODO: eps is hardcoded to 1e-5 for now + GGML_V2_API struct ggml_v2_tensor * ggml_v2_norm( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_rms_norm( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // a - x + // b - dy + GGML_V2_API struct ggml_v2_tensor * ggml_v2_rms_norm_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + // A: m rows, n columns + // B: p rows, n columns (i.e. we transpose it internally) + // result is m columns, p rows + GGML_V2_API struct ggml_v2_tensor * ggml_v2_mul_mat( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + // + // operations on tensors without backpropagation + // + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_scale( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + // in-place, returns view(a) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_scale_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_1d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_1d_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_2d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t nb1, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_set_2d_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + size_t nb1, + size_t offset); + + + // a -> b, return view(b) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_cpy( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + // make contiguous + GGML_V2_API struct ggml_v2_tensor * ggml_v2_cont( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // return view(a), b specifies the new shape + // TODO: when we start computing gradient, make a copy instead of view + GGML_V2_API struct ggml_v2_tensor * ggml_v2_reshape( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_V2_API struct ggml_v2_tensor * ggml_v2_reshape_1d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_reshape_2d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0, + int64_t ne1); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_V2_API struct ggml_v2_tensor * ggml_v2_reshape_3d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_reshape_4d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + // offset in bytes + GGML_V2_API struct ggml_v2_tensor * ggml_v2_view_1d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0, + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_view_2d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, // row stride in bytes + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_view_3d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_view_4d( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t nb3, + size_t offset); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_permute( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); + + // alias for ggml_v2_permute(ctx, a, 1, 0, 2, 3) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_transpose( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_get_rows( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_get_rows_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + struct ggml_v2_tensor * c); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_diag( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // set elements above the diagonal to -INF + GGML_V2_API struct ggml_v2_tensor * ggml_v2_diag_mask_inf( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_diag_mask_inf_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past); + + // set elements above the diagonal to 0 + GGML_V2_API struct ggml_v2_tensor * ggml_v2_diag_mask_zero( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_diag_mask_zero_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_soft_max( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // in-place, returns view(a) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_soft_max_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a); + + // rotary position embedding + // if mode & 1 == 1, skip n_past elements + // if mode & 2 == 1, GPT-NeoX style + // TODO: avoid creating a new tensor every time + GGML_V2_API struct ggml_v2_tensor * ggml_v2_rope( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past, + int n_dims, + int mode); + + // in-place, returns view(a) + GGML_V2_API struct ggml_v2_tensor * ggml_v2_rope_inplace( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past, + int n_dims, + int mode); + + // rotary position embedding backward, i.e compute dx from dy + // a - dy + GGML_V2_API struct ggml_v2_tensor * ggml_v2_rope_back( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past, + int n_dims, + int mode); + + // alibi position embedding + // in-place, returns view(a) + struct ggml_v2_tensor * ggml_v2_alibi( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + int n_past, + int n_head); + + // padding = 1 + // TODO: we don't support extra parameters for now + // that's why we are hard-coding the stride, padding, and dilation + // not great .. + GGML_V2_API struct ggml_v2_tensor * ggml_v2_conv_1d_1s( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_conv_1d_2s( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_flash_attn( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * q, + struct ggml_v2_tensor * k, + struct ggml_v2_tensor * v, + bool masked); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_flash_ff( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b0, + struct ggml_v2_tensor * b1, + struct ggml_v2_tensor * c0, + struct ggml_v2_tensor * c1); + + // Mapping operations + typedef void (*ggml_v2_unary_op_f32_t)(const int, float *, const float *); + typedef void (*ggml_v2_binary_op_f32_t)(const int, float *, const float *, const float *); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_map_unary_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + ggml_v2_unary_op_f32_t fun); + + GGML_V2_API struct ggml_v2_tensor * ggml_v2_map_binary_f32( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * a, + struct ggml_v2_tensor * b, + ggml_v2_binary_op_f32_t fun); + + // + // automatic differentiation + // + + GGML_V2_API void ggml_v2_set_param( + struct ggml_v2_context * ctx, + struct ggml_v2_tensor * tensor); + + GGML_V2_API void ggml_v2_build_forward_expand(struct ggml_v2_cgraph * cgraph, struct ggml_v2_tensor * tensor); + + GGML_V2_API struct ggml_v2_cgraph ggml_v2_build_forward (struct ggml_v2_tensor * tensor); + GGML_V2_API struct ggml_v2_cgraph ggml_v2_build_backward(struct ggml_v2_context * ctx, struct ggml_v2_cgraph * gf, bool keep); + + GGML_V2_API void ggml_v2_graph_compute(struct ggml_v2_context * ctx, struct ggml_v2_cgraph * cgraph); + GGML_V2_API void ggml_v2_graph_reset (struct ggml_v2_cgraph * cgraph); + + // print info and performance information for the graph + GGML_V2_API void ggml_v2_graph_print(const struct ggml_v2_cgraph * cgraph); + + // dump the graph into a file using the dot format + GGML_V2_API void ggml_v2_graph_dump_dot(const struct ggml_v2_cgraph * gb, const struct ggml_v2_cgraph * gf, const char * filename); + + // + // optimization + // + + // optimization methods + enum ggml_v2_opt_type { + GGML_V2_OPT_ADAM, + GGML_V2_OPT_LBFGS, + }; + + // linesearch methods + enum ggml_v2_linesearch { + GGML_V2_LINESEARCH_DEFAULT = 1, + + GGML_V2_LINESEARCH_BACKTRACKING_ARMIJO = 0, + GGML_V2_LINESEARCH_BACKTRACKING_WOLFE = 1, + GGML_V2_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, + }; + + // optimization return values + enum ggml_v2_opt_result { + GGML_V2_OPT_OK = 0, + GGML_V2_OPT_DID_NOT_CONVERGE, + GGML_V2_OPT_NO_CONTEXT, + GGML_V2_OPT_INVALID_WOLFE, + GGML_V2_OPT_FAIL, + + GGML_V2_LINESEARCH_FAIL = -128, + GGML_V2_LINESEARCH_MINIMUM_STEP, + GGML_V2_LINESEARCH_MAXIMUM_STEP, + GGML_V2_LINESEARCH_MAXIMUM_ITERATIONS, + GGML_V2_LINESEARCH_INVALID_PARAMETERS, + }; + + // optimization parameters + // + // see ggml.c (ggml_v2_opt_default_params) for default values + // + struct ggml_v2_opt_params { + enum ggml_v2_opt_type type; + + int n_threads; + + // delta-based convergence test + // + // if past == 0 - disabled + // if past > 0: + // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) + // + int past; + float delta; + + // maximum number of iterations without improvement + // + // if 0 - disabled + // if > 0: + // assume convergence if no cost improvement in this number of iterations + // + int max_no_improvement; + + bool print_forward_graph; + bool print_backward_graph; + + // ADAM parameters + struct { + int n_iter; + + float alpha; // learning rate + float beta1; + float beta2; + float eps; // epsilon for numerical stability + float eps_f; // epsilon for convergence test + float eps_g; // epsilon for convergence test + } adam; + + // LBFGS parameters + struct { + int m; // number of corrections to approximate the inv. Hessian + int n_iter; + int max_linesearch; + + float eps; // convergence tolerance + float ftol; // line search tolerance + float wolfe; + float min_step; + float max_step; + + enum ggml_v2_linesearch linesearch; + } lbfgs; + }; + + GGML_V2_API struct ggml_v2_opt_params ggml_v2_opt_default_params(enum ggml_v2_opt_type type); + + // optimize the function defined by the tensor f + GGML_V2_API enum ggml_v2_opt_result ggml_v2_opt( + struct ggml_v2_context * ctx, + struct ggml_v2_opt_params params, + struct ggml_v2_tensor * f); + + // + // quantization + // + + GGML_V2_API size_t ggml_v2_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + + GGML_V2_API size_t ggml_v2_quantize_q4_0_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q4_1_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q4_2_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q4_3_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q5_0_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q5_1_v2(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_q8_0_v2(const float * src, void * dst, int n, int k, int64_t * hist); + + GGML_V2_API size_t ggml_v2_quantize_chunk(enum ggml_v2_type type, const float * src, void * dst, int start, int n, int64_t * hist); + GGML_V2_API size_t ggml_v2_quantize_chunk_v2(enum ggml_v2_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // + // system info + // + + void SetQuantsUnshuffled(bool unshuffled); + bool GetQuantsUnshuffled(); + + GGML_V2_API int ggml_v2_cpu_has_avx (void); + GGML_V2_API int ggml_v2_cpu_has_avx2 (void); + GGML_V2_API int ggml_v2_cpu_has_avx512 (void); + GGML_V2_API int ggml_v2_cpu_has_avx512_vbmi(void); + GGML_V2_API int ggml_v2_cpu_has_avx512_vnni(void); + GGML_V2_API int ggml_v2_cpu_has_fma (void); + GGML_V2_API int ggml_v2_cpu_has_neon (void); + GGML_V2_API int ggml_v2_cpu_has_arm_fma (void); + GGML_V2_API int ggml_v2_cpu_has_f16c (void); + GGML_V2_API int ggml_v2_cpu_has_fp16_va (void); + GGML_V2_API int ggml_v2_cpu_has_wasm_simd (void); + GGML_V2_API int ggml_v2_cpu_has_blas (void); + GGML_V2_API int ggml_v2_cpu_has_cublas (void); + GGML_V2_API int ggml_v2_cpu_has_clblast (void); + GGML_V2_API int ggml_v2_cpu_has_gpublas (void); + GGML_V2_API int ggml_v2_cpu_has_sse3 (void); + GGML_V2_API int ggml_v2_cpu_has_vsx (void); + + // + // Internal types and functions exposed for tests and benchmarks + // + +#ifdef __cplusplus + // restrict not standard in C++ +#define GGML_V2_RESTRICT +#else +#define GGML_V2_RESTRICT restrict +#endif + typedef void (*dequantize_row_q_t)(const void * GGML_V2_RESTRICT x, float * GGML_V2_RESTRICT y, int k); + typedef void (*quantize_row_q_t) (const float * GGML_V2_RESTRICT x, void * GGML_V2_RESTRICT y, int k); + typedef void (*vec_dot_q_t) (const int n, float * GGML_V2_RESTRICT s, const void * GGML_V2_RESTRICT x, const void * GGML_V2_RESTRICT y); + + typedef struct { + dequantize_row_q_t dequantize_row_q; + quantize_row_q_t quantize_row_q; + quantize_row_q_t quantize_row_q_reference; + quantize_row_q_t quantize_row_q_dot; + vec_dot_q_t vec_dot_q; + enum ggml_v2_type vec_dot_type; + } quantize_fns_t; + + quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i); + +#ifdef __cplusplus +} +#endif diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 5e384cc44..dd356b39d 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml_v2.h" #include "otherarch.h" #include "utils.h" @@ -16,7 +16,7 @@ #include "model_adapter.h" #if defined(GGML_USE_CLBLAST) -#include "ggml-opencl.h" +#include "ggml_v2-opencl.h" #endif // load the model's weights from a file @@ -50,7 +50,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t qntvr = hparams.ftype / GGML_V2_QNT_VERSION_FACTOR; printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -60,7 +60,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g printf("%s: ftype = %d\n", __func__, hparams.ftype); printf("%s: qntvr = %d\n", __func__, qntvr); - hparams.ftype %= GGML_QNT_VERSION_FACTOR; + hparams.ftype %= GGML_V2_QNT_VERSION_FACTOR; } // load vocab @@ -87,12 +87,12 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g } } - auto memory_type = GGML_TYPE_F16; + auto memory_type = GGML_V2_TYPE_F16; // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { + ggml_v2_type wtype = ggml_v2_ftype_to_ggml_v2_type((ggml_v2_ftype) (model.hparams.ftype)); + if (wtype == GGML_V2_TYPE_COUNT) { fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(), model.hparams.ftype); return ModelLoadResult::FAIL; @@ -110,51 +110,51 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // ln_f_b - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + ctx_size += n_vocab*n_embd*ggml_v2_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_v2_type_sizef(wtype); // lm_head - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_1_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_2_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*(3*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_attn_proj_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_mlp_proj_b - ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_k - ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_v + ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // memory_k + ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // memory_v ctx_size += (6 + 12*n_layer)*512; // object overhead - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_v2_tensor)); printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } // create the ggml context { - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = ctx_size; params.mem_buffer = NULL; params.no_alloc = false; - model.ctx = ggml_init(params); + model.ctx = ggml_v2_init(params); if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); + fprintf(stderr, "%s: ggml_v2_init() failed\n", __func__); return ModelLoadResult::FAIL; } } @@ -170,12 +170,12 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g model.layers.resize(n_layer); - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + model.ln_f_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); - model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wte = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wpe = ggml_v2_new_tensor_2d(ctx, GGML_V2_TYPE_F32, n_embd, n_ctx); + model.lm_head = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_vocab); // map by name model.tensors["model/ln_f/g"] = model.ln_f_g; @@ -188,23 +188,23 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + layer.ln_1_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + layer.ln_2_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + layer.c_attn_attn_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + layer.c_attn_attn_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 3*n_embd); - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.c_attn_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + layer.c_mlp_fc_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 4*n_embd); - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.c_mlp_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); // map by name model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; @@ -238,10 +238,10 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; - model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5); - model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5); + model.memory_k = ggml_v2_new_tensor_1d(ctx, memory_type, n_elements*1.5); + model.memory_v = ggml_v2_new_tensor_1d(ctx, memory_type, n_elements*1.5); - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + const size_t memory_size = ggml_v2_nbytes(model.memory_k) + ggml_v2_nbytes(model.memory_v); printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); } @@ -281,7 +281,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g } auto tensor = model.tensors[name.data()]; - if (ggml_nelements(tensor) != nelements) { + if (ggml_v2_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return ModelLoadResult::FAIL; } @@ -294,29 +294,29 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g // for debugging if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_v2_type_name(ggml_v2_type(ttype)), ggml_v2_nbytes(tensor)/1024.0/1024.0, ggml_v2_nbytes(tensor)); } - const size_t bpe = ggml_type_size(ggml_type(ttype)); + const size_t bpe = ggml_v2_type_size(ggml_v2_type(ttype)); - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + if ((nelements*bpe)/ggml_v2_blck_size(tensor->type) != ggml_v2_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + __func__, name.data(), ggml_v2_nbytes(tensor), nelements*bpe); return ModelLoadResult::FAIL; } - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.read(reinterpret_cast(tensor->data), ggml_v2_nbytes(tensor)); // GPT-2 models share the WTE tensor as the LM head if (name == "model/wte" && has_lm_head == false) { - memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + memcpy(model.lm_head->data, tensor->data, ggml_v2_nbytes(tensor)); } if (name == "model/lm_head") { has_lm_head = true; } - total_size += ggml_nbytes(tensor); + total_size += ggml_v2_nbytes(tensor); } printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); @@ -341,18 +341,18 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g // for (int i = 0; i < n_gpu; ++i) { // const auto & layer = model.layers[i]; -// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); -// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); -// ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g); -// ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b); -// ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); -// ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b); -// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b); -// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); -// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); -// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); -// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); +// ggml_v2_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_v2_nbytes(layer.ln_1_g); +// ggml_v2_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_v2_nbytes(layer.ln_1_b); +// ggml_v2_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_v2_nbytes(layer.ln_2_g); +// ggml_v2_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_v2_nbytes(layer.ln_2_b); +// ggml_v2_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_v2_nbytes(layer.c_attn_attn_w); +// ggml_v2_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_v2_nbytes(layer.c_attn_attn_b); +// ggml_v2_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_proj_w); +// ggml_v2_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_v2_nbytes(layer.c_attn_proj_b); +// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_w); +// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_b); +// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_w); +// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_b); // } // fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); @@ -417,45 +417,45 @@ bool gpt2_eval( } } - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = buf_size; params.mem_buffer = buf; params.no_alloc = false; - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_v2_context * ctx0 = ggml_v2_init(params); + struct ggml_v2_cgraph gf = {}; gf.n_threads = n_threads; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + struct ggml_v2_tensor * embd = ggml_v2_new_tensor_1d(ctx0, GGML_V2_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_v2_element_size(embd)); - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + struct ggml_v2_tensor * position = ggml_v2_new_tensor_1d(ctx0, GGML_V2_TYPE_I32, N); for (int i = 0; i < N; ++i) { ((int32_t *) position->data)[i] = n_past + i; } // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); + struct ggml_v2_tensor * inpL = + ggml_v2_add(ctx0, + ggml_v2_get_rows(ctx0, model.wte, embd), + ggml_v2_get_rows(ctx0, model.wpe, position)); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; + struct ggml_v2_tensor * cur; // norm { // [ 768, N] - cur = ggml_norm(ctx0, inpL); + cur = ggml_v2_norm(ctx0, inpL); // cur = ln_1_g*cur + ln_1_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].ln_1_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + ggml_v2_repeat(ctx0, model.layers[il].ln_1_b, cur)); } // attn @@ -467,104 +467,104 @@ bool gpt2_eval( // cur = attn_w*cur + attn_b // [2304, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_attn_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), cur); } // self-attention { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_v2_tensor * Qcur = ggml_v2_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_v2_tensor * Kcur = ggml_v2_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_v2_tensor * Vcur = ggml_v2_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); // store key and value to memory if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + struct ggml_v2_tensor * k = ggml_v2_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_v2_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_v2_tensor * v = ggml_v2_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_v2_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Kcur, k)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) // [64, N, 12] - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, + struct ggml_v2_tensor * Q = + ggml_v2_permute(ctx0, + ggml_v2_cpy(ctx0, Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + ggml_v2_new_tensor_3d(ctx0, GGML_V2_TYPE_F32, n_embd/n_head, n_head, N)), 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) // [64, n_past + N, 12] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + struct ggml_v2_tensor * K = + ggml_v2_permute(ctx0, + ggml_v2_reshape_3d(ctx0, + ggml_v2_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_v2_element_size(model.memory_k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + //struct ggml_v2_tensor * V = + // ggml_v2_cpy(ctx0, + // ggml_v2_permute(ctx0, + // ggml_v2_reshape_3d(ctx0, + // ggml_v2_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_v2_element_size(model.memory_v)*n_embd), // n_embd/n_head, n_head, n_past + N), // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + // ggml_v2_new_tensor_3d(ctx0, GGML_V2_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + //struct ggml_v2_tensor * KQV = ggml_v2_flash_attn(ctx0, Q, K, V, true); // K * Q // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_v2_tensor * KQ = ggml_v2_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, + struct ggml_v2_tensor * KQ_scaled = + ggml_v2_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ggml_v2_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) ); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + struct ggml_v2_tensor * KQ_masked = ggml_v2_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); // KQ = soft_max(KQ_masked) // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + struct ggml_v2_tensor * KQ_soft_max = ggml_v2_soft_max_inplace(ctx0, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // [n_past + N, 64, 12] - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + struct ggml_v2_tensor * V_trans = + ggml_v2_cpy(ctx0, + ggml_v2_permute(ctx0, + ggml_v2_reshape_3d(ctx0, + ggml_v2_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_v2_element_size(model.memory_v)*n_embd), n_embd/n_head, n_head, n_past + N), 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + ggml_v2_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); // KQV = transpose(V) * KQ_soft_max // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + struct ggml_v2_tensor * KQV = ggml_v2_mul_mat(ctx0, V_trans, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_v2_tensor * KQV_merged = ggml_v2_permute(ctx0, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) // [768, N] - cur = ggml_cpy(ctx0, + cur = ggml_v2_cpy(ctx0, KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_v2_new_tensor_2d(ctx0, GGML_V2_TYPE_F32, n_embd, N)); } // projection @@ -576,33 +576,33 @@ bool gpt2_eval( // cur = proj_w*cur + proj_b // [768, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_proj_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); } // add the input - cur = ggml_add(ctx0, cur, inpL); + cur = ggml_v2_add(ctx0, cur, inpL); - struct ggml_tensor * inpFF = cur; + struct ggml_v2_tensor * inpFF = cur; // feed-forward network { // norm { - cur = ggml_norm(ctx0, inpFF); + cur = ggml_v2_norm(ctx0, inpFF); // cur = ln_2_g*cur + ln_2_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].ln_2_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + ggml_v2_repeat(ctx0, model.layers[il].ln_2_b, cur)); } // fully connected @@ -613,17 +613,17 @@ bool gpt2_eval( // // cur = fc_w*cur + fc_b // [3072, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_mlp_fc_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), cur); // GELU activation // [3072, N] - cur = ggml_gelu(ctx0, cur); + cur = ggml_v2_gelu(ctx0, cur); // projection // [ 768, 3072] - model.layers[il].c_mlp_proj_w @@ -633,63 +633,63 @@ bool gpt2_eval( // // cur = proj_w*cur + proj_b // [768, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_mlp_proj_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), cur); } // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); + inpL = ggml_v2_add(ctx0, cur, inpFF); } // norm { // [ 768, N] - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_v2_norm(ctx0, inpL); // inpL = ln_f_g*inpL + ln_f_b // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.ln_f_g, inpL), inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); + ggml_v2_repeat(ctx0, model.ln_f_b, inpL)); } // inpL = WTE * inpL // [ 768, 50257] - model.lm_head // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + inpL = ggml_v2_mul_mat(ctx0, model.lm_head, inpL); // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); + //inpL = ggml_v2_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_v2_build_forward_expand(&gf, inpL); + ggml_v2_graph_compute (ctx0, &gf); //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + // ggml_v2_graph_print (&gf); + // ggml_v2_graph_dump_dot(&gf, NULL, "gpt-2.dot"); //} //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + //memcpy(embd_w.data(), ggml_v2_get_data(inpL), sizeof(float)*n_vocab*N); // return result just for the last token embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(embd_w.data(), (float *) ggml_v2_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; + mem_per_token = ggml_v2_used_mem(ctx0)/N; } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + //printf("used_mem = %zu\n", ggml_v2_used_mem(ctx0)); - ggml_free(ctx0); + ggml_v2_free(ctx0); return true; } \ No newline at end of file diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index b480d6f36..0b678df65 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml_v2.h" #include "otherarch.h" #include "utils.h" @@ -49,7 +49,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t qntvr = hparams.ftype / GGML_V2_QNT_VERSION_FACTOR; printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -60,7 +60,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g printf("%s: ftype = %d\n", __func__, hparams.ftype); printf("%s: qntvr = %d\n", __func__, qntvr); - hparams.ftype %= GGML_QNT_VERSION_FACTOR; + hparams.ftype %= GGML_V2_QNT_VERSION_FACTOR; } // load vocab @@ -89,8 +89,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { + ggml_v2_type wtype = ggml_v2_ftype_to_ggml_v2_type((ggml_v2_ftype) (model.hparams.ftype)); + if (wtype == GGML_V2_TYPE_COUNT) { fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(), model.hparams.ftype); return ModelLoadResult::FAIL; @@ -98,7 +98,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g auto & ctx = model.ctx; - auto memory_type = GGML_TYPE_F16; + auto memory_type = GGML_V2_TYPE_F16; size_t ctx_size = 0; @@ -110,31 +110,31 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // ln_f_b - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + ctx_size += n_embd*n_vocab*ggml_v2_type_sizef(wtype); // wte - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g - ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + ctx_size += n_embd*n_vocab*ggml_v2_type_sizef(wtype); // lmh_g + ctx_size += n_vocab*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // lmh_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_1_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_q_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_k_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_v_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v + ctx_size += n_ctx*n_layer*n_embd*ggml_v2_type_sizef(memory_type); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_v2_type_sizef(memory_type); // memory_v ctx_size += (5 + 10*n_layer)*512; // object overhead @@ -143,15 +143,15 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g // create the ggml context { - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = ctx_size; params.mem_buffer = NULL; params.no_alloc = false; - model.ctx = ggml_init(params); + model.ctx = ggml_v2_init(params); if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); + fprintf(stderr, "%s: ggml_v2_init() failed\n", __func__); return ModelLoadResult::FAIL; } } @@ -166,13 +166,13 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g model.layers.resize(n_layer); - model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wte = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + model.ln_f_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); + model.lmh_g = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.lmh_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_vocab); // map by name model.tensors["transformer.wte.weight"] = model.wte; @@ -186,20 +186,20 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + layer.ln_1_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_q_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_k_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_v_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + layer.c_mlp_fc_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 4*n_embd); - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.c_mlp_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); // map by name model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g; @@ -230,10 +230,10 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g const int n_mem = n_layer*n_ctx; const int n_elements = n_embd*n_mem; - model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements); + model.memory_k = ggml_v2_new_tensor_1d(ctx, memory_type, n_elements); + model.memory_v = ggml_v2_new_tensor_1d(ctx, memory_type, n_elements); - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + const size_t memory_size = ggml_v2_nbytes(model.memory_k) + ggml_v2_nbytes(model.memory_v); printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); } @@ -274,7 +274,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g } auto tensor = model.tensors[name.data()]; - if (ggml_nelements(tensor) != nelements) { + if (ggml_v2_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return ModelLoadResult::FAIL; } @@ -286,7 +286,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g if(tensor->ne[0]==ne[1] && tensor->ne[1]==ne[0] && should_transpose_layer(name)) { printf("\nFound a transposed tensor. This could be an older or newer model. Retrying load..."); - ggml_free(ctx); + ggml_v2_free(ctx); return ModelLoadResult::RETRY_LOAD; } else @@ -300,21 +300,21 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g // for debugging if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_v2_type_name(ggml_v2_type(ttype)), ggml_v2_nbytes(tensor)/1024.0/1024.0, ggml_v2_nbytes(tensor)); } - const size_t bpe = ggml_type_size(ggml_type(ttype)); + const size_t bpe = ggml_v2_type_size(ggml_v2_type(ttype)); - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + if ((nelements*bpe)/ggml_v2_blck_size(tensor->type) != ggml_v2_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + __func__, name.data(), ggml_v2_nbytes(tensor), nelements*bpe); return ModelLoadResult::FAIL; } - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.read(reinterpret_cast(tensor->data), ggml_v2_nbytes(tensor)); - //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); - total_size += ggml_nbytes(tensor); + //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_v2_nbytes(tensor)/1024.0/1024.0); + total_size += ggml_v2_nbytes(tensor); if (++n_tensors % 8 == 0) { printf("."); fflush(stdout); @@ -344,16 +344,16 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g // for (int i = 0; i < n_gpu; ++i) { // const auto & layer = model.layers[i]; -// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); -// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); -// ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w); -// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); -// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); -// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); -// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); -// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); +// ggml_v2_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_v2_nbytes(layer.ln_1_g); +// ggml_v2_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_v2_nbytes(layer.ln_1_b); +// ggml_v2_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_q_proj_w); +// ggml_v2_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_k_proj_w); +// ggml_v2_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_v_proj_w); +// ggml_v2_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_proj_w); +// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_w); +// ggml_v2_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_b); +// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_w); +// ggml_v2_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_b); // } // fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); @@ -420,193 +420,193 @@ bool gptj_eval( } } - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = buf_size; params.mem_buffer = buf; params.no_alloc = false; - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_v2_context * ctx0 = ggml_v2_init(params); + struct ggml_v2_cgraph gf = {}; gf.n_threads = n_threads; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + struct ggml_v2_tensor * embd = ggml_v2_new_tensor_1d(ctx0, GGML_V2_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_v2_element_size(embd)); // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); + struct ggml_v2_tensor * inpL = ggml_v2_get_rows(ctx0, model.wte, embd); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; + struct ggml_v2_tensor * cur; // norm { - cur = ggml_norm(ctx0, inpL); + cur = ggml_v2_norm(ctx0, inpL); // cur = ln_1_g*cur + ln_1_b - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].ln_1_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + ggml_v2_repeat(ctx0, model.layers[il].ln_1_b, cur)); } - struct ggml_tensor * inpSA = cur; + struct ggml_v2_tensor * inpSA = cur; // self-attention { - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_v2_tensor * Qcur = ggml_v2_rope_inplace(ctx0, ggml_v2_reshape_3d(ctx0, ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_v2_tensor * Kcur = ggml_v2_rope_inplace(ctx0, ggml_v2_reshape_3d(ctx0, ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); // store key and value to memory { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur)); + struct ggml_v2_tensor * Vcur = ggml_v2_transpose(ctx0, ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur)); - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); + struct ggml_v2_tensor * k = ggml_v2_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_v2_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_v2_tensor * v = ggml_v2_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_v2_element_size(model.memory_v), + (il*n_ctx)*ggml_v2_element_size(model.memory_v)*n_embd + n_past*ggml_v2_element_size(model.memory_v)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Kcur, k)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - struct ggml_tensor * Q = - ggml_permute(ctx0, + struct ggml_v2_tensor * Q = + ggml_v2_permute(ctx0, Qcur, 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + struct ggml_v2_tensor * K = + ggml_v2_permute(ctx0, + ggml_v2_reshape_3d(ctx0, + ggml_v2_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_v2_element_size(model.memory_k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_v2_tensor * KQ = ggml_v2_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, + struct ggml_v2_tensor * KQ_scaled = + ggml_v2_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ggml_v2_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) ); // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + struct ggml_v2_tensor * KQ_masked = ggml_v2_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + struct ggml_v2_tensor * KQ_soft_max = ggml_v2_soft_max_inplace(ctx0, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, + struct ggml_v2_tensor * V = + ggml_v2_view_3d(ctx0, model.memory_v, n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, - il*n_ctx*ggml_element_size(model.memory_v)*n_embd); + n_ctx*ggml_v2_element_size(model.memory_v), + n_ctx*ggml_v2_element_size(model.memory_v)*n_embd/n_head, + il*n_ctx*ggml_v2_element_size(model.memory_v)*n_embd); // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + struct ggml_v2_tensor * KQV = ggml_v2_mul_mat(ctx0, V, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_v2_tensor * KQV_merged = ggml_v2_permute(ctx0, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, + cur = ggml_v2_cpy(ctx0, KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_v2_new_tensor_2d(ctx0, GGML_V2_TYPE_F32, n_embd, N)); // projection (no bias) - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_proj_w, cur); } - struct ggml_tensor * inpFF = cur; + struct ggml_v2_tensor * inpFF = cur; // feed-forward network // this is independent of the self-attention result, so it could be done in parallel to the self-attention { // note here we pass inpSA instead of cur - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_mlp_fc_w, inpSA); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), cur); // GELU activation - cur = ggml_gelu(ctx0, cur); + cur = ggml_v2_gelu(ctx0, cur); // projection // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_mlp_proj_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), cur); } // self-attention + FF - cur = ggml_add(ctx0, cur, inpFF); + cur = ggml_v2_add(ctx0, cur, inpFF); // input for next layer - inpL = ggml_add(ctx0, cur, inpL); + inpL = ggml_v2_add(ctx0, cur, inpL); } // norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_v2_norm(ctx0, inpL); // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.ln_f_g, inpL), inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); + ggml_v2_repeat(ctx0, model.ln_f_b, inpL)); } // lm_head { - inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); + inpL = ggml_v2_mul_mat(ctx0, model.lmh_g, inpL); - inpL = ggml_add(ctx0, - ggml_repeat(ctx0, model.lmh_b, inpL), + inpL = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.lmh_b, inpL), inpL); } // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); + //inpL = ggml_v2_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_v2_build_forward_expand(&gf, inpL); + ggml_v2_graph_compute (ctx0, &gf); //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot"); + // ggml_v2_graph_print (&gf); + // ggml_v2_graph_dump_dot(&gf, NULL, "gpt-j.dot"); //} //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + //memcpy(embd_w.data(), ggml_v2_get_data(inpL), sizeof(float)*n_vocab*N); // return result for just the last token embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(embd_w.data(), (float *) ggml_v2_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; + mem_per_token = ggml_v2_used_mem(ctx0)/N; } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + //printf("used_mem = %zu\n", ggml_v2_used_mem(ctx0)); - ggml_free(ctx0); + ggml_v2_free(ctx0); return true; } \ No newline at end of file diff --git a/llama.cpp b/otherarch/llama_v2.cpp similarity index 62% rename from llama.cpp rename to otherarch/llama_v2.cpp index c8d90d3d7..85de44e06 100644 --- a/llama.cpp +++ b/otherarch/llama_v2.cpp @@ -6,13 +6,13 @@ #endif #include "llama-util.h" -#include "llama.h" +#include "llama_v2.h" -#include "ggml.h" +#include "ggml_v2.h" #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) -#include "ggml-opencl.h" +#include "ggml_v2-opencl.h" #endif #include @@ -36,7 +36,7 @@ #include #define LLAMA_USE_SCRATCH -#define LLAMA_MAX_SCRATCH_BUFFERS 16 +#define LLAMA_V2_MAX_SCRATCH_BUFFERS 16 // available llama models enum e_model { @@ -105,7 +105,7 @@ static const std::map & MEM_REQ_EVAL() } // default hparams (LLaMA 7B) -struct llama_hparams { +struct llama_v2_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; @@ -113,89 +113,89 @@ struct llama_hparams { uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; - enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; + enum llama_v2_ftype ftype = LLAMA_V2_FTYPE_MOSTLY_F16; - bool operator!=(const llama_hparams & other) const { - return memcmp(this, &other, sizeof(llama_hparams)); + bool operator!=(const llama_v2_hparams & other) const { + return memcmp(this, &other, sizeof(llama_v2_hparams)); } }; -struct llama_layer { +struct llama_v2_layer { // normalization - struct ggml_tensor * attention_norm; + struct ggml_v2_tensor * attention_norm; // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; + struct ggml_v2_tensor * wq; + struct ggml_v2_tensor * wk; + struct ggml_v2_tensor * wv; + struct ggml_v2_tensor * wo; // normalization - struct ggml_tensor * ffn_norm; + struct ggml_v2_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_v2_tensor * w1; + struct ggml_v2_tensor * w2; + struct ggml_v2_tensor * w3; }; -struct llama_kv_cache { - struct ggml_tensor * k; - struct ggml_tensor * v; +struct llama_v2_kv_cache { + struct ggml_v2_tensor * k; + struct ggml_v2_tensor * v; - struct ggml_context * ctx = NULL; + struct ggml_v2_context * ctx = NULL; - llama_ctx_buffer buf; + llama_v2_ctx_buffer buf; int n; // number of tokens currently in the cache - ~llama_kv_cache() { + ~llama_v2_kv_cache() { if (ctx) { - ggml_free(ctx); + ggml_v2_free(ctx); } } }; -struct llama_model { +struct llama_v2_model { e_model type = MODEL_UNKNOWN; - llama_hparams hparams; + llama_v2_hparams hparams; - struct ggml_tensor * tok_embeddings; + struct ggml_v2_tensor * tok_embeddings; - struct ggml_tensor * norm; - struct ggml_tensor * output; + struct ggml_v2_tensor * norm; + struct ggml_v2_tensor * output; - std::vector layers; + std::vector layers; // context - struct ggml_context * ctx = NULL; + struct ggml_v2_context * ctx = NULL; // key + value cache for the self attention - // TODO: move to llama_state - struct llama_kv_cache kv_self; + // TODO: move to llama_v2_state + struct llama_v2_kv_cache kv_self; // the model memory buffer - llama_ctx_buffer buf; + llama_v2_ctx_buffer buf; // model memory mapped file - std::unique_ptr mapping; + std::unique_ptr mapping; // objects representing data potentially being locked in memory - llama_mlock mlock_buf; - llama_mlock mlock_mmap; + llama_v2_mlock mlock_buf; + llama_v2_mlock mlock_mmap; // for quantize-stats only - std::vector> tensors_by_name; + std::vector> tensors_by_name; - ~llama_model() { + ~llama_v2_model() { if (ctx) { - ggml_free(ctx); + ggml_v2_free(ctx); } } }; -struct llama_vocab { +struct llama_v2_vocab { using id = int32_t; using token = std::string; @@ -208,7 +208,7 @@ struct llama_vocab { std::vector id_to_token; }; -struct llama_context { +struct llama_v2_context { std::mt19937 rng; int64_t t_load_us = 0; @@ -223,8 +223,8 @@ struct llama_context { int32_t n_eval = 0; // number of eval calls int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - llama_model model; - llama_vocab vocab; + llama_v2_model model; + llama_v2_vocab vocab; size_t mem_per_token = 0; @@ -236,22 +236,22 @@ struct llama_context { std::vector embedding; // memory buffers used to evaluate the model - // TODO: move in llama_state - llama_ctx_buffer buf_compute; - llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + // TODO: move in llama_v2_state + llama_v2_ctx_buffer buf_compute; + llama_v2_ctx_buffer buf_scratch[LLAMA_V2_MAX_SCRATCH_BUFFERS]; int buf_last = 0; - size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; + size_t buf_max_size[LLAMA_V2_MAX_SCRATCH_BUFFERS] = { 0 }; - void use_buf(struct ggml_context * ctx, int i) { + void use_buf(struct ggml_v2_context * ctx, int i) { #if defined(LLAMA_USE_SCRATCH) size_t last_size = 0; if (i == -1) { - last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); + last_size = ggml_v2_set_scratch(ctx, { 0, 0, nullptr, }); } else { auto & buf = buf_scratch[i]; - last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, }); + last_size = ggml_v2_set_scratch(ctx, { 0, buf.size, buf.addr, }); } if (buf_last >= 0) { @@ -292,7 +292,7 @@ static size_t checked_div(size_t a, size_t b) { return a / b; } -static std::string llama_format_tensor_shape(const std::vector & ne) { +static std::string llama_v2_format_tensor_shape(const std::vector & ne) { char buf[256]; snprintf(buf, sizeof(buf), "%5u", ne.at(0)); for (size_t i = 1; i < ne.size(); i++) { @@ -301,44 +301,44 @@ static std::string llama_format_tensor_shape(const std::vector & ne) { return buf; } -static size_t llama_calc_tensor_size(const std::vector & ne, enum ggml_type type) { - size_t size = ggml_type_size(type); +static size_t llama_v2_calc_tensor_size(const std::vector & ne, enum ggml_v2_type type) { + size_t size = ggml_v2_type_size(type); for (uint32_t dim : ne) { size = checked_mul(size, dim); } - return size / ggml_blck_size(type); + return size / ggml_v2_blck_size(type); } -struct llama_load_tensor_shard { +struct llama_v2_load_tensor_shard { std::vector ne; size_t size; - enum ggml_type type; + enum ggml_v2_type type; size_t file_idx; size_t file_off; void calc_size() { - size = llama_calc_tensor_size(ne, type); + size = llama_v2_calc_tensor_size(ne, type); } }; -enum llama_split_type { +enum llama_v2_split_type { SPLIT_NONE, SPLIT_BY_COLUMNS, SPLIT_BY_ROWS }; -struct llama_load_tensor { - std::vector shards; +struct llama_v2_load_tensor { + std::vector shards; std::string name; - enum ggml_type type = GGML_TYPE_F32; - llama_split_type split_type = SPLIT_NONE; + enum ggml_v2_type type = GGML_V2_TYPE_F32; + llama_v2_split_type split_type = SPLIT_NONE; std::vector ne; size_t size; - struct ggml_tensor * ggml_tensor = NULL; + struct ggml_v2_tensor * ggml_v2_tensor = NULL; uint8_t * data; - llama_load_tensor(const std::string & name) : name(name) {} + llama_v2_load_tensor(const std::string & name) : name(name) {} void calc_all() { calc_type(); @@ -375,11 +375,11 @@ struct llama_load_tensor { for (const auto & shard : shards) { if (shard.ne != first_shard.ne) { throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s", - name.c_str(), llama_format_tensor_shape(first_shard.ne).c_str(), llama_format_tensor_shape(shard.ne).c_str()); + name.c_str(), llama_v2_format_tensor_shape(first_shard.ne).c_str(), llama_v2_format_tensor_shape(shard.ne).c_str()); } } ne = first_shard.ne; - LLAMA_ASSERT(shards.size() <= UINT32_MAX); + LLAMA_V2_ASSERT(shards.size() <= UINT32_MAX); uint32_t n_shards = (uint32_t) shards.size(); switch (split_type) { case SPLIT_NONE: @@ -397,31 +397,31 @@ struct llama_load_tensor { } void calc_size() { - size = llama_calc_tensor_size(ne, type); + size = llama_v2_calc_tensor_size(ne, type); } }; -struct llama_load_tensors_map { +struct llama_v2_load_tensors_map { // tensors is kept in a separate vector to preserve file order - std::vector tensors; + std::vector tensors; std::unordered_map name_to_idx; }; -enum llama_file_version { - LLAMA_FILE_VERSION_GGML, - LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab - LLAMA_FILE_VERSION_GGJT_V1, // added padding - LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format - LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format +enum llama_v2_file_version { + LLAMA_V2_FILE_VERSION_GGML, + LLAMA_V2_FILE_VERSION_GGMF_V1, // added version field and scores in vocab + LLAMA_V2_FILE_VERSION_GGJT_V1, // added padding + LLAMA_V2_FILE_VERSION_GGJT_V2, // changed quantization format + LLAMA_V2_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format }; -struct llama_file_loader { - llama_file file; - llama_file_version file_version; - llama_hparams hparams; - llama_vocab vocab; +struct llama_v2_file_loader { + llama_v2_file file; + llama_v2_file_version file_version; + llama_v2_hparams hparams; + llama_v2_vocab vocab; - llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map) + llama_v2_file_loader(const char * fname, size_t file_idx, llama_v2_load_tensors_map & tensors_map) : file(fname, "rb") { fprintf(stderr, "llama.cpp: loading model from %s\n", fname); read_magic(); @@ -438,15 +438,15 @@ struct llama_file_loader { } if (magic == 'ggml' && version == 0) { - file_version = LLAMA_FILE_VERSION_GGML; + file_version = LLAMA_V2_FILE_VERSION_GGML; } else if (magic == 'ggmf' && version == 1) { - file_version = LLAMA_FILE_VERSION_GGMF_V1; + file_version = LLAMA_V2_FILE_VERSION_GGMF_V1; } else if (magic == 'ggjt' && version == 1) { - file_version = LLAMA_FILE_VERSION_GGJT_V1; + file_version = LLAMA_V2_FILE_VERSION_GGJT_V1; } else if (magic == 'ggjt' && version == 2) { - file_version = LLAMA_FILE_VERSION_GGJT_V2; + file_version = LLAMA_V2_FILE_VERSION_GGJT_V2; } else if (magic == 'ggjt' && version == 3) { - file_version = LLAMA_FILE_VERSION_GGJT_V3; + file_version = LLAMA_V2_FILE_VERSION_GGJT_V3; } else { throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", magic, version); @@ -459,13 +459,13 @@ struct llama_file_loader { hparams.n_head = file.read_u32(); hparams.n_layer = file.read_u32(); hparams.n_rot = file.read_u32(); - hparams.ftype = (enum llama_ftype) file.read_u32(); + hparams.ftype = (enum llama_v2_ftype) file.read_u32(); } void read_vocab() { vocab.id_to_token.resize(hparams.n_vocab); int32_t vocabloops = hparams.n_vocab; - if(vocabloops==32001 && file_version == LLAMA_FILE_VERSION_GGML) + if(vocabloops==32001 && file_version == LLAMA_V2_FILE_VERSION_GGML) { printf("---\n!! WARNING: Model appears to be GPT4ALL v1 model, triggering compatibility fix !!\n---\n"); vocabloops -= 1; @@ -476,7 +476,7 @@ struct llama_file_loader { std::string word = file.read_string(len); float score = 0.0f; - if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) { + if (file_version >= LLAMA_V2_FILE_VERSION_GGMF_V1) { file.read_raw(&score, sizeof(score)); } @@ -487,12 +487,12 @@ struct llama_file_loader { tok_score.score = score; } } - void read_tensor_metadata(size_t file_idx, llama_load_tensors_map & tensors_map) { + void read_tensor_metadata(size_t file_idx, llama_v2_load_tensors_map & tensors_map) { while (file.tell() < file.size) { - llama_load_tensor_shard shard; + llama_v2_load_tensor_shard shard; uint32_t n_dims = file.read_u32(); uint32_t name_len = file.read_u32(); - shard.type = (enum ggml_type) file.read_u32(); + shard.type = (enum ggml_v2_type) file.read_u32(); shard.ne.resize(n_dims); file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims); std::string name = file.read_string(name_len); @@ -500,22 +500,22 @@ struct llama_file_loader { throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims); } switch (shard.type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_F32: + case GGML_V2_TYPE_F16: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_3: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: break; default: { throw format("unrecognized tensor type %u\n", shard.type); } } - if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { + if (file_version >= LLAMA_V2_FILE_VERSION_GGJT_V1) { // skip to the next multiple of 32 bytes file.seek(-file.tell() & 31, SEEK_CUR); } @@ -539,10 +539,10 @@ struct llama_file_loader { } }; -struct llama_file_saver { - llama_file file; - llama_file_loader * any_file_loader; - llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype) +struct llama_v2_file_saver { + llama_v2_file file; + llama_v2_file_loader * any_file_loader; + llama_v2_file_saver(const char * fname, llama_v2_file_loader * any_file_loader, enum llama_v2_ftype new_ftype) : file(fname, "wb"), any_file_loader(any_file_loader) { fprintf(stderr, "llama.cpp: saving model to %s\n", fname); write_magic(); @@ -550,11 +550,11 @@ struct llama_file_saver { write_vocab(); } void write_magic() { - file.write_u32(LLAMA_FILE_MAGIC); // magic - file.write_u32(LLAMA_FILE_VERSION); // version + file.write_u32(LLAMA_V2_FILE_MAGIC); // magic + file.write_u32(LLAMA_V2_FILE_VERSION); // version } - void write_hparams(enum llama_ftype new_ftype) { - const llama_hparams & hparams = any_file_loader->hparams; + void write_hparams(enum llama_v2_ftype new_ftype) { + const llama_v2_hparams & hparams = any_file_loader->hparams; file.write_u32(hparams.n_vocab); file.write_u32(hparams.n_embd); file.write_u32(hparams.n_mult); @@ -564,7 +564,7 @@ struct llama_file_saver { file.write_u32(new_ftype); } void write_vocab() { - if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) { + if (any_file_loader->file_version == LLAMA_V2_FILE_VERSION_GGML) { fprintf(stderr, "llama.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n"); } uint32_t n_vocab = any_file_loader->hparams.n_vocab; @@ -575,19 +575,19 @@ struct llama_file_saver { file.write_raw(&token_score.score, sizeof(token_score.score)); } } - void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) { + void write_tensor(llama_v2_load_tensor & tensor, enum ggml_v2_type new_type, const void * new_data, size_t new_size) { switch (new_type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_2: - case GGML_TYPE_Q4_3: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_F32: + case GGML_V2_TYPE_F16: + case GGML_V2_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_2: + case GGML_V2_TYPE_Q4_3: + case GGML_V2_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_1: + case GGML_V2_TYPE_Q8_0: break; - default: LLAMA_ASSERT(false); + default: LLAMA_V2_ASSERT(false); } file.write_u32((uint32_t) tensor.ne.size()); file.write_u32((uint32_t) tensor.name.size()); @@ -595,32 +595,32 @@ struct llama_file_saver { file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.name.data(), tensor.name.size()); file.seek(-file.tell() & 31, SEEK_CUR); - LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); + LLAMA_V2_ASSERT(new_size == llama_v2_calc_tensor_size(tensor.ne, new_type)); file.write_raw(new_data, new_size); } }; -struct llama_model_loader { - std::vector> file_loaders; - llama_load_tensors_map tensors_map; +struct llama_v2_model_loader { + std::vector> file_loaders; + llama_v2_load_tensors_map tensors_map; bool use_mmap; - size_t num_ggml_tensors_created = 0; - struct ggml_context * ggml_ctx = NULL; - std::unique_ptr mapping; + size_t num_ggml_v2_tensors_created = 0; + struct ggml_v2_context * ggml_v2_ctx = NULL; + std::unique_ptr mapping; - llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { - auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map); + llama_v2_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) { + auto * first_file = new llama_v2_file_loader(fname_base.c_str(), 0, tensors_map); file_loaders.emplace_back(first_file); uint32_t n_parts = vocab_only ? 1 : guess_n_parts(); for (uint32_t i = 1; i < n_parts; i++) { std::string fname = fname_base + "." + std::to_string(i); - auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map); + auto * ith_file = new llama_v2_file_loader(fname.c_str(), i, tensors_map); file_loaders.emplace_back(ith_file); if (ith_file->hparams != first_file->hparams) { throw format("llama.cpp: hparams inconsistent between files"); } } - if (!llama_mmap::SUPPORTED) { + if (!llama_v2_mmap::SUPPORTED) { use_mmap = false; } if (use_mmap && alignment_prevents_mmap()) { @@ -628,14 +628,14 @@ struct llama_model_loader { use_mmap = false; } this->use_mmap = use_mmap; - for (llama_load_tensor & lt : tensors_map.tensors) { + for (llama_v2_load_tensor & lt : tensors_map.tensors) { lt.calc_all(); } } bool alignment_prevents_mmap() { - for (const llama_load_tensor & lt : tensors_map.tensors) { - for (const llama_load_tensor_shard & shard : lt.shards) { + for (const llama_v2_load_tensor & lt : tensors_map.tensors) { + for (const llama_v2_load_tensor_shard & shard : lt.shards) { if (shard.file_off & 3) { return true; } @@ -649,61 +649,61 @@ struct llama_model_loader { if (it == tensors_map.name_to_idx.end()) { throw std::string("missing tok_embeddings.weight"); } - const llama_load_tensor & lt = tensors_map.tensors.at(it->second); + const llama_v2_load_tensor & lt = tensors_map.tensors.at(it->second); return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0); } void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const { *ctx_size_p = *mmapped_size_p = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { - *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; + for (const llama_v2_load_tensor & lt : tensors_map.tensors) { + *ctx_size_p += sizeof(struct ggml_v2_tensor) + GGML_V2_OBJECT_SIZE; *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; } } - struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne) { + struct ggml_v2_tensor * get_tensor(const std::string & name, const std::vector & ne) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); } - llama_load_tensor & lt = tensors_map.tensors.at(it->second); + llama_v2_load_tensor & lt = tensors_map.tensors.at(it->second); if (lt.ne != ne) { throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s", - name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); + name.c_str(), llama_v2_format_tensor_shape(ne).c_str(), llama_v2_format_tensor_shape(lt.ne).c_str()); } return get_tensor_for(lt); } - struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { - struct ggml_tensor * tensor; + struct ggml_v2_tensor * get_tensor_for(llama_v2_load_tensor & lt) { + struct ggml_v2_tensor * tensor; if (lt.ne.size() == 2) { - tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); + tensor = ggml_v2_new_tensor_2d(ggml_v2_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); } else { - LLAMA_ASSERT(lt.ne.size() == 1); - tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); + LLAMA_V2_ASSERT(lt.ne.size() == 1); + tensor = ggml_v2_new_tensor_1d(ggml_v2_ctx, lt.type, lt.ne.at(0)); } - ggml_set_name(tensor, lt.name.c_str()); - LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor - lt.ggml_tensor = tensor; - num_ggml_tensors_created++; + ggml_v2_set_name(tensor, lt.name.c_str()); + LLAMA_V2_ASSERT(lt.ggml_v2_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + lt.ggml_v2_tensor = tensor; + num_ggml_v2_tensors_created++; return tensor; } void done_getting_tensors() const { - if (num_ggml_tensors_created != tensors_map.tensors.size()) { + if (num_ggml_v2_tensors_created != tensors_map.tensors.size()) { throw std::string("llama.cpp: file contained more tensors than expected"); } } - void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + void load_all_data(llama_v2_progress_callback progress_callback, void * progress_callback_user_data, llama_v2_mlock * lmlock) { size_t data_size = 0; - for (const llama_load_tensor & lt : tensors_map.tensors) { + for (const llama_v2_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + mapping.reset(new llama_v2_mmap(&file_loaders.at(0)->file)); if (!lmlock) { // Don't call the callback since the actual loading will be lazy // and we can't measure it. @@ -715,14 +715,14 @@ struct llama_model_loader { } size_t done_size = 0; - for (llama_load_tensor & lt : tensors_map.tensors) { + for (llama_v2_load_tensor & lt : tensors_map.tensors) { if (progress_callback) { progress_callback((float) done_size / data_size, progress_callback_user_data); } - LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already - lt.data = (uint8_t *) lt.ggml_tensor->data; + LLAMA_V2_ASSERT(lt.ggml_v2_tensor); // unused tensors should have been caught by load_data already + lt.data = (uint8_t *) lt.ggml_v2_tensor->data; load_data_for(lt); - lt.ggml_tensor->data = lt.data; + lt.ggml_v2_tensor->data = lt.data; done_size += lt.size; if (use_mmap && lmlock) { lmlock->grow_to(done_size); @@ -733,29 +733,29 @@ struct llama_model_loader { } } - void load_data_for(llama_load_tensor & lt) { + void load_data_for(llama_v2_load_tensor & lt) { if (use_mmap) { - LLAMA_ASSERT(lt.shards.size() == 1); + LLAMA_V2_ASSERT(lt.shards.size() == 1); lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; } else if (lt.split_type == SPLIT_NONE) { - llama_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; + llama_v2_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; file.seek(lt.shards.at(0).file_off, SEEK_SET); file.read_raw(lt.data, lt.size); } else if (lt.split_type == SPLIT_BY_ROWS) { size_t offset = 0; - for (llama_load_tensor_shard & shard : lt.shards) { - llama_file & file = file_loaders.at(shard.file_idx)->file; + for (llama_v2_load_tensor_shard & shard : lt.shards) { + llama_v2_file & file = file_loaders.at(shard.file_idx)->file; file.seek(shard.file_off, SEEK_SET); file.read_raw(lt.data + offset, shard.size); offset += shard.size; } - LLAMA_ASSERT(offset == lt.size); + LLAMA_V2_ASSERT(offset == lt.size); } else if (lt.split_type == SPLIT_BY_COLUMNS) { // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector tmp_bufs(lt.shards.size()); + std::vector tmp_bufs(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { - llama_load_tensor_shard & shard = lt.shards.at(i); - llama_file & file = file_loaders.at(shard.file_idx)->file; + llama_v2_load_tensor_shard & shard = lt.shards.at(i); + llama_v2_file & file = file_loaders.at(shard.file_idx)->file; file.seek(shard.file_off, SEEK_SET); tmp_bufs.at(i).resize(shard.size); file.read_raw(tmp_bufs.at(i).addr, shard.size); @@ -765,28 +765,28 @@ struct llama_model_loader { size_t per_shard_row_size = lt.shards.at(0).size / num_rows; size_t out_offset = 0; for (size_t row = 0; row < num_rows; row++) { - for (llama_buffer & tmp_buf : tmp_bufs) { + for (llama_v2_buffer & tmp_buf : tmp_bufs) { memcpy(lt.data + out_offset, tmp_buf.addr + row * per_shard_row_size, per_shard_row_size); out_offset += per_shard_row_size; } } - LLAMA_ASSERT(out_offset == lt.size); + LLAMA_V2_ASSERT(out_offset == lt.size); } if (0) { print_checksum(lt); } } - static void print_checksum(llama_load_tensor & lt) { + static void print_checksum(llama_v2_load_tensor & lt) { uint32_t sum = 0; for (size_t i = 0; i < lt.size; i++) { uint8_t byte = lt.data[i]; sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash } fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum, - llama_format_tensor_shape(lt.ne).c_str(), lt.size); + llama_v2_format_tensor_shape(lt.ne).c_str(), lt.size); } }; @@ -797,9 +797,9 @@ struct llama_model_loader { // static bool kv_cache_init( - const struct llama_hparams & hparams, - struct llama_kv_cache & cache, - ggml_type wtype, + const struct llama_v2_hparams & hparams, + struct llama_v2_kv_cache & cache, + ggml_v2_type wtype, int n_ctx) { const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -807,30 +807,30 @@ static bool kv_cache_init( const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; - cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB); - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = cache.buf.size; params.mem_buffer = cache.buf.addr; params.no_alloc = false; - cache.ctx = ggml_init(params); + cache.ctx = ggml_v2_init(params); if (!cache.ctx) { fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); return false; } - cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - ggml_set_name(cache.k, "cache_k"); - ggml_set_name(cache.v, "cache_v"); + cache.k = ggml_v2_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.v = ggml_v2_new_tensor_1d(cache.ctx, wtype, n_elements); + ggml_v2_set_name(cache.k, "cache_k"); + ggml_v2_set_name(cache.v, "cache_v"); return true; } -struct llama_context_params llama_context_default_params() { - struct llama_context_params result = { +struct llama_v2_context_params llama_v2_context_default_params() { + struct llama_v2_context_params result = { /*.n_ctx =*/ 512, /*.gpu_layers =*/ 0, /*.seed =*/ -1, @@ -847,48 +847,48 @@ struct llama_context_params llama_context_default_params() { return result; } -bool llama_mmap_supported() { - return llama_mmap::SUPPORTED; +bool llama_v2_mmap_supported() { + return llama_v2_mmap::SUPPORTED; } -bool llama_mlock_supported() { - return llama_mlock::SUPPORTED; +bool llama_v2_mlock_supported() { + return llama_v2_mlock::SUPPORTED; } // // model loading // -static const char *llama_file_version_name(llama_file_version version) { +static const char *llama_v2_file_version_name(llama_v2_file_version version) { switch (version) { - case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; - case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; - case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; - case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; - case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; + case LLAMA_V2_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; + case LLAMA_V2_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; + case LLAMA_V2_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; + case LLAMA_V2_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; + case LLAMA_V2_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; } return "unknown"; } -static const char *llama_ftype_name(enum llama_ftype ftype) { +static const char *llama_v2_ftype_name(enum llama_v2_ftype ftype) { switch (ftype) { - case LLAMA_FTYPE_ALL_F32: return "all F32"; - case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; - case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; - case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; - case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: + case LLAMA_V2_FTYPE_ALL_F32: return "all F32"; + case LLAMA_V2_FTYPE_MOSTLY_F16: return "mostly F16"; + case LLAMA_V2_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; + case LLAMA_V2_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; + case LLAMA_V2_FTYPE_MOSTLY_Q4_1_SOME_F16: return "mostly Q4_1, some F16"; - case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; - case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3"; - case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; - case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; - case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; + case LLAMA_V2_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; + case LLAMA_V2_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3"; + case LLAMA_V2_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; + case LLAMA_V2_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; + case LLAMA_V2_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; default: return "unknown, may not work"; } } -static const char *llama_model_type_name(e_model type) { +static const char *llama_v2_model_type_name(e_model type) { switch (type) { case MODEL_7B: return "7B"; case MODEL_13B: return "13B"; @@ -900,26 +900,26 @@ static const char *llama_model_type_name(e_model type) { } } -static void llama_model_load_internal( +static void llama_v2_model_load_internal( const std::string & fname, - llama_context & lctx, + llama_v2_context & lctx, int n_ctx, int n_gpu_layers, - ggml_type memory_type, + ggml_v2_type memory_type, bool use_mmap, bool use_mlock, bool vocab_only, - llama_progress_callback progress_callback, + llama_v2_progress_callback progress_callback, void * progress_callback_user_data) { - lctx.t_start_us = ggml_time_us(); + lctx.t_start_us = ggml_v2_time_us(); - std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); + std::unique_ptr ml(new llama_v2_model_loader(fname, use_mmap, vocab_only)); lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); auto & model = lctx.model; model.hparams = ml->file_loaders.at(0)->hparams; - llama_file_version file_version = ml->file_loaders.at(0)->file_version; + llama_v2_file_version file_version = ml->file_loaders.at(0)->file_version; auto & hparams = model.hparams; uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; @@ -936,7 +936,7 @@ static void llama_model_load_internal( } { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: format = %s\n", __func__, llama_v2_file_version_name(file_version)); fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); @@ -944,24 +944,24 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); - fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); + fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_v2_ftype_name(hparams.ftype)); fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size()); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + fprintf(stderr, "%s: model size = %s\n", __func__, llama_v2_model_type_name(model.type)); } - if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { - if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && - hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { + if (file_version < LLAMA_V2_FILE_VERSION_GGJT_V2) { + if (hparams.ftype != LLAMA_V2_FTYPE_ALL_F32 && + hparams.ftype != LLAMA_V2_FTYPE_MOSTLY_F16 && + hparams.ftype != LLAMA_V2_FTYPE_MOSTLY_Q8_0) { printf("\nLegacy LLAMA GGJT v1 compatability changes triggered.\n"); } } - if (file_version < LLAMA_FILE_VERSION_GGJT_V3) { - if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || - hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || - hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { + if (file_version < LLAMA_V2_FILE_VERSION_GGJT_V3) { + if (hparams.ftype == LLAMA_V2_FTYPE_MOSTLY_Q4_0 || + hparams.ftype == LLAMA_V2_FTYPE_MOSTLY_Q4_1 || + hparams.ftype == LLAMA_V2_FTYPE_MOSTLY_Q8_0) { printf("\nLegacy LLAMA GGJT v2 compatability changes triggered.\n"); } } @@ -979,7 +979,7 @@ static void llama_model_load_internal( // print memory requirements { - const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; + const size_t scale = memory_type == GGML_V2_TYPE_F32 ? 2 : 1; // this is the total memory required to run the inference const size_t mem_required = @@ -989,7 +989,7 @@ static void llama_model_load_internal( MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_EVAL().at(model.type); - // this is the memory required by one llama_state + // this is the memory required by one llama_v2_state const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type); @@ -1005,15 +1005,15 @@ static void llama_model_load_internal( lctx.model.mlock_buf.grow_to(lctx.model.buf.size); } - struct ggml_init_params params = { + struct ggml_v2_init_params params = { /*.mem_size =*/ lctx.model.buf.size, /*.mem_buffer =*/ lctx.model.buf.addr, /*.no_alloc =*/ ml->use_mmap, }; - model.ctx = ggml_init(params); + model.ctx = ggml_v2_init(params); if (!model.ctx) { - throw format("ggml_init() failed"); + throw format("ggml_v2_init() failed"); } } @@ -1023,7 +1023,7 @@ static void llama_model_load_internal( const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - ml->ggml_ctx = ctx; + ml->ggml_v2_ctx = ctx; model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); model.norm = ml->get_tensor("norm.weight", {n_embd}); @@ -1053,8 +1053,8 @@ static void llama_model_load_internal( ml->done_getting_tensors(); // populate `tensors_by_name` - for (llama_load_tensor & lt : ml->tensors_map.tensors) { - model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); + for (llama_v2_load_tensor & lt : ml->tensors_map.tensors) { + model.tensors_by_name.emplace_back(lt.name, lt.ggml_v2_tensor); } ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); @@ -1071,17 +1071,17 @@ static void llama_model_load_internal( for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; - ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); - ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); - ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); - ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); - ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); - ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); - ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); + ggml_v2_cuda_transform_tensor(layer.wq); vram_total += ggml_v2_nbytes(layer.wq); + ggml_v2_cuda_transform_tensor(layer.wk); vram_total += ggml_v2_nbytes(layer.wk); + ggml_v2_cuda_transform_tensor(layer.wv); vram_total += ggml_v2_nbytes(layer.wv); + ggml_v2_cuda_transform_tensor(layer.wo); vram_total += ggml_v2_nbytes(layer.wo); + ggml_v2_cuda_transform_tensor(layer.w1); vram_total += ggml_v2_nbytes(layer.w1); + ggml_v2_cuda_transform_tensor(layer.w2); vram_total += ggml_v2_nbytes(layer.w2); + ggml_v2_cuda_transform_tensor(layer.w3); vram_total += ggml_v2_nbytes(layer.w3); } if (n_gpu_layers > (int) hparams.n_layer) { fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); - ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + ggml_v2_cuda_transform_tensor(model.output); vram_total += ggml_v2_nbytes(model.output); } fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); @@ -1099,17 +1099,17 @@ static void llama_model_load_internal( for (int i = 0; i < n_gpu; ++i) { const auto & layer = model.layers[i]; - ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); - ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); - ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); - ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); - ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); - ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); - ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); + ggml_v2_cl_transform_tensor(layer.wq); vram_total += ggml_v2_nbytes(layer.wq); + ggml_v2_cl_transform_tensor(layer.wk); vram_total += ggml_v2_nbytes(layer.wk); + ggml_v2_cl_transform_tensor(layer.wv); vram_total += ggml_v2_nbytes(layer.wv); + ggml_v2_cl_transform_tensor(layer.wo); vram_total += ggml_v2_nbytes(layer.wo); + ggml_v2_cl_transform_tensor(layer.w1); vram_total += ggml_v2_nbytes(layer.w1); + ggml_v2_cl_transform_tensor(layer.w2); vram_total += ggml_v2_nbytes(layer.w2); + ggml_v2_cl_transform_tensor(layer.w3); vram_total += ggml_v2_nbytes(layer.w3); } if (n_gpu_layers > (int) hparams.n_layer) { fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__); - ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + ggml_v2_cl_transform_tensor(model.output); vram_total += ggml_v2_nbytes(model.output); } fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); @@ -1128,22 +1128,22 @@ static void llama_model_load_internal( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration - lctx.t_load_us = ggml_time_us() - lctx.t_start_us; + lctx.t_load_us = ggml_v2_time_us() - lctx.t_start_us; } -static bool llama_model_load( +static bool llama_v2_model_load( const std::string & fname, - llama_context & lctx, + llama_v2_context & lctx, int n_ctx, int n_gpu_layers, - ggml_type memory_type, + ggml_v2_type memory_type, bool use_mmap, bool use_mlock, bool vocab_only, - llama_progress_callback progress_callback, + llama_v2_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock, + llama_v2_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::string & err) { @@ -1159,20 +1159,20 @@ static bool llama_model_load( // - n_past: the context size so far // - n_threads: number of threads to use // -static bool llama_eval_internal( - llama_context & lctx, - const llama_token * tokens, +static bool llama_v2_eval_internal( + llama_v2_context & lctx, + const llama_v2_token * tokens, const int n_tokens, const int n_past, const int n_threads) { // enforce that the first token is BOS (not needed, messes with my context manip code) - //if (n_past == 0 && tokens[0] != llama_token_bos()) { + //if (n_past == 0 && tokens[0] != llama_v2_token_bos()) { //fprintf(stderr, "%s: first token must be BOS\n", __func__); // return false; //never fail. Not even in the face of Armageddon. //} - const int64_t t_start_us = ggml_time_us(); + const int64_t t_start_us = ggml_v2_time_us(); const int N = n_tokens; @@ -1181,7 +1181,7 @@ static bool llama_eval_internal( const auto & kv_self = model.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); + LLAMA_V2_ASSERT(!!kv_self.ctx); const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; @@ -1193,171 +1193,171 @@ static bool llama_eval_internal( auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; - struct ggml_init_params params = { + struct ggml_v2_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.addr, /*.no_alloc =*/ false, }; - struct ggml_context * ctx0 = ggml_init(params); + struct ggml_v2_context * ctx0 = ggml_v2_init(params); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + ggml_v2_cgraph gf = {}; + gf.n_threads = N >= 32 && ggml_v2_cpu_has_blas() && !ggml_v2_cpu_has_gpublas() ? 1 : n_threads; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_name(embd, "embd"); - memcpy(embd->data, tokens, N*ggml_element_size(embd)); + struct ggml_v2_tensor * embd = ggml_v2_new_tensor_1d(ctx0, GGML_V2_TYPE_I32, N); + ggml_v2_set_name(embd, "embd"); + memcpy(embd->data, tokens, N*ggml_v2_element_size(embd)); - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + struct ggml_v2_tensor * inpL = ggml_v2_get_rows(ctx0, model.tok_embeddings, embd); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + struct ggml_v2_tensor * inpSA = inpL; - struct ggml_tensor * cur; + struct ggml_v2_tensor * cur; lctx.use_buf(ctx0, 0); // norm { - cur = ggml_rms_norm(ctx0, inpL); + cur = ggml_v2_rms_norm(ctx0, inpL); // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].attention_norm, cur), + cur = ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].attention_norm, cur), cur); } // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - ggml_set_name(Qcur, "Qcur"); - ggml_set_name(Kcur, "Kcur"); + struct ggml_v2_tensor * Qcur = ggml_v2_rope_inplace(ctx0, ggml_v2_reshape_3d(ctx0, ggml_v2_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_v2_tensor * Kcur = ggml_v2_rope_inplace(ctx0, ggml_v2_reshape_3d(ctx0, ggml_v2_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + ggml_v2_set_name(Qcur, "Qcur"); + ggml_v2_set_name(Kcur, "Kcur"); // store key and value to memory { // compute the transposed [N, n_embd] V matrix - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N)); + struct ggml_v2_tensor * Vcur = ggml_v2_transpose(ctx0, ggml_v2_reshape_2d(ctx0, ggml_v2_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N)); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + struct ggml_v2_tensor * k = ggml_v2_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_v2_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_v2_tensor * v = ggml_v2_view_2d(ctx0, kv_self.v, N, n_embd, + ( n_ctx)*ggml_v2_element_size(kv_self.v), + (il*n_ctx)*ggml_v2_element_size(kv_self.v)*n_embd + n_past*ggml_v2_element_size(kv_self.v)); // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Kcur, k)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Vcur, v)); } - struct ggml_tensor * Q = - ggml_permute(ctx0, + struct ggml_v2_tensor * Q = + ggml_v2_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + ggml_v2_set_name(Q, "Q"); - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), + struct ggml_v2_tensor * K = + ggml_v2_permute(ctx0, + ggml_v2_reshape_3d(ctx0, + ggml_v2_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_v2_element_size(kv_self.k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); - ggml_set_name(K, "K"); + ggml_v2_set_name(K, "K"); // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + struct ggml_v2_tensor * KQ = ggml_v2_mul_mat(ctx0, K, Q); + ggml_v2_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); + struct ggml_v2_tensor * KQ_scale = ggml_v2_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); + ggml_v2_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + struct ggml_v2_tensor * KQ_scaled = ggml_v2_scale_inplace(ctx0, KQ, KQ_scale); + ggml_v2_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - ggml_set_name(KQ_masked, "KQ_masked"); + struct ggml_v2_tensor * KQ_masked = ggml_v2_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + ggml_v2_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + struct ggml_v2_tensor * KQ_soft_max = ggml_v2_soft_max_inplace(ctx0, KQ_masked); + ggml_v2_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, + struct ggml_v2_tensor * V = + ggml_v2_view_3d(ctx0, kv_self.v, n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(kv_self.v), - n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, - il*n_ctx*ggml_element_size(kv_self.v)*n_embd); - ggml_set_name(V, "V"); + n_ctx*ggml_v2_element_size(kv_self.v), + n_ctx*ggml_v2_element_size(kv_self.v)*n_embd/n_head, + il*n_ctx*ggml_v2_element_size(kv_self.v)*n_embd); + ggml_v2_set_name(V, "V"); #if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + struct ggml_v2_tensor * KQV = ggml_v2_mul_mat(ctx0, V, KQ_soft_max); + ggml_v2_set_name(KQV, "KQV"); #else // make V contiguous in memory to speed up the matmul, however we waste time on the copy // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); + struct ggml_v2_tensor * V_cont = ggml_v2_cpy(ctx0, V, ggml_v2_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + struct ggml_v2_tensor * KQV = ggml_v2_mul_mat(ctx0, V_cont, KQ_soft_max); #endif // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + struct ggml_v2_tensor * KQV_merged = ggml_v2_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_v2_set_name(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, + cur = ggml_v2_cpy(ctx0, KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merged_contiguous"); + ggml_v2_new_tensor_2d(ctx0, GGML_V2_TYPE_F32, n_embd, N)); + ggml_v2_set_name(cur, "KQV_merged_contiguous"); // projection (no bias) - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].wo, cur); } lctx.use_buf(ctx0, 1); - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + struct ggml_v2_tensor * inpFF = ggml_v2_add(ctx0, cur, inpSA); // feed-forward network { // norm { - cur = ggml_rms_norm(ctx0, inpFF); + cur = ggml_v2_rms_norm(ctx0, inpFF); // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), + cur = ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].ffn_norm, cur), cur); } - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + struct ggml_v2_tensor * tmp = ggml_v2_mul_mat(ctx0, model.layers[il].w3, cur); - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].w1, cur); // SILU activation - cur = ggml_silu(ctx0, cur); + cur = ggml_v2_silu(ctx0, cur); - cur = ggml_mul(ctx0, cur, tmp); + cur = ggml_v2_mul(ctx0, cur, tmp); - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].w2, cur); } - cur = ggml_add(ctx0, cur, inpFF); + cur = ggml_v2_add(ctx0, cur, inpFF); // input for next layer inpL = cur; @@ -1366,46 +1366,46 @@ static bool llama_eval_internal( lctx.use_buf(ctx0, 0); // used at the end to optionally extract the embeddings - struct ggml_tensor * embeddings = NULL; + struct ggml_v2_tensor * embeddings = NULL; // norm { - inpL = ggml_rms_norm(ctx0, inpL); + inpL = ggml_v2_rms_norm(ctx0, inpL); // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model.norm, inpL), + inpL = ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.norm, inpL), inpL); embeddings = inpL; } // lm_head - inpL = ggml_mul_mat(ctx0, model.output, inpL); + inpL = ggml_v2_mul_mat(ctx0, model.output, inpL); lctx.use_buf(ctx0, -1); // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); + //inpL = ggml_v2_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_v2_build_forward_expand(&gf, inpL); + ggml_v2_graph_compute (ctx0, &gf); -#ifdef GGML_PERF +#ifdef GGML_V2_PERF // print timing information per ggml operation (for debugging purposes) - // requires GGML_PERF to be defined - ggml_graph_print(&gf); + // requires GGML_V2_PERF to be defined + ggml_v2_graph_print(&gf); #endif // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { - // ggml_graph_dump_dot(&gf, NULL, "llama.dot"); + // ggml_v2_graph_dump_dot(&gf, NULL, "llama.dot"); //} //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + //memcpy(embd_w.data(), ggml_v2_get_data(inpL), sizeof(float)*n_vocab*N); // update kv token count lctx.model.kv_self.n = n_past + N; @@ -1416,11 +1416,11 @@ static bool llama_eval_internal( if (lctx.logits_all) { logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N); + memcpy(logits_out.data(), (float *) ggml_v2_get_data(inpL), sizeof(float)*n_vocab*N); } else { // return result for just the last token logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(logits_out.data(), (float *) ggml_v2_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); } } @@ -1429,29 +1429,29 @@ static bool llama_eval_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + memcpy(embedding_out.data(), (float *) ggml_v2_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); } if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; + mem_per_token = ggml_v2_used_mem(ctx0)/N; } #if 0 printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__, - ggml_used_mem(ctx0)/1024.0/1024.0, + ggml_v2_used_mem(ctx0)/1024.0/1024.0, lctx.get_buf_max_mem(0)/1024.0/1024.0, lctx.get_buf_max_mem(1)/1024.0/1024.0); #endif - ggml_free(ctx0); + ggml_v2_free(ctx0); // measure the performance only for the single-token evals if (N == 1) { - lctx.t_eval_us += ggml_time_us() - t_start_us; + lctx.t_eval_us += ggml_v2_time_us() - t_start_us; lctx.n_eval++; } else if (N > 1) { - lctx.t_p_eval_us += ggml_time_us() - t_start_us; + lctx.t_p_eval_us += ggml_v2_time_us() - t_start_us; lctx.n_p_eval += N; } @@ -1468,7 +1468,7 @@ static size_t utf8_len(char src) { return lookup[highbits]; } -struct llama_sp_symbol { +struct llama_v2_sp_symbol { using index = int; index prev; index next; @@ -1476,33 +1476,33 @@ struct llama_sp_symbol { size_t n; }; -static_assert(std::is_trivially_copyable::value, "llama_sp_symbol is not trivially copyable"); +static_assert(std::is_trivially_copyable::value, "llama_v2_sp_symbol is not trivially copyable"); -struct llama_sp_bigram { +struct llama_v2_sp_bigram { struct comparator { - bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) { + bool operator()(llama_v2_sp_bigram & l, llama_v2_sp_bigram & r) { return (l.score < r.score) || (l.score == r.score && l.left > r.left); } }; - using queue_storage = std::vector; - using queue = std::priority_queue; - llama_sp_symbol::index left; - llama_sp_symbol::index right; + using queue_storage = std::vector; + using queue = std::priority_queue; + llama_v2_sp_symbol::index left; + llama_v2_sp_symbol::index right; float score; size_t size; }; // original implementation: // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4 -struct llama_tokenizer { - llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {} +struct llama_v2_tokenizer { + llama_v2_tokenizer(const llama_v2_vocab & vocab): vocab_(vocab) {} - void tokenize(const std::string & text, std::vector & output) { + void tokenize(const std::string & text, std::vector & output) { // split string into utf8 chars int index = 0; size_t offs = 0; while (offs < text.size()) { - llama_sp_symbol sym; + llama_v2_sp_symbol sym; size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; @@ -1556,7 +1556,7 @@ struct llama_tokenizer { if (token == vocab_.token_to_id.end()) { // output any symbols that did not form tokens as bytes. for (int j = 0; j < (int) symbol.n; ++j) { - llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; + llama_v2_vocab::id token_id = static_cast(symbol.text[j]) + 3; output.push_back(token_id); } } else { @@ -1584,7 +1584,7 @@ private: const auto &tok_score = vocab_.id_to_token[(*token).second]; - llama_sp_bigram bigram; + llama_v2_sp_bigram bigram; bigram.left = left; bigram.right = right; bigram.score = tok_score.score; @@ -1592,21 +1592,21 @@ private: work_queue_.push(bigram); } - const llama_vocab & vocab_; - std::vector symbols_; - llama_sp_bigram::queue work_queue_; + const llama_v2_vocab & vocab_; + std::vector symbols_; + llama_v2_sp_bigram::queue work_queue_; }; -static std::vector llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) { - llama_tokenizer tokenizer(vocab); - std::vector output; +static std::vector llama_v2_tokenize(const llama_v2_vocab & vocab, const std::string & text, bool bos) { + llama_v2_tokenizer tokenizer(vocab); + std::vector output; if (text.empty()) { return output; } if (bos) { - output.push_back(llama_token_bos()); + output.push_back(llama_v2_token_bos()); } tokenizer.tokenize(text, output); @@ -1617,14 +1617,14 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co // sampling // -void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) { +void llama_v2_sample_softmax(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates) { assert(candidates->size > 0); - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_start_sample_us = ggml_v2_time_us(); // Sort the logits in descending order if (!candidates->sorted) { - std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + std::sort(candidates->data, candidates->data + candidates->size, [](const llama_v2_token_data & a, const llama_v2_token_data & b) { return a.logit > b.logit; }); candidates->sorted = true; @@ -1642,19 +1642,19 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c } if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) { - const int64_t t_start_sample_us = ggml_time_us(); +void llama_v2_sample_top_k(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, int k, size_t min_keep) { + const int64_t t_start_sample_us = ggml_v2_time_us(); k = std::max(k, (int) min_keep); k = std::min(k, (int) candidates->size); // Sort scores in descending order if (!candidates->sorted) { - auto comp = [](const llama_token_data & a, const llama_token_data & b) { + auto comp = [](const llama_v2_token_data & a, const llama_v2_token_data & b) { return a.logit > b.logit; }; if (k == (int) candidates->size) { @@ -1667,18 +1667,18 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can candidates->size = k; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { +void llama_v2_sample_top_p(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float p, size_t min_keep) { if (p >= 1.0f) { return; } - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_start_sample_us = ggml_v2_time_us(); - llama_sample_softmax(ctx, candidates); + llama_v2_sample_softmax(ctx, candidates); // Compute the cumulative probabilities float cum_sum = 0.0f; @@ -1698,18 +1698,18 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can candidates->size = last_idx; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) { +void llama_v2_sample_tail_free(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float z, size_t min_keep) { if (z >= 1.0f || candidates->size <= 2) { return; } - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_start_sample_us = ggml_v2_time_us(); - llama_sample_softmax(nullptr, candidates); + llama_v2_sample_softmax(nullptr, candidates); // Compute the first and second derivatives std::vector first_derivatives(candidates->size - 1); @@ -1749,22 +1749,22 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates->size = last_idx; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) { +void llama_v2_sample_typical(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float p, size_t min_keep) { // Reference implementation: // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr if (p >= 1.0f) { return; } - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_start_sample_us = ggml_v2_time_us(); // Compute the softmax of logits and calculate entropy - llama_sample_softmax(nullptr, candidates); + llama_v2_sample_softmax(nullptr, candidates); float entropy = 0.0f; for (size_t i = 0; i < candidates->size; ++i) { @@ -1802,7 +1802,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c } // Resize the output vector to keep only the locally typical tokens - std::vector new_candidates; + std::vector new_candidates; for (size_t i = 0; i < last_idx; ++i) { size_t idx = indices[i]; new_candidates.push_back(candidates->data[idx]); @@ -1813,28 +1813,28 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c candidates->size = new_candidates.size(); if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) { - const int64_t t_start_sample_us = ggml_time_us(); +void llama_v2_sample_temperature(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates_p, float temp) { + const int64_t t_start_sample_us = ggml_v2_time_us(); for (size_t i = 0; i < candidates_p->size; ++i) { candidates_p->data[i].logit /= temp; } if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) { +void llama_v2_sample_repetition_penalty(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, const llama_v2_token * last_tokens, size_t last_tokens_size, float penalty) { if (last_tokens_size == 0 || penalty == 1.0f) { return; } - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_start_sample_us = ggml_v2_time_us(); for (size_t i = 0; i < candidates->size; ++i) { const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id); @@ -1854,19 +1854,19 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat candidates->sorted = false; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { +void llama_v2_sample_frequency_and_presence_penalties(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, const llama_v2_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) { if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) { return; } - const int64_t t_start_sample_us = ggml_time_us(); + const int64_t t_start_sample_us = ggml_v2_time_us(); // Create a frequency map to count occurrences of each token in last_tokens - std::unordered_map token_count; + std::unordered_map token_count; for (size_t i = 0; i < last_tokens_size; ++i) { token_count[last_tokens_p[i]]++; } @@ -1885,18 +1885,18 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l candidates->sorted = false; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } } -llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) { +llama_v2_token llama_v2_sample_token_mirostat(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float tau, float eta, int m, float * mu) { assert(ctx); - auto N = float(llama_n_vocab(ctx)); + auto N = float(llama_v2_n_vocab(ctx)); int64_t t_start_sample_us; - t_start_sample_us = ggml_time_us(); + t_start_sample_us = ggml_v2_time_us(); - llama_sample_softmax(nullptr, candidates); + llama_v2_sample_softmax(nullptr, candidates); // Estimate s_hat using the most probable m tokens float s_hat = 0.0; @@ -1915,15 +1915,15 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); // Sample the next word X using top-k sampling - llama_sample_top_k(nullptr, candidates, int(k), 1); + llama_v2_sample_top_k(nullptr, candidates, int(k), 1); if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } - llama_token X = llama_sample_token(ctx, candidates); - t_start_sample_us = ggml_time_us(); + llama_v2_token X = llama_v2_sample_token(ctx, candidates); + t_start_sample_us = ggml_v2_time_us(); // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v2_token_data & candidate) { return candidate.id == X; })); float observed_surprise = -log2f(candidates->data[X_idx].p); @@ -1933,36 +1933,36 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_ *mu = *mu - eta * e; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; ctx->n_sample++; } return X; } -llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) { +llama_v2_token llama_v2_sample_token_mirostat_v2(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float tau, float eta, float * mu) { assert(ctx); int64_t t_start_sample_us; - t_start_sample_us = ggml_time_us(); + t_start_sample_us = ggml_v2_time_us(); - llama_sample_softmax(ctx, candidates); + llama_v2_sample_softmax(ctx, candidates); // Truncate the words with surprise values greater than mu - candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v2_token_data & candidate) { return -log2f(candidate.p) > *mu; })); // Normalize the probabilities of the remaining words - llama_sample_softmax(ctx, candidates); + llama_v2_sample_softmax(ctx, candidates); // Sample the next word X from the remaining words if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } - llama_token X = llama_sample_token(ctx, candidates); - t_start_sample_us = ggml_time_us(); + llama_v2_token X = llama_v2_sample_token(ctx, candidates); + t_start_sample_us = ggml_v2_time_us(); // Compute error as the difference between observed surprise and target surprise value - size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { + size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_v2_token_data & candidate) { return candidate.id == X; })); float observed_surprise = -log2f(candidates->data[X_idx].p); @@ -1972,31 +1972,31 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok *mu = *mu - eta * e; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; } return X; } -llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) { - const int64_t t_start_sample_us = ggml_time_us(); +llama_v2_token llama_v2_sample_token_greedy(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates) { + const int64_t t_start_sample_us = ggml_v2_time_us(); // Find max element - auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) { + auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_v2_token_data & a, const llama_v2_token_data & b) { return a.logit < b.logit; }); - llama_token result = max_iter->id; + llama_v2_token result = max_iter->id; if (ctx) { - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; ctx->n_sample++; } return result; } -llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { +llama_v2_token llama_v2_sample_token(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates) { assert(ctx); - const int64_t t_start_sample_us = ggml_time_us(); - llama_sample_softmax(nullptr, candidates); + const int64_t t_start_sample_us = ggml_v2_time_us(); + llama_v2_sample_softmax(nullptr, candidates); std::vector probs; probs.reserve(candidates->size); @@ -2008,9 +2008,9 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra auto & rng = ctx->rng; int idx = dist(rng); - llama_token result = candidates->data[idx].id; + llama_v2_token result = candidates->data[idx].id; - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; + ctx->t_sample_us += ggml_v2_time_us() - t_start_sample_us; ctx->n_sample++; return result; } @@ -2019,16 +2019,16 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra // quantization // -static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { - ggml_type quantized_type; +static void llama_v2_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_v2_ftype ftype, int nthread) { + ggml_v2_type quantized_type; switch (ftype) { - case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; - case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; - case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break; - case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; - case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; - case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; + case LLAMA_V2_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_V2_TYPE_Q4_0; break; + case LLAMA_V2_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_V2_TYPE_Q4_1; break; + case LLAMA_V2_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_V2_TYPE_Q4_2; break; + case LLAMA_V2_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_V2_TYPE_Q4_3; break; + case LLAMA_V2_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_V2_TYPE_Q5_0; break; + case LLAMA_V2_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_V2_TYPE_Q5_1; break; + case LLAMA_V2_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_V2_TYPE_Q8_0; break; default: throw format("invalid output file type %d\n", ftype); }; @@ -2036,9 +2036,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s nthread = std::thread::hardware_concurrency(); } - std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false, + std::unique_ptr model_loader(new llama_v2_model_loader(fname_inp, /*use_mmap*/ false, /*vocab_only*/ false)); - llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); + llama_v2_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype); size_t total_size_org = 0; size_t total_size_new = 0; @@ -2048,16 +2048,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::mutex mutex; size_t idx = 0; - for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { - llama_buffer read_data; + for (llama_v2_load_tensor & tensor : model_loader->tensors_map.tensors) { + llama_v2_buffer read_data; read_data.resize(tensor.size); tensor.data = read_data.addr; model_loader->load_data_for(tensor); printf("[%4zu/%4zu] %36s - %16s, type = %6s, ", ++idx, model_loader->tensors_map.tensors.size(), - tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), - ggml_type_name(tensor.type)); + tensor.name.c_str(), llama_v2_format_tensor_shape(tensor.ne).c_str(), + ggml_v2_type_name(tensor.type)); // This used to be a regex, but has an extreme cost to compile times. bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? @@ -2070,10 +2070,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize = false; //} - enum ggml_type new_type; + enum ggml_v2_type new_type; void * new_data; size_t new_size; - llama_buffer work; + llama_v2_buffer work; if (!quantize) { new_type = tensor.type; @@ -2084,18 +2084,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type = quantized_type; float * f32_data; size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); - llama_buffer f32_conv_buf; - if (tensor.type == GGML_TYPE_F32) { + llama_v2_buffer f32_conv_buf; + if (tensor.type == GGML_V2_TYPE_F32) { f32_data = (float *) tensor.data; - } else if (tensor.type == GGML_TYPE_F16) { + } else if (tensor.type == GGML_V2_TYPE_F16) { f32_conv_buf.resize(nelements * sizeof(float)); f32_data = (float *) f32_conv_buf.addr; - const auto * f16_data = (const ggml_fp16_t *) tensor.data; + const auto * f16_data = (const ggml_v2_fp16_t *) tensor.data; for (size_t i = 0; i < nelements; i++) { - f32_data[i] = ggml_fp16_to_fp32(f16_data[i]); + f32_data[i] = ggml_v2_fp16_to_fp32(f16_data[i]); } } else { - throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)); + throw format("type %s unsupported for integer quantization", ggml_v2_type_name(tensor.type)); } printf("quantizing .. "); @@ -2109,7 +2109,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); + new_size = ggml_v2_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data()); } else { size_t counter = 0; new_size = 0; @@ -2133,7 +2133,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (local_hist.empty()) { local_hist.resize(hist_cur.size(), 0); } - local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); + local_size += ggml_v2_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data()); } }; if ((int) workers.size() < nthread_use - 1) { @@ -2184,12 +2184,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // interface implementation // -struct llama_context * llama_init_from_file( +struct llama_v2_context * llama_v2_init_from_file( const char * path_model, - struct llama_context_params params) { - ggml_time_init(); + struct llama_v2_context_params params) { + ggml_v2_time_init(); - llama_context * ctx = new llama_context; + llama_v2_context * ctx = new llama_v2_context; if (params.seed < 0) { params.seed = time(NULL); @@ -2215,13 +2215,13 @@ struct llama_context * llama_init_from_file( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + ggml_v2_type memory_type = params.f16_kv ? GGML_V2_TYPE_F16 : GGML_V2_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type, + if (!llama_v2_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { fprintf(stderr, "%s: failed to load model\n", __func__); - llama_free(ctx); + llama_v2_free(ctx); return nullptr; } @@ -2229,12 +2229,12 @@ struct llama_context * llama_init_from_file( if (!params.vocab_only) { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); - llama_free(ctx); + llama_v2_free(ctx); return nullptr; } { - const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); + const size_t memory_size = ggml_v2_nbytes(ctx->model.kv_self.k) + ggml_v2_nbytes(ctx->model.kv_self.v); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -2260,17 +2260,17 @@ struct llama_context * llama_init_from_file( return ctx; } -void llama_free(struct llama_context * ctx) { +void llama_v2_free(struct llama_v2_context * ctx) { delete ctx; } -int llama_model_quantize( +int llama_v2_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype, + enum llama_v2_ftype ftype, int nthread) { try { - llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread); + llama_v2_model_quantize_internal(fname_inp, fname_out, ftype, nthread); return 0; } catch (const std::string & err) { fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str()); @@ -2278,12 +2278,12 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_v2_apply_lora_from_file_internal(struct llama_v2_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); auto & model = ctx->model; - const int64_t t_start_lora_us = ggml_time_us(); + const int64_t t_start_lora_us = ggml_v2_time_us(); auto fin = std::ifstream(path_lora, std::ios::binary); if (!fin) { @@ -2320,46 +2320,46 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // create a temporary ggml context to store the lora tensors // todo: calculate size from biggest possible tensor std::vector lora_buf(1024ull * 1024ull * 1024ull); - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = lora_buf.size(); params.mem_buffer = lora_buf.data(); params.no_alloc = false; - ggml_context * lora_ctx = ggml_init(params); - std::unordered_map lora_tensors; + ggml_v2_context * lora_ctx = ggml_v2_init(params); + std::unordered_map lora_tensors; // create a name -> tensor map of the model to accelerate lookups - std::unordered_map model_tensors; + std::unordered_map model_tensors; for (auto & kv: model.tensors_by_name) { model_tensors.insert(kv); } // load base model - std::unique_ptr model_loader; - ggml_context * base_ctx = NULL; - llama_buffer base_buf; + std::unique_ptr model_loader; + ggml_v2_context * base_ctx = NULL; + llama_v2_buffer base_buf; if (path_base_model) { fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model); - model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); + model_loader.reset(new llama_v2_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false)); size_t ctx_size; size_t mmapped_size; model_loader->calc_sizes(&ctx_size, &mmapped_size); base_buf.resize(ctx_size); - ggml_init_params base_params; + ggml_v2_init_params base_params; base_params.mem_size = base_buf.size; base_params.mem_buffer = base_buf.addr; base_params.no_alloc = model_loader->use_mmap; - base_ctx = ggml_init(base_params); + base_ctx = ggml_v2_init(base_params); - model_loader->ggml_ctx = base_ctx; + model_loader->ggml_v2_ctx = base_ctx; - // maybe this should in llama_model_loader + // maybe this should in llama_v2_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + model_loader->mapping.reset(new llama_v2_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); } } @@ -2409,10 +2409,10 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } // create ggml tensor - ggml_type wtype; + ggml_v2_type wtype; switch (ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; + case 0: wtype = GGML_V2_TYPE_F32; break; + case 1: wtype = GGML_V2_TYPE_F16; break; default: { fprintf(stderr, "%s: invalid tensor data type '%d'\n", @@ -2420,9 +2420,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * return false; } } - ggml_tensor* lora_tensor; + ggml_v2_tensor* lora_tensor; if (n_dims == 2) { - lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); + lora_tensor = ggml_v2_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); } else { fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims); @@ -2431,7 +2431,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // load tensor data size_t offset = fin.tellg(); - size_t tensor_data_size = ggml_nbytes(lora_tensor); + size_t tensor_data_size = ggml_v2_nbytes(lora_tensor); offset = (offset + 31) & -32; fin.seekg(offset); fin.read((char*)lora_tensor->data, tensor_data_size); @@ -2442,8 +2442,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { - ggml_tensor * dest_t = model_tensors[base_name]; - ggml_tensor * base_t; + ggml_v2_tensor * dest_t = model_tensors[base_name]; + ggml_v2_tensor * base_t; if (model_loader) { // load from base model if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { @@ -2451,17 +2451,17 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * return 1; } size_t idx = model_loader->tensors_map.name_to_idx[base_name]; - llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; + llama_v2_load_tensor & lt = model_loader->tensors_map.tensors[idx]; base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); - lt.data = (uint8_t *) lt.ggml_tensor->data; + lt.data = (uint8_t *) lt.ggml_v2_tensor->data; model_loader->load_data_for(lt); - lt.ggml_tensor->data = lt.data; + lt.ggml_v2_tensor->data = lt.data; } else { base_t = dest_t; } - if (ggml_is_quantized(base_t->type)) { + if (ggml_v2_is_quantized(base_t->type)) { if (!warned) { fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, " "use a f16 or f32 base model with --lora-base\n", __func__); @@ -2469,8 +2469,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } } - ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; - ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; + ggml_v2_tensor * loraA = lora_tensors[base_name + ".loraA"]; + ggml_v2_tensor * loraB = lora_tensors[base_name + ".loraB"]; if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" @@ -2479,29 +2479,29 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + ggml_v2_tensor * BA = ggml_v2_mul_mat(lora_ctx, loraA, loraB); if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + ggml_v2_tensor * scale_tensor = ggml_v2_new_f32(lora_ctx, scaling); + BA = ggml_v2_scale_inplace(lora_ctx, BA, scale_tensor); } - ggml_tensor * r; + ggml_v2_tensor * r; if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx, dest_t, BA); + r = ggml_v2_add_inplace(lora_ctx, dest_t, BA); } else { - r = ggml_add(lora_ctx, base_t, BA); - r = ggml_cpy(lora_ctx, r, dest_t); + r = ggml_v2_add(lora_ctx, base_t, BA); + r = ggml_v2_cpy(lora_ctx, r, dest_t); } - struct ggml_cgraph gf = ggml_build_forward(r); + struct ggml_v2_cgraph gf = ggml_v2_build_forward(r); gf.n_threads = n_threads; - ggml_graph_compute(lora_ctx, &gf); + ggml_v2_graph_compute(lora_ctx, &gf); // we won't need these tensors again, reset the context to save memory - ggml_free(lora_ctx); - lora_ctx = ggml_init(params); + ggml_v2_free(lora_ctx); + lora_ctx = ggml_v2_init(params); lora_tensors.clear(); n_tensors++; @@ -2512,33 +2512,33 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } // TODO: this should be in a destructor, it will leak on failure - ggml_free(lora_ctx); + ggml_v2_free(lora_ctx); if (base_ctx) { - ggml_free(base_ctx); + ggml_v2_free(base_ctx); } - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; + const int64_t t_lora_us = ggml_v2_time_us() - t_start_lora_us; fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0); return 0; } -int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_v2_apply_lora_from_file(struct llama_v2_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + return llama_v2_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); } catch (const std::string & err) { fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str()); return 1; } } -int llama_get_kv_cache_token_count(const struct llama_context * ctx) { +int llama_v2_get_kv_cache_token_count(const struct llama_v2_context * ctx) { return ctx->model.kv_self.n; } -#define LLAMA_MAX_RNG_STATE (64*1024) +#define LLAMA_V2_MAX_RNG_STATE (64*1024) -void llama_set_rng_seed(struct llama_context * ctx, int seed) { +void llama_v2_set_rng_seed(struct llama_v2_context * ctx, int seed) { if (seed < 0) { seed = time(NULL); } @@ -2546,11 +2546,11 @@ void llama_set_rng_seed(struct llama_context * ctx, int seed) { } // Returns the *maximum* size of the state -size_t llama_get_state_size(const struct llama_context * ctx) { +size_t llama_v2_get_state_size(const struct llama_v2_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); - const size_t s_rng = LLAMA_MAX_RNG_STATE; + const size_t s_rng = LLAMA_V2_MAX_RNG_STATE; const size_t s_logits_capacity = sizeof(size_t); const size_t s_logits_size = sizeof(size_t); const size_t s_logits = ctx->logits.capacity() * sizeof(float); @@ -2577,7 +2577,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { } // Copies the state to the specified destination address -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { +size_t llama_v2_copy_state_data(struct llama_v2_context * ctx, uint8_t * dst) { uint8_t * out = dst; // copy rng @@ -2586,13 +2586,13 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { rng_ss << ctx->rng; const size_t rng_size = rng_ss.str().size(); - char rng_buf[LLAMA_MAX_RNG_STATE]; + char rng_buf[LLAMA_V2_MAX_RNG_STATE]; - memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE); + memset(&rng_buf[0], 0, LLAMA_V2_MAX_RNG_STATE); memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size()); memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size); - memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE; + memcpy(out, &rng_buf[0], LLAMA_V2_MAX_RNG_STATE); out += LLAMA_V2_MAX_RNG_STATE; } // copy logits @@ -2631,69 +2631,69 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { const int n_ctx = hparams.n_ctx; const size_t kv_size = kv_self.buf.size; - const int kv_ntok = llama_get_kv_cache_token_count(ctx); + const int kv_ntok = llama_v2_get_kv_cache_token_count(ctx); memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size); memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok); if (kv_size) { - const size_t elt_size = ggml_element_size(kv_self.k); + const size_t elt_size = ggml_v2_element_size(kv_self.k); char buffer[4096]; - ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_v2_context * cpy_ctx = ggml_v2_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + ggml_v2_cgraph gf{}; gf.n_threads = 1; - ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_v2_tensor * kout3d = ggml_v2_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kout3d->data = out; - out += ggml_nbytes(kout3d); + out += ggml_v2_nbytes(kout3d); - ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_v2_tensor * vout3d = ggml_v2_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); vout3d->data = out; - out += ggml_nbytes(vout3d); + out += ggml_v2_nbytes(vout3d); - ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, + ggml_v2_tensor * k3d = ggml_v2_view_3d(cpy_ctx, kv_self.k, n_embd, kv_ntok, n_layer, elt_size*n_embd, elt_size*n_embd*n_ctx, 0); - ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, + ggml_v2_tensor * v3d = ggml_v2_view_3d(cpy_ctx, kv_self.v, kv_ntok, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(cpy_ctx, k3d, kout3d)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(cpy_ctx, v3d, vout3d)); + ggml_v2_graph_compute(cpy_ctx, &gf); - ggml_free(cpy_ctx); + ggml_v2_free(cpy_ctx); } } const size_t written = out - dst; - const size_t max_size = llama_get_state_size(ctx); + const size_t max_size = llama_v2_get_state_size(ctx); - LLAMA_ASSERT(written <= max_size); + LLAMA_V2_ASSERT(written <= max_size); return written; } // Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { +size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src) { const uint8_t * inp = src; // set rng { size_t rng_size; - char rng_buf[LLAMA_MAX_RNG_STATE]; + char rng_buf[LLAMA_V2_MAX_RNG_STATE]; memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size); - memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE; + memcpy(&rng_buf[0], inp, LLAMA_V2_MAX_RNG_STATE); inp += LLAMA_V2_MAX_RNG_STATE; std::stringstream rng_ss; rng_ss.str(std::string(&rng_buf[0], rng_size)); rng_ss >> ctx->rng; - LLAMA_ASSERT(rng_ss.fail() == false); + LLAMA_V2_ASSERT(rng_ss.fail() == false); } // set logits @@ -2704,7 +2704,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap); memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size); - LLAMA_ASSERT(ctx->logits.capacity() == logits_cap); + LLAMA_V2_ASSERT(ctx->logits.capacity() == logits_cap); if (logits_size) { ctx->logits.resize(logits_size); @@ -2720,7 +2720,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size); - LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size); + LLAMA_V2_ASSERT(ctx->embedding.capacity() == embedding_size); if (embedding_size) { memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float)); @@ -2743,65 +2743,65 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok); if (kv_size) { - LLAMA_ASSERT(kv_self.buf.size == kv_size); + LLAMA_V2_ASSERT(kv_self.buf.size == kv_size); - const size_t elt_size = ggml_element_size(kv_self.k); + const size_t elt_size = ggml_v2_element_size(kv_self.k); char buffer[4096]; - ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_v2_context * cpy_ctx = ggml_v2_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + ggml_v2_cgraph gf{}; gf.n_threads = 1; - ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); + ggml_v2_tensor * kin3d = ggml_v2_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kin3d->data = (void *) inp; - inp += ggml_nbytes(kin3d); + inp += ggml_v2_nbytes(kin3d); - ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); + ggml_v2_tensor * vin3d = ggml_v2_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer); vin3d->data = (void *) inp; - inp += ggml_nbytes(vin3d); + inp += ggml_v2_nbytes(vin3d); - ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k, + ggml_v2_tensor * k3d = ggml_v2_view_3d(cpy_ctx, kv_self.k, n_embd, kv_ntok, n_layer, elt_size*n_embd, elt_size*n_embd*n_ctx, 0); - ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v, + ggml_v2_tensor * v3d = ggml_v2_view_3d(cpy_ctx, kv_self.v, kv_ntok, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute(cpy_ctx, &gf); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(cpy_ctx, kin3d, k3d)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(cpy_ctx, vin3d, v3d)); + ggml_v2_graph_compute(cpy_ctx, &gf); - ggml_free(cpy_ctx); + ggml_v2_free(cpy_ctx); } ctx->model.kv_self.n = kv_ntok; } const size_t nread = inp - src; - const size_t max_size = llama_get_state_size(ctx); + const size_t max_size = llama_v2_get_state_size(ctx); - LLAMA_ASSERT(nread <= max_size); + LLAMA_V2_ASSERT(nread <= max_size); return nread; } -bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - llama_file file(path_session, "rb"); +bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_v2_file file(path_session, "rb"); // sanity checks { const uint32_t magic = file.read_u32(); const uint32_t version = file.read_u32(); - if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) { + if (magic != LLAMA_V2_SESSION_MAGIC || version != LLAMA_V2_SESSION_VERSION) { fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); return false; } - llama_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_hparams)); + llama_v2_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(llama_v2_hparams)); if (session_hparams != ctx->model.hparams) { fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); @@ -2818,14 +2818,14 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi return false; } - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + file.read_raw(tokens_out, sizeof(llama_v2_token) * n_token_count); *n_token_count_out = n_token_count; } // restore the context state { const size_t n_state_size_cur = file.size - file.tell(); - const size_t n_state_size_max = llama_get_state_size(ctx); + const size_t n_state_size_max = llama_v2_get_state_size(ctx); if (n_state_size_cur > n_state_size_max) { fprintf(stderr, "%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); @@ -2835,30 +2835,30 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi std::vector state_data(n_state_size_max); file.read_raw(state_data.data(), n_state_size_cur); - llama_set_state_data(ctx, state_data.data()); + llama_v2_set_state_data(ctx, state_data.data()); } return true; } -bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - llama_file file(path_session, "wb"); +bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count) { + llama_v2_file file(path_session, "wb"); - file.write_u32(LLAMA_SESSION_MAGIC); - file.write_u32(LLAMA_SESSION_VERSION); + file.write_u32(LLAMA_V2_SESSION_MAGIC); + file.write_u32(LLAMA_V2_SESSION_VERSION); - file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); + file.write_raw(&ctx->model.hparams, sizeof(llama_v2_hparams)); // save the prompt file.write_u32((uint32_t) n_token_count); - file.write_raw(tokens, sizeof(llama_token) * n_token_count); + file.write_raw(tokens, sizeof(llama_v2_token) * n_token_count); // save the context state { - const size_t n_state_size_max = llama_get_state_size(ctx); + const size_t n_state_size_max = llama_v2_get_state_size(ctx); std::vector state_data(n_state_size_max); - const size_t n_state_size_cur = llama_copy_state_data(ctx, state_data.data()); + const size_t n_state_size_cur = llama_v2_copy_state_data(ctx, state_data.data()); file.write_raw(state_data.data(), n_state_size_cur); } @@ -2866,13 +2866,13 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi return true; } -int llama_eval( - struct llama_context * ctx, - const llama_token * tokens, +int llama_v2_eval( + struct llama_v2_context * ctx, + const llama_v2_token * tokens, int n_tokens, int n_past, int n_threads) { - if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) { + if (!llama_v2_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1; } @@ -2880,20 +2880,20 @@ int llama_eval( // get a more accurate load time, upon first eval // TODO: fix this if (!ctx->has_evaluated_once) { - ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->t_load_us = ggml_v2_time_us() - ctx->t_start_us; ctx->has_evaluated_once = true; } return 0; } -int llama_tokenize( - struct llama_context * ctx, +int llama_v2_tokenize( + struct llama_v2_context * ctx, const char * text, - llama_token * tokens, + llama_v2_token * tokens, int n_max_tokens, bool add_bos) { - auto res = llama_tokenize(ctx->vocab, text, add_bos); + auto res = llama_v2_tokenize(ctx->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); @@ -2907,49 +2907,49 @@ int llama_tokenize( return res.size(); } -int llama_n_vocab(const struct llama_context * ctx) { +int llama_v2_n_vocab(const struct llama_v2_context * ctx) { return ctx->vocab.id_to_token.size(); } -int llama_n_ctx(const struct llama_context * ctx) { +int llama_v2_n_ctx(const struct llama_v2_context * ctx) { return ctx->model.hparams.n_ctx; } -int llama_n_embd(const struct llama_context * ctx) { +int llama_v2_n_embd(const struct llama_v2_context * ctx) { return ctx->model.hparams.n_embd; } -float * llama_get_logits(struct llama_context * ctx) { +float * llama_v2_get_logits(struct llama_v2_context * ctx) { return ctx->logits.data(); } -float * llama_get_embeddings(struct llama_context * ctx) { +float * llama_v2_get_embeddings(struct llama_v2_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { - if (token >= llama_n_vocab(ctx)) { +const char * llama_v2_token_to_str(const struct llama_v2_context * ctx, llama_v2_token token) { + if (token >= llama_v2_n_vocab(ctx)) { return nullptr; } return ctx->vocab.id_to_token[token].tok.c_str(); } -llama_token llama_token_bos() { +llama_v2_token llama_v2_token_bos() { return 1; } -llama_token llama_token_eos() { +llama_v2_token llama_v2_token_eos() { return 2; } -llama_token llama_token_nl() { +llama_v2_token llama_v2_token_nl() { return 13; } -void llama_print_timings(struct llama_context * ctx) { - const int64_t t_end_us = ggml_time_us(); +void llama_v2_print_timings(struct llama_v2_context * ctx) { + const int64_t t_end_us = ggml_v2_time_us(); const int32_t n_sample = std::max(1, ctx->n_sample); const int32_t n_eval = std::max(1, ctx->n_eval); @@ -2963,36 +2963,119 @@ void llama_print_timings(struct llama_context * ctx) { fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); } -void llama_reset_timings(struct llama_context * ctx) { - ctx->t_start_us = ggml_time_us(); +void llama_v2_reset_timings(struct llama_v2_context * ctx) { + ctx->t_start_us = ggml_v2_time_us(); ctx->t_sample_us = ctx->n_sample = 0; ctx->t_eval_us = ctx->n_eval = 0; ctx->t_p_eval_us = ctx->n_p_eval = 0; } -const char * llama_print_system_info(void) { +const char * llama_v2_print_system_info(void) { static std::string s; s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "AVX = " + std::to_string(ggml_v2_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_v2_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_v2_cpu_has_avx512()) + " | "; + s += "AVX512_VBMI = " + std::to_string(ggml_v2_cpu_has_avx512_vbmi()) + " | "; + s += "AVX512_VNNI = " + std::to_string(ggml_v2_cpu_has_avx512_vnni()) + " | "; + s += "FMA = " + std::to_string(ggml_v2_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_v2_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_v2_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_v2_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_v2_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_v2_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_v2_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_v2_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_v2_cpu_has_vsx()) + " | "; return s.c_str(); } // For internal test use -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +std::vector>& llama_v2_internal_get_tensor_map(struct llama_v2_context * ctx) { return ctx->model.tensors_by_name; } + + +// TODO: Calculate this constant from the vocabulary +#define MAX_TOKEN_LEN 18 +// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece +std::vector legacy_llama_v2_tokenize(const llama_v2_vocab & vocab, const std::string & text, bool bos) { + std::vector res; + std::vector score; + std::vector prev; + int len = text.length(); + + score.resize(len + 1); + prev.resize(len + 1); + + // Forward pass + for (int i = 0; i < len; i++) { + int max_len = std::min(len - i, MAX_TOKEN_LEN); + for (int sub_len = 1; sub_len <= max_len; sub_len++) { + auto sub = text.substr(i, sub_len); + auto token = vocab.token_to_id.find(sub); + if (token != vocab.token_to_id.end()) { + int token_score = sub.length() * sub.length(); + int local_score = score[i] + token_score; + int next = i + sub_len; + if (score[next] < local_score) { + score[next] = local_score; + prev[next] = (*token).second; + } + } + } + } + + // Backward pass + int i = len; + while (i > 0) { + llama_v2_token token_id = prev[i]; + if (token_id == 0) { + // TODO: Return error or something more meaningful + printf("failed to tokenize string!\n"); + break; + } + res.push_back(token_id); + auto token = vocab.id_to_token[token_id].tok; + i -= token.length(); + } + + if (bos) { + res.push_back(1); // TODO: replace with vocab.bos + } + + // Pieces are in reverse order so correct that + std::reverse(res.begin(), res.end()); + + return res; +} + +int legacy_llama_v2_tokenize( + struct llama_v2_context * ctx, + const char * text, + llama_v2_token * tokens, + int n_max_tokens, + bool add_bos) { + auto res = legacy_llama_v2_tokenize(ctx->vocab, text, add_bos); + + if (n_max_tokens < (int) res.size()) { + fprintf(stderr, "%s: too many tokens\n", __func__); + return -((int) res.size()); + } + + for (size_t i = 0; i < res.size(); i++) { + tokens[i] = res[i]; + } + + return res.size(); +} + +std::vector legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos) { + std::vector res(8096); + int n = legacy_llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + res.resize(n); + + return res; +} \ No newline at end of file diff --git a/llama.h b/otherarch/llama_v2.h similarity index 51% rename from llama.h rename to otherarch/llama_v2.h index 953a4d969..2b1cfc725 100644 --- a/llama.h +++ b/otherarch/llama_v2.h @@ -1,29 +1,29 @@ -#ifndef LLAMA_H -#define LLAMA_H +#ifndef LLAMA_V2_H +#define LLAMA_V2_H #include #include #include -#ifdef LLAMA_SHARED +#ifdef LLAMA_V2_SHARED # if defined(_WIN32) && !defined(__MINGW32__) -# ifdef LLAMA_BUILD -# define LLAMA_API __declspec(dllexport) +# ifdef LLAMA_V2_BUILD +# define LLAMA_V2_API __declspec(dllexport) # else -# define LLAMA_API __declspec(dllimport) +# define LLAMA_V2_API __declspec(dllimport) # endif # else -# define LLAMA_API __attribute__ ((visibility ("default"))) +# define LLAMA_V2_API __attribute__ ((visibility ("default"))) # endif #else -# define LLAMA_API +# define LLAMA_V2_API #endif -#define LLAMA_FILE_VERSION 3 -#define LLAMA_FILE_MAGIC 'ggjt' -#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' -#define LLAMA_SESSION_MAGIC 'ggsn' -#define LLAMA_SESSION_VERSION 1 +#define LLAMA_V2_FILE_VERSION 3 +#define LLAMA_V2_FILE_MAGIC 'ggjt' +#define LLAMA_V2_FILE_MAGIC_UNVERSIONED 'ggml' +#define LLAMA_V2_SESSION_MAGIC 'ggsn' +#define LLAMA_V2_SESSION_VERSION 1 #ifdef __cplusplus extern "C" { @@ -35,78 +35,78 @@ extern "C" { // TODO: show sample usage // - struct llama_context; + struct llama_v2_context; - typedef int llama_token; + typedef int llama_v2_token; - typedef struct llama_token_data { - llama_token id; // token id + typedef struct llama_v2_token_data { + llama_v2_token id; // token id float logit; // log-odds of the token float p; // probability of the token - } llama_token_data; + } llama_v2_token_data; - typedef struct llama_token_data_array { - llama_token_data * data; + typedef struct llama_v2_token_data_array { + llama_v2_token_data * data; size_t size; bool sorted; - } llama_token_data_array; + } llama_v2_token_data_array; - typedef void (*llama_progress_callback)(float progress, void *ctx); + typedef void (*llama_v2_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_v2_context_params { int n_ctx; // text context int n_gpu_layers; // number of layers to store in VRAM int seed; // RNG seed, -1 for random bool f16_kv; // use fp16 for KV cache - bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool logits_all; // the llama_v2_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only // called with a progress value between 0 and 1, pass NULL to disable - llama_progress_callback progress_callback; + llama_v2_progress_callback progress_callback; // context pointer passed to the progress callback void * progress_callback_user_data; }; // model file types - enum llama_ftype { - LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + enum llama_v2_ftype { + LLAMA_V2_FTYPE_ALL_F32 = 0, + LLAMA_V2_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_V2_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_V2_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_V2_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + LLAMA_V2_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + LLAMA_V2_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors + LLAMA_V2_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_V2_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_V2_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors }; - LLAMA_API struct llama_context_params llama_context_default_params(); + LLAMA_V2_API struct llama_v2_context_params llama_v2_context_default_params(); - LLAMA_API bool llama_mmap_supported(); - LLAMA_API bool llama_mlock_supported(); + LLAMA_V2_API bool llama_v2_mmap_supported(); + LLAMA_V2_API bool llama_v2_mlock_supported(); // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure - LLAMA_API struct llama_context * llama_init_from_file( + LLAMA_V2_API struct llama_v2_context * llama_v2_init_from_file( const char * path_model, - struct llama_context_params params); + struct llama_v2_context_params params); // Frees all allocated memory - LLAMA_API void llama_free(struct llama_context * ctx); + LLAMA_V2_API void llama_v2_free(struct llama_v2_context * ctx); // TODO: not great API - very likely to change // Returns 0 on success // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given - LLAMA_API int llama_model_quantize( + LLAMA_V2_API int llama_v2_model_quantize( const char * fname_inp, const char * fname_out, - enum llama_ftype ftype, + enum llama_v2_ftype ftype, int nthread); // Apply a LoRA adapter to a loaded model @@ -115,42 +115,42 @@ extern "C" { // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one // Returns 0 on success - LLAMA_API int llama_apply_lora_from_file( - struct llama_context * ctx, + LLAMA_V2_API int llama_v2_apply_lora_from_file( + struct llama_v2_context * ctx, const char * path_lora, const char * path_base_model, int n_threads); // Returns the number of tokens in the KV cache - LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); + LLAMA_V2_API int llama_v2_get_kv_cache_token_count(const struct llama_v2_context * ctx); // Sets the current rng seed. - LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); + LLAMA_V2_API void llama_v2_set_rng_seed(struct llama_v2_context * ctx, int seed); // Returns the maximum size in bytes of the state (rng, logits, embedding // and kv_cache) - will often be smaller after compacting tokens - LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); + LLAMA_V2_API size_t llama_v2_get_state_size(const struct llama_v2_context * ctx); // Copies the state to the specified destination address. // Destination needs to have allocated enough memory. // Returns the number of bytes copied - LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); + LLAMA_V2_API size_t llama_v2_copy_state_data(struct llama_v2_context * ctx, uint8_t * dst); // Set the state reading from the specified address // Returns the number of bytes read - LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); + LLAMA_V2_API size_t llama_v2_set_state_data(struct llama_v2_context * ctx, const uint8_t * src); // Save/load session file - LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + LLAMA_V2_API bool llama_v2_load_session_file(struct llama_v2_context * ctx, const char * path_session, llama_v2_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_V2_API bool llama_v2_save_session_file(struct llama_v2_context * ctx, const char * path_session, const llama_v2_token * tokens, size_t n_token_count); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process // n_past is the number of tokens to use from previous eval calls // Returns 0 on success - LLAMA_API int llama_eval( - struct llama_context * ctx, - const llama_token * tokens, + LLAMA_V2_API int llama_v2_eval( + struct llama_v2_context * ctx, + const llama_v2_token * tokens, int n_tokens, int n_past, int n_threads); @@ -160,101 +160,104 @@ extern "C" { // Returns the number of tokens on success, no more than n_max_tokens // Returns a negative number on failure - the number of tokens that would have been returned // TODO: not sure if correct - LLAMA_API int llama_tokenize( - struct llama_context * ctx, + LLAMA_V2_API int llama_v2_tokenize( + struct llama_v2_context * ctx, const char * text, - llama_token * tokens, + llama_v2_token * tokens, int n_max_tokens, bool add_bos); - LLAMA_API int llama_n_vocab(const struct llama_context * ctx); - LLAMA_API int llama_n_ctx (const struct llama_context * ctx); - LLAMA_API int llama_n_embd (const struct llama_context * ctx); + + std::vector legacy_llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos); - // Token logits obtained from the last call to llama_eval() + LLAMA_V2_API int llama_v2_n_vocab(const struct llama_v2_context * ctx); + LLAMA_V2_API int llama_v2_n_ctx (const struct llama_v2_context * ctx); + LLAMA_V2_API int llama_v2_n_embd (const struct llama_v2_context * ctx); + + // Token logits obtained from the last call to llama_v2_eval() // The logits for the last token are stored in the last row // Can be mutated in order to change the probabilities of the next token // Rows: n_tokens // Cols: n_vocab - LLAMA_API float * llama_get_logits(struct llama_context * ctx); + LLAMA_V2_API float * llama_v2_get_logits(struct llama_v2_context * ctx); // Get the embeddings for the input // shape: [n_embd] (1-dimensional) - LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); + LLAMA_V2_API float * llama_v2_get_embeddings(struct llama_v2_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); + LLAMA_V2_API const char * llama_v2_token_to_str(const struct llama_v2_context * ctx, llama_v2_token token); // Special tokens - LLAMA_API llama_token llama_token_bos(); - LLAMA_API llama_token llama_token_eos(); - LLAMA_API llama_token llama_token_nl(); + LLAMA_V2_API llama_v2_token llama_v2_token_bos(); + LLAMA_V2_API llama_v2_token llama_v2_token_eos(); + LLAMA_V2_API llama_v2_token llama_v2_token_nl(); // Sampling functions /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); + LLAMA_V2_API void llama_v2_sample_repetition_penalty(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, const llama_v2_token * last_tokens, size_t last_tokens_size, float penalty); /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); + LLAMA_V2_API void llama_v2_sample_frequency_and_presence_penalties(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, const llama_v2_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. - LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); + LLAMA_V2_API void llama_v2_sample_softmax(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); + LLAMA_V2_API void llama_v2_sample_top_k(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, int k, size_t min_keep); /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 - LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); + LLAMA_V2_API void llama_v2_sample_top_p(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float p, size_t min_keep); /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. - LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); + LLAMA_V2_API void llama_v2_sample_tail_free(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float z, size_t min_keep); /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. - LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); - LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); + LLAMA_V2_API void llama_v2_sample_typical(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float p, size_t min_keep); + LLAMA_V2_API void llama_v2_sample_temperature(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float temp); /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param candidates A vector of `llama_v2_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); + LLAMA_V2_API llama_v2_token llama_v2_sample_token_mirostat(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float tau, float eta, int m, float * mu); /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. + /// @param candidates A vector of `llama_v2_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); + LLAMA_V2_API llama_v2_token llama_v2_sample_token_mirostat_v2(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates, float tau, float eta, float * mu); /// @details Selects the token with the highest probability. - LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); + LLAMA_V2_API llama_v2_token llama_v2_sample_token_greedy(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates); /// @details Randomly selects a token from the candidates based on their probabilities. - LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); + LLAMA_V2_API llama_v2_token llama_v2_sample_token(struct llama_v2_context * ctx, llama_v2_token_data_array * candidates); // Performance information - LLAMA_API void llama_print_timings(struct llama_context * ctx); - LLAMA_API void llama_reset_timings(struct llama_context * ctx); + LLAMA_V2_API void llama_v2_print_timings(struct llama_v2_context * ctx); + LLAMA_V2_API void llama_v2_reset_timings(struct llama_v2_context * ctx); // Print system information - LLAMA_API const char * llama_print_system_info(void); + LLAMA_V2_API const char * llama_v2_print_system_info(void); #ifdef __cplusplus } #endif // Internal API to be implemented by llama.cpp and used by tests/benchmarks only -#ifdef LLAMA_API_INTERNAL +#ifdef LLAMA_V2_API_INTERNAL #include #include -struct ggml_tensor; +struct ggml_v2_tensor; -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); +std::vector>& llama_v2_internal_get_tensor_map(struct llama_v2_context * ctx); #endif -#endif // LLAMA_H +#endif // LLAMA_V2_H diff --git a/otherarch/neox_v2.cpp b/otherarch/neox_v2.cpp index 5ebca4fd8..d6ac629b1 100644 --- a/otherarch/neox_v2.cpp +++ b/otherarch/neox_v2.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml_v2.h" #include "otherarch.h" #include "utils.h" @@ -55,7 +55,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & } fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t qntvr = hparams.ftype / GGML_V2_QNT_VERSION_FACTOR; printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -67,7 +67,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & printf("%s: ftype = %d\n", __func__, hparams.ftype); printf("%s: qntvr = %d\n", __func__, qntvr); - hparams.ftype %= GGML_QNT_VERSION_FACTOR; + hparams.ftype %= GGML_V2_QNT_VERSION_FACTOR; } // load vocab @@ -89,8 +89,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { + ggml_v2_type wtype = ggml_v2_ftype_to_ggml_v2_type((ggml_v2_ftype) (model.hparams.ftype)); + if (wtype == GGML_V2_TYPE_COUNT) { fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", __func__, fname.c_str(), model.hparams.ftype); return ModelLoadResult::FAIL; @@ -108,34 +108,34 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + ctx_size += n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // ln_f_b - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + ctx_size += n_embd*n_vocab*ggml_v2_type_sizef(wtype); // wte - ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g - //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + ctx_size += n_embd*n_vocab*ggml_v2_type_sizef(wtype); // lmh_g + //ctx_size += n_vocab*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // lmh_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_1_b - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + ctx_size += n_layer*(3*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_attn_attn_b - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + ctx_size += n_layer*(n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_attn_proj_b - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // ln_2_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_mlp_fc_b - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + ctx_size += n_layer*(4*n_embd*n_embd*ggml_v2_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32)); // c_mlp_proj_b - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + ctx_size += n_ctx*n_layer*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_v2_type_sizef(GGML_V2_TYPE_F32); // memory_v ctx_size += (6 + 16*n_layer)*512; // object overhead @@ -144,14 +144,14 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & // create the ggml context { - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = ctx_size; params.mem_buffer = NULL; params.no_alloc = false; - model.ctx = ggml_init(params); + model.ctx = ggml_v2_init(params); if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); + fprintf(stderr, "%s: ggml_v2_init() failed\n", __func__); return ModelLoadResult::FAIL; } } @@ -166,13 +166,13 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model.layers.resize(n_layer); - model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wte = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + model.ln_f_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - //model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); + model.lmh_g = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + //model.lmh_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_vocab); // map by name model.tensors["gpt_neox.embed_in.weight"] = model.wte; @@ -186,23 +186,23 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + layer.ln_1_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + layer.c_attn_attn_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + layer.c_attn_attn_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 3*n_embd); - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.c_attn_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_g = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); + layer.ln_2_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + layer.c_mlp_fc_w = ggml_v2_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, 4*n_embd); - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.c_mlp_proj_w = ggml_v2_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_embd); // map by name model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g; @@ -236,10 +236,10 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_k = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F16, n_elements); + model.memory_v = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F16, n_elements); - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + const size_t memory_size = ggml_v2_nbytes(model.memory_k) + ggml_v2_nbytes(model.memory_v); printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); } @@ -280,7 +280,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & } auto tensor = model.tensors[name.data()]; - if (ggml_nelements(tensor) != nelements) { + if (ggml_v2_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return ModelLoadResult::FAIL; } @@ -293,20 +293,20 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & // for debugging if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_v2_type_name(ggml_v2_type(ttype)), ggml_v2_nbytes(tensor)/1024.0/1024.0, ggml_v2_nbytes(tensor)); } - size_t bpe = ggml_type_size(ggml_type(ttype)); + size_t bpe = ggml_v2_type_size(ggml_v2_type(ttype)); if(file_format==FileFormat::NEOX_1) { switch (ttype) { - case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; - case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; - case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; - case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; - case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break; - case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break; + case 0: bpe = ggml_v2_type_size(GGML_V2_TYPE_F32); break; + case 1: bpe = ggml_v2_type_size(GGML_V2_TYPE_F16); break; + case 2: bpe = ggml_v2_type_size(GGML_V2_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; + case 3: bpe = ggml_v2_type_size(GGML_V2_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; + case 5: bpe = ggml_v2_type_size(GGML_V2_TYPE_Q4_2); assert(ne[0] % 64 == 0); break; + case 6: bpe = ggml_v2_type_size(GGML_V2_TYPE_Q4_3); assert(ne[0] % 64 == 0); break; default: { fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype); @@ -315,16 +315,16 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & }; } - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + if ((nelements*bpe)/ggml_v2_blck_size(tensor->type) != ggml_v2_nbytes(tensor)) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); - ggml_free(ctx); + __func__, name.data(), ggml_v2_nbytes(tensor), nelements*bpe); + ggml_v2_free(ctx); return ModelLoadResult::RETRY_LOAD; } - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + fin.read(reinterpret_cast(tensor->data), ggml_v2_nbytes(tensor)); - total_size += ggml_nbytes(tensor); + total_size += ggml_v2_nbytes(tensor); if (++n_tensors % 8 == 0) { printf("."); fflush(stdout); @@ -343,37 +343,37 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & // feed-forward network -ggml_tensor * gpt_neox_ff( +ggml_v2_tensor * gpt_neox_ff( const gpt_neox_layer &layer, - ggml_context * ctx0, - ggml_tensor * inp) { - ggml_tensor * cur = ggml_norm(ctx0, inp); + ggml_v2_context * ctx0, + ggml_v2_tensor * inp) { + ggml_v2_tensor * cur = ggml_v2_norm(ctx0, inp); - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, layer.ln_2_g, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, layer.ln_2_g, cur), cur), - ggml_repeat(ctx0, layer.ln_2_b, cur)); + ggml_v2_repeat(ctx0, layer.ln_2_b, cur)); - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, layer.c_mlp_fc_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.c_mlp_fc_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, layer.c_mlp_fc_b, cur), cur); // GELU activation - cur = ggml_gelu(ctx0, cur); + cur = ggml_v2_gelu(ctx0, cur); // projection // cur = proj_w*cur + proj_b - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, layer.c_mlp_proj_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.c_mlp_proj_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, layer.c_mlp_proj_b, cur), cur); return cur; } @@ -420,196 +420,196 @@ bool gpt_neox_eval( } } - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = buf_size; params.mem_buffer = buf; params.no_alloc = false; - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_v2_context * ctx0 = ggml_v2_init(params); + struct ggml_v2_cgraph gf = {}; gf.n_threads = n_threads; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + struct ggml_v2_tensor * embd = ggml_v2_new_tensor_1d(ctx0, GGML_V2_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_v2_element_size(embd)); // wte - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); + struct ggml_v2_tensor * inpL = ggml_v2_get_rows(ctx0, model.wte, embd); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; + struct ggml_v2_tensor * cur; // self-attention { { - cur = ggml_norm(ctx0, inpL); + cur = ggml_v2_norm(ctx0, inpL); - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].ln_1_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + ggml_v2_repeat(ctx0, model.layers[il].ln_1_b, cur)); } // compute QKV { - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_attn_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur = ggml_v2_add(ctx0, + ggml_v2_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), cur); } - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); + struct ggml_v2_tensor * Qcur = ggml_v2_cont(ctx0, ggml_v2_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); + struct ggml_v2_tensor * Kcur = ggml_v2_cont(ctx0, ggml_v2_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); + struct ggml_v2_tensor * Vcur = ggml_v2_cont(ctx0, ggml_v2_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); // using mode = 2 for GPT-NeoX mode - Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2); - Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2); + Qcur = ggml_v2_rope_inplace(ctx0, Qcur, n_past, n_rot, 2); + Kcur = ggml_v2_rope_inplace(ctx0, Kcur, n_past, n_rot, 2); // store key and value to memory { - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + Vcur = ggml_v2_transpose(ctx0, ggml_v2_reshape_2d(ctx0, Vcur, n_embd, N)); - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, - ( n_ctx)*ggml_element_size(model.memory_v), - (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); + struct ggml_v2_tensor * k = ggml_v2_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_v2_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_v2_tensor * v = ggml_v2_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_v2_element_size(model.memory_v), + (il*n_ctx)*ggml_v2_element_size(model.memory_v)*n_embd + n_past*ggml_v2_element_size(model.memory_v)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Kcur, k)); + ggml_v2_build_forward_expand(&gf, ggml_v2_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - struct ggml_tensor * Q = - ggml_permute(ctx0, + struct ggml_v2_tensor * Q = + ggml_v2_permute(ctx0, Qcur, 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + struct ggml_v2_tensor * K = + ggml_v2_permute(ctx0, + ggml_v2_reshape_3d(ctx0, + ggml_v2_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_v2_element_size(model.memory_k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_v2_tensor * KQ = ggml_v2_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, + struct ggml_v2_tensor * KQ_scaled = + ggml_v2_scale_inplace(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ggml_v2_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) ); // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + struct ggml_v2_tensor * KQ_masked = ggml_v2_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + struct ggml_v2_tensor * KQ_soft_max = ggml_v2_soft_max_inplace(ctx0, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V = - ggml_view_3d(ctx0, model.memory_v, + struct ggml_v2_tensor * V = + ggml_v2_view_3d(ctx0, model.memory_v, n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(model.memory_v), - n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, - il*n_ctx*ggml_element_size(model.memory_v)*n_embd); + n_ctx*ggml_v2_element_size(model.memory_v), + n_ctx*ggml_v2_element_size(model.memory_v)*n_embd/n_head, + il*n_ctx*ggml_v2_element_size(model.memory_v)*n_embd); // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + struct ggml_v2_tensor * KQV = ggml_v2_mul_mat(ctx0, V, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_v2_tensor * KQV_merged = ggml_v2_permute(ctx0, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) - cur = ggml_cpy(ctx0, + cur = ggml_v2_cpy(ctx0, KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_v2_new_tensor_2d(ctx0, GGML_V2_TYPE_F32, n_embd, N)); // projection { - cur = ggml_mul_mat(ctx0, + cur = ggml_v2_mul_mat(ctx0, model.layers[il].c_attn_proj_w, cur); - cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); + cur = ggml_v2_add(ctx0, ggml_v2_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); } } if (hparams.par_res == 0) { - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); + struct ggml_v2_tensor * inpFF = ggml_v2_add(ctx0, cur, inpL); cur = gpt_neox_ff(model.layers[il], ctx0, inpFF); // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); + inpL = ggml_v2_add(ctx0, cur, inpFF); } else { - struct ggml_tensor * inpFF = cur; + struct ggml_v2_tensor * inpFF = cur; // this is independent of the self-attention result, so it could be done in parallel to the self-attention // note here we pass inpL instead of cur cur = gpt_neox_ff(model.layers[il], ctx0, inpL); // layer input + FF - cur = ggml_add(ctx0, cur, inpFF); + cur = ggml_v2_add(ctx0, cur, inpFF); // input for next layer - inpL = ggml_add(ctx0, cur, inpL); + inpL = ggml_v2_add(ctx0, cur, inpL); } } // norm { - inpL = ggml_norm(ctx0, inpL); + inpL = ggml_v2_norm(ctx0, inpL); // inpL = ln_f_g*inpL + ln_f_b - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL = ggml_v2_add(ctx0, + ggml_v2_mul(ctx0, + ggml_v2_repeat(ctx0, model.ln_f_g, inpL), inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); + ggml_v2_repeat(ctx0, model.ln_f_b, inpL)); } // lm_head { - inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); + inpL = ggml_v2_mul_mat(ctx0, model.lmh_g, inpL); - //inpL = ggml_add(ctx0, - // ggml_repeat(ctx0, model.lmh_b, inpL), + //inpL = ggml_v2_add(ctx0, + // ggml_v2_repeat(ctx0, model.lmh_b, inpL), // inpL); } // logits -> probs - //inpL = ggml_soft_max_inplace(ctx0, inpL); + //inpL = ggml_v2_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_v2_build_forward_expand(&gf, inpL); + ggml_v2_graph_compute (ctx0, &gf); //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + // ggml_v2_graph_print (&gf); + // ggml_v2_graph_dump_dot(&gf, NULL, "gpt-2.dot"); //} //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + //memcpy(embd_w.data(), ggml_v2_get_data(inpL), sizeof(float)*n_vocab*N); // return result for just the last token embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(embd_w.data(), (float *) ggml_v2_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); if (mem_per_token == 0) { - mem_per_token = ggml_used_mem(ctx0)/N; + mem_per_token = ggml_v2_used_mem(ctx0)/N; } - //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + //printf("used_mem = %zu\n", ggml_v2_used_mem(ctx0)); - ggml_free(ctx0); + ggml_v2_free(ctx0); return true; } \ No newline at end of file diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index c8399f911..3644dc006 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -46,6 +46,26 @@ struct gptj_layer { struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility struct ggml_tensor * c_mlp_proj_b; }; +struct gptj_layer_v2 { + // normalization + struct ggml_v2_tensor * ln_1_g; + struct ggml_v2_tensor * ln_1_b; + + // attention + struct ggml_v2_tensor * c_attn_q_proj_w; + struct ggml_v2_tensor * c_attn_k_proj_w; + struct ggml_v2_tensor * c_attn_v_proj_w; + + struct ggml_v2_tensor * c_attn_proj_w; + + // ff + struct ggml_v2_tensor * c_mlp_fc_w; + struct ggml_v2_tensor * c_mlp_fc_b; + + struct ggml_v2_tensor * c_mlp_proj_w; + struct ggml_v2_tensor * c_mlp_proj_w_trans; //for backwards compatibility + struct ggml_v2_tensor * c_mlp_proj_b; +}; struct gptj_layer_v1 { // normalization struct ggml_v1_tensor * ln_1_g; @@ -90,6 +110,29 @@ struct gptj_model_v1 { std::map tensors; }; +struct gptj_model_v2 { + gptj_hparams hparams; + + // normalization + struct ggml_v2_tensor * ln_f_g; + struct ggml_v2_tensor * ln_f_b; + + struct ggml_v2_tensor * wte; // position embedding + + struct ggml_v2_tensor * lmh_g; // language model head + struct ggml_v2_tensor * lmh_b; // language model bias + + std::vector layers; + + // key + value memory + struct ggml_v2_tensor * memory_k; + struct ggml_v2_tensor * memory_v; + + // + struct ggml_v2_context * ctx; + std::map tensors; +}; + struct gptj_model { gptj_hparams hparams; @@ -167,6 +210,50 @@ struct gpt2_v1_model { std::map tensors; }; +struct gpt2_layer_v2 { + // normalization + struct ggml_v2_tensor * ln_1_g; + struct ggml_v2_tensor * ln_1_b; + + struct ggml_v2_tensor * ln_2_g; + struct ggml_v2_tensor * ln_2_b; + + // attention + struct ggml_v2_tensor * c_attn_attn_w; + struct ggml_v2_tensor * c_attn_attn_b; + + struct ggml_v2_tensor * c_attn_proj_w; + struct ggml_v2_tensor * c_attn_proj_b; + + // mlp + struct ggml_v2_tensor * c_mlp_fc_w; + struct ggml_v2_tensor * c_mlp_fc_b; + + struct ggml_v2_tensor * c_mlp_proj_w; + struct ggml_v2_tensor * c_mlp_proj_b; +}; + +struct gpt2_v2_model { + gpt2_hparams hparams; + + // normalization + struct ggml_v2_tensor * ln_f_g; + struct ggml_v2_tensor * ln_f_b; + + struct ggml_v2_tensor * wte; // position embedding + struct ggml_v2_tensor * wpe; // token embedding + struct ggml_v2_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_v2_tensor * memory_k; + struct ggml_v2_tensor * memory_v; + + // + struct ggml_v2_context * ctx; + std::map tensors; +}; struct gpt2_layer { // normalization @@ -225,6 +312,53 @@ struct gpt_neox_hparams { int32_t ftype = 1; }; +struct gpt_neox_layer_v2 { + // pre normalization + struct ggml_v2_tensor * ln_1_g; + struct ggml_v2_tensor * ln_1_b; + + // attention + struct ggml_v2_tensor * c_attn_attn_w; + struct ggml_v2_tensor * c_attn_attn_b; + + struct ggml_v2_tensor * c_attn_proj_w; + struct ggml_v2_tensor * c_attn_proj_b; + + // post normalization + struct ggml_v2_tensor * ln_2_g; + struct ggml_v2_tensor * ln_2_b; + + // ff + struct ggml_v2_tensor * c_mlp_fc_w; + struct ggml_v2_tensor * c_mlp_fc_b; + + struct ggml_v2_tensor * c_mlp_proj_w; + struct ggml_v2_tensor * c_mlp_proj_b; +}; + +struct gpt_neox_v2_model { + gpt_neox_hparams hparams; + + // normalization + struct ggml_v2_tensor * ln_f_g; + struct ggml_v2_tensor * ln_f_b; + + struct ggml_v2_tensor * wte; // position embedding + + struct ggml_v2_tensor * lmh_g; // language model head + //struct ggml_tensor * lmh_b; // language model bias + + std::vector layers; + + // key + value memory + struct ggml_v2_tensor * memory_k; + struct ggml_v2_tensor * memory_v; + + // + struct ggml_v2_context * ctx; + std::map tensors; +}; + struct gpt_neox_layer { // pre normalization struct ggml_tensor * ln_1_g; diff --git a/otherarch/rwkv_v2.cpp b/otherarch/rwkv_v2.cpp index 6aa25e9f1..a58830dce 100644 --- a/otherarch/rwkv_v2.cpp +++ b/otherarch/rwkv_v2.cpp @@ -4,7 +4,7 @@ #include "otherarch.h" #include "rwkv_v2.h" -#include "ggml.h" +#include "ggml_v2.h" #include #include @@ -48,21 +48,21 @@ bool read_int32(FILE * file, int32_t * dest) { return true; } -#define GGML_TYPE_UNKNOWN GGML_TYPE_COUNT +#define GGML_V2_TYPE_UNKNOWN GGML_V2_TYPE_COUNT #define FORMAT_TYPE_COUNT 10 -static const ggml_type FORMAT_TYPE_TO_GGML_TYPE[FORMAT_TYPE_COUNT] = { - GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_UNKNOWN, // Unused - GGML_TYPE_Q4_2, - GGML_TYPE_UNKNOWN, // Unused - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, - GGML_TYPE_Q8_0 +static const ggml_v2_type FORMAT_TYPE_TO_GGML_V2_TYPE[FORMAT_TYPE_COUNT] = { + GGML_V2_TYPE_F32, + GGML_V2_TYPE_F16, + GGML_V2_TYPE_Q4_0, + GGML_V2_TYPE_Q4_1, + GGML_V2_TYPE_UNKNOWN, // Unused + GGML_V2_TYPE_Q4_2, + GGML_V2_TYPE_UNKNOWN, // Unused + GGML_V2_TYPE_Q5_0, + GGML_V2_TYPE_Q5_1, + GGML_V2_TYPE_Q8_0 }; static int32_t format_name_to_format_type(const char * format_name) { @@ -79,29 +79,29 @@ static int32_t format_name_to_format_type(const char * format_name) { // --- Model definition and loading utilities --- struct rwkv_layer { - struct ggml_tensor * ln1_weight; - struct ggml_tensor * ln1_bias; + struct ggml_v2_tensor * ln1_weight; + struct ggml_v2_tensor * ln1_bias; // RWKV, also called "attention" by the author. - struct ggml_tensor * att_time_mix_k; - struct ggml_tensor * att_time_mix_v; - struct ggml_tensor * att_time_mix_r; - struct ggml_tensor * att_time_first; - struct ggml_tensor * att_time_decay; - struct ggml_tensor * att_key; - struct ggml_tensor * att_value; - struct ggml_tensor * att_receptance; - struct ggml_tensor * att_output; + struct ggml_v2_tensor * att_time_mix_k; + struct ggml_v2_tensor * att_time_mix_v; + struct ggml_v2_tensor * att_time_mix_r; + struct ggml_v2_tensor * att_time_first; + struct ggml_v2_tensor * att_time_decay; + struct ggml_v2_tensor * att_key; + struct ggml_v2_tensor * att_value; + struct ggml_v2_tensor * att_receptance; + struct ggml_v2_tensor * att_output; - struct ggml_tensor * ln2_weight; - struct ggml_tensor * ln2_bias; + struct ggml_v2_tensor * ln2_weight; + struct ggml_v2_tensor * ln2_bias; // FFN. - struct ggml_tensor * ffn_time_mix_k; - struct ggml_tensor * ffn_time_mix_r; - struct ggml_tensor * ffn_key; - struct ggml_tensor * ffn_value; - struct ggml_tensor * ffn_receptance; + struct ggml_v2_tensor * ffn_time_mix_k; + struct ggml_v2_tensor * ffn_time_mix_r; + struct ggml_v2_tensor * ffn_key; + struct ggml_v2_tensor * ffn_value; + struct ggml_v2_tensor * ffn_receptance; }; struct rwkv_model { @@ -111,23 +111,23 @@ struct rwkv_model { // 0 for float32, 1 for float16. int32_t data_type; - struct ggml_tensor * emb; + struct ggml_v2_tensor * emb; - struct ggml_tensor * ln0_weight; - struct ggml_tensor * ln0_bias; + struct ggml_v2_tensor * ln0_weight; + struct ggml_v2_tensor * ln0_bias; std::vector layers; - struct ggml_tensor * ln_out_weight; - struct ggml_tensor * ln_out_bias; + struct ggml_v2_tensor * ln_out_weight; + struct ggml_v2_tensor * ln_out_bias; - struct ggml_tensor * head; + struct ggml_v2_tensor * head; }; // Finds model parameter by key and sets it into dest. // If the parameter was not found, returns false. -bool set_parameter(std::unordered_map * parameters, char * key, struct ggml_tensor ** dest) { - struct ggml_tensor * parameter = (*parameters)[key]; +bool set_parameter(std::unordered_map * parameters, char * key, struct ggml_v2_tensor ** dest) { + struct ggml_v2_tensor * parameter = (*parameters)[key]; RWKV_ASSERT_FALSE(parameter != NULL, "Parameter %s not found in model file", key); *dest = parameter; return true; @@ -135,7 +135,7 @@ bool set_parameter(std::unordered_map * param // Finds block parameter by block index and key and sets it into dest. // If the parameter was not found, returns false. -bool set_block_parameter(std::unordered_map * parameters, int32_t block_index, char * key, struct ggml_tensor ** dest) { +bool set_block_parameter(std::unordered_map * parameters, int32_t block_index, char * key, struct ggml_v2_tensor ** dest) { char full_key[128]; sprintf(full_key, "blocks.%d.%s", block_index, key); return set_parameter(parameters, full_key, dest); @@ -167,28 +167,28 @@ void rwkv_max_impl(const int n_cols, float * dest, const float * src0, const flo } } -struct ggml_tensor * rwkv_exp(ggml_context * ctx, struct ggml_tensor * x) { - return ggml_map_unary_f32(ctx, x, rwkv_exp_impl); +struct ggml_v2_tensor * rwkv_exp(ggml_v2_context * ctx, struct ggml_v2_tensor * x) { + return ggml_v2_map_unary_f32(ctx, x, rwkv_exp_impl); } -struct ggml_tensor * rwkv_1_minus_x(ggml_context * ctx, struct ggml_tensor * x) { - return ggml_map_unary_f32(ctx, x, rwkv_1_minus_x_impl); +struct ggml_v2_tensor * rwkv_1_minus_x(ggml_v2_context * ctx, struct ggml_v2_tensor * x) { + return ggml_v2_map_unary_f32(ctx, x, rwkv_1_minus_x_impl); } -struct ggml_tensor * rwkv_sigmoid(ggml_context * ctx, struct ggml_tensor * x) { - return ggml_map_unary_f32(ctx, x, rwkv_sigmoid_impl); +struct ggml_v2_tensor * rwkv_sigmoid(ggml_v2_context * ctx, struct ggml_v2_tensor * x) { + return ggml_v2_map_unary_f32(ctx, x, rwkv_sigmoid_impl); } -struct ggml_tensor * rwkv_max(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y) { - return ggml_map_binary_f32(ctx, x, y, rwkv_max_impl); +struct ggml_v2_tensor * rwkv_max(ggml_v2_context * ctx, struct ggml_v2_tensor * x, struct ggml_v2_tensor * y) { + return ggml_v2_map_binary_f32(ctx, x, y, rwkv_max_impl); } -struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) { +struct ggml_v2_tensor * rwkv_layer_norm(ggml_v2_context * ctx, struct ggml_v2_tensor * x, struct ggml_v2_tensor * weight, struct ggml_v2_tensor * bias) { // LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias` - // Looks like ggml_norm does the first part, we only need to apply weight & bias. - x = ggml_norm(ctx, x); - x = ggml_mul(ctx, x, weight); - x = ggml_add(ctx, x, bias); + // Looks like ggml_v2_norm does the first part, we only need to apply weight & bias. + x = ggml_v2_norm(ctx, x); + x = ggml_v2_mul(ctx, x, weight); + x = ggml_v2_add(ctx, x, bias); return x; } @@ -196,12 +196,12 @@ struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct rwkv_context { struct rwkv_model * model; - struct ggml_tensor * token_index; - struct ggml_tensor * state; - struct ggml_tensor ** state_parts; - struct ggml_tensor * logits; - struct ggml_context * ctx; - struct ggml_cgraph * graph; + struct ggml_v2_tensor * token_index; + struct ggml_v2_tensor * state; + struct ggml_v2_tensor ** state_parts; + struct ggml_v2_tensor * logits; + struct ggml_v2_context * ctx; + struct ggml_v2_cgraph * graph; bool freed; float * state_in = 0; //stores input state, or use null for a new state float * state_out = 0; //stores address of output state buffer @@ -267,13 +267,13 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr size_t(256) * 1024 * 1024; // Initialize ggml - struct ggml_init_params params; + struct ggml_v2_init_params params; params.mem_size = memory_required; params.mem_buffer = NULL; params.no_alloc = false; - struct ggml_context * ctx = ggml_init(params); + struct ggml_v2_context * ctx = ggml_v2_init(params); - std::unordered_map parameters; + std::unordered_map parameters; while (true) { int32_t dim_count; @@ -294,22 +294,22 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr read_int32(file, &data_type); RWKV_ASSERT_NULL(data_type >= 0 && data_type < FORMAT_TYPE_COUNT, "Unsupported parameter data type %d", data_type); - ggml_type ggml_data_type = FORMAT_TYPE_TO_GGML_TYPE[data_type]; + ggml_v2_type ggml_v2_data_type = FORMAT_TYPE_TO_GGML_V2_TYPE[data_type]; - RWKV_ASSERT_NULL(ggml_data_type != GGML_TYPE_UNKNOWN, "Unsupported parameter data type %d", data_type); + RWKV_ASSERT_NULL(ggml_v2_data_type != GGML_V2_TYPE_UNKNOWN, "Unsupported parameter data type %d", data_type); - struct ggml_tensor * tensor; + struct ggml_v2_tensor * tensor; int32_t x = -1; int32_t y = -1; if (dim_count == 1) { read_int32(file, &x); - tensor = ggml_new_tensor_1d(ctx, ggml_data_type, x); + tensor = ggml_v2_new_tensor_1d(ctx, ggml_v2_data_type, x); } else if (dim_count == 2) { read_int32(file, &x); read_int32(file, &y); - tensor = ggml_new_tensor_2d(ctx, ggml_data_type, x, y); + tensor = ggml_v2_new_tensor_2d(ctx, ggml_v2_data_type, x, y); } else { abort(); } @@ -317,7 +317,7 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr std::string key(key_length, 0); RWKV_ASSERT_NULL(fread(&key[0], 1, key_length, file) == uint32_t(key_length), "Failed to read parameter key"); - RWKV_ASSERT_NULL(fread(tensor->data, 1, ggml_nbytes(tensor), file) == ggml_nbytes(tensor), "Failed to read parameter data"); + RWKV_ASSERT_NULL(fread(tensor->data, 1, ggml_v2_nbytes(tensor), file) == ggml_v2_nbytes(tensor), "Failed to read parameter data"); parameters[key] = tensor; } @@ -365,7 +365,7 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr set_parameter(¶meters, "head.weight", &(model->head)); // Verify order of dimensions - struct ggml_tensor * emb = model->emb; + struct ggml_v2_tensor * emb = model->emb; RWKV_ASSERT_NULL(emb->n_dims == 2, "Unexpected dimension count of embedding matrix %d", emb->n_dims); RWKV_ASSERT_NULL(emb->ne[0] == model->n_embed, "Unexpected dimension of embedding matrix %lld", emb->ne[0]); RWKV_ASSERT_NULL(emb->ne[1] == model->n_vocab, "Unexpected dimension of embedding matrix %lld", emb->ne[1]); @@ -374,17 +374,17 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr int32_t n_layer = model->n_layer; // Build graph - struct ggml_tensor * state = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_layer * 5 * n_embed); + struct ggml_v2_tensor * state = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_F32, n_layer * 5 * n_embed); // x = self.w.emb.weight[token] - struct ggml_tensor * token_index = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - struct ggml_tensor * x = ggml_get_rows(ctx, model->emb, token_index); + struct ggml_v2_tensor * token_index = ggml_v2_new_tensor_1d(ctx, GGML_V2_TYPE_I32, 1); + struct ggml_v2_tensor * x = ggml_v2_get_rows(ctx, model->emb, token_index); // x = self.layer_norm(x, self.w.blocks[0].ln0) x = rwkv_layer_norm(ctx, x, model->ln0_weight, model->ln0_bias); // We collect parts of new state here. Each part is (n_embed) vector. - struct ggml_tensor ** state_parts = new ggml_tensor * [n_layer * 5]; + struct ggml_v2_tensor ** state_parts = new ggml_v2_tensor * [n_layer * 5]; for (int i = 0; i < n_layer; i++) { auto layer = model->layers[i]; @@ -392,99 +392,99 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr // RWKV/time mixing { // self.layer_norm(x, self.w.blocks[i].ln1) - struct ggml_tensor * x0 = rwkv_layer_norm(ctx, x, layer.ln1_weight, layer.ln1_bias); + struct ggml_v2_tensor * x0 = rwkv_layer_norm(ctx, x, layer.ln1_weight, layer.ln1_bias); // state[5 * i + 1] - struct ggml_tensor * x_prev = ggml_view_1d(ctx, state, n_embed, (5 * i + 1) * n_embed * sizeof(float)); + struct ggml_v2_tensor * x_prev = ggml_v2_view_1d(ctx, state, n_embed, (5 * i + 1) * n_embed * sizeof(float)); // xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k) // xv = x * time_mix_v + state[5 * i + 1] * (1 - time_mix_v) // xr = x * time_mix_r + state[5 * i + 1] * (1 - time_mix_r) - struct ggml_tensor * xk = ggml_add( + struct ggml_v2_tensor * xk = ggml_v2_add( ctx, - ggml_mul(ctx, x0, layer.att_time_mix_k), - ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_k)) + ggml_v2_mul(ctx, x0, layer.att_time_mix_k), + ggml_v2_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_k)) ); - struct ggml_tensor * xv = ggml_add( + struct ggml_v2_tensor * xv = ggml_v2_add( ctx, - ggml_mul(ctx, x0, layer.att_time_mix_v), - ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_v)) + ggml_v2_mul(ctx, x0, layer.att_time_mix_v), + ggml_v2_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_v)) ); - struct ggml_tensor * xr = ggml_add( + struct ggml_v2_tensor * xr = ggml_v2_add( ctx, - ggml_mul(ctx, x0, layer.att_time_mix_r), - ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_r)) + ggml_v2_mul(ctx, x0, layer.att_time_mix_r), + ggml_v2_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_r)) ); // state[5 * i + 1] = x state_parts[5 * i + 1] = x0; // r = torch.sigmoid(rw @ xr) - struct ggml_tensor * r = rwkv_sigmoid( + struct ggml_v2_tensor * r = rwkv_sigmoid( ctx, - ggml_mul_mat(ctx, layer.att_receptance, xr) + ggml_v2_mul_mat(ctx, layer.att_receptance, xr) ); // k = kw @ xk - struct ggml_tensor * k = ggml_mul_mat(ctx, layer.att_key, xk); + struct ggml_v2_tensor * k = ggml_v2_mul_mat(ctx, layer.att_key, xk); // v = vw @ xv - struct ggml_tensor * v = ggml_mul_mat(ctx, layer.att_value, xv); + struct ggml_v2_tensor * v = ggml_v2_mul_mat(ctx, layer.att_value, xv); // aa = state[5 * i + 2] // bb = state[5 * i + 3] // pp = state[5 * i + 4] - struct ggml_tensor * aa = ggml_view_1d(ctx, state, n_embed, (5 * i + 2) * n_embed * sizeof(float)); - struct ggml_tensor * bb = ggml_view_1d(ctx, state, n_embed, (5 * i + 3) * n_embed * sizeof(float)); - struct ggml_tensor * pp = ggml_view_1d(ctx, state, n_embed, (5 * i + 4) * n_embed * sizeof(float)); + struct ggml_v2_tensor * aa = ggml_v2_view_1d(ctx, state, n_embed, (5 * i + 2) * n_embed * sizeof(float)); + struct ggml_v2_tensor * bb = ggml_v2_view_1d(ctx, state, n_embed, (5 * i + 3) * n_embed * sizeof(float)); + struct ggml_v2_tensor * pp = ggml_v2_view_1d(ctx, state, n_embed, (5 * i + 4) * n_embed * sizeof(float)); // ww = time_first + k - struct ggml_tensor * ww = ggml_add(ctx, layer.att_time_first, k); + struct ggml_v2_tensor * ww = ggml_v2_add(ctx, layer.att_time_first, k); // qq = torch.maximum(pp, ww) - struct ggml_tensor * qq = rwkv_max(ctx, pp, ww); + struct ggml_v2_tensor * qq = rwkv_max(ctx, pp, ww); // e1 = torch.exp(pp - qq) - struct ggml_tensor * e1 = rwkv_exp(ctx, ggml_sub(ctx, pp, qq)); + struct ggml_v2_tensor * e1 = rwkv_exp(ctx, ggml_v2_sub(ctx, pp, qq)); // e2 = torch.exp(ww - qq) - struct ggml_tensor * e2 = rwkv_exp(ctx, ggml_sub(ctx, ww, qq)); + struct ggml_v2_tensor * e2 = rwkv_exp(ctx, ggml_v2_sub(ctx, ww, qq)); // a = e1 * aa + e2 * v - struct ggml_tensor * a = ggml_add( + struct ggml_v2_tensor * a = ggml_v2_add( ctx, - ggml_mul(ctx, e1, aa), - ggml_mul(ctx, e2, v) + ggml_v2_mul(ctx, e1, aa), + ggml_v2_mul(ctx, e2, v) ); // b = e1 * bb + e2 - struct ggml_tensor * b = ggml_add( + struct ggml_v2_tensor * b = ggml_v2_add( ctx, - ggml_mul(ctx, e1, bb), + ggml_v2_mul(ctx, e1, bb), e2 ); // wkv = a / b - struct ggml_tensor * wkv = ggml_div(ctx, a, b); + struct ggml_v2_tensor * wkv = ggml_v2_div(ctx, a, b); // ww = pp + time_decay - ww = ggml_add(ctx, pp, layer.att_time_decay); + ww = ggml_v2_add(ctx, pp, layer.att_time_decay); // qq = torch.maximum(ww, k) qq = rwkv_max(ctx, ww, k); // e1 = torch.exp(ww - qq) - e1 = rwkv_exp(ctx, ggml_sub(ctx, ww, qq)); + e1 = rwkv_exp(ctx, ggml_v2_sub(ctx, ww, qq)); // e2 = torch.exp(k - qq) - e2 = rwkv_exp(ctx, ggml_sub(ctx, k, qq)); + e2 = rwkv_exp(ctx, ggml_v2_sub(ctx, k, qq)); // state[5 * i + 2] = e1 * aa + e2 * v - state_parts[5 * i + 2] = ggml_add( + state_parts[5 * i + 2] = ggml_v2_add( ctx, - ggml_mul(ctx, e1, aa), - ggml_mul(ctx, e2, v) + ggml_v2_mul(ctx, e1, aa), + ggml_v2_mul(ctx, e2, v) ); // state[5 * i + 3] = e1 * bb + e2 - state_parts[5 * i + 3] = ggml_add( + state_parts[5 * i + 3] = ggml_v2_add( ctx, - ggml_mul(ctx, e1, bb), + ggml_v2_mul(ctx, e1, bb), e2 ); // state[5 * i + 4] = qq state_parts[5 * i + 4] = qq; // ow @ (r * wkv) - x = ggml_add( + x = ggml_v2_add( ctx, x, - ggml_mul_mat( + ggml_v2_mul_mat( ctx, layer.att_output, - ggml_mul(ctx, r, wkv) + ggml_v2_mul(ctx, r, wkv) ) ); } @@ -492,42 +492,42 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr // FFN/channel mixing { // self.layer_norm(x, self.w.blocks[i].ln2) - struct ggml_tensor * x0 = rwkv_layer_norm(ctx, x, layer.ln2_weight, layer.ln2_bias); + struct ggml_v2_tensor * x0 = rwkv_layer_norm(ctx, x, layer.ln2_weight, layer.ln2_bias); // state[5 * i + 0] - struct ggml_tensor * x_prev = ggml_view_1d(ctx, state, n_embed, (5 * i + 0) * n_embed * sizeof(float)); + struct ggml_v2_tensor * x_prev = ggml_v2_view_1d(ctx, state, n_embed, (5 * i + 0) * n_embed * sizeof(float)); // xk = x * time_mix_k + state[5 * i + 0] * (1 - time_mix_k) // xr = x * time_mix_r + state[5 * i + 0] * (1 - time_mix_r) - struct ggml_tensor * xk = ggml_add( + struct ggml_v2_tensor * xk = ggml_v2_add( ctx, - ggml_mul(ctx, x0, layer.ffn_time_mix_k), - ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_k)) + ggml_v2_mul(ctx, x0, layer.ffn_time_mix_k), + ggml_v2_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_k)) ); - struct ggml_tensor * xr = ggml_add( + struct ggml_v2_tensor * xr = ggml_v2_add( ctx, - ggml_mul(ctx, x0, layer.ffn_time_mix_r), - ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_r)) + ggml_v2_mul(ctx, x0, layer.ffn_time_mix_r), + ggml_v2_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_r)) ); // state[5 * i + 0] = x state_parts[5 * i + 0] = x0; // r = torch.sigmoid(rw @ xr) - struct ggml_tensor * r = rwkv_sigmoid( + struct ggml_v2_tensor * r = rwkv_sigmoid( ctx, - ggml_mul_mat(ctx, layer.ffn_receptance, xr) + ggml_v2_mul_mat(ctx, layer.ffn_receptance, xr) ); // k = torch.square(torch.relu(kw @ xk)) - struct ggml_tensor * k = ggml_sqr(ctx, ggml_relu( + struct ggml_v2_tensor * k = ggml_v2_sqr(ctx, ggml_v2_relu( ctx, - ggml_mul_mat(ctx, layer.ffn_key, xk) + ggml_v2_mul_mat(ctx, layer.ffn_key, xk) )); // r * (vw @ k) - x = ggml_add( + x = ggml_v2_add( ctx, x, - ggml_mul( + ggml_v2_mul( ctx, r, - ggml_mul_mat(ctx, layer.ffn_value, k) + ggml_v2_mul_mat(ctx, layer.ffn_value, k) ) ); } @@ -537,14 +537,14 @@ struct rwkv_context * rwkv_init_from_file(const char * file_path, uint32_t n_thr x = rwkv_layer_norm(ctx, x, model->ln_out_weight, model->ln_out_bias); // x = (self.w.head.weight @ x).float() - struct ggml_tensor * logits = ggml_mul_mat(ctx, model->head, x); + struct ggml_v2_tensor * logits = ggml_v2_mul_mat(ctx, model->head, x); - struct ggml_cgraph * graph = (struct ggml_cgraph *) calloc(1, sizeof(struct ggml_cgraph)); + struct ggml_v2_cgraph * graph = (struct ggml_v2_cgraph *) calloc(1, sizeof(struct ggml_v2_cgraph)); - *graph = ggml_build_forward(logits); + *graph = ggml_v2_build_forward(logits); for (int i = 0; i < n_layer * 5; i++) { - ggml_build_forward_expand(graph, state_parts[i]); + ggml_v2_build_forward_expand(graph, state_parts[i]); } graph->n_threads = n_threads; @@ -578,15 +578,15 @@ bool rwkv_eval(struct rwkv_context * ctx, int32_t token, float * state_in, float RWKV_ASSERT_FALSE(token >= 0 && token < n_vocab, "Token is out of range 0..%d", n_vocab - 1); - ggml_set_i32_1d(ctx->token_index, 0, token); + ggml_v2_set_i32_1d(ctx->token_index, 0, token); if (state_in == NULL) { - ggml_set_f32(ctx->state, 0.0F); + ggml_v2_set_f32(ctx->state, 0.0F); for (int i = 0; i < n_layer; i++) { // state[5 * i + 4] = -1e30 - ggml_set_f32( - ggml_view_1d(ctx->ctx, ctx->state, n_embed, (5 * i + 4) * n_embed * sizeof(float)), + ggml_v2_set_f32( + ggml_v2_view_1d(ctx->ctx, ctx->state, n_embed, (5 * i + 4) * n_embed * sizeof(float)), -1e30F ); } @@ -594,10 +594,10 @@ bool rwkv_eval(struct rwkv_context * ctx, int32_t token, float * state_in, float memcpy(ctx->state->data, state_in, ctx->state->ne[0] * sizeof(float)); } - ggml_graph_compute(ctx->ctx, ctx->graph); + ggml_v2_graph_compute(ctx->ctx, ctx->graph); for (size_t i = 0; i < size_t(n_layer * 5); i++) { - struct ggml_tensor * part = ctx->state_parts[i]; + struct ggml_v2_tensor * part = ctx->state_parts[i]; memcpy(state_out + i * n_embed, part->data, part->ne[0] * sizeof(float)); } @@ -611,7 +611,7 @@ void rwkv_free(struct rwkv_context * ctx) { ctx->model->layers.~vector(); free(ctx->model); delete[] ctx->state_parts; - ggml_free(ctx->ctx); + ggml_v2_free(ctx->ctx); free(ctx->graph); free(ctx); } @@ -621,15 +621,15 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode RWKV_ASSERT_FALSE(format_type != -1, "Unsupported format \"%s\"", format_name); - ggml_type type = FORMAT_TYPE_TO_GGML_TYPE[format_type]; + ggml_v2_type type = FORMAT_TYPE_TO_GGML_V2_TYPE[format_type]; - RWKV_ASSERT_FALSE(type != GGML_TYPE_UNKNOWN, "Unsupported format \"%s\"", format_name); + RWKV_ASSERT_FALSE(type != GGML_V2_TYPE_UNKNOWN, "Unsupported format \"%s\"", format_name); // Needed to initialize FP16 lookup table { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); + struct ggml_v2_init_params params = { 0, NULL, false }; + struct ggml_v2_context * ctx = ggml_v2_init(params); + ggml_v2_free(ctx); } printf("Loading model from '%s'\n", model_file_path_in); @@ -680,7 +680,7 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode std::vector work; std::vector data_u8; - std::vector data_f16; + std::vector data_f16; std::vector data_f32; std::vector hist_all(1 << 4, 0); @@ -700,9 +700,9 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode RWKV_ASSERT_FALSE(parameter_data_type >= 0 && parameter_data_type < FORMAT_TYPE_COUNT, "Invalid parameter data type %d", parameter_data_type); - ggml_type parameter_ggml_type = FORMAT_TYPE_TO_GGML_TYPE[parameter_data_type]; + ggml_v2_type parameter_ggml_v2_type = FORMAT_TYPE_TO_GGML_V2_TYPE[parameter_data_type]; - RWKV_ASSERT_FALSE(parameter_ggml_type != GGML_TYPE_UNKNOWN, "Invalid parameter data type %d", parameter_data_type); + RWKV_ASSERT_FALSE(parameter_ggml_v2_type != GGML_V2_TYPE_UNKNOWN, "Invalid parameter data type %d", parameter_data_type); int32_t nelements = 1; int32_t ne[2] = { 1, 1 }; @@ -715,9 +715,9 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode finp.read(&name[0], key_length); { - printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name(parameter_ggml_type)); + printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_v2_type_name(parameter_ggml_v2_type)); - total_size_orig += (size_t) (nelements * ggml_type_sizef(parameter_ggml_type)); + total_size_orig += (size_t) (nelements * ggml_v2_type_sizef(parameter_ggml_v2_type)); } // Quantize only 2D tensors, except embedding and head matrices. @@ -736,10 +736,10 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode if (parameter_data_type == 1) { data_f16.resize(nelements); - finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); + finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_v2_fp16_t)); data_f32.resize(nelements); for (int i = 0; i < nelements; ++i) { - data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); + data_f32[i] = ggml_v2_fp16_to_fp32(data_f16[i]); } } else { data_f32.resize(nelements); @@ -772,23 +772,23 @@ bool rwkv_quantize_model_file(const char * model_file_path_in, const char * mode std::vector hist_cur(1 << 4, 0); switch (type) { - case GGML_TYPE_Q4_0: - cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + case GGML_V2_TYPE_Q4_0: + cur_size = ggml_v2_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; - case GGML_TYPE_Q4_1: - cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + case GGML_V2_TYPE_Q4_1: + cur_size = ggml_v2_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; - case GGML_TYPE_Q4_2: - cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + case GGML_V2_TYPE_Q4_2: + cur_size = ggml_v2_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; - case GGML_TYPE_Q5_0: - cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + case GGML_V2_TYPE_Q5_0: + cur_size = ggml_v2_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; - case GGML_TYPE_Q5_1: - cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + case GGML_V2_TYPE_Q5_1: + cur_size = ggml_v2_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; - case GGML_TYPE_Q8_0: - cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + case GGML_V2_TYPE_Q8_0: + cur_size = ggml_v2_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); break; default: { fprintf(stderr, "unsupported quantization type %d\n", type); @@ -848,18 +848,18 @@ const char * rwkv_get_system_info_string(void) { static std::string s; s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "AVX = " + std::to_string(ggml_v2_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_v2_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_v2_cpu_has_avx512()) + " | "; + s += "FMA = " + std::to_string(ggml_v2_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_v2_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_v2_cpu_has_arm_fma()) + " | "; + s += "F16C = " + std::to_string(ggml_v2_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_v2_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_v2_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_v2_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_v2_cpu_has_sse3()) + " | "; + s += "VSX = " + std::to_string(ggml_v2_cpu_has_vsx()) + " | "; return s.c_str(); } \ No newline at end of file