From 5790c8dac1a4f0aa80b4efee3b962d8c04c829e8 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 26 Apr 2024 09:26:16 +0200 Subject: [PATCH 01/12] bench: server add stop word for PHI-2 (#6916) --- examples/server/bench/script.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index c4c486cdf..bdf4f5abc 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -90,7 +90,8 @@ export default function () { "model": model, "stream": true, "seed": 42, - "max_tokens": max_tokens + "max_tokens": max_tokens, + "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS } const params = {method: 'POST', body: JSON.stringify(payload)}; From 7d641c26ac73956a54b204cabefe85c764823678 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 26 Apr 2024 09:26:59 +0200 Subject: [PATCH 02/12] ci: fix concurrency for pull_request_target (#6917) --- .github/workflows/bench.yml | 2 +- .github/workflows/server.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index d50af0b70..6dc91d27b 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -32,7 +32,7 @@ on: - cron: '04 2 * * *' concurrency: - group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} cancel-in-progress: true jobs: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 3e68a3c8c..499611b5a 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -23,7 +23,7 @@ on: - cron: '2 4 * * *' concurrency: - group: ${{ github.workflow }}-${{ github.ref || github.run_id }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true jobs: From d4a9afc1009f0da88d04b2c5f672d81d5ae94675 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 26 Apr 2024 09:27:49 +0200 Subject: [PATCH 03/12] ci: server: fix python installation (#6918) --- .github/workflows/server.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 499611b5a..955fd9a5a 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -100,6 +100,12 @@ jobs: -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server + - name: Python setup + id: setup_python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Tests dependencies id: test_dependencies run: | From 83b72cb086ce46a33dececc86bfe4648b6120aa8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 26 Apr 2024 10:41:53 +0300 Subject: [PATCH 04/12] Merge pull request from GHSA-p5mv-gjc5-mwqv * always use calloc clamp n_kv on failure to read a kv * ggml : alternative ctx->header.n_kv update --------- Co-authored-by: slaren --- ggml.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/ggml.c b/ggml.c index f574351f4..9f99a69d1 100644 --- a/ggml.c +++ b/ggml.c @@ -20614,7 +20614,7 @@ static void gguf_free_kv(struct gguf_kv * kv) { } struct gguf_context * gguf_init_empty(void) { - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context)); memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; @@ -20659,7 +20659,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p bool ok = true; - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context)); // read the header { @@ -20696,9 +20696,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the kv pairs { - ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); + const uint64_t n_kv = ctx->header.n_kv; - for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { + // header.n_kv will hold the actual value of pairs that were successfully read in the loop below + ctx->header.n_kv = 0; + ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv)); + + for (uint64_t i = 0; i < n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; //fprintf(stderr, "%s: reading kv %d\n", __func__, i); @@ -20747,7 +20751,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type)); + kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type)); ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); } break; @@ -20761,7 +20765,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str)); + kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str)); for (uint64_t j = 0; j < kv->value.arr.n; ++j) { ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); @@ -20777,6 +20781,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { break; } + + ctx->header.n_kv++; } if (!ok) { @@ -20789,7 +20795,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the tensor infos { - ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -20810,6 +20816,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + // TODO: return an error instead of crashing with GGML_ASSERT gguf_tensor_info_sanitize(info); if (!ok) { @@ -21291,7 +21298,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = type; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type)); + ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type)); memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type)); } @@ -21301,7 +21308,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str)); + ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str)); for (int i = 0; i < n; i++) { struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; str->n = strlen(data[i]); @@ -21328,7 +21335,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_ARRAY: { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { - const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *)); + const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *)); for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; } @@ -21416,7 +21423,7 @@ struct gguf_buf { static struct gguf_buf gguf_buf_init(size_t size) { struct gguf_buf buf = { - /*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size), + /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size), /*buf.size =*/ size, /*buf.offset =*/ 0, }; From 9e4e077ec50fde6049b128662c72d37a3c28e34b Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 26 Apr 2024 11:11:51 +0200 Subject: [PATCH 05/12] ci: server: fix python installation (#6922) --- .github/workflows/server.yml | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 955fd9a5a..67bbe6cb7 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -58,6 +58,7 @@ jobs: git \ cmake \ python3-pip \ + python3-venv \ curl \ wget \ language-pack-en \ @@ -100,16 +101,13 @@ jobs: -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies + - name: Setup python env + id: pipenv run: | - pip install -r examples/server/tests/requirements.txt + cd examples/server/tests + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt - name: Tests id: server_integration_tests From 7f5ff558eed0f732af8f25c2ab0645610bdec80c Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 26 Apr 2024 12:15:30 +0200 Subject: [PATCH 06/12] server: stop generation at `n_ctx_train` if `n_predict` is not set (#6638) * server: cap n_predict if not set to n_ctx_train * server: fix infinite loop * server: infinite loop, move in process_token server: infinite loop: set stop limit to true * minor: spaces * minor: spaces * server: include prompt tokens in the EOS limit --- examples/server/server.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3acbd17df..48ef8ff2a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1207,6 +1207,27 @@ struct server_context { LOG_VERBOSE("eos token found", {}); } + auto n_ctx_train = llama_n_ctx_train(model); + if (slot.params.n_predict < 1 && slot.ga_n == 1 + && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { + LOG_WARNING("n_predict is not set and self-context extend is disabled." + " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", { + { "id_slot", slot.id }, + { "params.n_predict", slot.params.n_predict }, + { "slot.n_prompt_tokens", slot.n_prompt_tokens }, + { "slot.n_decoded", slot.n_decoded }, + { "slot.n_predict", slot.n_predict }, + { "n_slots", params.n_parallel }, + { "slot.n_ctx", slot.n_ctx }, + { "n_ctx", n_ctx }, + { "n_ctx_train", n_ctx_train }, + { "ga_n", slot.ga_n }, + }); + slot.truncated = true; + slot.stopped_limit = true; + slot.has_next_token = false; // stop prediction + } + LOG_VERBOSE("next token", { {"id_slot", slot.id}, {"id_task", slot.id_task}, @@ -2141,7 +2162,7 @@ struct server_context { }); // process the created batch of tokens - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { + for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); for (auto & slot : slots) { From bbe3c6e76157a5d806fdc155451f0ca8936248ee Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 26 Apr 2024 12:27:25 +0200 Subject: [PATCH 07/12] ci: server: fix python installation (#6925) --- .github/workflows/server.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 67bbe6cb7..a044d6044 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -106,7 +106,7 @@ jobs: run: | cd examples/server/tests python3 -m venv venv - source venv/bin/activate + . venv/bin/activate pip install -r requirements.txt - name: Tests From 4b1c3c98b442a4c84a788fff6323f6fa7e678a5b Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Fri, 26 Apr 2024 10:05:33 -0400 Subject: [PATCH 08/12] llamafile : use 64-bit integers in sgemm (#6928) --- sgemm.cpp | 170 ++++++++++++++++++++++++++---------------------------- sgemm.h | 6 +- 2 files changed, 87 insertions(+), 89 deletions(-) diff --git a/sgemm.cpp b/sgemm.cpp index 531e12af3..4e0159804 100644 --- a/sgemm.cpp +++ b/sgemm.cpp @@ -50,7 +50,6 @@ #pragma GCC diagnostic ignored "-Wignored-attributes" #include "sgemm.h" -#include #include "ggml-impl.h" #include "ggml-quants.h" @@ -243,23 +242,23 @@ template <> inline __m512 load(const ggml_fp16_t *p) { template class tinyBLAS { public: - tinyBLAS(int k, - const TA *A, int lda, - const TB *B, int ldb, - TC *C, int ldc, + tinyBLAS(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } - void matmul(int m, int n, int task) { + void matmul(int64_t m, int64_t n, int task) { if (task == GGML_TASK_TYPE_COMPUTE) mnpack(0, m, 0, n); } private: - NOINLINE void mnpack(int m0, int m, int n0, int n) { - int mc, nc, mp, np; - switch ((std::min(m - m0, 5) << 4) | std::min(n - n0, 5)) { + NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) { #if VECTOR_REGISTERS == 32 case 0x55: mc = 5; @@ -409,27 +408,27 @@ class tinyBLAS { } template - NOINLINE void gemm(int m0, int m, int n0, int n) { - int ytiles = (m - m0) / RM; - int xtiles = (n - n0) / RN; - int tiles = xtiles * ytiles; - int duty = (tiles + nth - 1) / nth; - int start = duty * ith; - int end = start + duty; + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; if (end > tiles) end = tiles; - for (int job = start; job < end; ++job) { - int ii = m0 + job / xtiles * RM; - int jj = n0 + job % xtiles * RN; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; D Cv[RN][RM] = {}; - for (int l = 0; l < k; l += KN) - for (int j = 0; j < RN; ++j) - for (int i = 0; i < RM; ++i) + for (int64_t l = 0; l < k; l += KN) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) Cv[j][i] = madd(load(A + lda * (ii + i) + l), load(B + ldb * (jj + j) + l), Cv[j][i]); - for (int j = 0; j < RN; ++j) - for (int i = 0; i < RM; ++i) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); } } @@ -437,10 +436,10 @@ class tinyBLAS { const TA *const A; const TB *const B; TC *const C; - const int k; - const int lda; - const int ldb; - const int ldc; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; const int ith; const int nth; }; @@ -452,23 +451,23 @@ class tinyBLAS { template class tinyBLAS_Q0_ARM { public: - tinyBLAS_Q0_ARM(int k, - const TA *A, int lda, - const block_q8_0 *B, int ldb, - float *C, int ldc, + tinyBLAS_Q0_ARM(int64_t k, + const TA *A, int64_t lda, + const block_q8_0 *B, int64_t ldb, + float *C, int64_t ldc, int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } - void matmul(int m, int n, int task) { + void matmul(int64_t m, int64_t n, int task) { if (task == GGML_TASK_TYPE_COMPUTE) mnpack(0, m, 0, n); } private: - NOINLINE void mnpack(int m0, int m, int n0, int n) { - int mc, nc, mp, np; - switch ((std::min(m - m0, 3) << 4) | std::min(n - n0, 3)) { + NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) { case 0x33: mc = 3; nc = 3; @@ -524,22 +523,22 @@ class tinyBLAS_Q0_ARM { } template - NOINLINE void gemm(int m0, int m, int n0, int n) { - int ytiles = (m - m0) / RM; - int xtiles = (n - n0) / RN; - int tiles = xtiles * ytiles; - int duty = (tiles + nth - 1) / nth; - int start = duty * ith; - int end = start + duty; + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; if (end > tiles) end = tiles; - for (int job = start; job < end; ++job) { - int ii = m0 + job / xtiles * RM; - int jj = n0 + job % xtiles * RN; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; float32x4_t Cv[RN][RM] = {}; - for (int l = 0; l < k; ++l) - for (int j = 0; j < RN; ++j) - for (int i = 0; i < RM; ++i) + for (int64_t l = 0; l < k; ++l) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) Cv[j][i] = vmlaq_n_f32(Cv[j][i], vcvtq_f32_s32(vdotq_s32( vdotq_s32(vdupq_n_s32(0), @@ -549,8 +548,8 @@ class tinyBLAS_Q0_ARM { load_hi(B + ldb * (jj + j) + l))), unhalf(A[lda * (ii + i) + l].d) * unhalf(B[ldb * (jj + j) + l].d)); - for (int j = 0; j < RN; ++j) - for (int i = 0; i < RM; ++i) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); } } @@ -577,10 +576,10 @@ class tinyBLAS_Q0_ARM { const TA *const A; const block_q8_0 *const B; float *const C; - const int k; - const int lda; - const int ldb; - const int ldc; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; const int ith; const int nth; }; @@ -590,23 +589,23 @@ class tinyBLAS_Q0_ARM { template class tinyBLAS_Q0_AVX2 { public: - tinyBLAS_Q0_AVX2(int k, - const TA *A, int lda, - const TB *B, int ldb, - TC *C, int ldc, + tinyBLAS_Q0_AVX2(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, int ith, int nth) : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } - void matmul(int m, int n, int task) { + void matmul(int64_t m, int64_t n, int task) { if (task == GGML_TASK_TYPE_COMPUTE) mnpack(0, m, 0, n); } private: - void mnpack(int m0, int m, int n0, int n) { - int mc, nc, mp, np; - switch ((std::min(m - m0, 4) << 4) | std::min(n - n0, 4)) { + void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) { #if VECTOR_REGISTERS == 32 case 0x44: mc = 4; @@ -714,22 +713,22 @@ class tinyBLAS_Q0_AVX2 { } template - NOINLINE void gemm(int m0, int m, int n0, int n) { - int ytiles = (m - m0) / RM; - int xtiles = (n - n0) / RN; - int tiles = xtiles * ytiles; - int duty = (tiles + nth - 1) / nth; - int start = duty * ith; - int end = start + duty; + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; if (end > tiles) end = tiles; - for (int job = start; job < end; ++job) { - int ii = m0 + job / xtiles * RM; - int jj = n0 + job % xtiles * RN; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; __m256 Cv[RN][RM] = {}; - for (int l = 0; l < k; ++l) - for (int j = 0; j < RN; ++j) - for (int i = 0; i < RM; ++i) + for (int64_t l = 0; l < k; ++l) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) * unhalf(B[ldb * (jj + j) + l].d)), updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l), @@ -737,8 +736,8 @@ class tinyBLAS_Q0_AVX2 { _mm256_sign_epi8(load(B + ldb * (jj + j) + l), load(A + lda * (ii + i) + l))), Cv[j][i]); - for (int j = 0; j < RN; ++j) - for (int i = 0; i < RM; ++i) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); } } @@ -771,10 +770,10 @@ class tinyBLAS_Q0_AVX2 { const TA *const A; const TB *const B; TC *const C; - const int k; - const int lda; - const int ldb; - const int ldc; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; const int ith; const int nth; }; @@ -813,8 +812,8 @@ class tinyBLAS_Q0_AVX2 { * @param Ctype is GGML data type of `C` * @return true if this function was able to service the matmul request */ -bool llamafile_sgemm(int m, int n, int k, const void *A, int lda, const void *B, int ldb, void *C, - int ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) { +bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, + int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) { assert(m >= 0); assert(n >= 0); @@ -824,9 +823,6 @@ bool llamafile_sgemm(int m, int n, int k, const void *A, int lda, const void *B, assert(ldc >= m); assert(nth > 0); assert(ith < nth); - assert(1ll * lda * m <= 0x7fffffff); - assert(1ll * ldb * n <= 0x7fffffff); - assert(1ll * ldc * n <= 0x7fffffff); if (Ctype != GGML_TYPE_F32) return false; diff --git a/sgemm.h b/sgemm.h index da23b209c..f29747d0a 100644 --- a/sgemm.h +++ b/sgemm.h @@ -1,11 +1,13 @@ #pragma once +#include #include #ifdef __cplusplus extern "C" { #endif -bool llamafile_sgemm(int, int, int, const void *, int, const void *, int, - void *, int, int, int, int, int, int, int); +bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, + const void *, int64_t, void *, int64_t, int, int, + int, int, int, int); #ifdef __cplusplus } From e2764cd7ca1112d9303eba9e81c9935ee67352ff Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 26 Apr 2024 17:07:42 +0200 Subject: [PATCH 09/12] gguf : fix mismatch between alloc and free functions (#6929) --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 9f99a69d1..34eef23fc 100644 --- a/ggml.c +++ b/ggml.c @@ -20987,7 +20987,7 @@ void gguf_free(struct gguf_context * ctx) { GGML_FREE(ctx->infos); } - GGML_ALIGNED_FREE(ctx); + GGML_FREE(ctx); } const char * gguf_type_name(enum gguf_type type) { From 017e6999b5184234370b22a2f868e1be911e8d88 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 26 Apr 2024 18:39:58 +0200 Subject: [PATCH 10/12] add basic tensor data validation function (#6884) * add basic tensor data validation function * add --check-tensors command line argument tensor validation is disabled by default and can be enabled by adding `--check-tensors` to the command line arguments. quantize always validates tensors. --- common/common.cpp | 6 + common/common.h | 1 + ggml-quants.c | 284 ++++++++++++++++++++++++++++++++++++++++++++++ ggml.h | 2 + llama.cpp | 90 ++++++++++++--- llama.h | 7 +- 6 files changed, 371 insertions(+), 19 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 97f55b053..9ab8aef7e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1089,6 +1089,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.n_print = std::stoi(argv[i]); return true; } + if (arg == "--check-tensors") { + params.check_tensors = true; + return true; + } if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; @@ -1554,6 +1558,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -ptc N, --print-token-count N\n"); printf(" print token count every N tokens (default: %d)\n", params.n_print); + printf(" --check-tensors check model tensor data for invalid values\n"); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); @@ -1774,6 +1779,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; + mparams.check_tensors = params.check_tensors; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { diff --git a/common/common.h b/common/common.h index 87361e8e9..0005f143e 100644 --- a/common/common.h +++ b/common/common.h @@ -161,6 +161,7 @@ struct gpt_params { bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run + bool check_tensors = false; // validate tensor data std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V diff --git a/ggml-quants.c b/ggml-quants.c index c147531df..444d1e55e 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -12383,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) block_iq2_s * restrict y = vy; quantize_row_iq2_s_reference(x, y, k); } + +static bool validate_float(float f, size_t i) { + if (isinf(f)) { + fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i); + return false; + } + + if (isnan(f)) { + fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i); + return false; + } + + return true; +} + +static bool isinf_fp16(ggml_fp16_t f) { + return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0; +} + +static bool isnan_fp16(ggml_fp16_t f) { + return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0; +} + +static bool validate_fp16(ggml_fp16_t f, size_t i) { + if (isinf_fp16(f)) { + fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i); + return false; + } + + if (isnan_fp16(f)) { + fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i); + return false; + } + + return true; +} + +#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \ + const type * q = (const type *) (data); \ + for (size_t i = 0; i < (nb); ++i) { \ + if (!validate_fp16(q[i].d, i)) { \ + return false; \ + } \ + } + +#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \ + const type * q = (const type *) (data); \ + for (size_t i = 0; i < (nb); ++i) { \ + if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \ + return false; \ + } \ + } + +bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) { + if (type < 0 || type >= GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid type %d\n", __func__, type); + return false; + } + + if (nbytes % ggml_type_size(type) != 0) { + fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type); + return false; + } + + const size_t nb = nbytes/ggml_type_size(type); + + switch (type) { + case GGML_TYPE_F16: + { + const ggml_fp16_t * f = (const ggml_fp16_t *) data; + size_t i = 0; +#if defined(__AVX2__) + for (; i + 15 < nb; i += 16) { + __m256i v = _mm256_loadu_si256((const __m256i *)(f + i)); + __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00)); + __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00)); + int mask = _mm256_movemask_epi8(cmp); + if (mask) { + for (size_t j = 0; j < 16; ++j) { + if (!validate_fp16(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#elif defined(__ARM_NEON) + for (; i + 7 < nb; i += 8) { + uint16x8_t v = vld1q_u16(f + i); + uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00)); + uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00)); + uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0); + if (mask) { + for (size_t j = 0; j < 8; ++j) { + if (!validate_fp16(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#endif + for (; i < nb; ++i) { + if (!validate_fp16(f[i], i)) { + return false; + } + } + } break; + case GGML_TYPE_F32: + { + const float * f = (const float *) data; + size_t i = 0; +#if defined(__AVX2__) + for (; i + 7 < nb; i += 8) { + __m256i v = _mm256_loadu_si256((const __m256i *)(f + i)); + __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000)); + __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000)); + int mask = _mm256_movemask_epi8(cmp); + if (mask) { + for (size_t j = 0; j < 8; ++j) { + if (!validate_float(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#elif defined(__ARM_NEON) + for (; i + 3 < nb; i += 4) { + uint32x4_t v = vld1q_u32((const uint32_t *)f + i); + uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000)); + uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000)); + uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0); + if (mask) { + for (size_t j = 0; j < 4; ++j) { + if (!validate_float(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#endif + for (; i < nb; ++i) { + if (!validate_float(f[i], i)) { + return false; + } + } + } break; + case GGML_TYPE_F64: + { + const double * f = (const double *) data; + for (size_t i = 0; i < nb; ++i) { + if (!validate_float(f[i], i)) { + return false; + } + } + } break; + case GGML_TYPE_Q4_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb); + } break; + case GGML_TYPE_Q4_1: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m); + } break; + case GGML_TYPE_Q5_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb); + } break; + case GGML_TYPE_Q5_1: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m); + } break; + case GGML_TYPE_Q8_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb); + } break; + case GGML_TYPE_Q2_K: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin); + } break; + case GGML_TYPE_Q3_K: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb); + } break; + case GGML_TYPE_Q4_K: + { + #ifdef GGML_QKK_64 + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]); + #else + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin); + #endif + } break; + case GGML_TYPE_Q5_K: + { + #ifdef GGML_QKK_64 + VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb); + #else + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin); + #endif + } break; + case GGML_TYPE_Q6_K: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb); + } break; + case GGML_TYPE_Q8_K: + { + const block_q8_K * q = (const block_q8_K *) data; + for (size_t i = 0; i < nb; ++i) { + if (!validate_float(q[i].d, i)) { + return false; + } + } + } break; + case GGML_TYPE_IQ1_S: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb); + } break; + case GGML_TYPE_IQ1_M: + { + const block_iq1_m * q = (const block_iq1_m *) data; + for (size_t i = 0; i < nb; ++i) { + #if QK_K == 64 + if (!validate_fp16(q[i].d, i)) { + return false; + } + #else + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)q[i].scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + if (!validate_fp16(scale.f16, i)) { + return false; + } + #endif + } + } break; + case GGML_TYPE_IQ2_XXS: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb); + } break; + case GGML_TYPE_IQ2_XS: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb); + } break; + case GGML_TYPE_IQ2_S: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb); + } break; + case GGML_TYPE_IQ3_XXS: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb); + } break; + + case GGML_TYPE_IQ3_S: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb); + } break; + case GGML_TYPE_IQ4_XS: + #if QK_K != 64 + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb); + } break; + #endif + // with QK_K == 64, iq4_xs is iq4_nl + case GGML_TYPE_IQ4_NL: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_I64: + // nothing to validate + break; + default: + { + fprintf(stderr, "%s: invalid type %d\n", __func__, type); + return false; + } + } + + return true; +} diff --git a/ggml.h b/ggml.h index 4d1d77fe9..86e5a8dc5 100644 --- a/ggml.h +++ b/ggml.h @@ -762,6 +762,8 @@ extern "C" { // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); + GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); diff --git a/llama.cpp b/llama.cpp index d728bd499..e5e640016 100644 --- a/llama.cpp +++ b/llama.cpp @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -2985,6 +2986,7 @@ struct llama_model_loader { size_t n_bytes = 0; bool use_mmap = false; + bool check_tensors; llama_files files; llama_ftype ftype; @@ -3018,7 +3020,7 @@ struct llama_model_loader { std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) { + llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -3223,6 +3225,7 @@ struct llama_model_loader { } this->use_mmap = use_mmap; + this->check_tensors = check_tensors; } ~llama_model_loader() { @@ -3481,6 +3484,10 @@ struct llama_model_loader { file->seek(w.offs, SEEK_SET); file->read_raw(cur->data, ggml_nbytes(cur)); } + + if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } } size_t size_done = 0; @@ -3497,6 +3504,8 @@ struct llama_model_loader { GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; + std::vector>> validation_result; + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -3518,37 +3527,66 @@ struct llama_model_loader { if (bufs_mmap.count(weight->idx)) { buf_mmap = bufs_mmap.at(weight->idx); } + uint8_t * data = (uint8_t *) mapping->addr + weight->offs; + + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); + })); + } + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs); + ggml_backend_tensor_alloc(buf_mmap, cur, data); if (lmlocks) { const auto & lmlock = lmlocks->at(weight->idx); - lmlock->grow_to(weight->offs + ggml_nbytes(cur)); + lmlock->grow_to(weight->offs + n_size); } auto & mmap_used = mmaps_used[weight->idx]; mmap_used.first = std::min(mmap_used.first, weight->offs); mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { - ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size); + ggml_backend_tensor_set(cur, data, 0, n_size); } } else { GGML_ASSERT(weight->idx < files.size()); const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, ggml_nbytes(cur)); + file->read_raw(cur->data, n_size); + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); + })); + } } else { - read_buf.resize(ggml_nbytes(cur)); + read_buf.resize(n_size); file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), ggml_nbytes(cur)); + file->read_raw(read_buf.data(), n_size); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } } } size_done += n_size; } + // check validation results + bool validation_failed = false; + for (auto & future : validation_result) { + auto result = future.get(); + if (!result.second) { + LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); + validation_failed = true; + } + } + if (validation_failed) { + throw std::runtime_error("found tensors with invalid data"); + } + // check if this is the last call and do final cleanup if (size_done >= size_data) { // unmap offloaded tensors and metadata @@ -5975,7 +6013,7 @@ static bool llm_load_tensors( // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { try { - llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); + llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; @@ -14360,14 +14398,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { + if (nthread < 2) { + // single-thread + size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); + if (!ggml_validate_row_data(new_type, new_data, new_size)) { + throw std::runtime_error("quantized data validation failed"); + } + return new_size; + } + std::mutex mutex; int64_t counter = 0; size_t new_size = 0; - if (nthread < 2) { - // single-thread - return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); - } - auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size, + bool valid = true; + auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() { const int64_t nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; @@ -14382,7 +14426,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa } lock.unlock(); const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); - local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); + size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); + local_size += this_size; + + // validate the quantized data + const size_t row_size = ggml_row_size(new_type, n_per_row); + void * this_data = (char *) new_data + first_row * row_size; + if (!ggml_validate_row_data(new_type, this_data, this_size)) { + std::unique_lock lock(mutex); + valid = false; + break; + } } }; for (int it = 0; it < nthread - 1; ++it) { @@ -14391,6 +14445,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa compute(); for (auto & w : workers) { w.join(); } workers.clear(); + if (!valid) { + throw std::runtime_error("quantized data validation failed"); + } return new_size; } @@ -14453,7 +14510,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s auto v = (std::vector*)params->kv_overrides; kv_overrides = v->data(); } - llama_model_loader ml(fname_inp, use_mmap, kv_overrides); + llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides); ml.init_mappings(false); // no prefetching llama_model model; @@ -14814,7 +14871,7 @@ static int llama_apply_lora_from_file_internal( std::unique_ptr ml; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); ml->init_mappings(/*prefetch*/ false); // no prefetching } @@ -15073,6 +15130,7 @@ struct llama_model_params llama_model_default_params() { /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, + /*.check_tensors =*/ false, }; #ifdef GGML_USE_METAL diff --git a/llama.h b/llama.h index 8aa763672..0a79fa765 100644 --- a/llama.h +++ b/llama.h @@ -232,9 +232,10 @@ extern "C" { const struct llama_model_kv_override * kv_overrides; // Keep the booleans together to avoid misalignment during copy-by-value. - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool check_tensors; // validate model tensor data }; struct llama_context_params { From 0c4d489e29e53589bf13a801fe7c94b7b546d8f6 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 26 Apr 2024 20:06:33 +0200 Subject: [PATCH 11/12] quantize: add imatrix and dataset metadata in GGUF (#6658) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * imatrix: save the dataset file used in the output file * llama: support kv overrides type string string * common: factorize KV Overrides parsing between common and server * quantize: add imatrix n entries and dataset KV metadata quantize: factorize KV Overrides parsing between common #6656 * llama: remove kv override str_value initialization as it does not compile on some toolchain * quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count` * quantize: add imatrix filename in KV * llama: add llama_model_kv_override_free * common: add llama_model_kv_override_free common: free kv override if used after model loading * llama: finally move the string KV override value to the stack * llama : minor * no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators. Co-authored-by: slaren * kv override: ensure string termination --------- Co-authored-by: Georgi Gerganov Co-authored-by: slaren --- Makefile | 2 +- common/common.cpp | 88 ++++++++++++++------------ common/common.h | 2 + examples/imatrix/imatrix.cpp | 77 +++++++++++++---------- examples/quantize/CMakeLists.txt | 2 +- examples/quantize/quantize.cpp | 103 ++++++++++++++++++------------- examples/server/server.cpp | 36 +---------- llama.cpp | 35 ++++++----- llama.h | 12 ++-- 9 files changed, 186 insertions(+), 171 deletions(-) diff --git a/Makefile b/Makefile index 24acb8013..9dc35410a 100644 --- a/Makefile +++ b/Makefile @@ -768,7 +768,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml. $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/common/common.cpp b/common/common.cpp index 9ab8aef7e..d42fa131d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -234,8 +234,54 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } +bool parse_kv_override(const char * data, std::vector & overrides) { + const char * sep = strchr(data, '='); + if (sep == nullptr || sep - data >= 128) { + fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + return false; + } + llama_model_kv_override kvo; + std::strncpy(kvo.key, data, sep - data); + kvo.key[sep - data] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.val_f64 = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.val_bool = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.val_bool = false; + } else { + fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); + return false; + } + } else if (strncmp(sep, "str:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + if (strlen(sep) > 127) { + fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + return false; + } + strncpy(kvo.val_str, sep, 127); + kvo.val_str[127] = '\0'; + } else { + fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + return false; + } + overrides.emplace_back(std::move(kvo)); + return true; +} + bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { - llama_sampling_params& sparams = params.sparams; + llama_sampling_params & sparams = params.sparams; if (arg == "-s" || arg == "--seed") { if (++i >= argc) { @@ -1244,47 +1290,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - char* sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - return true; - } - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } - else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } - else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } - else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } - else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - return true; - } - } - else { + if (!parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; return true; } - params.kv_overrides.push_back(kvo); return true; } #ifndef LOG_DISABLE_LOGS @@ -1555,7 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -ptc N, --print-token-count N\n"); printf(" print token count every N tokens (default: %d)\n", params.n_print); printf(" --check-tensors check model tensor data for invalid values\n"); diff --git a/common/common.h b/common/common.h index 0005f143e..96a28a6ce 100644 --- a/common/common.h +++ b/common/common.h @@ -171,6 +171,8 @@ struct gpt_params { std::string image = ""; // path to an image file }; +bool parse_kv_override(const char * data, std::vector & overrides); + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 98c0e93e4..71e7a727f 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -23,6 +23,7 @@ struct Stats { }; struct StatParams { + std::string dataset; std::string ofile = "imatrix.dat"; int n_output_frequency = 10; int verbosity = 1; @@ -46,7 +47,7 @@ private: std::vector m_src1_data; std::vector m_ids; // the expert ids from ggml_mul_mat_id // - void save_imatrix(const char * file_name) const; + void save_imatrix(const char * file_name, const char * dataset) const; void keep_imatrix(int ncall) const; }; @@ -199,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } void IMatrixCollector::save_imatrix() const { - save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); + save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str()); } void IMatrixCollector::keep_imatrix(int ncall) const { @@ -207,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const { if (file_name.empty()) file_name = "imatrix.dat"; file_name += ".at_"; file_name += std::to_string(ncall); - save_imatrix(file_name.c_str()); + save_imatrix(file_name.c_str(), m_params.dataset.c_str()); } -void IMatrixCollector::save_imatrix(const char * fname) const { +void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const { std::ofstream out(fname, std::ios::binary); int n_entries = m_stats.size(); - out.write((const char*)&n_entries, sizeof(n_entries)); - for (auto& p : m_stats) { + out.write((const char *) &n_entries, sizeof(n_entries)); + for (const auto & p : m_stats) { int len = p.first.size(); - out.write((const char*)&len, sizeof(len)); + out.write((const char *) &len, sizeof(len)); out.write(p.first.c_str(), len); - out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); + out.write((const char *) &p.second.ncall, sizeof(p.second.ncall)); int nval = p.second.values.size(); - out.write((const char*)&nval, sizeof(nval)); - if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); + out.write((const char *) &nval, sizeof(nval)); + if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float)); } + + // Write the number of call the matrix was computed with + out.write((const char *) &m_last_call, sizeof(m_last_call)); + + // Write the dataset name at the end of the file to later on specify it in quantize + int n_dataset = strlen(dataset); + out.write((const char *) &n_dataset, sizeof(n_dataset)); + out.write(dataset, n_dataset); + if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); } } @@ -547,6 +557,29 @@ int main(int argc, char ** argv) { } } + gpt_params params; + params.n_batch = 512; + if (!gpt_params_parse(args.size(), args.data(), params)) { + return 1; + } + + params.logits_all = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + sparams.dataset = params.prompt_file; g_collector.set_parameters(std::move(sparams)); if (!combine_files.empty()) { @@ -585,28 +618,6 @@ int main(int argc, char ** argv) { } } - gpt_params params; - params.n_batch = 512; - if (!gpt_params_parse(args.size(), args.data(), params)) { - return 1; - } - - params.logits_all = true; - params.n_batch = std::min(params.n_batch, params.n_ctx); - - print_build_info(); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); - } - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 6f374a2bd..6b977fde8 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index da1850dfd..432cc2b4f 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -8,7 +8,6 @@ #include #include #include -#include struct quant_option { std::string name; @@ -53,6 +52,10 @@ static const std::vector QUANT_OPTIONS = { { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; +static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; @@ -113,7 +116,7 @@ static void usage(const char * executable) { exit(1); } -static void load_imatrix(const std::string & imatrix_file, std::unordered_map> & imatrix_data) { +static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map> & imatrix_data) { std::ifstream in(imatrix_file.c_str(), std::ios::binary); if (!in) { printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); @@ -160,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map dataset_as_vec(dataset_len); + in.read(dataset_as_vec.data(), dataset_len); + imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end()); + printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str()); + } + printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); + return m_last_call; } -static void prepare_imatrix(const std::string & imatrix_file, +static int prepare_imatrix(const std::string & imatrix_file, + std::string & imatrix_dataset, const std::vector & included_weights, const std::vector & excluded_weights, std::unordered_map> & imatrix_data) { + int m_last_call = -1; if (!imatrix_file.empty()) { - load_imatrix(imatrix_file, imatrix_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data); } if (imatrix_data.empty()) { - return; + return m_last_call; } if (!excluded_weights.empty()) { for (auto& name : excluded_weights) { @@ -197,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file, if (!imatrix_data.empty()) { printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); } + return m_last_call; } static ggml_type parse_ggml_type(const char * arg) { @@ -211,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) { return result; } -static bool parse_kv_override(const char * data, std::vector & overrides) { - const char* sep = strchr(data, '='); - if (sep == nullptr || sep - data >= 128) { - fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); - return false; - } - llama_model_kv_override kvo; - std::strncpy(kvo.key, data, sep - data); - kvo.key[sep - data] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); - return false; - } - } else { - fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); - return false; - } - overrides.emplace_back(std::move(kvo)); - return true; -} - int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -316,10 +298,43 @@ int main(int argc, char ** argv) { usage(argv[0]); } + std::string imatrix_dataset; std::unordered_map> imatrix_data; - prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data); + int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data); if (!imatrix_data.empty()) { params.imatrix = &imatrix_data; + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_file.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + if (!imatrix_dataset.empty()) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_dataset.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = imatrix_data.size(); + kv_overrides.emplace_back(std::move(kvo)); + } + + if (m_last_call > 0) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = m_last_call; + kv_overrides.emplace_back(std::move(kvo)); + } } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 48ef8ff2a..6f8ba3fc6 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2392,7 +2392,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n"); printf(" --chat-template JINJA_TEMPLATE\n"); @@ -2823,43 +2823,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } - char * sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - } else { + if (!parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; break; } - params.kv_overrides.push_back(kvo); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); diff --git a/llama.cpp b/llama.cpp index e5e640016..dd8b1f264 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2883,6 +2883,7 @@ namespace GGUFMeta { case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; + case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; } return "unknown"; } @@ -2894,13 +2895,16 @@ namespace GGUFMeta { __func__, override_type_to_str(ovrd->tag), ovrd->key); switch (ovrd->tag) { case LLAMA_KV_OVERRIDE_TYPE_BOOL: { - LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false"); + LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); } break; case LLAMA_KV_OVERRIDE_TYPE_INT: { - LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value); + LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); } break; case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { - LLAMA_LOG_INFO("%.6f\n", ovrd->float_value); + LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); + } break; + case LLAMA_KV_OVERRIDE_TYPE_STR: { + LLAMA_LOG_INFO("%s\n", ovrd->val_str); } break; default: // Shouldn't be possible to end up here, but just in case... @@ -2919,7 +2923,7 @@ namespace GGUFMeta { static typename std::enable_if::value, bool>::type try_override(OT & target, const struct llama_model_kv_override * ovrd) { if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { - target = ovrd->bool_value; + target = ovrd->val_bool; return true; } return false; @@ -2929,7 +2933,7 @@ namespace GGUFMeta { static typename std::enable_if::value && std::is_integral::value, bool>::type try_override(OT & target, const struct llama_model_kv_override * ovrd) { if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { - target = ovrd->int_value; + target = ovrd->val_i64; return true; } return false; @@ -2939,7 +2943,7 @@ namespace GGUFMeta { static typename std::enable_if::value, bool>::type try_override(T & target, const struct llama_model_kv_override * ovrd) { if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { - target = ovrd->float_value; + target = ovrd->val_f64; return true; } return false; @@ -2948,12 +2952,11 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type try_override(T & target, const struct llama_model_kv_override * ovrd) { - (void)target; - (void)ovrd; - if (!ovrd) { return false; } - // Currently, we should never end up here so it would be a bug if we do. - throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", - ovrd ? ovrd->key : "NULL")); + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { + target = ovrd->val_str; + return true; + } + return false; } static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { @@ -14548,11 +14551,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s for (auto & o : overrides) { if (o.key[0] == 0) break; if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { - gguf_set_val_f32(ctx_out, o.key, o.float_value); + gguf_set_val_f32(ctx_out, o.key, o.val_f64); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { - gguf_set_val_i32(ctx_out, o.key, o.int_value); + gguf_set_val_i32(ctx_out, o.key, o.val_i64); } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { - gguf_set_val_bool(ctx_out, o.key, o.bool_value); + gguf_set_val_bool(ctx_out, o.key, o.val_bool); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { + gguf_set_val_str(ctx_out, o.key, o.val_str); } else { LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); } diff --git a/llama.h b/llama.h index 0a79fa765..8b1b15ed4 100644 --- a/llama.h +++ b/llama.h @@ -195,15 +195,19 @@ extern "C" { LLAMA_KV_OVERRIDE_TYPE_INT, LLAMA_KV_OVERRIDE_TYPE_FLOAT, LLAMA_KV_OVERRIDE_TYPE_BOOL, + LLAMA_KV_OVERRIDE_TYPE_STR, }; struct llama_model_kv_override { - char key[128]; enum llama_model_kv_override_type tag; + + char key[128]; + union { - int64_t int_value; - double float_value; - bool bool_value; + int64_t val_i64; + double val_f64; + bool val_bool; + char val_str[128]; }; }; From 928e0b7013c862cf10701957b3d654aa70f11bd8 Mon Sep 17 00:00:00 2001 From: agray3 Date: Fri, 26 Apr 2024 19:08:30 +0100 Subject: [PATCH 12/12] Reset schedule earlier to allow overlap with ggml graph computation on device (#6933) * Reset schedule earlier to allow overlap with graph computation on device --- ggml-backend.c | 12 +++++++----- llama.cpp | 4 ++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index e91d97cd9..f5bdcf078 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT - memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); - memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); + if (!sched->is_reset) { + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT + memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); + memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - sched->is_reset = true; + sched->is_reset = true; + } sched->is_alloc = false; } diff --git a/llama.cpp b/llama.cpp index dd8b1f264..49f2b559e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11473,6 +11473,10 @@ static int llama_decode_internal( } } + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(lctx.sched); + return 0; }