From 1804adb0cfee4811eaf633741503d683a46e4c77 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Nov 2024 12:58:45 +0200 Subject: [PATCH 1/3] ggml : remove ggml_scratch (#10121) ggml-ci --- ggml/include/ggml.h | 9 ------ ggml/src/ggml.c | 67 ++------------------------------------------- 2 files changed, 2 insertions(+), 74 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e5862246c..41df85557 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -655,14 +655,6 @@ extern "C" { void * abort_callback_data; }; - // scratch buffer - // TODO: deprecate and remove - struct ggml_scratch { - size_t offs; - size_t size; - void * data; - }; - struct ggml_init_params { // memory pool size_t mem_size; // bytes @@ -766,7 +758,6 @@ extern "C" { GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 59f2ed043..84f2c766b 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2018,15 +2018,11 @@ struct ggml_context { void * mem_buffer; bool mem_buffer_owned; bool no_alloc; - bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers int n_objects; struct ggml_object * objects_begin; struct ggml_object * objects_end; - - struct ggml_scratch scratch; - struct ggml_scratch scratch_save; }; struct ggml_context_container { @@ -3879,12 +3875,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, - /*.no_alloc_save =*/ params.no_alloc, /*.n_objects =*/ 0, /*.objects_begin =*/ NULL, /*.objects_end =*/ NULL, - /*.scratch =*/ { 0, 0, NULL, }, - /*.scratch_save =*/ { 0, 0, NULL, }, }; GGML_ASSERT(ctx->mem_buffer != NULL); @@ -3904,8 +3897,6 @@ void ggml_reset(struct ggml_context * ctx) { ctx->n_objects = 0; ctx->objects_begin = NULL; ctx->objects_end = NULL; - ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, }; - ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, }; } void ggml_free(struct ggml_context * ctx) { @@ -3924,14 +3915,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx) { return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; } -size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { - const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0; - - ctx->scratch = scratch; - - return result; -} - bool ggml_get_no_alloc(struct ggml_context * ctx) { return ctx->no_alloc; } @@ -3959,27 +3942,6 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { return max_size; } -// IMPORTANT: -// when creating "opt" tensors, always save and load the scratch buffer -// this is an error prone process, but it is necessary to support inplace -// operators when using scratch buffers -// TODO: implement a better way -static void ggml_scratch_save(struct ggml_context * ctx) { - // this is needed to allow opt tensors to store their data - // TODO: again, need to find a better way - ctx->no_alloc_save = ctx->no_alloc; - ctx->no_alloc = false; - - ctx->scratch_save = ctx->scratch; - ctx->scratch.data = NULL; -} - -static void ggml_scratch_load(struct ggml_context * ctx) { - ctx->no_alloc = ctx->no_alloc_save; - - ctx->scratch = ctx->scratch_save; -} - //////////////////////////////////////////////////////////////////////////////// static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) { @@ -4060,29 +4022,13 @@ static struct ggml_tensor * ggml_new_tensor_impl( size_t obj_alloc_size = 0; if (view_src == NULL && !ctx->no_alloc) { - if (ctx->scratch.data != NULL) { - // allocate tensor data in the scratch buffer - if (ctx->scratch.offs + data_size > ctx->scratch.size) { - GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", - __func__, ctx->scratch.offs + data_size, ctx->scratch.size); - assert(false); - return NULL; - } - - data = (char * const) ctx->scratch.data + ctx->scratch.offs; - - ctx->scratch.offs += data_size; - } else { - // allocate tensor data in the context's memory pool - obj_alloc_size = data_size; - } + // allocate tensor data in the context's memory pool + obj_alloc_size = data_size; } struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); GGML_ASSERT(obj_new); - // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here - struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); #ifdef __clang__ @@ -4178,24 +4124,16 @@ struct ggml_tensor * ggml_new_tensor_4d( } struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { - ggml_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - ggml_scratch_load(ctx); - ggml_set_i32(result, value); return result; } struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { - ggml_scratch_save(ctx); - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - ggml_scratch_load(ctx); - ggml_set_f32(result, value); return result; @@ -20263,7 +20201,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { uint64_t size_eval = 0; // compute size of intermediate results - // TODO: does not take into account scratch buffers !!!! for (int i = 0; i < cgraph->n_nodes; ++i) { size_eval += ggml_nbytes_pad(cgraph->nodes[i]); } From d865d1478cd4e403f82d793c2afcd0f943412f05 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Fri, 1 Nov 2024 13:33:14 +0000 Subject: [PATCH 2/3] server : fix smart selection of available slot (#10120) * Fix smart selection of available slot * minor fix * replace vectors of tokens with shorthands --- examples/server/server.cpp | 35 +++++++++---------------- examples/server/utils.hpp | 52 ++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f914ff88c..54cdb4b72 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -725,12 +725,12 @@ struct server_context { return nullptr; } - server_slot * get_available_slot(const std::string & prompt) { + server_slot * get_available_slot(const server_task & task) { server_slot * ret = nullptr; // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) { - int max_lcp_len = 0; + if (ret == nullptr && slot_prompt_similarity != 0.0f) { + int max_lcs_len = 0; float similarity = 0; for (server_slot & slot : slots) { @@ -740,25 +740,25 @@ struct server_context { } // skip the slot if it does not contains cached tokens - if (slot.prompt_tokens.empty()) { + if (slot.cache_tokens.empty()) { continue; } - // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens); + // length of the Longest Common Subsequence between the current slot's prompt and the input prompt + int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens); - // fraction of the common substring length compared to the current slot's prompt length - similarity = static_cast(lcp_len) / static_cast(slot.prompt_tokens.size()); + // fraction of the common subsequence length compared to the current slot's prompt length + similarity = static_cast(lcs_len) / static_cast(slot.cache_tokens.size()); // select the current slot if the criteria match - if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) { - max_lcp_len = lcp_len; + if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) { + max_lcs_len = lcs_len; ret = &slot; } } if (ret != nullptr) { - SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity); + SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity); } } @@ -1514,18 +1514,7 @@ struct server_context { { const int id_slot = json_value(task.data, "id_slot", -1); - server_slot * slot; - - if (id_slot != -1) { - slot = get_slot_by_id(id_slot); - } else { - std::string prompt; - if (task.data.contains("prompt") && task.data.at("prompt").is_string()) { - prompt = json_value(task.data, "prompt", std::string()); - } - - slot = get_available_slot(prompt); - } + server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); if (slot == nullptr) { // if no slot is available, we defer this task for processing later diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 58f5a5684..871a17a4f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -439,18 +439,60 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t longest_common_prefix(const std::vector & a, const std::vector & b) { +static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } -static size_t longest_common_prefix(const std::string & a, const std::string & b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} +static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) { + // check for empty sequences + if (a.empty() || b.empty()) { + return 0; + } - return i; + // get the lengths of the input sequences + int a_len = a.size(); + int b_len = b.size(); + + // initialize the maximum length of the longest common subsequence (LCS) + int max_length = 0; + + // use two rows instead of a 2D matrix to optimize space + std::vector prev_row(b_len + 1, 0); + std::vector curr_row(b_len + 1, 0); + + // iterate through the elements of a + for (int i = 1; i <= a_len; i++) { + // iterate through the elements of b + for (int j = 1; j <= b_len; j++) { + // if elements at the current positions match + if (a[i - 1] == b[j - 1]) { + // if it's the first element of either sequences, set LCS length to 1 + if (i == 1 || j == 1) { + curr_row[j] = 1; + } else { + // increment LCS length by 1 compared to the previous element + curr_row[j] = prev_row[j - 1] + 1; + } + + // update max_length if necessary + if (curr_row[j] > max_length) { + max_length = curr_row[j]; + } + } else { + // reset LCS length if elements don't match + curr_row[j] = 0; + } + } + + // update the previous row for the next iteration + prev_row = curr_row; + } + + // return the maximum length of the LCS + return max_length; } static bool ends_with(const std::string & str, const std::string & suffix) { From ba6f62eb793d6617892d252f5c04d7685d908a38 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Nov 2024 17:31:51 +0200 Subject: [PATCH 3/3] readme : update hot topics --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8fe1f4b4b..0378a674e 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics -- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669** +- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123 +- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669 - Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ----