diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc index cf6f9f88d..fef8f09c5 100644 --- a/third_party/ggml/llama.cc +++ b/third_party/ggml/llama.cc @@ -27,6 +27,7 @@ │ │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "third_party/ggml/llama.h" +#include "libc/assert.h" #include "libc/intrin/bits.h" #include "third_party/ggml/ggml.h" #include "third_party/ggml/llama_util.h" @@ -2540,8 +2541,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) { if (kv_size) { const size_t elt_size = ggml_element_size(kv_self.k); - char buffer[4096]; - ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + llama_buffer buffer; + buffer.resize(4096); + ggml_context * cpy_ctx = ggml_init({ buffer.size, buffer.addr, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; @@ -2644,8 +2646,9 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { LLAMA_ASSERT(kv_self.buf.size == kv_size); const size_t elt_size = ggml_element_size(kv_self.k); - char buffer[4096]; - ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + llama_buffer buffer; + buffer.resize(4096); + ggml_context * cpy_ctx = ggml_init({ buffer.size, buffer.addr, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; diff --git a/third_party/ggml/llama_util.h b/third_party/ggml/llama_util.h index 614fb20fa..05184945d 100755 --- a/third_party/ggml/llama_util.h +++ b/third_party/ggml/llama_util.h @@ -377,13 +377,13 @@ struct llama_buffer { size_t size = 0; void resize(size_t size) { - delete[] addr; - addr = new uint8_t[size]; + free(addr); + addr = (uint8_t *)memalign(32, size); this->size = size; } ~llama_buffer() { - delete[] addr; + free(addr); } }; #endif