From 807d1705db29dfad6dc6106814bb4874a9c3661c Mon Sep 17 00:00:00 2001 From: mudler Date: Mon, 19 Jun 2023 14:49:54 +0200 Subject: [PATCH] Workaround struct misalignment during value-copy Signed-off-by: mudler --- llama.cpp | 12 ++++++------ llama.h | 16 +++++++--------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5401db00e..eea1cd99d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -925,19 +925,19 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { - /*.n_ctx =*/ 512, - /*.n_batch =*/ 512, - /*.gpu_layers =*/ 0, - /*.main_gpu =*/ 0, - /*.tensor_split =*/ {0}, /*.low_vram =*/ false, - /*.seed =*/ -1, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, + /*.seed =*/ -1, + /*.n_ctx =*/ 512, + /*.n_batch =*/ 512, + /*.gpu_layers =*/ 0, + /*.main_gpu =*/ 0, + /*.tensor_split =*/ {0}, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, }; diff --git a/llama.h b/llama.h index 1241ba6c0..c41873ff0 100644 --- a/llama.h +++ b/llama.h @@ -71,15 +71,8 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { - int n_ctx; // text context - int n_batch; // prompt processing batch size - int n_gpu_layers; // number of layers to store in VRAM - int main_gpu; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + struct llama_context_params { bool low_vram; // if true, reduce VRAM usage at the cost of performance - int seed; // RNG seed, -1 for random - bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights @@ -87,12 +80,17 @@ extern "C" { bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only + int seed; // RNG seed, -1 for random + int n_ctx; // text context + int n_batch; // prompt processing batch size + int n_gpu_layers; // number of layers to store in VRAM + int main_gpu; // the GPU that is used for scratch and small tensors + float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback void * progress_callback_user_data; }; - // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0,