From 1c545e51ed9c8f7ebef225ee5c35a68518f6ab5c Mon Sep 17 00:00:00 2001 From: Thomas Antony Date: Sun, 19 Mar 2023 16:59:17 -0700 Subject: [PATCH] Update llama_model_load() from master branch --- llama.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4cd69b7dd..bed40f13d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -128,7 +128,8 @@ struct llama_context /* Original code by @ggerganov */ // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { +bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) { + fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); std::vector f_buf(1024*1024); @@ -1071,9 +1072,12 @@ llama_context* llama_init_from_params(const gpt_params& params) { llama_model model{}; gpt_vocab vocab{}; + const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; + // Compute time taken to load model const int64_t t_start = ggml_time_us(); - bool ret = llama_model_load(params.model, model, vocab, 1024); + + bool ret = llama_model_load(params.model, model, vocab, params.n_ctx, memory_type); const int64_t t_end = ggml_time_us(); if(!ret) {