From 69dd1e859ae6991383e5c0950fc485525d3cf213 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Jan 2025 21:57:46 +0200 Subject: [PATCH] llama : quant (cont) ggml-ci --- src/llama-adapter.cpp | 17 ++++++++++++++++- src/llama-adapter.h | 2 -- src/llama-quant.cpp | 20 +++++++++++++++++++- src/llama-quant.h | 6 ------ src/llama.cpp | 37 ++----------------------------------- 5 files changed, 37 insertions(+), 45 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 31dac843d..9fd7edea3 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -149,7 +149,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { delete adapter; } -void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { +static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx_init; @@ -317,3 +317,18 @@ void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_ LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } + +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { + struct llama_lora_adapter * adapter = new llama_lora_adapter(); + + try { + llama_lora_adapter_init_impl(*model, path_lora, *adapter); + return adapter; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); + + delete adapter; + } + + return nullptr; +} diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 24f067db7..5f1870cc8 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -64,5 +64,3 @@ struct llama_lora_adapter { llama_lora_weight * get_weight(struct ggml_tensor * w); }; - -void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 49aae4c22..42974f8f1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa return new_size; } -void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { +static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; @@ -892,6 +892,10 @@ void llama_model_quantize_internal(const std::string & fname_inp, const std::str } } +// +// interface implementation +// + struct llama_model_quantize_params llama_model_quantize_default_params() { struct llama_model_quantize_params result = { /*.nthread =*/ 0, @@ -909,3 +913,17 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { return result; } + +uint32_t llama_model_quantize( + const char * fname_inp, + const char * fname_out, + const llama_model_quantize_params * params) { + try { + llama_model_quantize_internal(fname_inp, fname_out, params); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); + return 1; + } + + return 0; +} diff --git a/src/llama-quant.h b/src/llama-quant.h index 7c4f37ad0..6f70f09be 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,7 +1 @@ #pragma once - -#include - -struct llama_model_quantize_params; - -void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params); diff --git a/src/llama.cpp b/src/llama.cpp index 85b504589..d7110b90b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -41,17 +41,13 @@ #endif // -// helpers +// tensor loading (TODO: add llama_tesor_loader?) // static int llama_get_device_count(const llama_model & model) { return (int) model.devices.size(); } -// -// model loading and saving -// - // checks if the weight tensor can be used with the specified buffer type and device static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) { GGML_ASSERT(w != nullptr); @@ -11319,21 +11315,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { - struct llama_lora_adapter * adapter = new llama_lora_adapter(); - - try { - llama_lora_adapter_init_impl(*model, path_lora, *adapter); - return adapter; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - - delete adapter; - } - - return nullptr; -} - int32_t llama_lora_adapter_set( struct llama_context * ctx, struct llama_lora_adapter * adapter, @@ -11585,6 +11566,7 @@ struct llama_model * llama_load_model_from_file( } else if (status == -2) { LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); } + llama_free_model(model); return nullptr; } @@ -11943,20 +11925,6 @@ struct llama_context * llama_new_context_with_model( return ctx; } -uint32_t llama_model_quantize( - const char * fname_inp, - const char * fname_out, - const llama_model_quantize_params * params) { - try { - llama_model_quantize_internal(fname_inp, fname_out, params); - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); - return 1; - } - - return 0; -} - // // kv cache // @@ -12343,4 +12311,3 @@ void llama_perf_context_reset(struct llama_context * ctx) { ctx->t_eval_us = ctx->n_eval = 0; ctx->t_p_eval_us = ctx->n_p_eval = 0; } -