From 69dd1e859ae6991383e5c0950fc485525d3cf213 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 2 Jan 2025 21:57:46 +0200
Subject: [PATCH] llama : quant (cont)

ggml-ci
---
 src/llama-adapter.cpp | 17 ++++++++++++++++-
 src/llama-adapter.h   |  2 --
 src/llama-quant.cpp   | 20 +++++++++++++++++++-
 src/llama-quant.h     |  6 ------
 src/llama.cpp         | 37 ++-----------------------------------
 5 files changed, 37 insertions(+), 45 deletions(-)
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
index 31dac843d..9fd7edea3 100644
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -149,7 +149,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
     delete adapter;
 }
 
-void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
+static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
     LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
 
     ggml_context * ctx_init;
@@ -317,3 +317,18 @@ void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_
 
     LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
+
+struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
+    struct llama_lora_adapter * adapter = new llama_lora_adapter();
+
+    try {
+        llama_lora_adapter_init_impl(*model, path_lora, *adapter);
+        return adapter;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
+
+        delete adapter;
+    }
+
+    return nullptr;
+}
diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index 24f067db7..5f1870cc8 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -64,5 +64,3 @@ struct llama_lora_adapter {
 
     llama_lora_weight * get_weight(struct ggml_tensor * w);
 };
-
-void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter);
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 49aae4c22..42974f8f1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
     return new_size;
 }
 
-void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type default_type;
     llama_ftype ftype = params->ftype;
 
@@ -892,6 +892,10 @@ void llama_model_quantize_internal(const std::string & fname_inp, const std::str
     }
 }
 
+//
+// interface implementation
+//
+
 struct llama_model_quantize_params llama_model_quantize_default_params() {
     struct llama_model_quantize_params result = {
         /*.nthread                     =*/ 0,
@@ -909,3 +913,17 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 
     return result;
 }
+
+uint32_t llama_model_quantize(
+        const char * fname_inp,
+        const char * fname_out,
+        const llama_model_quantize_params * params) {
+    try {
+        llama_model_quantize_internal(fname_inp, fname_out, params);
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/src/llama-quant.h b/src/llama-quant.h
index 7c4f37ad0..6f70f09be 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,7 +1 @@
 #pragma once
-
-#include <string>
-
-struct llama_model_quantize_params;
-
-void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params);
diff --git a/src/llama.cpp b/src/llama.cpp
index 85b504589..d7110b90b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -41,17 +41,13 @@
 #endif
 
 //
-// helpers
+// tensor loading (TODO: add llama_tesor_loader?)
 //
 
 static int llama_get_device_count(const llama_model & model) {
     return (int) model.devices.size();
 }
 
-//
-// model loading and saving
-//
-
 // checks if the weight tensor can be used with the specified buffer type and device
 static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
     GGML_ASSERT(w != nullptr);
@@ -11319,21 +11315,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     }
 }
 
-struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
-    struct llama_lora_adapter * adapter = new llama_lora_adapter();
-
-    try {
-        llama_lora_adapter_init_impl(*model, path_lora, *adapter);
-        return adapter;
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
-
-        delete adapter;
-    }
-
-    return nullptr;
-}
-
 int32_t llama_lora_adapter_set(
             struct llama_context * ctx,
             struct llama_lora_adapter * adapter,
@@ -11585,6 +11566,7 @@ struct llama_model * llama_load_model_from_file(
         } else if (status == -2) {
             LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
         }
+
         llama_free_model(model);
         return nullptr;
     }
@@ -11943,20 +11925,6 @@ struct llama_context * llama_new_context_with_model(
     return ctx;
 }
 
-uint32_t llama_model_quantize(
-        const char * fname_inp,
-        const char * fname_out,
-        const llama_model_quantize_params * params) {
-    try {
-        llama_model_quantize_internal(fname_inp, fname_out, params);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
-        return 1;
-    }
-
-    return 0;
-}
-
 //
 // kv cache
 //
@@ -12343,4 +12311,3 @@ void llama_perf_context_reset(struct llama_context * ctx) {
     ctx->t_eval_us   = ctx->n_eval = 0;
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
-