File load progress reporting

2023-03-22 13:13:29 -04:00 · 2023-03-22 13:13:29 -04:00 · e47924fd4b
commit e47924fd4b
parent a18c19259a
3 changed files with 32 additions and 5 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2,6 +2,8 @@

 #include "ggml.h"

+#include <sys/stat.h>
+
 #include <cinttypes>
 #include <fstream>
 #include <random>
@ -127,7 +129,8 @@ static bool llama_model_load(
        int n_ctx,
        int n_parts,
        ggml_type memory_type,
-        bool vocab_only) {
+        bool vocab_only,
+        llama_progress_handler progress) {
    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    const int64_t t_start_us = ggml_time_us();
@ -394,6 +397,10 @@ static bool llama_model_load(

    std::vector<uint8_t> tmp;

+    if (progress.handler) {
+        progress.handler(0, progress.ctx);
+    }
+
    for (int i = 0; i < n_parts; ++i) {
        const int part_id = i;
        //const int part_id = n_parts - i - 1;
@ -409,6 +416,11 @@ static bool llama_model_load(
        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
        fin.seekg(file_offset);

+        // stat the file for file size
+        struct stat st;
+        stat(fname_part.c_str(), &st);
+        const size_t file_size = st.st_size;
+
        // load weights
        {
            int n_tensors = 0;
@ -579,6 +591,10 @@ static bool llama_model_load(

                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
                if (++n_tensors % 8 == 0) {
+                    if (progress.handler) {
+                        double current_progress = (double(i) + (double(fin.tellg()) / double(file_size))) / double(n_parts);
+                        progress.handler(current_progress, progress.ctx);
+                    }
                    fprintf(stderr, ".");
                    fflush(stderr);
                }
@ -596,6 +612,10 @@ static bool llama_model_load(

    lctx.t_load_us = ggml_time_us() - t_start_us;

+    if (progress.handler) {
+        progress.handler(1, progress.ctx);
+    }
+
    return true;
 }

@ -1394,7 +1414,8 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str

 struct llama_context * llama_init_from_file(
                             const char * path_model,
-            struct llama_context_params   params) {
+            struct llama_context_params   params,
+                 llama_progress_handler   progress) {
    ggml_time_init();

    llama_context * ctx = new llama_context;
@ -1408,7 +1429,7 @@ struct llama_context * llama_init_from_file(

    ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only, progress)) {
        fprintf(stderr, "%s: failed to load model\n", __func__);
        delete ctx;
        return nullptr;
--- a/llama.h
+++ b/llama.h
@ -55,6 +55,11 @@ extern "C" {
        bool vocab_only; // only load the vocabulary, no weights
    };

+    struct llama_progress_handler {
+        void (*handler)(double progress, void *ctx);
+        void *ctx;
+    };
+
    LLAMA_API struct llama_context_params llama_context_default_params();

    // Various functions for loading a ggml llama model.
@ -62,7 +67,8 @@ extern "C" {
    // Return NULL on failure
    LLAMA_API struct llama_context * llama_init_from_file(
                             const char * path_model,
-            struct llama_context_params   params);
+            struct llama_context_params   params,
+          struct llama_progress_handler   progress);

    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -32,7 +32,7 @@ int main(int argc, char **argv) {

        lparams.vocab_only = true;

-        ctx = llama_init_from_file(fname.c_str(), lparams);
+        ctx = llama_init_from_file(fname.c_str(), lparams, {NULL, NULL});

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());