diff --git a/llama.cpp b/llama.cpp index 7de3c19c8..81b2e9e82 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2,6 +2,8 @@ #include "ggml.h" +#include + #include #include #include @@ -127,7 +129,8 @@ static bool llama_model_load( int n_ctx, int n_parts, ggml_type memory_type, - bool vocab_only) { + bool vocab_only, + llama_progress_handler progress) { fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); const int64_t t_start_us = ggml_time_us(); @@ -394,6 +397,10 @@ static bool llama_model_load( std::vector tmp; + if (progress.handler) { + progress.handler(0, progress.ctx); + } + for (int i = 0; i < n_parts; ++i) { const int part_id = i; //const int part_id = n_parts - i - 1; @@ -409,6 +416,11 @@ static bool llama_model_load( fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); fin.seekg(file_offset); + // stat the file for file size + struct stat st; + stat(fname_part.c_str(), &st); + const size_t file_size = st.st_size; + // load weights { int n_tensors = 0; @@ -579,6 +591,10 @@ static bool llama_model_load( //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); if (++n_tensors % 8 == 0) { + if (progress.handler) { + double current_progress = (double(i) + (double(fin.tellg()) / double(file_size))) / double(n_parts); + progress.handler(current_progress, progress.ctx); + } fprintf(stderr, "."); fflush(stderr); } @@ -596,6 +612,10 @@ static bool llama_model_load( lctx.t_load_us = ggml_time_us() - t_start_us; + if (progress.handler) { + progress.handler(1, progress.ctx); + } + return true; } @@ -1394,7 +1414,8 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str struct llama_context * llama_init_from_file( const char * path_model, - struct llama_context_params params) { + struct llama_context_params params, + llama_progress_handler progress) { ggml_time_init(); llama_context * ctx = new llama_context; @@ -1408,7 +1429,7 @@ struct llama_context * llama_init_from_file( ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) { + if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only, progress)) { fprintf(stderr, "%s: failed to load model\n", __func__); delete ctx; return nullptr; diff --git a/llama.h b/llama.h index 3df9ed1fd..e8bae5181 100644 --- a/llama.h +++ b/llama.h @@ -55,6 +55,11 @@ extern "C" { bool vocab_only; // only load the vocabulary, no weights }; + struct llama_progress_handler { + void (*handler)(double progress, void *ctx); + void *ctx; + }; + LLAMA_API struct llama_context_params llama_context_default_params(); // Various functions for loading a ggml llama model. @@ -62,7 +67,8 @@ extern "C" { // Return NULL on failure LLAMA_API struct llama_context * llama_init_from_file( const char * path_model, - struct llama_context_params params); + struct llama_context_params params, + struct llama_progress_handler progress); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 49bc232b6..96e4d0cef 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -32,7 +32,7 @@ int main(int argc, char **argv) { lparams.vocab_only = true; - ctx = llama_init_from_file(fname.c_str(), lparams); + ctx = llama_init_from_file(fname.c_str(), lparams, {NULL, NULL}); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());