From bd35cb0ae357185c173345f10dc89a4ff925fc25 Mon Sep 17 00:00:00 2001 From: "Gilad S." <7817232+giladgd@users.noreply.github.com> Date: Fri, 13 Sep 2024 04:54:49 +0300 Subject: [PATCH 01/51] feat: remove a sampler from a chain (#9445) * feat: remove a sampler from a chain * fix: return removed sampler * fix: safer casting --- include/llama.h | 3 +++ src/llama-sampling.cpp | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index 405af912c..744ef9d90 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1056,6 +1056,9 @@ extern "C" { LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); + // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed + LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i); + // available samplers: LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index fd1b7f919..c828dc359 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -349,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) { const auto * p = (const llama_sampler_chain *) chain->ctx; - if (i < 0 || i >= (int32_t) p->samplers.size()) { + if (i < 0 || (size_t) i >= p->samplers.size()) { return nullptr; } return p->samplers[i]; } +struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) { + auto * p = (llama_sampler_chain *) chain->ctx; + + if (i < 0 || (size_t) i >= p->samplers.size()) { + return nullptr; + } + + auto * result = p->samplers[i]; + p->samplers.erase(p->samplers.begin() + i); + + return result; +} + int llama_sampler_chain_n(const struct llama_sampler * chain) { const auto * p = (const llama_sampler_chain *) chain->ctx; From 0abc6a2c25272d5cf01384dda8ee8bfec4ba8745 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 13 Sep 2024 09:53:38 +0300 Subject: [PATCH 02/51] llama : llama_perf + option to disable timings during decode (#9355) * llama : llama_perf + option to disable timings during decode ggml-ci * common : add llama_arg * Update src/llama.cpp Co-authored-by: Xuan Son Nguyen * perf : separate functions in the API ggml-ci * perf : safer pointer handling + naming update ggml-ci * minor : better local var name * perf : abort on invalid sampler pointer ggml-ci --------- Co-authored-by: Xuan Son Nguyen --- common/arg.cpp | 8 ++ common/common.cpp | 3 +- common/common.h | 2 + common/sampling.cpp | 6 +- examples/batched-bench/batched-bench.cpp | 2 +- examples/batched.swift/Sources/main.swift | 4 +- examples/batched/batched.cpp | 4 +- examples/embedding/embedding.cpp | 2 +- examples/eval-callback/eval-callback.cpp | 2 +- examples/imatrix/imatrix.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 2 +- examples/llava/llava-cli.cpp | 4 +- examples/llava/minicpmv-cli.cpp | 2 +- examples/lookup/lookup.cpp | 3 +- examples/parallel/parallel.cpp | 2 +- examples/passkey/passkey.cpp | 2 +- examples/perplexity/perplexity.cpp | 2 +- examples/retrieval/retrieval.cpp | 2 +- examples/simple/simple.cpp | 4 +- examples/speculative/speculative.cpp | 2 +- include/llama.h | 29 ++++-- src/llama-sampling.cpp | 34 +++++++ src/llama.cpp | 103 +++++++++------------- 23 files changed, 135 insertions(+), 91 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index ce6a27614..a1cd5830f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.prompt = value; } )); + add_opt(llama_arg( + {"--no-perf"}, + format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](gpt_params & params) { + params.no_perf = true; + params.sparams.no_perf = true; + } + ).set_env("LLAMA_ARG_NO_PERF")); add_opt(llama_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", diff --git a/common/common.cpp b/common/common.cpp index c492ae0cc..f9a831ec7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } llama_kv_cache_clear(lctx); llama_synchronize(lctx); - llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_reset(lctx); } iparams.model = model; @@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_v = kv_cache_type_from_str(params.cache_type_v); diff --git a/common/common.h b/common/common.h index 23babdd09..e8025aeef 100644 --- a/common/common.h +++ b/common/common.h @@ -124,6 +124,7 @@ struct gpt_sampler_params { float mirostat_eta = 0.10f; // learning rate bool penalize_nl = false; // consider newlines as a repeatable token bool ignore_eos = false; + bool no_perf = false; // disable performance metrics std::vector samplers = { GPT_SAMPLER_TYPE_TOP_K, @@ -246,6 +247,7 @@ struct gpt_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention + bool no_perf = false; // disable performance metrics bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool logits_all = false; // return logits for all tokens in the batch diff --git a/common/sampling.cpp b/common/sampling.cpp index 4498feb11..c07b5e940 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const { struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); - lparams.no_perf = false; // TODO: control via params + lparams.no_perf = params.no_perf; auto * result = new gpt_sampler { /* .params = */ params, @@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // TODO: measure grammar performance if (gsmpl) { - llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN); + llama_perf_sampler_print(gsmpl->chain); } if (ctx) { - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); } } diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 89a4566c4..ec00fcf78 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -187,7 +187,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 9f7c49492..10f2e7fd1 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -200,8 +200,8 @@ let t_main_end = ggml_time_us() print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n") -llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT) -llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN) +llama_perf_sampler_print(smpl) +llama_perf_context_print(context) private func tokenize(text: String, add_bos: Bool) -> [llama_token] { let utf8Count = text.utf8.count diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 5d32153fe..f1df20c6e 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -229,8 +229,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index db00c6363..e94ae2955 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -306,7 +306,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); // clean up llama_batch_free(batch); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index bc7203143..af389abe1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -182,7 +182,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 032a90136..73b54da7f 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,7 +637,7 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_free(ctx); llama_free_model(model); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d7db5af72..2d90f65a0 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) { fflush(p_err->fout); } - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index e9108a9bd..12fe7345f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -308,7 +308,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); @@ -325,7 +325,7 @@ int main(int argc, char ** argv) { // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index afc74d279..3ac455e69 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -319,7 +319,7 @@ int main(int argc, char ** argv) { } } printf("\n"); - llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index fff44a499..be6f8d7d7 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -240,8 +240,7 @@ int main(int argc, char ** argv){ LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("\ntarget:\n\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index bc6301311..758393c3d 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -415,7 +415,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); // TODO: print sampling/grammar timings for all clients - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); llama_batch_free(batch); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index d3d5ab46f..52aa68bfc 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -256,7 +256,7 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 04df65b0a..29ff86bbc 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 7a360b731..d08679edb 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -292,7 +292,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx); // clean up llama_batch_free(query_batch); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 3fdc04394..0c923d4ed 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -154,8 +154,8 @@ int main(int argc, char ** argv) { __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); LOG_TEE("\n"); - llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); fprintf(stderr, "\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 214e4932b..843579acd 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { LOG_TEE("\ndraft:\n\n"); // TODO: print sampling/grammar timings for all drafts - llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); + llama_perf_context_print(ctx_dft); LOG_TEE("\ntarget:\n\n"); gpt_perf_print(ctx_tgt, smpl); diff --git a/include/llama.h b/include/llama.h index 744ef9d90..cfc8d85dc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -343,7 +343,7 @@ extern "C" { bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] - //bool no_perf; // whether to measure performance timings, TODO: implement + bool no_perf; // whether to measure performance timings // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -1176,13 +1176,30 @@ extern "C" { // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. // - enum llama_perf_type { - LLAMA_PERF_TYPE_CONTEXT = 0, - LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1, + struct llama_perf_context_data { + double t_start_ms; + double t_load_ms; + double t_p_eval_ms; + double t_eval_ms; + + int32_t n_p_eval; + int32_t n_eval; }; - LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type); - LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type); + struct llama_perf_sampler_data { + double t_sample_ms; + + int32_t n_sample; + }; + + LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); + LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); + LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); + + // NOTE: the following work only with samplers constructed via llama_sampler_chain_init + LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); + LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index c828dc359..5275b1d60 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1669,3 +1669,37 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { return LLAMA_DEFAULT_SEED; } + +// perf + +struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) { + struct llama_perf_sampler_data data = {}; + + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + const auto * ctx = (const struct llama_sampler_chain *) chain->ctx; + + data.t_sample_ms = 1e-3 * ctx->t_sample_us; + data.n_sample = std::max(0, ctx->n_sample); + + return data; +} + +void llama_perf_sampler_print(const struct llama_sampler * chain) { + const auto data = llama_perf_sampler(chain); + + LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); +} + +void llama_perf_sampler_reset(struct llama_sampler * chain) { + if (chain == nullptr || chain->iface != &llama_sampler_chain_i) { + GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__); + } + + auto * ctx = (struct llama_sampler_chain *) chain->ctx; + + ctx->t_sample_us = ctx->n_sample = 0; +} diff --git a/src/llama.cpp b/src/llama.cpp index cdc3f1856..65afcc84a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2486,6 +2486,7 @@ struct llama_cparams { bool causal_attn; bool offload_kqv; bool flash_attn; + bool no_perf; enum llama_pooling_type pooling_type; @@ -6661,8 +6662,6 @@ static bool llm_load_tensors( bool use_mlock, llama_progress_callback progress_callback, void * progress_callback_user_data) { - model.t_start_us = ggml_time_us(); - auto & hparams = model.hparams; model.split_mode = split_mode; @@ -8593,14 +8592,13 @@ static bool llm_load_tensors( } } - // loading time will be recalculate after the first eval, so - // we take page faults deferred by mmap() into consideration - model.t_load_us = ggml_time_us() - model.t_start_us; return true; } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { + model.t_start_us = ggml_time_us(); + try { llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); @@ -8662,6 +8660,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam return -1; } + // loading time will be recalculate after the first eval, so + // we take page faults deferred by mmap() into consideration + model.t_load_us = ggml_time_us() - model.t_start_us; + return 0; } @@ -17949,6 +17951,7 @@ struct llama_context_params llama_context_default_params() { /*.embeddings =*/ false, /*.offload_kqv =*/ true, /*.flash_attn =*/ false, + /*.no_perf =*/ true, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -18159,6 +18162,7 @@ struct llama_context * llama_new_context_with_model( cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; + cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; @@ -20077,10 +20081,14 @@ void llama_synchronize(struct llama_context * ctx) { // add the evaluation to the stats if (ctx->n_queued_tokens == 1) { - ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_eval++; } else if (ctx->n_queued_tokens > 1) { - ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + if (!ctx->cparams.no_perf) { + ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us; + } ctx->n_p_eval += ctx->n_queued_tokens; } @@ -20688,65 +20696,40 @@ const char * llama_print_system_info(void) { return s.c_str(); } -void llama_perf_print(const void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - const auto * p = (const struct llama_context *) ctx; +struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) { + struct llama_perf_context_data data = {}; - const double t_start_ms = 1e-3 * p->t_start_us; - const double t_end_ms = 1.00 * ggml_time_ms(); - const double t_load_ms = 1e-3 * p->t_load_us; - const double t_p_eval_ms = 1e-3 * p->t_p_eval_us; - const double t_eval_ms = 1e-3 * p->t_eval_us; - - const int32_t n_p_eval = std::max(0, p->n_p_eval); - const int32_t n_eval = std::max(1, p->n_eval); - - LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms); - LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval); - LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval)); - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - const auto * smpl = (const struct llama_sampler *) ctx; - const auto * p = (const struct llama_sampler_chain *) smpl->ctx; - - const double t_sampler_ms = 1e-3 * p->t_sample_us; - - const int32_t n_sampler = std::max(0, p->n_sample); - - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler); - } break; - default: - GGML_ABORT("invalid perf type"); + if (ctx == nullptr) { + return data; } + + data.t_start_ms = 1e-3 * ctx->t_start_us; + data.t_load_ms = 1e-3 * ctx->t_load_us; + data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; + data.t_eval_ms = 1e-3 * ctx->t_eval_us; + data.n_p_eval = std::max(1, ctx->n_p_eval); + data.n_eval = std::max(1, ctx->n_eval); + + return data; } -void llama_perf_reset(void * ctx, enum llama_perf_type type) { - switch (type) { - case LLAMA_PERF_TYPE_CONTEXT: - { - auto * p = (struct llama_context *) ctx; +void llama_perf_context_print(const struct llama_context * ctx) { + const auto data = llama_perf_context(ctx); - p->t_start_us = ggml_time_us(); - p->t_eval_us = p->n_eval = 0; - p->t_p_eval_us = p->n_p_eval = 0; - } break; - case LLAMA_PERF_TYPE_SAMPLER_CHAIN: - { - auto * smpl = (struct llama_sampler *) ctx; - auto * p = (struct llama_sampler_chain *) smpl->ctx; + const double t_end_ms = 1e-3 * ggml_time_us(); - p->t_sample_us = p->n_sample = 0; - } break; - default: - GGML_ABORT("invalid perf type"); - } + LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); +} + +void llama_perf_context_reset(struct llama_context * ctx) { + ctx->t_start_us = ggml_time_us(); + ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_p_eval_us = ctx->n_p_eval = 0; } void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) { From feff4aa8461da7c432d144c11da4802e41fef3cf Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 13 Sep 2024 14:23:11 +0200 Subject: [PATCH 03/51] server : add loading html page while model is loading (#9468) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adding loading page for '/' server requests * set content when model is loading * removed loading html file * updated cmakelist * updated makefile * cleaned up whitespace * cleanup for PR removed error * updated server test to handle 503 HTML * updated server test to handle 503 HTML * ca†ch 503 before parsing json * revert test * account for both api and web browser requests * precommit corrections * eol fix * revert changes to pre-commit * removed print statement * made loading message more descriptive * also support .html files --------- Co-authored-by: VJHack Co-authored-by: Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> --- Makefile | 1 + examples/server/CMakeLists.txt | 1 + examples/server/public/loading.html | 12 ++++++++++++ examples/server/server.cpp | 11 +++++++++-- 4 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 examples/server/public/loading.html diff --git a/Makefile b/Makefile index 8d3fd3ee8..f41887a4d 100644 --- a/Makefile +++ b/Makefile @@ -1440,6 +1440,7 @@ llama-server: \ examples/server/system-prompts.js.hpp \ examples/server/prompt-formats.js.hpp \ examples/server/json-schema-to-grammar.mjs.hpp \ + examples/server/loading.html.hpp \ common/json.hpp \ common/stb_image.h \ $(OBJ_ALL) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index dbe41f1fd..580f3a824 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -30,6 +30,7 @@ set(PUBLIC_ASSETS system-prompts.js prompt-formats.js json-schema-to-grammar.mjs + loading.html ) foreach(asset ${PUBLIC_ASSETS}) diff --git a/examples/server/public/loading.html b/examples/server/public/loading.html new file mode 100644 index 000000000..c3fd19a0f --- /dev/null +++ b/examples/server/public/loading.html @@ -0,0 +1,12 @@ + + + + + + +
+ The model is loading. Please wait.
+ The user interface will appear soon. +
+ + diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5e4dffadf..73cd6aae7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -28,6 +28,7 @@ #include "system-prompts.js.hpp" #include "prompt-formats.js.hpp" #include "json-schema-to-grammar.mjs.hpp" +#include "loading.html.hpp" #include #include @@ -2592,10 +2593,16 @@ int main(int argc, char ** argv) { return false; }; - auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) { + auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) { server_state current_state = state.load(); if (current_state == SERVER_STATE_LOADING_MODEL) { - res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + auto tmp = string_split(req.path, '.'); + if (req.path == "/" || tmp.back() == "html") { + res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); + res.status = 503; + } else { + res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); + } return false; } return true; From befaf1197fa447f61714de041828852a270659d2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 14 Sep 2024 09:50:12 +0200 Subject: [PATCH 04/51] llama : make cell_id const in inp_s_mask block (#9470) This commit makes the cell_id variable const in the inp_s_mask block. The motivation for this change is consistency with the code in the inp_s_copy block. --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 65afcc84a..1986a90fb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15826,7 +15826,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) { // clear unused states for (int i = 0; i < n_kv; ++i) { - uint32_t cell_id = i + kv_self.head; + const uint32_t cell_id = i + kv_self.head; llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id]; data[i] = (float) (kv_cell.src >= 0); From 1f4111e540bacec8d00ca9fd96417bf4c1339394 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 14 Sep 2024 10:55:05 +0300 Subject: [PATCH 05/51] cmake : use list(APPEND ...) instead of set() + dedup linker (#9463) * cmake : use list(APPEND ...) instead of set() + dedup linker ggml-ci * cmake : try fix sycl * cmake : try to fix sycl 2 * cmake : fix sycl build (#9469) * try fix sycl build * use CMAKE_CXX_FLAGS as a string variable --------- Co-authored-by: Georgi Gerganov * one more CMAKE_CXX_FLAGS fix (#9471) --------- Co-authored-by: Michael Podvitskiy --- ggml/src/CMakeLists.txt | 77 ++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index cd2dcd066..506b6dc7b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -26,6 +26,8 @@ if (NOT MSVC) endif() endif() +unset(GGML_EXTRA_LIBS) + if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) if (ACCELERATE_FRAMEWORK) @@ -35,7 +37,7 @@ if (APPLE AND GGML_ACCELERATE) add_compile_definitions(ACCELERATE_NEW_LAPACK) add_compile_definitions(ACCELERATE_LAPACK_ILP64) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + list(APPEND GGML_EXTRA_LIBS ${ACCELERATE_FRAMEWORK}) else() message(WARNING "Accelerate framework not found") endif() @@ -87,7 +89,7 @@ if (GGML_METAL) COMMENT "Generate assembly for embedded Metal library" ) - set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM}) else() if (GGML_METAL_SHADER_DEBUG) # custom command to do the following: @@ -132,7 +134,7 @@ if (GGML_METAL) ) endif() # GGML_METAL_EMBED_LIBRARY - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} + list(APPEND GGML_EXTRA_LIBS ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} @@ -157,11 +159,11 @@ if (GGML_OPENMP) add_compile_definitions(GGML_USE_OPENMP) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + list(APPEND GGML_EXTRA_LIBS OpenMP::OpenMP_C OpenMP::OpenMP_CXX) if (GGML_MUSA) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so") + list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp") + list(APPEND GGML_EXTRA_LIBS "/usr/lib/llvm-10/lib/libomp.so") endif() else() message(WARNING "OpenMP not found") @@ -244,8 +246,8 @@ if (GGML_BLAS) set(GGML_HEADERS_BLAS ../include/ggml-blas.h) set(GGML_SOURCES_BLAS ggml-blas.cpp) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + list(APPEND GGML_EXTRA_LIBS ${BLAS_LIBRARIES}) + list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS}) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" @@ -368,19 +370,19 @@ if (GGML_CUDA) if (GGML_STATIC) if (WIN32) # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas CUDA::cublasLt) else () if (GGML_MUSA) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static) + list(APPEND GGML_EXTRA_LIBS MUSA::musart_static MUSA::mublas_static) else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) endif() endif() else() if (GGML_MUSA) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas) + list(APPEND GGML_EXTRA_LIBS MUSA::musart MUSA::mublas) else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + list(APPEND GGML_EXTRA_LIBS CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() endif() @@ -388,9 +390,9 @@ if (GGML_CUDA) # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) else() if (GGML_MUSA) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... + list(APPEND GGML_EXTRA_LIBS MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... else() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + list(APPEND GGML_EXTRA_LIBS CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... endif() endif() else() @@ -495,7 +497,7 @@ if (GGML_HIPBLAS) if (CXX_IS_HIPCC) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device) + list(APPEND GGML_EXTRA_LIBS hip::device) else() set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) endif() @@ -504,7 +506,8 @@ if (GGML_HIPBLAS) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") endif() - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas) + # TODO: this "PUBLIC" here seems wrong + list(APPEND GGML_EXTRA_LIBS PUBLIC hip::host roc::rocblas roc::hipblas) endif() if (GGML_SYCL) @@ -513,7 +516,8 @@ if (GGML_SYCL) endif() check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) - if ( DEFINED ENV{ONEAPI_ROOT}) + + if (DEFINED ENV{ONEAPI_ROOT}) message(STATUS "Using oneAPI Release SYCL compiler (icpx).") elseif(SUPPORTS_SYCL) message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}. @@ -551,21 +555,27 @@ if (GGML_SYCL) find_package(DNNL) message("-- DNNL found:" ${DNNL_FOUND}) + if (GGML_SYCL_TARGET STREQUAL "INTEL") add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND}) else() add_compile_definitions(GGML_SYCL_DNNL=0) endif() + + if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") + list(APPEND GGML_EXTRA_LIBS DNNL::dnnl) + endif() + if (WIN32) find_package(IntelSYCL REQUIRED) find_package(MKL REQUIRED) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + list(APPEND GGML_EXTRA_LIBS IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() if (GGML_SYCL_TARGET STREQUAL "INTEL") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + list(APPEND GGML_EXTRA_LIBS OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl) + list(APPEND GGML_EXTRA_LIBS pthread m dl onemkl) endif() endif() if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") @@ -579,7 +589,7 @@ if (GGML_RPC) list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC) if (WIN32) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32) + list(APPEND GGML_EXTRA_LIBS ws2_32) endif() set(GGML_HEADERS_RPC ../include/ggml-rpc.h) @@ -657,8 +667,8 @@ if (GGML_VULKAN) set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header}) set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source}) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND GGML_EXTRA_LIBS Vulkan::Vulkan) + list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) else() message(WARNING "Vulkan not found") endif() @@ -817,8 +827,8 @@ if (GGML_KOMPUTE) list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE) - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND GGML_EXTRA_LIBS kompute) + list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) else() message(WARNING "Kompute not found") endif() @@ -883,9 +893,10 @@ if (GGML_CANN) message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") - set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} ) - set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS}) - set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64) + list(APPEND GGML_EXTRA_LIBS ${CANN_LIBRARIES} ) + list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS}) + list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64) + list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) endif() else() @@ -1322,12 +1333,14 @@ if (EMSCRIPTEN) set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128") endif() -target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) -target_include_directories(ggml PUBLIC ../include) +target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) +target_include_directories(ggml PUBLIC ../include) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) -target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) +target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) target_compile_features (ggml PRIVATE c_std_11) # don't bump +list(REMOVE_DUPLICATES GGML_EXTRA_LIBS) + target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) find_library(MATH_LIBRARY m) From dcdcee3a744f39714503ee2b19c49b7c7b6209c9 Mon Sep 17 00:00:00 2001 From: VoidIsVoid <343750470@qq.com> Date: Sat, 14 Sep 2024 17:36:44 +0800 Subject: [PATCH 06/51] server: add data: [DONE] to /chat/completions stream response (#9459) --- examples/server/server.cpp | 2 ++ examples/server/tests/features/steps/steps.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 73cd6aae7..14c4af3d9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2993,6 +2993,8 @@ int main(int argc, char ** argv) { }, [&](json error_data) { server_sent_event(sink, "error", error_data); }); + static const std::string ev_done = "data: [DONE]\n\n"; + sink.write(ev_done.data(), ev_done.size()); sink.done(); return true; }; diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 11587dd64..0f4249b13 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1020,6 +1020,8 @@ async def oai_chat_completions(user_prompt, event_data = line.split(': ', 1) assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' chunk_raw = event_data[1] + if chunk_raw == '[DONE]': + break chunk = json.loads(chunk_raw) assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" From 822b6322dea704110797a5671fc80ae39ee6ac97 Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Sat, 14 Sep 2024 05:54:37 -0400 Subject: [PATCH 07/51] ggml : ggml_type_name return "NONE" for invalid values (#9458) When running on Windows, the quantization utility attempts to print the types that are not set which leads to a crash. --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 493ff7fc0..490c8d602 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3399,7 +3399,7 @@ double ggml_type_sizef(enum ggml_type type) { } GGML_CALL const char * ggml_type_name(enum ggml_type type) { - return type_traits[type].type_name; + return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } GGML_CALL bool ggml_is_quantized(enum ggml_type type) { From 7596487bebd58eade3cd0133d42a9008aaaf9d09 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Sun, 15 Sep 2024 09:06:38 +0200 Subject: [PATCH 08/51] cmake : try to fix sycl+intel build (#9487) --- ggml/src/CMakeLists.txt | 77 +++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 506b6dc7b..b25440769 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -26,7 +26,8 @@ if (NOT MSVC) endif() endif() -unset(GGML_EXTRA_LIBS) +unset(GGML_EXTRA_LIBS_PRIVATE) +unset(GGML_EXTRA_LIBS_PUBLIC) if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) @@ -37,7 +38,7 @@ if (APPLE AND GGML_ACCELERATE) add_compile_definitions(ACCELERATE_NEW_LAPACK) add_compile_definitions(ACCELERATE_LAPACK_ILP64) - list(APPEND GGML_EXTRA_LIBS ${ACCELERATE_FRAMEWORK}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK}) else() message(WARNING "Accelerate framework not found") endif() @@ -89,7 +90,7 @@ if (GGML_METAL) COMMENT "Generate assembly for embedded Metal library" ) - list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM}) + list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM}) else() if (GGML_METAL_SHADER_DEBUG) # custom command to do the following: @@ -134,7 +135,7 @@ if (GGML_METAL) ) endif() # GGML_METAL_EMBED_LIBRARY - list(APPEND GGML_EXTRA_LIBS + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} @@ -159,11 +160,11 @@ if (GGML_OPENMP) add_compile_definitions(GGML_USE_OPENMP) - list(APPEND GGML_EXTRA_LIBS OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) if (GGML_MUSA) - list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp") - list(APPEND GGML_EXTRA_LIBS "/usr/lib/llvm-10/lib/libomp.so") + list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp") + list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so") endif() else() message(WARNING "OpenMP not found") @@ -246,8 +247,8 @@ if (GGML_BLAS) set(GGML_HEADERS_BLAS ../include/ggml-blas.h) set(GGML_SOURCES_BLAS ggml-blas.cpp) - list(APPEND GGML_EXTRA_LIBS ${BLAS_LIBRARIES}) - list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES}) + list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS}) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" @@ -370,19 +371,19 @@ if (GGML_CUDA) if (GGML_STATIC) if (WIN32) # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library - list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt) else () if (GGML_MUSA) - list(APPEND GGML_EXTRA_LIBS MUSA::musart_static MUSA::mublas_static) + list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static) else() - list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) endif() endif() else() if (GGML_MUSA) - list(APPEND GGML_EXTRA_LIBS MUSA::musart MUSA::mublas) + list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas) else() - list(APPEND GGML_EXTRA_LIBS CUDA::cudart CUDA::cublas CUDA::cublasLt) + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() endif() @@ -390,9 +391,9 @@ if (GGML_CUDA) # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) else() if (GGML_MUSA) - list(APPEND GGML_EXTRA_LIBS MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... + list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ... else() - list(APPEND GGML_EXTRA_LIBS CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... endif() endif() else() @@ -497,7 +498,7 @@ if (GGML_HIPBLAS) if (CXX_IS_HIPCC) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) - list(APPEND GGML_EXTRA_LIBS hip::device) + list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device) else() set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) endif() @@ -506,8 +507,7 @@ if (GGML_HIPBLAS) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") endif() - # TODO: this "PUBLIC" here seems wrong - list(APPEND GGML_EXTRA_LIBS PUBLIC hip::host roc::rocblas roc::hipblas) + list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas) endif() if (GGML_SYCL) @@ -563,24 +563,23 @@ if (GGML_SYCL) endif() if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") - list(APPEND GGML_EXTRA_LIBS DNNL::dnnl) + list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl) endif() if (WIN32) find_package(IntelSYCL REQUIRED) find_package(MKL REQUIRED) - list(APPEND GGML_EXTRA_LIBS IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) + list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() if (GGML_SYCL_TARGET STREQUAL "INTEL") - list(APPEND GGML_EXTRA_LIBS OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl") + list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - list(APPEND GGML_EXTRA_LIBS pthread m dl onemkl) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") + list(APPEND GGML_EXTRA_LIBS_PRIVATE pthread m dl onemkl) endif() endif() - if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL") - list(APPEND GGML_EXTRA_LIBS DNNL::dnnl) - endif() endif() if (GGML_RPC) @@ -589,7 +588,7 @@ if (GGML_RPC) list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC) if (WIN32) - list(APPEND GGML_EXTRA_LIBS ws2_32) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32) endif() set(GGML_HEADERS_RPC ../include/ggml-rpc.h) @@ -667,8 +666,8 @@ if (GGML_VULKAN) set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header}) set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source}) - list(APPEND GGML_EXTRA_LIBS Vulkan::Vulkan) - list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan) + list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) else() message(WARNING "Vulkan not found") endif() @@ -827,8 +826,8 @@ if (GGML_KOMPUTE) list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE) - list(APPEND GGML_EXTRA_LIBS kompute) - list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute) + list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) else() message(WARNING "Kompute not found") endif() @@ -893,9 +892,9 @@ if (GGML_CANN) message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}") - list(APPEND GGML_EXTRA_LIBS ${CANN_LIBRARIES} ) - list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS}) - list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64) + list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} ) + list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS}) + list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN) endif() @@ -1339,9 +1338,7 @@ target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS}) target_compile_features (ggml PRIVATE c_std_11) # don't bump -list(REMOVE_DUPLICATES GGML_EXTRA_LIBS) - -target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) +list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads) find_library(MATH_LIBRARY m) if (MATH_LIBRARY) @@ -1350,6 +1347,10 @@ if (MATH_LIBRARY) endif() endif() +list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE) +list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC) +target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC}) + if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD) From d6b37c881f056bd32b681dcd7658a37ea6ec3a1e Mon Sep 17 00:00:00 2001 From: OSecret <135510162+OLSecret@users.noreply.github.com> Date: Sun, 15 Sep 2024 10:36:53 +0300 Subject: [PATCH 09/51] readme : update tools list (#9475) * Added link to proprietary wrapper for Unity3d into README.md Wrapper has prebuild library and was tested on iOS, Android, WebGL, PC, Mac platforms, has online demos like [this](https://d23myu0xfn2ttc.cloudfront.net/rich/index.html) and [that](https://d23myu0xfn2ttc.cloudfront.net/). * Update README.md Fixes upon review --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 73041b1a2..9a10ead83 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage +- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) **Infrastructure:** From 3c7989fd29a2db2b75e28fd708cc441febe99a82 Mon Sep 17 00:00:00 2001 From: Csaba Kecskemeti Date: Sun, 15 Sep 2024 00:48:25 -0700 Subject: [PATCH 10/51] py : add "LLaMAForCausalLM" conversion support (#9485) Co-authored-by: Csaba Kecskemeti --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 01a8a50a2..2c6d5d95b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1487,7 +1487,7 @@ class StableLMModel(Model): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA From 6988da94a261444859f78595899212eeedc5ff9d Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Sun, 15 Sep 2024 18:55:52 +0200 Subject: [PATCH 11/51] cmake : correct order of sycl flags (#9497) --- ggml/src/CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index b25440769..11b877e19 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -572,12 +572,10 @@ if (GGML_SYCL) list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() if (GGML_SYCL_TARGET STREQUAL "INTEL") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl") - list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - list(APPEND GGML_EXTRA_LIBS_PRIVATE pthread m dl onemkl) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") + list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl) endif() endif() endif() From e6deac31f7e62db43b6afbc3be814f764fd5a187 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 15 Sep 2024 19:02:27 +0200 Subject: [PATCH 12/51] gguf-split : add basic checks (#9499) * gguf-split : do not overwrite existing files when merging * gguf-split : error when too many arguments are passed --- examples/gguf-split/gguf-split.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 881f0451c..82c239b83 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -152,7 +152,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p throw std::invalid_argument("error: invalid parameter for argument: " + arg); } - if (argc - arg_idx < 2) { + if (argc - arg_idx != 2) { throw std::invalid_argument("error: bad arguments"); } @@ -389,10 +389,17 @@ static void gguf_merge(const split_params & split_params) { int n_split = 1; int total_tensors = 0; - auto * ctx_out = gguf_init_empty(); + // avoid overwriting existing output file + if (std::ifstream(split_params.output.c_str())) { + fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str()); + exit(EXIT_FAILURE); + } + std::ofstream fout(split_params.output.c_str(), std::ios::binary); fout.exceptions(std::ofstream::failbit); // fail fast on write errors + auto * ctx_out = gguf_init_empty(); + std::vector read_data; std::vector ctx_metas; std::vector ctx_ggufs; From 6262d13e0b2da91f230129a93a996609a2f5a2f2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 15 Sep 2024 20:46:12 +0300 Subject: [PATCH 13/51] common : reimplement logging (#9418) https://github.com/ggerganov/llama.cpp/pull/9418 --- .github/workflows/build.yml | 3 + .github/workflows/server.yml | 6 + Makefile | 38 +- ci/run.sh | 3 + common/CMakeLists.txt | 24 +- common/arg.cpp | 115 ++- common/common.cpp | 210 +++-- common/common.h | 13 +- common/log.cpp | 401 +++++++++ common/log.h | 786 ++---------------- common/ngram-cache.cpp | 3 + common/sampling.cpp | 2 +- common/train.cpp | 2 + examples/batched-bench/batched-bench.cpp | 35 +- examples/batched/batched.cpp | 49 +- .../convert-llama2c-to-ggml.cpp | 91 +- .../cvector-generator/cvector-generator.cpp | 9 +- examples/embedding/embedding.cpp | 107 +-- examples/eval-callback/eval-callback.cpp | 50 +- examples/export-lora/export-lora.cpp | 2 +- examples/gritlm/gritlm.cpp | 2 + examples/imatrix/imatrix.cpp | 101 ++- examples/infill/infill.cpp | 182 ++-- examples/llava/clip.cpp | 144 ++-- examples/llava/llava-cli.cpp | 67 +- examples/llava/llava.cpp | 58 +- examples/llava/minicpmv-cli.cpp | 86 +- examples/lookahead/lookahead.cpp | 55 +- examples/lookup/lookup-stats.cpp | 29 +- examples/lookup/lookup.cpp | 55 +- examples/main/main.cpp | 282 +++---- examples/parallel/parallel.cpp | 66 +- examples/passkey/passkey.cpp | 59 +- examples/perplexity/perplexity.cpp | 343 ++++---- examples/retrieval/retrieval.cpp | 70 +- examples/server/CMakeLists.txt | 7 +- examples/server/README.md | 1 - examples/server/bench/README.md | 1 - examples/server/bench/bench.py | 1 - examples/server/server.cpp | 572 +++++-------- examples/server/tests/.gitignore | 1 + examples/server/tests/README.md | 1 - examples/server/tests/features/steps/steps.py | 2 - examples/server/utils.hpp | 110 +-- examples/simple/simple.cpp | 43 +- examples/speculative/speculative.cpp | 107 ++- examples/tokenize/tokenize.cpp | 50 +- ggml/include/ggml.h | 9 +- ggml/src/ggml-metal.m | 7 +- src/llama-impl.h | 1 + src/llama.cpp | 8 +- tests/CMakeLists.txt | 1 + tests/test-arg-parser.cpp | 2 +- tests/test-log.cpp | 39 + 54 files changed, 2092 insertions(+), 2419 deletions(-) create mode 100644 common/log.cpp create mode 100644 examples/server/tests/.gitignore create mode 100644 tests/test-log.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 181ef37e2..1777489ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -23,6 +23,9 @@ env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 GGML_N_THREADS: 1 + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 jobs: macOS-latest-cmake-arm64: diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 29f8fd444..699ac095d 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -20,6 +20,12 @@ on: types: [opened, synchronize, reopened] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] +env: + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_LOG_VERBOSITY: 10 + concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true diff --git a/Makefile b/Makefile index f41887a4d..cb5ff9f9d 100644 --- a/Makefile +++ b/Makefile @@ -54,6 +54,7 @@ TEST_TARGETS = \ tests/test-grammar-parser \ tests/test-json-schema-to-grammar \ tests/test-llama-grammar \ + tests/test-log \ tests/test-model-load-cancel \ tests/test-opt \ tests/test-quantize-fns \ @@ -148,6 +149,14 @@ GGML_NO_METAL := 1 DEPRECATE_WARNING := 1 endif +ifdef LLAMA_DISABLE_LOGS +REMOVE_WARNING := 1 +endif + +ifdef LLAMA_SERVER_VERBOSE +REMOVE_WARNING := 1 +endif + ifndef UNAME_S UNAME_S := $(shell uname -s) endif @@ -351,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED MK_LDFLAGS += -fsanitize=undefined -g endif -ifdef LLAMA_SERVER_VERBOSE - MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) -endif - ifdef LLAMA_SERVER_SSL MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT MK_LDFLAGS += -lssl -lcrypto endif -ifdef LLAMA_DISABLE_LOGS - MK_CPPFLAGS += -DLOG_DISABLE_LOGS -endif # LLAMA_DISABLE_LOGS - # warnings WARN_FLAGS = \ -Wall \ @@ -931,6 +932,7 @@ OBJ_LLAMA = \ OBJ_COMMON = \ common/common.o \ common/arg.o \ + common/log.o \ common/console.o \ common/ngram-cache.o \ common/sampling.o \ @@ -1027,6 +1029,14 @@ $(info - LLAMA_NO_CCACHE) $(info ) endif +ifdef REMOVE_WARNING +$(info !!! REMOVAL WARNING !!!) +$(info The following LLAMA_ options have been removed and are no longer supported) +$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418)) +$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418)) +$(info ) +endif + # # Build libraries # @@ -1168,6 +1178,11 @@ common/arg.o: \ common/arg.h $(CXX) $(CXXFLAGS) -c $< -o $@ +common/log.o: \ + common/log.cpp \ + common/log.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + common/sampling.o: \ common/sampling.cpp \ common/sampling.h \ @@ -1346,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ - $(OBJ_GGML) $(OBJ_LLAMA) + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1528,6 +1543,11 @@ tests/test-llama-grammar: tests/test-llama-grammar.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +tests/test-log: tests/test-log.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + tests/test-grammar-parser: tests/test-grammar-parser.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/ci/run.sh b/ci/run.sh index 751bb0a02..1ac08ee4e 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -737,6 +737,9 @@ function gg_sum_embd_bge_small { ## main +export LLAMA_LOG_PREFIX=1 +export LLAMA_LOG_TIMESTAMPS=1 + if [ -z ${GG_BUILD_LOW_PERF} ]; then # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt rm -rf ${SRC}/models-mnt diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 22fd99689..042e895ad 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -51,21 +51,23 @@ endif() set(TARGET common) add_library(${TARGET} STATIC - base64.hpp - common.h - common.cpp - arg.h arg.cpp - sampling.h - sampling.cpp - console.h + arg.h + base64.hpp + common.cpp + common.h console.cpp - json.hpp + console.h json-schema-to-grammar.cpp - train.h - train.cpp - ngram-cache.h + json.hpp + log.cpp + log.h ngram-cache.cpp + ngram-cache.h + sampling.cpp + sampling.h + train.cpp + train.h ) if (BUILD_SHARED_LIBS) diff --git a/common/arg.cpp b/common/arg.cpp index a1cd5830f..8fcb8c25f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1,15 +1,17 @@ #include "arg.h" +#include "log.h" #include "sampling.h" #include -#include -#include -#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include #include "json-schema-to-grammar.h" @@ -383,20 +385,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, exit(0); } )); - add_opt(llama_arg( - {"-v", "--verbose"}, - "print verbose information", - [](gpt_params & params) { - params.verbosity = 1; - } - )); - add_opt(llama_arg( - {"--verbosity"}, "N", - format("set specific verbosity level (default: %d)", params.verbosity), - [](gpt_params & params, int value) { - params.verbosity = value; - } - )); add_opt(llama_arg( {"--verbose-prompt"}, format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), @@ -417,7 +405,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params) { params.use_color = true; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(llama_arg( {"-t", "--threads"}, "N", format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), @@ -876,7 +864,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.input_prefix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", @@ -884,7 +872,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.input_suffix = value; params.enable_chat_template = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"--no-warmup"}, "skip warming up the model with an empty run", @@ -1824,19 +1812,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.system_prompt = system_prompt; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--log-format"}, "{text, json}", - "log output format: json or text (default: json)", - [](gpt_params & params, const std::string & value) { - if (value == "json") { - params.log_json = true; - } else if (value == "text") { - params.log_json = false; - } else { - throw std::invalid_argument("invalid value"); - } - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"--metrics"}, format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), @@ -1956,39 +1931,57 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); -#ifndef LOG_DISABLE_LOGS - // TODO: make this looks less weird - add_opt(llama_arg( - {"--log-test"}, - "Log test", - [](gpt_params &) { log_param_single_parse("--log-test"); } - )); add_opt(llama_arg( {"--log-disable"}, "Log disable", - [](gpt_params &) { log_param_single_parse("--log-disable"); } - )); - add_opt(llama_arg( - {"--log-enable"}, - "Log enable", - [](gpt_params &) { log_param_single_parse("--log-enable"); } - )); - add_opt(llama_arg( - {"--log-new"}, - "Log new", - [](gpt_params &) { log_param_single_parse("--log-new"); } - )); - add_opt(llama_arg( - {"--log-append"}, - "Log append", - [](gpt_params &) { log_param_single_parse("--log-append"); } + [](gpt_params &) { + gpt_log_pause(gpt_log_main()); + } )); add_opt(llama_arg( {"--log-file"}, "FNAME", - "Log file", - [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } + "Log to file", + [](gpt_params &, const std::string & value) { + gpt_log_set_file(gpt_log_main(), value.c_str()); + } )); -#endif // LOG_DISABLE_LOGS + add_opt(llama_arg( + {"--log-colors"}, + "Enable colored logging", + [](gpt_params &) { + gpt_log_set_colors(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_COLORS")); + add_opt(llama_arg( + {"-v", "--verbose", "--log-verbose"}, + "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", + [](gpt_params & params) { + params.verbosity = INT_MAX; + gpt_log_set_verbosity_thold(INT_MAX); + } + )); + add_opt(llama_arg( + {"-lv", "--verbosity", "--log-verbosity"}, "N", + "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", + [](gpt_params & params, int value) { + params.verbosity = value; + gpt_log_set_verbosity_thold(value); + } + ).set_env("LLAMA_LOG_VERBOSITY")); + add_opt(llama_arg( + {"--log-prefix"}, + "Enable prefx in log messages", + [](gpt_params &) { + gpt_log_set_prefix(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_PREFIX")); + add_opt(llama_arg( + {"--log-timestamps"}, + "Enable timestamps in log messages", + [](gpt_params &) { + gpt_log_set_timestamps(gpt_log_main(), true); + } + ).set_env("LLAMA_LOG_TIMESTAMPS")); return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index f9a831ec7..8d0ed4f95 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3,6 +3,7 @@ #endif #include "common.h" +#include "log.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" @@ -25,6 +26,7 @@ #include #include #include +#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -48,7 +50,6 @@ #if defined(LLAMA_USE_CURL) #include #include -#include #include #endif @@ -226,7 +227,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { } if (!SetPriorityClass(GetCurrentProcess(), p)) { - fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); + LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); return false; } @@ -251,7 +252,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { } if (!setpriority(PRIO_PROCESS, 0, p)) { - fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); + LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno); return false; } return true; @@ -284,14 +285,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) if (n_set && n_set < cpuparams.n_threads) { // Not enough set bits, may experience performance issues. - fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); + LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); } } bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { size_t dash_loc = range.find('-'); if (dash_loc == std::string::npos) { - fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); + LOG_ERR("Format of CPU range is invalid! Expected []-[].\n"); return false; } @@ -303,7 +304,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE } else { start_i = std::stoull(range.substr(0, dash_loc)); if (start_i >= GGML_MAX_N_THREADS) { - fprintf(stderr, "Start index out of bounds!\n"); + LOG_ERR("Start index out of bounds!\n"); return false; } } @@ -313,7 +314,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE } else { end_i = std::stoull(range.substr(dash_loc + 1)); if (end_i >= GGML_MAX_N_THREADS) { - fprintf(stderr, "End index out of bounds!\n"); + LOG_ERR("End index out of bounds!\n"); return false; } } @@ -348,7 +349,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD } else if (c >= 'A' && c <= 'F') { id -= 'A' - 10; } else { - fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i)); + LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i)); return false; } @@ -361,6 +362,22 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } +void gpt_init() { + llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { + if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { + gpt_log_add(gpt_log_main(), level, "%s", text); + } + }, NULL); + +#ifdef NDEBUG + const char * build_type = ""; +#else + const char * build_type = " (debug)"; +#endif + + LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); +} + std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; @@ -441,6 +458,94 @@ void string_replace_all(std::string & s, const std::string & search, const std:: s = std::move(builder); } +std::string string_from(bool value) { + return value ? "true" : "false"; +} + +std::string string_from(const std::vector & values) { + std::stringstream buf; + + buf << "[ "; + bool first = true; + for (auto e : values) { + if (first) { + first = false; + } else { + buf << ", "; + } + buf << std::to_string(e); + } + buf << " ]"; + + return buf.str(); +} + +std::string string_from(const struct llama_context * ctx, const std::vector & tokens) { + std::stringstream buf; + + buf << "[ "; + + bool first = true; + for (const auto & token : tokens) { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, token); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf << "'" << detokenized << "'" + << ":" << std::to_string(token); + } + + buf << " ]"; + + return buf.str(); +} + +std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) { + std::stringstream buf; + + buf << "[ "; + + bool first = true; + for (int i = 0; i < batch.n_tokens; ++i) { + if (!first) { + buf << ", "; + } else { + first = false; + } + + auto detokenized = llama_token_to_piece(ctx, batch.token[i]); + + detokenized.erase( + std::remove_if( + detokenized.begin(), + detokenized.end(), + [](const unsigned char c) { return !std::isprint(c); }), + detokenized.end()); + + buf << "\n" << std::to_string(i) + << ":token '" << detokenized << "'" + << ":pos " << std::to_string(batch.pos[i]) + << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) + << ":seq_id " << std::to_string(batch.seq_id[i][0]) + << ":logits " << std::to_string(batch.logits[i]); + } + + buf << " ]"; + + return buf.str(); +} + void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -481,7 +586,7 @@ void string_process_escapes(std::string & input) { bool string_parse_kv_override(const char * data, std::vector & overrides) { const char * sep = strchr(data, '='); if (sep == nullptr || sep - data >= 128) { - fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + LOG_ERR("%s: malformed KV override '%s'\n", __func__, data); return false; } llama_model_kv_override kvo; @@ -504,20 +609,20 @@ bool string_parse_kv_override(const char * data, std::vector 127) { - fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); return false; } strncpy(kvo.val_str, sep, 127); kvo.val_str[127] = '\0'; } else { - fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data); return false; } overrides.emplace_back(std::move(kvo)); @@ -729,7 +834,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); return iparams; } @@ -737,7 +842,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_context * lctx = llama_new_context_with_model(model, cparams); if (lctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); return iparams; } @@ -773,7 +878,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { loaded_la.scale = la.scale; loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); if (loaded_la.adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); + LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); return iparams; @@ -785,12 +890,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { } if (params.sparams.ignore_eos && llama_token_eos(model) == -1) { - fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); + LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); params.sparams.ignore_eos = false; } if (params.warmup) { - LOG("warming up the model with an empty run\n"); + LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); std::vector tmp; llama_token bos = llama_token_bos(model); @@ -955,7 +1060,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ int remaining_attempts = max_attempts; while (remaining_attempts > 0) { - fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); CURLcode res = curl_easy_perform(curl); if (res == CURLE_OK) { @@ -963,13 +1068,14 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ } int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; - fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); + LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); remaining_attempts--; std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); } - fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + return false; } @@ -978,7 +1084,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat // Initialize libcurl std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { - fprintf(stderr, "%s: error initializing libcurl\n", __func__); + LOG_ERR("%s: error initializing libcurl\n", __func__); return false; } @@ -1019,11 +1125,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat if (metadata_in.good()) { try { metadata_in >> metadata; - fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); if (metadata.contains("url") && metadata.at("url").is_string()) { auto previous_url = metadata.at("url").get(); if (previous_url != url) { - fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); + LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); return false; } } @@ -1034,12 +1140,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat last_modified = metadata.at("lastModified"); } } catch (const nlohmann::json::exception & e) { - fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); return false; } } } else { - fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); + LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); } // Send a HEAD request to retrieve the etag and last-modified headers @@ -1087,26 +1193,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat // HEAD not supported, we don't know if the file has changed // force trigger downloading force_download = true; - fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } } bool should_download = !file_exists || force_download; if (!should_download) { if (!etag.empty() && etag != headers.etag) { - fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); should_download = true; } else if (!last_modified.empty() && last_modified != headers.last_modified) { - fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); should_download = true; } } if (should_download) { std::string path_temporary = path + ".downloadInProgress"; if (file_exists) { - fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); if (remove(path.c_str()) != 0) { - fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); + LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); return false; } } @@ -1121,7 +1227,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); if (!outfile) { - fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); + LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str()); return false; } @@ -1152,7 +1258,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat }; // start the download - fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); if (!was_perform_successful) { @@ -1162,7 +1268,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat long http_code = 0; curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code < 200 || http_code >= 400) { - fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); + LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); return false; } @@ -1176,10 +1282,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat {"lastModified", headers.last_modified} }); std::ofstream(metadata_path) << metadata.dump(4); - fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); + LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); if (rename(path_temporary.c_str(), path.c_str()) != 0) { - fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); return false; } } @@ -1194,7 +1300,7 @@ struct llama_model * llama_load_model_from_url( const struct llama_model_params & params) { // Basic validation of the model_url if (!model_url || strlen(model_url) == 0) { - fprintf(stderr, "%s: invalid model_url\n", __func__); + LOG_ERR("%s: invalid model_url\n", __func__); return NULL; } @@ -1211,7 +1317,7 @@ struct llama_model * llama_load_model_from_url( }; auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); if (!ctx_gguf) { - fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model); return NULL; } @@ -1231,14 +1337,12 @@ struct llama_model * llama_load_model_from_url( // and extract split URL and PATH prefixes { if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { - fprintf(stderr, "\n%s: unexpected model file name: %s" - " n_split=%d\n", __func__, path_model, n_split); + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); return NULL; } if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { - fprintf(stderr, "\n%s: unexpected model url: %s" - " n_split=%d\n", __func__, model_url, n_split); + LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); return NULL; } } @@ -1298,7 +1402,7 @@ struct llama_model * llama_load_model_from_url( const char * /*path_model*/, const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } @@ -1308,7 +1412,7 @@ struct llama_model * llama_load_model_from_hf( const char * /*path_model*/, const char * /*hf_token*/, const struct llama_model_params & /*params*/) { - fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; } @@ -1636,13 +1740,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr }; struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); if (!ctx_gguf) { - fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str()); return result; } int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); if (n_tensors == 0) { - fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); } for (int i = 0; i < n_tensors; i++) { @@ -1660,23 +1764,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } } if (layer_idx < 0) { - fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } else if (layer_idx == 0) { - fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); if (tensor->type != GGML_TYPE_F32) { - fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } if (ggml_n_dims(tensor) != 1) { - fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -1684,7 +1788,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr if (result.n_embd == -1) { result.n_embd = ggml_nelements(tensor); } else if (ggml_nelements(tensor) != result.n_embd) { - fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); + LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str()); result.n_embd = -1; break; } @@ -1701,7 +1805,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } if (result.n_embd == -1) { - fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); + LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str()); result.data.clear(); } @@ -1722,7 +1826,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector #include +#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -343,6 +341,10 @@ struct gpt_params { bool batched_bench_output_jsonl = false; }; +// call once at the start of a program if it uses libcommon +// initializes the logging system and prints info about the build +void gpt_init(); + std::string gpt_params_get_system_info(const gpt_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); @@ -378,6 +380,11 @@ static std::vector string_split(const std::string & str, char delim) { bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); +std::string string_from(bool value); +std::string string_from(const std::vector & values); +std::string string_from(const struct llama_context * ctx, const std::vector & tokens); +std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch); + // // Filesystem utils // diff --git a/common/log.cpp b/common/log.cpp new file mode 100644 index 000000000..2825a227e --- /dev/null +++ b/common/log.cpp @@ -0,0 +1,401 @@ +#include "log.h" + +#include +#include +#include +#include +#include +#include +#include + +int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA; + +void gpt_log_set_verbosity_thold(int verbosity) { + gpt_log_verbosity_thold = verbosity; +} + +#define LOG_COL_DEFAULT "\033[0m" +#define LOG_COL_BOLD "\033[1m" +#define LOG_COL_RED "\033[31m" +#define LOG_COL_GREEN "\033[32m" +#define LOG_COL_YELLOW "\033[33m" +#define LOG_COL_BLUE "\033[34m" +#define LOG_COL_MAGENTA "\033[35m" +#define LOG_COL_CYAN "\033[36m" +#define LOG_COL_WHITE "\033[37m" + +static int64_t t_us() { + return std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); +} + +// colors +enum gpt_log_col : int { + GPT_LOG_COL_DEFAULT = 0, + GPT_LOG_COL_BOLD, + GPT_LOG_COL_RED, + GPT_LOG_COL_GREEN, + GPT_LOG_COL_YELLOW, + GPT_LOG_COL_BLUE, + GPT_LOG_COL_MAGENTA, + GPT_LOG_COL_CYAN, + GPT_LOG_COL_WHITE, +}; + +// disable colors by default +static std::vector g_col = { + "", + "", + "", + "", + "", + "", + "", + "", + "", +}; + +struct gpt_log_entry { + enum ggml_log_level level; + + bool prefix; + + int64_t timestamp; + + std::vector msg; + + // signals the worker thread to stop + bool is_end; + + void print(FILE * file = nullptr) const { + FILE * fcur = file; + if (!fcur) { + // stderr displays DBG messages only when their verbosity level is not higher than the threshold + // these messages will still be logged to a file + if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) { + return; + } + + fcur = stdout; + + if (level != GGML_LOG_LEVEL_NONE) { + fcur = stderr; + } + } + + if (level != GGML_LOG_LEVEL_NONE && prefix) { + if (timestamp) { + // [M.s.ms.us] + fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", + g_col[GPT_LOG_COL_BLUE], + (int) (timestamp / 1000000 / 60), + (int) (timestamp / 1000000 % 60), + (int) (timestamp / 1000 % 1000), + (int) (timestamp % 1000), + g_col[GPT_LOG_COL_DEFAULT]); + } + + switch (level) { + case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break; + case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break; + case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break; + case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break; + default: + break; + } + } + + fprintf(fcur, "%s", msg.data()); + + if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { + fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]); + } + + fflush(fcur); + } +}; + +struct gpt_log { + // default capacity - will be expanded if needed + gpt_log() : gpt_log(256) {} + + gpt_log(size_t capacity) { + file = nullptr; + prefix = false; + timestamps = false; + running = false; + t_start = t_us(); + + // initial message size - will be expanded if longer messages arrive + entries.resize(capacity); + for (auto & entry : entries) { + entry.msg.resize(256); + } + + head = 0; + tail = 0; + + resume(); + } + + ~gpt_log() { + pause(); + if (file) { + fclose(file); + } + } + +private: + std::mutex mtx; + std::thread thrd; + std::condition_variable cv; + + FILE * file; + + bool prefix; + bool timestamps; + bool running; + + int64_t t_start; + + // ring buffer of entries + std::vector entries; + size_t head; + size_t tail; + + // worker thread copies into this + gpt_log_entry cur; + +public: + void add(enum ggml_log_level level, const char * fmt, va_list args) { + std::lock_guard lock(mtx); + + if (!running) { + // discard messages while the worker thread is paused + return; + } + + auto & entry = entries[tail]; + + { + // cannot use args twice, so make a copy in case we need to expand the buffer + va_list args_copy; + va_copy(args_copy, args); + +#if 1 + const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args); + if (n >= entry.msg.size()) { + entry.msg.resize(n + 1); + vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy); + } +#else + // hack for bolding arguments + + std::stringstream ss; + for (int i = 0; fmt[i] != 0; i++) { + if (fmt[i] == '%') { + ss << LOG_COL_BOLD; + while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++]; + ss << LOG_COL_DEFAULT; + if (fmt[i] == 0) break; + } + ss << fmt[i]; + } + const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args); + if (n >= entry.msg.size()) { + entry.msg.resize(n + 1); + vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy); + } +#endif + } + + entry.level = level; + entry.prefix = prefix; + entry.timestamp = 0; + if (timestamps) { + entry.timestamp = t_us() - t_start; + } + entry.is_end = false; + + tail = (tail + 1) % entries.size(); + if (tail == head) { + // expand the buffer + std::vector new_entries(2*entries.size()); + + size_t new_tail = 0; + + do { + new_entries[new_tail] = std::move(entries[head]); + + head = (head + 1) % entries.size(); + new_tail = (new_tail + 1); + } while (head != tail); + + head = 0; + tail = new_tail; + + for (size_t i = tail; i < new_entries.size(); i++) { + new_entries[i].msg.resize(256); + } + + entries = std::move(new_entries); + } + + cv.notify_one(); + } + + void resume() { + std::lock_guard lock(mtx); + + if (running) { + return; + } + + running = true; + + thrd = std::thread([this]() { + while (true) { + { + std::unique_lock lock(mtx); + cv.wait(lock, [this]() { return head != tail; }); + + cur = entries[head]; + + head = (head + 1) % entries.size(); + } + + if (cur.is_end) { + break; + } + + cur.print(); // stdout and stderr + + if (file) { + cur.print(file); + } + } + }); + } + + void pause() { + { + std::lock_guard lock(mtx); + + if (!running) { + return; + } + + running = false; + + // push an entry to signal the worker thread to stop + { + auto & entry = entries[tail]; + entry.is_end = true; + + tail = (tail + 1) % entries.size(); + } + + cv.notify_one(); + } + + thrd.join(); + } + + void set_file(const char * path) { + pause(); + + if (file) { + fclose(file); + } + + if (path) { + file = fopen(path, "w"); + } else { + file = nullptr; + } + + resume(); + } + + void set_colors(bool colors) { + pause(); + + if (colors) { + g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; + g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD; + g_col[GPT_LOG_COL_RED] = LOG_COL_RED; + g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN; + g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW; + g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE; + g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; + g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN; + g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE; + } else { + for (size_t i = 0; i < g_col.size(); i++) { + g_col[i] = ""; + } + } + + resume(); + } + + void set_prefix(bool prefix) { + std::lock_guard lock(mtx); + + this->prefix = prefix; + } + + void set_timestamps(bool timestamps) { + std::lock_guard lock(mtx); + + this->timestamps = timestamps; + } +}; + +// +// public API +// + +struct gpt_log * gpt_log_init() { + return new gpt_log; +} + +struct gpt_log * gpt_log_main() { + static struct gpt_log log; + + return &log; +} + +void gpt_log_pause(struct gpt_log * log) { + log->pause(); +} + +void gpt_log_resume(struct gpt_log * log) { + log->resume(); +} + +void gpt_log_free(struct gpt_log * log) { + delete log; +} + +void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + log->add(level, fmt, args); + va_end(args); +} + +void gpt_log_set_file(struct gpt_log * log, const char * file) { + log->set_file(file); +} + +void gpt_log_set_colors(struct gpt_log * log, bool colors) { + log->set_colors(colors); +} + +void gpt_log_set_prefix(struct gpt_log * log, bool prefix) { + log->set_prefix(prefix); +} + +void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) { + log->set_timestamps(timestamps); +} diff --git a/common/log.h b/common/log.h index 1bc5328ce..d13f72d89 100644 --- a/common/log.h +++ b/common/log.h @@ -1,724 +1,90 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include +#include "ggml.h" // for ggml_log_level -// -------------------------------- -// -// Basic usage: -// -// -------- -// -// The LOG() and LOG_TEE() macros are ready to go by default -// they do not require any initialization. -// -// LOGLN() and LOG_TEELN() are variants which automatically -// include \n character at the end of the log string. -// -// LOG() behaves exactly like printf, by default writing to a logfile. -// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ). -// -// Default logfile is named -// "llama..log" -// Default LOG_TEE() secondary output target is -// stderr -// -// Logs can be dynamically disabled or enabled using functions: -// log_disable() -// and -// log_enable() -// -// A log target can be changed with: -// log_set_target( string ) -// creating and opening, or re-opening a file by string filename -// or -// log_set_target( FILE* ) -// allowing to point at stderr, stdout, or any valid FILE* file handler. -// -// -------- -// -// End of Basic usage. -// -// -------------------------------- - -// Specifies a log target. -// default uses log_handler() with "llama.log" log file -// this can be changed, by defining LOG_TARGET -// like so: -// -// #define LOG_TARGET (a valid FILE*) -// #include "log.h" -// -// or it can be simply redirected to stdout or stderr -// like so: -// -// #define LOG_TARGET stderr -// #include "log.h" -// -// The log target can also be redirected to a different function -// like so: -// -// #define LOG_TARGET log_handler_different() -// #include "log.h" -// -// FILE* log_handler_different() -// { -// return stderr; -// } -// -// or: -// -// #define LOG_TARGET log_handler_another_one("somelog.log") -// #include "log.h" -// -// FILE* log_handler_another_one(char*filename) -// { -// static FILE* logfile = nullptr; -// (...) -// if( !logfile ) -// { -// fopen(...) -// } -// (...) -// return logfile -// } -// -#ifndef LOG_TARGET - #define LOG_TARGET log_handler() -#endif - -#ifndef LOG_TEE_TARGET - #define LOG_TEE_TARGET stderr -#endif - -// Utility for synchronizing log configuration state -// since std::optional was introduced only in c++17 -enum LogTriState -{ - LogTriStateSame, - LogTriStateFalse, - LogTriStateTrue -}; - -// Utility to obtain "pid" like unique process id and use it when creating log files. -inline std::string log_get_pid() -{ - static std::string pid; - if (pid.empty()) - { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" - // it's not the same as "pid" but is unique enough to solve multiple instances - // trying to write to the same log. - std::stringstream ss; - ss << std::this_thread::get_id(); - pid = ss.str(); - } - - return pid; -} - -// Utility function for generating log file names with unique id based on thread id. -// invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" -// where the number is a runtime id of the current thread. - -#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension) - -// INTERNAL, DO NOT USE -inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension) -{ - static bool _multilog = false; - - if (multilog != LogTriStateSame) - { - _multilog = multilog == LogTriStateTrue; - } - - std::stringstream buf; - - buf << log_file_basename; - if (_multilog) - { - buf << "."; - buf << log_get_pid(); - } - buf << "."; - buf << log_file_extension; - - return buf.str(); -} - -#ifndef LOG_DEFAULT_FILE_NAME - #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log") -#endif - -// Utility for turning #define values into string literals -// so we can have a define for stderr and -// we can print "stderr" instead of literal stderr, etc. -#define LOG_STRINGIZE1(s) #s -#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s) - -#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET) - -// Allows disabling timestamps. -// in order to disable, define LOG_NO_TIMESTAMPS -// like so: -// -// #define LOG_NO_TIMESTAMPS -// #include "log.h" -// -#ifndef LOG_NO_TIMESTAMPS - #ifndef _MSC_VER - #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #else - #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #endif +#ifndef __GNUC__ +# define LOG_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) #else - #define LOG_TIMESTAMP_FMT "%s" - #define LOG_TIMESTAMP_VAL ,"" +# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif -#ifdef LOG_TEE_TIMESTAMPS - #ifndef _MSC_VER - #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #else - #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " - #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() - #endif -#else - #define LOG_TEE_TIMESTAMP_FMT "%s" - #define LOG_TEE_TIMESTAMP_VAL ,"" -#endif +#define LOG_DEFAULT_DEBUG 1 +#define LOG_DEFAULT_LLAMA 0 -// Allows disabling file/line/function prefix -// in order to disable, define LOG_NO_FILE_LINE_FUNCTION -// like so: +// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower +// set via gpt_log_set_verbosity() +extern int gpt_log_verbosity_thold; + +void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe + +// the gpt_log uses an internal worker thread to print/write log messages +// when the worker thread is paused, incoming log messages are discarded +struct gpt_log; + +struct gpt_log * gpt_log_init(); +struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit +void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe +void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe +void gpt_log_free (struct gpt_log * log); + +LOG_ATTRIBUTE_FORMAT(3, 4) +void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...); + +// defaults: file = NULL, colors = false, prefix = false, timestamps = false // -// #define LOG_NO_FILE_LINE_FUNCTION -// #include "log.h" +// regular log output: // -#ifndef LOG_NO_FILE_LINE_FUNCTION - #ifndef _MSC_VER - #define LOG_FLF_FMT "[%24s:%5d][%24s] " - #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ - #else - #define LOG_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ - #endif -#else - #define LOG_FLF_FMT "%s" - #define LOG_FLF_VAL ,"" -#endif - -#ifdef LOG_TEE_FILE_LINE_FUNCTION - #ifndef _MSC_VER - #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " - #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ - #else - #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " - #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__ - #endif -#else - #define LOG_TEE_FLF_FMT "%s" - #define LOG_TEE_FLF_VAL ,"" -#endif - -// INTERNAL, DO NOT USE -// USE LOG() INSTEAD +// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) +// llm_load_tensors: ggml ctx size = 0.27 MiB +// llm_load_tensors: offloading 32 repeating layers to GPU +// llm_load_tensors: offloading non-repeating layers to GPU // -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) - #define LOG_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ +// with prefix = true, timestamps = true, the log output will look like this: +// +// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34) +// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB +// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU +// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU +// +// I - info (stdout, V = 0) +// W - warning (stderr, V = 0) +// E - error (stderr, V = 0) +// D - debug (stderr, V = LOG_DEFAULT_DEBUG) +// + +void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe +void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe +void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log +void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix + +// helper macros for logging +// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold +// +// for example: +// +// LOG_DBG("this is a debug message: %d\n", expensive_function()); +// +// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold +// + +#define LOG_TMPL(level, verbosity, ...) \ + do { \ + if ((verbosity) <= gpt_log_verbosity_thold) { \ + gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \ + } \ } while (0) -#else - #define LOG_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - } while (0) -#endif -// INTERNAL, DO NOT USE -// USE LOG_TEE() INSTEAD -// -#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) - #define LOG_TEE_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ - { \ - fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \ - fflush(LOG_TEE_TARGET); \ - } \ - } while (0) -#else - #define LOG_TEE_IMPL(str, ...) \ - do { \ - if (LOG_TARGET != nullptr) \ - { \ - fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TARGET); \ - } \ - if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \ - { \ - fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \ - fflush(LOG_TEE_TARGET); \ - } \ - } while (0) -#endif +#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__) +#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) -// The '\0' as a last argument, is a trick to bypass the silly -// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro" -// so we can have a single macro which can be called just like printf. +#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__) +#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__) +#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__) +#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__) -// Main LOG macro. -// behaves like printf, and supports arguments the exact same way. -// -#if !defined(_MSC_VER) || defined(__clang__) - #define LOG(...) LOG_IMPL(__VA_ARGS__, "") -#else - #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") -#endif - -// Main TEE macro. -// does the same as LOG -// and -// simultaneously writes stderr. -// -// Secondary target can be changed just like LOG_TARGET -// by defining LOG_TEE_TARGET -// -#if !defined(_MSC_VER) || defined(__clang__) - #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") -#else - #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") -#endif - -// LOG macro variants with auto endline. -#if !defined(_MSC_VER) || defined(__clang__) - #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") - #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") -#else - #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n") - #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n") -#endif - -// INTERNAL, DO NOT USE -inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr) -{ - static bool _initialized = false; - static bool _append = false; - static bool _disabled = filename.empty() && target == nullptr; - static std::string log_current_filename{filename}; - static FILE *log_current_target{target}; - static FILE *logfile = nullptr; - - if (change) - { - if (append != LogTriStateSame) - { - _append = append == LogTriStateTrue; - return logfile; - } - - if (disable == LogTriStateTrue) - { - // Disable primary target - _disabled = true; - } - // If previously disabled, only enable, and keep previous target - else if (disable == LogTriStateFalse) - { - _disabled = false; - } - // Otherwise, process the arguments - else if (log_current_filename != filename || log_current_target != target) - { - _initialized = false; - } - } - - if (_disabled) - { - // Log is disabled - return nullptr; - } - - if (_initialized) - { - // with fallback in case something went wrong - return logfile ? logfile : stderr; - } - - // do the (re)initialization - if (target != nullptr) - { - if (logfile != nullptr && logfile != stdout && logfile != stderr) - { - fclose(logfile); - } - - log_current_filename = LOG_DEFAULT_FILE_NAME; - log_current_target = target; - - logfile = target; - } - else - { - if (log_current_filename != filename) - { - if (logfile != nullptr && logfile != stdout && logfile != stderr) - { - fclose(logfile); - } - } - - logfile = fopen(filename.c_str(), _append ? "a" : "w"); - } - - if (!logfile) - { - // Verify whether the file was opened, otherwise fallback to stderr - logfile = stderr; - - fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno)); - fflush(stderr); - - // At this point we let the init flag be to true below, and let the target fallback to stderr - // otherwise we would repeatedly fopen() which was already unsuccessful - } - - _initialized = true; - - return logfile ? logfile : stderr; -} - -// INTERNAL, DO NOT USE -inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME) -{ - return log_handler1_impl(change, append, disable, filename, target); -} - -// Disables logs entirely at runtime. -// Makes LOG() and LOG_TEE() produce no output, -// until enabled back. -#define log_disable() log_disable_impl() - -// INTERNAL, DO NOT USE -inline FILE *log_disable_impl() -{ - return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue); -} - -// Enables logs at runtime. -#define log_enable() log_enable_impl() - -// INTERNAL, DO NOT USE -inline FILE *log_enable_impl() -{ - return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse); -} - -// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*) -#define log_set_target(target) log_set_target_impl(target) - -// INTERNAL, DO NOT USE -inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); } -inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); } - -// INTERNAL, DO NOT USE -inline FILE *log_handler() { return log_handler1_impl(); } - -// Enable or disable creating separate log files for each run. -// can ONLY be invoked BEFORE first log use. -#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "") -// Enable or disable append mode for log file. -// can ONLY be invoked BEFORE first log use. -#define log_append(enable) log_append_impl(enable) -// INTERNAL, DO NOT USE -inline FILE *log_append_impl(bool enable) -{ - return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame); -} - -inline void log_test() -{ - log_disable(); - LOG("01 Hello World to nobody, because logs are disabled!\n"); - log_enable(); - LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET)); - LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n"); - log_set_target(stderr); - LOG("04 Hello World to stderr!\n"); - LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n"); - log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("06 Hello World to default log file!\n"); - log_set_target(stdout); - LOG("07 Hello World to stdout!\n"); - log_set_target(LOG_DEFAULT_FILE_NAME); - LOG("08 Hello World to default log file again!\n"); - log_disable(); - LOG("09 Hello World _1_ into the void!\n"); - log_enable(); - LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n"); - log_disable(); - log_set_target("llama.anotherlog.log"); - LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n"); - log_enable(); - LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n"); - log_set_target("llama.yetanotherlog.log"); - LOG("13 Hello World this time in yet new file?\n"); - log_set_target(log_filename_generator("llama_autonamed", "log")); - LOG("14 Hello World in log with generated filename!\n"); -#ifdef _MSC_VER - LOG_TEE("15 Hello msvc TEE without arguments\n"); - LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test"); - LOG_TEELN("17 Hello msvc TEELN without arguments\n"); - LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test"); - LOG("19 Hello msvc LOG without arguments\n"); - LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test"); - LOGLN("21 Hello msvc LOGLN without arguments\n"); - LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test"); -#endif -} - -inline bool log_param_single_parse(const std::string & param) -{ - if ( param == "--log-test") - { - log_test(); - return true; - } - - if ( param == "--log-disable") - { - log_disable(); - return true; - } - - if ( param == "--log-enable") - { - log_enable(); - return true; - } - - if (param == "--log-new") - { - log_multilog(true); - return true; - } - - if (param == "--log-append") - { - log_append(true); - return true; - } - - return false; -} - -inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string()) -{ - if ( param == "--log-file") - { - if (!check_but_dont_parse) - { - log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log")); - } - - return true; - } - - return false; -} - -inline void log_print_usage() -{ - printf("log options:\n"); - /* format - printf(" -h, --help show this help message and exit\n");*/ - /* spacing - printf("__-param----------------Description\n");*/ - printf(" --log-test Run simple logging test\n"); - printf(" --log-disable Disable trace logs\n"); - printf(" --log-enable Enable trace logs\n"); - printf(" --log-file Specify a log filename (without extension)\n"); - printf(" --log-new Create a separate new log file on start. " - "Each log file will have unique name: \"..log\"\n"); - printf(" --log-append Don't truncate the old log file.\n"); - printf("\n"); -} - -#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) - -// INTERNAL, DO NOT USE -inline void log_dump_cmdline_impl(int argc, char **argv) -{ - std::stringstream buf; - for (int i = 0; i < argc; ++i) - { - if (std::string(argv[i]).find(' ') != std::string::npos) - { - buf << " \"" << argv[i] <<"\""; - } - else - { - buf << " " << argv[i]; - } - } - LOGLN("Cmd:%s", buf.str().c_str()); -} - -#define log_tostr(var) log_var_to_string_impl(var).c_str() - -inline std::string log_var_to_string_impl(bool var) -{ - return var ? "true" : "false"; -} - -inline std::string log_var_to_string_impl(std::string var) -{ - return var; -} - -inline std::string log_var_to_string_impl(const std::vector & var) -{ - std::stringstream buf; - buf << "[ "; - bool first = true; - for (auto e : var) - { - if (first) - { - first = false; - } - else - { - buf << ", "; - } - buf << std::to_string(e); - } - buf << " ]"; - - return buf.str(); -} - -template -inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens) -{ - std::stringstream buf; - buf << "[ "; - - bool first = true; - for (const auto & token : tokens) - { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = llama_token_to_piece(ctx, token); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf - << "'" << detokenized << "'" - << ":" << std::to_string(token); - } - buf << " ]"; - - return buf.str(); -} - -template -inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch) -{ - std::stringstream buf; - buf << "[ "; - - bool first = true; - for (int i = 0; i < batch.n_tokens; ++i) - { - if (!first) { - buf << ", "; - } else { - first = false; - } - - auto detokenized = llama_token_to_piece(ctx, batch.token[i]); - - detokenized.erase( - std::remove_if( - detokenized.begin(), - detokenized.end(), - [](const unsigned char c) { return !std::isprint(c); }), - detokenized.end()); - - buf - << "\n" << std::to_string(i) - << ":token '" << detokenized << "'" - << ":pos " << std::to_string(batch.pos[i]) - << ":n_seq_id " << std::to_string(batch.n_seq_id[i]) - << ":seq_id " << std::to_string(batch.seq_id[i][0]) - << ":logits " << std::to_string(batch.logits[i]); - } - buf << " ]"; - - return buf.str(); -} - -#ifdef LOG_DISABLE_LOGS - -#undef LOG -#define LOG(...) // dummy stub -#undef LOGLN -#define LOGLN(...) // dummy stub - -#undef LOG_TEE -#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf - -#undef LOG_TEELN -#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf - -#undef LOG_DISABLE -#define LOG_DISABLE() // dummy stub - -#undef LOG_ENABLE -#define LOG_ENABLE() // dummy stub - -#undef LOG_ENABLE -#define LOG_ENABLE() // dummy stub - -#undef LOG_SET_TARGET -#define LOG_SET_TARGET(...) // dummy stub - -#undef LOG_DUMP_CMDLINE -#define LOG_DUMP_CMDLINE(...) // dummy stub - -#endif // LOG_DISABLE_LOGS +#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) +#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) +#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__) +#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index 3ca112ef1..7953c723e 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -2,8 +2,11 @@ #include "common.h" #include "log.h" +#include #include +#include #include +#include void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { diff --git a/common/sampling.cpp b/common/sampling.cpp index c07b5e940..e51d07611 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -325,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { } std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { - std::string result = "\tlogits "; + std::string result = "logits "; for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); diff --git a/common/train.cpp b/common/train.cpp index fef1e57c9..661ad8382 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -1,9 +1,11 @@ #include "train.h" #include "common.h" +#include #include #include #include +#include struct random_normal_distribution { std::mt19937 gen; diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index ec00fcf78..4a15941f1 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -8,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -20,6 +21,8 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + int is_pp_shared = params.is_pp_shared; std::vector n_pp = params.n_pp; @@ -76,7 +79,7 @@ int main(int argc, char ** argv) { const int ret = llama_decode(ctx, batch_view); if (ret != 0) { - LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } @@ -93,17 +96,17 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } } if (!params.batched_bench_output_jsonl) { - LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); - LOG_TEE("\n"); - LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); - LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); + LOG("\n"); + LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG("\n"); + LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); + LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); } for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { @@ -133,7 +136,7 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -155,7 +158,7 @@ int main(int argc, char ** argv) { } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } } @@ -173,20 +176,20 @@ int main(int argc, char ** argv) { const float speed = n_kv / t; if(params.batched_bench_output_jsonl) { - LOG_TEE( + LOG( "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n", n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed ); } else { - LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); + LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); } } } } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); llama_batch_free(batch); @@ -196,7 +199,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index f1df20c6e..7887a43d6 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -8,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -23,6 +24,7 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); // number of parallel batches int n_parallel = params.n_parallel; @@ -42,7 +44,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: error: unable to load model\n" , __func__); return 1; } @@ -72,31 +74,29 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed)); if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: error: failed to create the llama_context\n" , __func__); return 1; } const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); - LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__); + LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req); + LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__); return 1; } // print the prompt token-by-token - fprintf(stderr, "\n"); + LOG("\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } - fflush(stderr); - // create a llama_batch // we use this object to submit token data for decoding llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel); @@ -114,7 +114,7 @@ int main(int argc, char ** argv) { if (llama_model_has_encoder(model)) { if (llama_encode(ctx, batch)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -131,7 +131,7 @@ int main(int argc, char ** argv) { batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -142,7 +142,7 @@ int main(int argc, char ** argv) { //} if (n_parallel > 1) { - LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); + LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); } // main loop @@ -175,9 +175,9 @@ int main(int argc, char ** argv) { // is it an end of generation? -> mark the stream as finished if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; - LOG_TEE("\n"); + LOG("\n"); if (n_parallel > 1) { - LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); + LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur); } continue; @@ -185,8 +185,7 @@ int main(int argc, char ** argv) { // if there is only one stream, we print immediately to stdout if (n_parallel == 1) { - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); } streams[i] += llama_token_to_piece(ctx, new_token_id); @@ -208,27 +207,25 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); - if (n_parallel > 1) { - LOG_TEE("\n"); + LOG("\n"); for (int32_t i = 0; i < n_parallel; ++i) { - LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); + LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str()); } } const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); + LOG("\n"); llama_perf_sampler_print(smpl); llama_perf_context_print(ctx); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 8ca9f8915..ecff95f9a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_ const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; try { w->token_embedding_table.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); w->rms_att_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); w->rms_ffn_weight.resize(p->n_layers * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); w->wq.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); w->wo.resize(p->n_layers * p->dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); w->w1.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->w2.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); w->w3.resize(p->n_layers * p->hidden_dim * p->dim); - LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); w->rms_final_weight.resize(p->dim); - LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); if (shared_weights) { w->wcls = {}; } else { w->wcls.resize(p->vocab_size * p->dim); - LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); } } catch (std::length_error &) { @@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL fseek(f, 0, SEEK_END); auto end = ftell(f); if (curr != end) { - LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); + LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); return 1; } @@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL } static void print_sample_weights(TransformerWeights *w){ - LOG("----- Quick print of first of the weight vales of all the variables\n"); - LOG("%f\n", w->token_embedding_table[0]); - LOG("%f\n", w->rms_att_weight[0]); - LOG("%f\n", w->rms_ffn_weight[0]); + LOG_INF("----- Quick print of first of the weight vales of all the variables\n"); + LOG_INF("%f\n", w->token_embedding_table[0]); + LOG_INF("%f\n", w->rms_att_weight[0]); + LOG_INF("%f\n", w->rms_ffn_weight[0]); - LOG("%f\n", w->wq[0]); - LOG("%f\n", w->wk[0]); - LOG("%f\n", w->wv[0]); - LOG("%f\n", w->wo[0]); - LOG("%f\n", w->w1[0]); - LOG("%f\n", w->w2[0]); - LOG("%f\n", w->w3[0]); - LOG("%f\n", w->rms_att_weight[0]); - if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); + LOG_INF("%f\n", w->wq[0]); + LOG_INF("%f\n", w->wk[0]); + LOG_INF("%f\n", w->wv[0]); + LOG_INF("%f\n", w->wo[0]); + LOG_INF("%f\n", w->w1[0]); + LOG_INF("%f\n", w->w2[0]); + LOG_INF("%f\n", w->w3[0]); + LOG_INF("%f\n", w->rms_att_weight[0]); + if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -318,20 +319,20 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); - LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); - LOG("%s: n_embd: %u\n", __func__, params->n_embd); - LOG("%s: n_mult: %u\n", __func__, params->n_mult); - LOG("%s: n_head: %u\n", __func__, params->n_head); - LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); - LOG("%s: n_ff: %u\n", __func__, params->n_ff); - LOG("%s: n_layer: %u\n", __func__, params->n_layer); - LOG("%s: n_rot: %u\n", __func__, params->n_rot); + LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab); + LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx); + LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd); + LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult); + LOG_INF("%s: n_head: %u\n", __func__, params->n_head); + LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff); + LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer); + LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot); } static void print_tensor_info(const struct ggml_context * ctx) { for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - LOG("%s: Allocating ", __func__); + LOG_INF("%s: Allocating ", __func__); int64_t total = 1; int i = 0; for (; i < ggml_n_dims(t); ++i) { @@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) { static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { if (is_ggml_file(filename)) { - LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); + LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam gguf_free(ctx); } else { // assume llama2.c vocabulary - LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); + LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); @@ -871,23 +872,25 @@ static std::string basename(const std::string &path) { } int main(int argc, char ** argv) { + gpt_init(); + struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { return 1; } - log_set_target(stdout); + Config config; TransformerWeights weights = {}; { - LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); + LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); FILE * file = fopen(params.fn_llama2c_model, "rb"); if (!file) { - LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); return 1; } // read in the config header if (fread(&config, sizeof(Config), 1, file) != 1) { - LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); return 1; } auto shared_weights = config.vocab_size > 0; @@ -896,7 +899,7 @@ int main(int argc, char ** argv) { // read in the Transformer weights alloc_weights(&weights, &config, shared_weights); if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { - LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); return 1; } fclose(file); @@ -929,7 +932,7 @@ int main(int argc, char ** argv) { model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); return 0; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 569b6c38f..41bf4eb2a 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -13,14 +13,15 @@ #include "ggml-metal.h" #endif +#include +#include #include +#include +#include +#include #include #include #include -#include -#include -#include -#include ////////////////////////////////////////////////// diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e94ae2955..a438dcb5a 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -39,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu llama_kv_cache_clear(ctx); // run model - fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { // encoder-only model if (llama_encode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to encode\n", __func__); + LOG_ERR("%s : failed to encode\n", __func__); } } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { // decoder-only model if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); + LOG_ERR("%s : failed to decode\n", __func__); } } @@ -84,12 +85,12 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + params.embedding = true; // For non-causal models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; - print_build_info(); - llama_backend_init(); llama_numa_init(params.numa); @@ -99,7 +100,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } @@ -109,19 +110,19 @@ int main(int argc, char ** argv) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { - fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__); + LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__); return 1; } if (n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } // split the prompt into lines @@ -136,7 +137,7 @@ int main(int argc, char ** argv) { for (const auto & prompt : prompts) { auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { - fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } @@ -147,20 +148,20 @@ int main(int argc, char ** argv) { // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' for (auto & inp : inputs) { if (inp.empty() || inp.back() != llama_token_sep(model)) { - fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__); - fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); + LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); + LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); } } // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) inputs.size(); i++) { - fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); for (int j = 0; j < (int) inputs[i].size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); } - fprintf(stderr, "\n\n"); + LOG("\n\n"); } } @@ -211,57 +212,57 @@ int main(int argc, char ** argv) { batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); if (params.embd_out.empty()) { - fprintf(stdout, "\n"); + LOG("\n"); if (pooling_type == LLAMA_POOLING_TYPE_NONE) { for (int j = 0; j < n_embd_count; j++) { - fprintf(stdout, "embedding %d: ", j); + LOG("embedding %d: ", j); for (int i = 0; i < std::min(3, n_embd); i++) { if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + LOG("%6.0f ", emb[j * n_embd + i]); } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + LOG("%9.6f ", emb[j * n_embd + i]); } } - fprintf(stdout, " ... "); + LOG(" ... "); for (int i = n_embd - 3; i < n_embd; i++) { if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + LOG("%6.0f ", emb[j * n_embd + i]); } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + LOG("%9.6f ", emb[j * n_embd + i]); } } - fprintf(stdout, "\n"); + LOG("\n"); } } else { // print the first part of the embeddings or for a single prompt, the full embedding for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); + LOG("embedding %d: ", j); for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); + LOG("%6.0f ", emb[j * n_embd + i]); } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + LOG("%9.6f ", emb[j * n_embd + i]); } } - fprintf(stdout, "\n"); + LOG("\n"); } // print cosine similarity matrix if (n_prompts > 1) { - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); + LOG("\n"); + LOG("cosine similarity matrix:\n\n"); for (int i = 0; i < n_prompts; i++) { - fprintf(stdout, "%6.6s ", prompts[i].c_str()); + LOG("%6.6s ", prompts[i].c_str()); } - fprintf(stdout, "\n"); + LOG("\n"); for (int i = 0; i < n_prompts; i++) { for (int j = 0; j < n_prompts; j++) { float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); + LOG("%6.2f ", sim); } - fprintf(stdout, "%1.10s", prompts[i].c_str()); - fprintf(stdout, "\n"); + LOG("%1.10s", prompts[i].c_str()); + LOG("\n"); } } } @@ -270,42 +271,42 @@ int main(int argc, char ** argv) { if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { const bool notArray = params.embd_out != "array"; - fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); + LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); for (int j = 0;;) { // at least one iteration (one prompt) - if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); - fprintf(stdout, "["); + if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); + LOG("["); for (int i = 0;;) { // at least one iteration (n_embd > 0) - fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); + LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); i++; - if (i < n_embd) fprintf(stdout, ","); else break; + if (i < n_embd) LOG(","); else break; } - fprintf(stdout, notArray ? "]\n }" : "]"); + LOG(notArray ? "]\n }" : "]"); j++; - if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break; + if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break; } - fprintf(stdout, notArray ? "\n ]" : "]\n"); + LOG(notArray ? "\n ]" : "]\n"); if (params.embd_out == "json+" && n_prompts > 1) { - fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); + LOG(",\n \"cosineSimilarity\": [\n"); for (int i = 0;;) { // at least two iteration (n_embd_count > 1) - fprintf(stdout, " ["); + LOG(" ["); for (int j = 0;;) { // at least two iteration (n_embd_count > 1) float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f", sim); + LOG("%6.2f", sim); j++; - if (j < n_embd_count) fprintf(stdout, ", "); else break; + if (j < n_embd_count) LOG(", "); else break; } - fprintf(stdout, " ]"); + LOG(" ]"); i++; - if (i < n_embd_count) fprintf(stdout, ",\n"); else break; + if (i < n_embd_count) LOG(",\n"); else break; } - fprintf(stdout, "\n ]"); + LOG("\n ]"); } - if (notArray) fprintf(stdout, "\n}\n"); + if (notArray) LOG("\n}\n"); } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); // clean up diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index af389abe1..6d629fe4e 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,12 +1,11 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include "ggml.h" #include -#include #include -#include #include /** @@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne GGML_ASSERT(n > 0); float sum = 0; for (int64_t i3 = 0; i3 < ne[3]; i3++) { - printf(" [\n"); + LOG(" [\n"); for (int64_t i2 = 0; i2 < ne[2]; i2++) { if (i2 == n && ne[2] > 2*n) { - printf(" ..., \n"); + LOG(" ..., \n"); i2 = ne[2] - n; } - printf(" [\n"); + LOG(" [\n"); for (int64_t i1 = 0; i1 < ne[1]; i1++) { if (i1 == n && ne[1] > 2*n) { - printf(" ..., \n"); + LOG(" ..., \n"); i1 = ne[1] - n; } - printf(" ["); + LOG(" ["); for (int64_t i0 = 0; i0 < ne[0]; i0++) { if (i0 == n && ne[0] > 2*n) { - printf("..., "); + LOG("..., "); i0 = ne[0] - n; } size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; @@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else { GGML_ABORT("fatal error"); } - printf("%12.4f", v); + LOG("%12.4f", v); sum += v; - if (i0 < ne[0] - 1) printf(", "); + if (i0 < ne[0] - 1) LOG(", "); } - printf("],\n"); + LOG("],\n"); } - printf(" ],\n"); + LOG(" ],\n"); } - printf(" ]\n"); - printf(" sum = %f\n", sum); + LOG(" ]\n"); + LOG(" sum = %f\n", sum); } } @@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); } - printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, - t->name, ggml_type_name(t->type), ggml_op_desc(t), - src0->name, ggml_ne_string(src0).c_str(), - src1 ? src1_str : "", - ggml_ne_string(t).c_str()); + LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_desc(t), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); // copy the data from the GPU memory if needed @@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) { std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -149,7 +148,7 @@ int main(int argc, char ** argv) { return 1; } - print_build_info(); + gpt_init(); llama_backend_init(); llama_numa_init(params.numa); @@ -166,14 +165,15 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); + LOG_ERR("%s : failed to init\n", __func__); return 1; } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); } bool OK = run(ctx, params); @@ -181,7 +181,7 @@ int main(int argc, char ** argv) { return 1; } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 90126ad1e..0051a5eb6 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -406,7 +406,7 @@ int main(int argc, char ** argv) { return 1; } - g_verbose = (params.verbosity == 1); + g_verbose = (params.verbosity > 1); try { lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads); ctx.run_merge(); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 14c715202..20b99a4fd 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -158,6 +158,8 @@ int main(int argc, char * argv[]) { return 1; } + gpt_init(); + llama_model_params mparams = llama_model_params_from_gpt_params(params); llama_context_params cparams = llama_context_params_from_gpt_params(params); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 73b54da7f..265281699 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -19,12 +20,12 @@ #endif static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" + LOG("\nexample usage:\n"); + LOG("\n %s \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n" " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); - LOG_TEE("\n"); + LOG("\n"); } struct Stats { @@ -125,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.counts.resize(src1->ne[0]*n_as, 0); } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); exit(1); //GGML_ABORT("fatal error"); } - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); - } + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); // loop over all possible experts, regardless if they are used or not in the batch for (int ex = 0; ex < n_as; ++ex) { size_t e_start = ex*src1->ne[0]; @@ -151,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.values[e_start + j] += x[j]*x[j]; e.counts[e_start + j]++; if (!std::isfinite(e.values[e_start + j])) { - fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str()); + LOG("\n"); + LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str()); exit(1); } } @@ -174,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * e.counts.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ABORT("fatal error"); } ++e.ncall; - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); - } + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); for (int row = 0; row < (int)src1->ne[1]; ++row) { const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; e.counts[j]++; if (!std::isfinite(e.values[j])) { - fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str()); + LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str()); exit(1); } } @@ -239,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const { } if (n_zeros != 0 && is_first) { - fprintf(stderr, "\n"); + LOG_INF("\n"); is_first = false; } if (n_zeros == n_all) { - fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); + LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); continue; } if (n_zeros > 0) { - fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); + LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); continue; } @@ -258,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const { } if (to_store.size() < m_stats.size()) { - fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); + LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); } std::ofstream out(fname, std::ios::binary); @@ -290,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const { out.write(m_params.prompt_file.c_str(), len); } - if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); - } + LOGV(1, "\n"); + LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); } bool IMatrixCollector::load_imatrix(const char * fname) { std::ifstream in(fname, std::ios::binary); if (!in) { - printf("%s: failed to open %s\n",__func__, fname); + LOG_ERR("%s: failed to open %s\n",__func__, fname); return false; } int n_entries; in.read((char*)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { - printf("%s: no data in file %s\n", __func__, fname); + LOG_ERR("%s: no data in file %s\n", __func__, fname); return false; } for (int i = 0; i < n_entries; ++i) { @@ -312,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); + LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); return false; } name_as_vec[len] = 0; @@ -323,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i); m_stats = {}; return false; } @@ -336,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) { std::vector tmp(nval); in.read((char*)tmp.data(), nval*sizeof(float)); if (in.fail()) { - printf("%s: failed reading data for entry %d\n",__func__,i); + LOG_ERR("%s: failed reading data for entry %d\n",__func__,i); m_stats = {}; return false; } @@ -437,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { const int n_ctx = llama_n_ctx(ctx); auto tim1 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + LOG_INF("%s: tokenizing the input ..\n", __func__); std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); if (params.i_chunk > 0) { if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { - fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); + LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); return false; } - fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); + LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); } if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, - n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx); + LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size()); return false; } @@ -478,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { double nll = 0.0; double nll2 = 0.0; - fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); std::vector workers(std::thread::hardware_concurrency() - 1); @@ -514,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { // TODO: use batch.logits to save computations instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } @@ -531,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + LOG("%.2f minutes\n", total_seconds / 60.0); } if (params.compute_ppl) { const int first = n_ctx/2; - const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); count += n_ctx - first - 1; - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); fflush(stdout); logits.clear(); } } - printf("\n"); + LOG("\n"); if (params.compute_ppl) { nll2 /= count; @@ -562,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { nll2 -= nll * nll; if (nll2 > 0) { nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); } else { - printf("Unexpected negative standard deviation of log(prob)\n"); + LOG("Unexpected negative standard deviation of log(prob)\n"); } } @@ -576,26 +572,27 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; - params.verbosity = 1; if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { return 1; } + gpt_init(); + params.n_batch = std::min(params.n_batch, params.n_ctx); g_collector.set_params(params); for (const auto & in_file : params.in_files) { - printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); + LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); if (!g_collector.load_imatrix(in_file.c_str())) { - fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); + LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str()); return 1; } } if (params.in_files.size() > 1) { - printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); + LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); g_collector.save_imatrix(); } @@ -614,20 +611,20 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); + LOG_ERR("%s : failed to init\n", __func__); return 1; } const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } if (!compute_imatrix(ctx, params)) { @@ -636,7 +633,7 @@ int main(int argc, char ** argv) { g_collector.save_imatrix(); - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); llama_free(ctx); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 7e252ce09..b77b876cc 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -2,6 +2,7 @@ #include "common.h" #include "console.h" #include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -55,7 +56,7 @@ static void write_logfile( const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); return; } @@ -64,7 +65,7 @@ static void write_logfile( FILE * logfile = fopen(logfile_path.c_str(), "w"); if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); return; } @@ -93,7 +94,7 @@ static void sigint_handler(int signo) { is_interacting = true; } else { console::cleanup(); - printf("\n"); + LOG("\n"); gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); @@ -110,56 +111,51 @@ int main(int argc, char ** argv) { return 1; } - auto & sparams = params.sparams; + gpt_init(); -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("infill", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS + auto & sparams = params.sparams; console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); if (params.logits_all) { - printf("\n************\n"); - printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("\n************\n"); + LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.embedding) { - printf("\n************\n"); - printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("\n************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } + if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { - printf("\n************\n"); - printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); - printf("************\n\n"); + LOG_ERR("\n************\n"); + LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.rope_freq_base != 0.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - print_build_info(); - - LOG("%s: llama backend init\n", __func__); + LOG_INF("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -172,34 +168,32 @@ int main(int argc, char ** argv) { g_smpl = &smpl; // load the model and apply lora adapter, if any - LOG("%s: load the model and apply lora adapter, if any\n", __func__); + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; ctx = llama_init.context; if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); + LOG_DBG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - LOG_TEE("\n"); - LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } const bool add_bos = llama_add_bos_token(model); GGML_ASSERT(!llama_add_eos_token(model)); - LOG("add_bos: %d\n", add_bos); std::vector embd_inp; std::vector embd_end; @@ -224,18 +218,19 @@ int main(int argc, char ** argv) { embd_inp.push_back(middle_token); } - LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); - LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_DBG("add_bos: %d\n", add_bos); + LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str()); + LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); // Should not run without any tokens if (embd_inp.empty()) { embd_inp.push_back(llama_token_bos(model)); - LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } @@ -244,9 +239,8 @@ int main(int argc, char ** argv) { params.n_keep = (int)embd_inp.size(); } - LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); - LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); - + LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str()); + LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str()); // enable interactive mode if interactive start is specified if (params.interactive_first) { @@ -254,21 +248,21 @@ int main(int argc, char ** argv) { } if (params.verbose_prompt) { - LOG_TEE("\n"); - LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_INF("\n"); + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > 0) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_TEE("'\n"); + LOG("'\n"); } - LOG_TEE("\n"); + LOG_INF("\n"); } if (params.interactive) { @@ -285,28 +279,30 @@ int main(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - LOG_TEE("%s: interactive mode on.\n", __func__); + LOG_INF("%s: interactive mode on.\n", __func__); if (params.input_prefix_bos) { - LOG_TEE("Input prefix with BOS\n"); + LOG_INF("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); } if (!params.input_suffix.empty()) { - LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); } } smpl = gpt_sampler_init(model, sparams); - LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); - LOG_TEE("sampling: \n%s\n", sparams.print().c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); - LOG_TEE("\n\n"); + LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); - LOG_TEE("\n##### Infill mode #####\n\n"); + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + + LOG("\n"); + LOG("\n##### Infill mode #####\n\n"); if (params.interactive) { const char *control_message; if (params.multiline_input) { @@ -317,11 +313,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_TEE("== Running in interactive mode. ==\n"); + LOG("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); + LOG( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_TEE( "%s\n", control_message); + LOG( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -354,9 +350,8 @@ int main(int argc, char ** argv) { embd.resize(max_embd_size); console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); - fflush(stdout); } // infinite text generation via context swapping @@ -365,14 +360,14 @@ int main(int argc, char ** argv) { // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches if (n_past + (int) embd.size() > n_ctx) { if (params.n_predict == -2) { - LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; } const int n_left = n_past - params.n_keep - 1; const int n_discard = n_left/2; - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); @@ -380,9 +375,9 @@ int main(int argc, char ** argv) { n_past -= n_discard; - LOG("after swap: n_past = %d\n", n_past); + LOG_DBG("after swap: n_past = %d\n", n_past); - LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); } @@ -394,16 +389,16 @@ int main(int argc, char ** argv) { n_eval = params.n_batch; } - LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } n_past += n_eval; - LOG("n_past = %d\n", n_past); + LOG_DBG("n_past = %d\n", n_past); } } @@ -415,7 +410,7 @@ int main(int argc, char ** argv) { gpt_sampler_accept(smpl, id, true); - // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -425,10 +420,10 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); @@ -447,7 +442,7 @@ int main(int argc, char ** argv) { if (input_echo) { for (auto id : embd) { const std::string token_str = llama_token_to_piece(ctx, id); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); if (embd.size() > 1) { input_tokens.push_back(id); @@ -456,7 +451,6 @@ int main(int argc, char ** argv) { output_ss << token_str; } } - fflush(stdout); } // reset color to default if we there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { @@ -469,10 +463,9 @@ int main(int argc, char ** argv) { if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } - fflush(stdout); - printf("\n"); + LOG("\n"); console::set_display(console::user_input); std::string buffer; std::string line; @@ -528,35 +521,33 @@ int main(int argc, char ** argv) { n_remain = params.n_predict; n_past = 0; n_consumed = 0; - // LOG_TEE("took new input\n"); is_interacting = false; } // deal with end of generation tokens in interactive mode else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { - LOG("found EOS token\n"); + LOG_DBG("found EOS token\n"); if (params.interactive) { is_interacting = true; - printf("\n"); + LOG("\n"); console::set_display(console::user_input); - fflush(stdout); } } if (n_past > 0 && is_interacting && !params.interactive) { - LOG("waiting for user input\n"); + LOG_DBG("waiting for user input\n"); if (params.input_prefix_bos) { - LOG("adding input prefix BOS token\n"); + LOG_DBG("adding input prefix BOS token\n"); embd_inp.push_back(llama_token_bos(model)); } std::string buffer; if (!params.input_prefix.empty()) { - LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); buffer += params.input_prefix; - printf("%s", buffer.c_str()); + LOG("%s", buffer.c_str()); } std::string line; @@ -574,17 +565,17 @@ int main(int argc, char ** argv) { if (buffer.length() > 1) { // append input suffix if any if (!params.input_suffix.empty()) { - LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); buffer += params.input_suffix; - printf("%s", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); } - LOG("buffer: '%s'\n", buffer.c_str()); + LOG_DBG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); const auto line_inp = ::llama_tokenize(ctx, buffer, false); - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); @@ -595,9 +586,9 @@ int main(int argc, char ** argv) { } n_remain -= line_inp.size(); - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { - LOG("empty line, passing control back\n"); + LOG_DBG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -624,11 +615,10 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } - LOG_TEE("\n"); + LOG("\n"); gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); @@ -638,9 +628,5 @@ int main(int argc, char ** argv) { gpt_sampler_free(smpl); llama_backend_free(); -#ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); -#endif // LOG_DISABLE_LOGS - return 0; } diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 5dfb333d1..8aa7b0750 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -3,7 +3,6 @@ // I'll gradually clean and extend it // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" -#include "log.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" @@ -40,6 +39,11 @@ #include #include +#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) + //#define CLIP_DEBUG_FUNCTIONS // RGB uint8 image @@ -165,7 +169,7 @@ static std::map PROJECTOR_TYPE_NAMES = { static int get_key_idx(const gguf_context * ctx, const char * key) { int i = gguf_find_key(ctx, key); if (i == -1) { - LOG_TEE("key %s not found in file\n", key); + LOG_ERR("key %s not found in file\n", key); throw std::runtime_error(format("Missing required key: %s", key)); } @@ -270,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { size_t tensor_size = ggml_nbytes(tensor); - LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", prefix, ggml_n_dims(tensor), tensor->name, tensor_size, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); } @@ -288,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name) static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -307,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -568,7 +572,7 @@ struct clip_ctx { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return nullptr; } @@ -582,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (load_image_size == nullptr) { load_image_size = clip_image_size_init(); } - LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); + LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); image_size_width = load_image_size->width; image_size_height = load_image_size->height; if (is_inf) { @@ -1047,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const int idx_name = gguf_find_key(ctx, KEY_NAME); if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug const std::string name = gguf_get_val_str(ctx, idx_name); - LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); + LOG_INF("%s: model name: %s\n", __func__, name.c_str()); } - LOG_TEE("%s: description: %s\n", __func__, description.c_str()); - LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); - LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); - LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); - LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); - LOG_TEE("\n"); + LOG_INF("%s: description: %s\n", __func__, description.c_str()); + LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_INF("%s: n_kv: %d\n", __func__, n_kv); + LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str()); + LOG_INF("\n"); } const int n_tensors = gguf_get_n_tensors(ctx); // kv const int n_kv = gguf_get_n_kv(ctx); - LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", __func__, n_kv, n_tensors, fname); { std::map n_type; @@ -1072,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { n_type[type]++; } - LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i); @@ -1088,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } replace_all(value, "\n", "\\n"); - LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -1097,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { continue; } - LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -1112,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { size_t tensor_size = ggml_nbytes(cur); model_size += tensor_size; if (verbosity >= 3) { - LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); } } @@ -1139,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { #ifdef GGML_USE_CUDA new_clip->backend = ggml_backend_cuda_init(0); - LOG_TEE("%s: CLIP using CUDA backend\n", __func__); + LOG_INF("%s: CLIP using CUDA backend\n", __func__); #endif #ifdef GGML_USE_METAL new_clip->backend = ggml_backend_metal_init(); - LOG_TEE("%s: CLIP using Metal backend\n", __func__); + LOG_INF("%s: CLIP using Metal backend\n", __func__); #endif #ifdef GGML_USE_CANN new_clip->backend = ggml_backend_cann_init(0); - LOG_TEE("%s: CLIP using CANN backend\n", __func__); + LOG_INF("%s: CLIP using CANN backend\n", __func__); #endif #ifdef GGML_USE_VULKAN new_clip->backend = ggml_backend_vk_init(0); - LOG_TEE("%s: CLIP using Vulkan backend\n", __func__); + LOG_INF("%s: CLIP using Vulkan backend\n", __func__); #endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); - LOG_TEE("%s: CLIP using CPU backend\n", __func__); + LOG_INF("%s: CLIP using CPU backend\n", __func__); } // model size and capabilities @@ -1194,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->use_gelu = gguf_get_val_bool(ctx, idx); if (verbosity >= 1) { - LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); - LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); - LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); - LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); - LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } } - LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); + LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); // load tensors { @@ -1216,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->ctx_data = ggml_init(params); if (!new_clip->ctx_data) { - LOG_TEE("%s: ggml_init() failed\n", __func__); + LOG_ERR("%s: ggml_init() failed\n", __func__); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1224,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { - LOG_TEE("cannot open model file for loading tensors\n"); + LOG_ERR("cannot open model file for loading tensors\n"); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1246,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); fin.seekg(offset, std::ios::beg); if (!fin) { - LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); + LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1317,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (verbosity >= 2) { - LOG_TEE("\n%s: vision model hparams\n", __func__); - LOG_TEE("image_size %d\n", hparams.image_size); - LOG_TEE("patch_size %d\n", hparams.patch_size); - LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); - LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); - LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); - LOG_TEE("v_n_head %d\n", hparams.n_head); - LOG_TEE("v_n_layer %d\n", hparams.n_layer); - LOG_TEE("v_eps %f\n", hparams.eps); - LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); - LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); - LOG_TEE("v_image_grid_pinpoints: "); + LOG_INF("\n%s: vision model hparams\n", __func__); + LOG_INF("image_size %d\n", hparams.image_size); + LOG_INF("patch_size %d\n", hparams.patch_size); + LOG_INF("v_hidden_size %d\n", hparams.hidden_size); + LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate); + LOG_INF("v_projection_dim %d\n", hparams.projection_dim); + LOG_INF("v_n_head %d\n", hparams.n_head); + LOG_INF("v_n_layer %d\n", hparams.n_layer); + LOG_INF("v_eps %f\n", hparams.eps); + LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + LOG_INF("v_image_grid_pinpoints: "); for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { - LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); + LOG_INF("%d ", hparams.image_grid_pinpoints[i]); } - LOG_TEE("\n"); - LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + LOG_INF("\n"); + LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); } @@ -1371,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { - LOG_TEE("%s: failed to load vision model tensors\n", __func__); + LOG_ERR("%s: failed to load vision model tensors\n", __func__); } // LLaVA projection @@ -1400,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); - // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); + // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__); } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1501,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); } return new_clip; @@ -1552,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { int nx, ny, nc; auto * data = stbi_load(fname, &nx, &ny, &nc, 3); if (!data) { - LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); + LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1564,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length int nx, ny, nc; auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); if (!data) { - LOG_TEE("%s: failed to decode image bytes\n", __func__); + LOG_ERR("%s: failed to decode image bytes\n", __func__); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1754,7 +1758,7 @@ static std::pair select_best_resolution(const std::pair & or int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -1872,7 +1876,7 @@ static std::vector> uhd_slice_image(const clip_imag const int multiple = fmin(ceil(ratio), max_slice_nums); std::vector> images; - LOG_TEE("%s: multiple %d\n", __func__, multiple); + LOG_INF("%s: multiple %d\n", __func__, multiple); images.push_back(std::vector()); if (multiple <= 1) { @@ -1887,17 +1891,17 @@ static std::vector> uhd_slice_image(const clip_imag clip_image_u8 * source_image = clip_image_u8_init(); bicubic_resize(*img, *source_image, best_size.first, best_size.second); // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); images[images.size()-1].push_back(source_image); std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); - LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); clip_image_u8 * refine_image = clip_image_u8_init(); bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); - LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); // split_to_patches int width = refine_image->nx; @@ -1954,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli int idx = 0; for (size_t i = 0; i < imgs.size(); ++i) { for (size_t j = 0; j < imgs[i].size(); ++j) { - LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); clip_image_f32 * res = clip_image_f32_init(); normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); res_imgs->data[idx++] = *res; @@ -1966,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli bool pad_to_square = true; if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } auto & params = ctx->vision_model.hparams; @@ -2043,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } for (size_t i = 0; i < patches.size(); i++) { - // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); + // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } @@ -2279,7 +2283,7 @@ static std::vector> get_2d_sincos_pos_embed(int embed_dim, co bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } @@ -2291,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { if (!ctx->has_vision_encoder) { - LOG_TEE("This gguf file seems to have no vision encoder\n"); + LOG_ERR("This gguf file seems to have no vision encoder\n"); return false; } @@ -2521,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i new_type = type; if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type - // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); } const size_t n_elms = ggml_nelements(cur); float * f32_data; @@ -2540,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i f32_data = (float *)conv_buf.data(); break; default: - LOG_TEE("Please use an input file in f32 or f16\n"); + LOG_ERR("Please use an input file in f32 or f16\n"); gguf_free(ctx_out); return false; } @@ -2567,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i fout.put(0); } - LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); } @@ -2583,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i gguf_free(ctx_out); { - LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); } return true; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 12fe7345f..8f437863f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -10,6 +10,7 @@ #include #include +#include #include static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { @@ -20,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); return NULL; } @@ -89,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); if (!embed) { - LOG_TEE("%s: could not load image from base64 string.\n", __func__); + LOG_ERR("%s: could not load image from base64 string.\n", __func__); return NULL; } @@ -114,9 +115,9 @@ struct llava_context { }; static void print_usage(int, char ** argv) { - LOG_TEE("\n example usage:\n"); - LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); + LOG("\n example usage:\n"); + LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { @@ -126,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para auto prompt = params->prompt; if (prompt_contains_image(prompt)) { if (!params->image.empty()) { - LOG_TEE("using base64 encoded image instead of command line image path\n"); + LOG_INF("using base64 encoded image instead of command line image path\n"); } embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); if (!embed) { - LOG_TEE("%s: can't load image from prompt\n", __func__); + LOG_ERR("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); @@ -156,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // new templating mode: Provide the full prompt including system message and use as a placeholder for the image system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); + LOG_INF("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } - LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + LOG_INF("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -177,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } @@ -188,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // generate the response - LOG_TEE("\n"); + LOG("\n"); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); if (!smpl) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); } @@ -202,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + LOG("%s", tmp); if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 @@ -211,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ } gpt_sampler_free(smpl); - printf("\n"); + LOG("\n"); } static struct llama_model * llava_init(gpt_params * params) { @@ -222,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) { llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return NULL; } return model; @@ -245,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return NULL; } - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_clip = ctx_clip; @@ -268,12 +269,6 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - int main(int argc, char ** argv) { ggml_time_init(); @@ -283,27 +278,23 @@ int main(int argc, char ** argv) { return 1; } -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS + gpt_init(); if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); return 1; } - auto model = llava_init(¶ms); + + auto * model = llava_init(¶ms); if (model == NULL) { fprintf(stderr, "%s: error: failed to init llava model\n", __func__); return 1; } if (prompt_contains_image(params.prompt)) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, ""); + auto * image_embed = load_image(ctx_llava, ¶ms, ""); // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); @@ -314,11 +305,11 @@ int main(int argc, char ** argv) { llava_free(ctx_llava); } else { for (auto & image : params.image) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, image); + auto * image_embed = load_image(ctx_llava, ¶ms, image); if (!image_embed) { - std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; + LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); return 1; } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index e162586ed..8558c6bdc 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -1,13 +1,23 @@ #include "clip.h" -#include "common.h" -#include "llama.h" #include "llava.h" -#include "base64.hpp" +#include "llama.h" + +#include +#include #include #include +#include +#include #include -#include + +#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) +#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) + +#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) // RGB uint8 image struct clip_image_u8 { @@ -54,7 +64,7 @@ static std::pair select_best_resolution(const std::pair& ori int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli img_res_v.size = 0; img_res_v.data = nullptr; if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { - LOG_TEE("%s: unable to preprocess image\n", __func__); + LOG_ERR("%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; } @@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); } if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } const int64_t t_img_enc_steop_batch_us = ggml_time_us(); - LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli load_image_size->width = img->nx; load_image_size->height = img->ny; clip_add_load_image_size(ctx_clip, load_image_size); - LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); + LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); } else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding @@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 delete[] img_res_v.data; if (!encoded) { - LOG_TEE("Unable to encode image\n"); + LOG_ERR("Unable to encode image\n"); return false; } @@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); const int32_t * image_grid = clip_image_grid(ctx_clip); @@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); } - LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; - LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); + LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); return true; } @@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); auto n_image_embd = clip_n_mmproj_embd(ctx_clip); if (n_image_embd != n_llama_embd) { - LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); return false; } return true; @@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co } float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model if (!image_embd) { - LOG_TEE("Unable to allocate memory for image embeddings\n"); + LOG_ERR("Unable to allocate memory for image embeddings\n"); return false; } int n_img_pos; if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { - LOG_TEE("%s: cannot encode image, aborting\n", __func__); + LOG_ERR("%s: cannot encode image, aborting\n", __func__); free(image_embd); return false; } @@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ } llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; if (llama_decode(ctx_llama, batch)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return false; } *n_past += n_eval; @@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); - LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); + LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__); return NULL; } @@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - LOG_TEE("%s: coulnd't embed the image\n", __func__); + LOG_ERR("%s: coulnd't embed the image\n", __func__); return NULL; } @@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { auto file = fopen(path, "rb"); if (file == NULL) { - LOG_TEE("%s: can't read file %s\n", __func__, path); + LOG_ERR("%s: can't read file %s\n", __func__, path); return false; } @@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data if (buffer == NULL) { - LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); perror("Memory allocation error"); fclose(file); return false; @@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); if (!loaded) { - LOG_TEE("%s: failed to load %s\n", __func__, image_path); + LOG_ERR("%s: failed to load %s\n", __func__, image_path); return NULL; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 3ac455e69..c5156c35b 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -7,9 +7,12 @@ #include "llama.h" #include "ggml.h" +#include #include #include +#include #include +#include // TODO: remove me struct llava_context { struct clip_ctx * ctx_clip = NULL; @@ -18,14 +21,8 @@ struct llava_context { }; static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); + LOG("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llama_model * llava_init(gpt_params * params) { @@ -36,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) { llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return NULL; } return model; @@ -51,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); if (params->n_ctx < 2048) { // warn user here, "Image processing requires at least 2048 context, setting context to 2048" - LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); + LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); ctx_params.n_ctx = 2048; } else { ctx_params.n_ctx = params->n_ctx; @@ -60,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return NULL; } - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->model = model; @@ -89,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) { if (prompt.empty()) { prompt = "describe the image in detail."; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); return ctx_clip; } @@ -101,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vectorctx_clip)); std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); slice_embed->embed = image_embed; slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); @@ -143,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e else if (has_minicpmv_projector == 3) { system_prompt = "<|im_start|>user\n"; } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); + LOG_INF("%s: image token past: %d\n", __func__, n_past); eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); @@ -162,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e } eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); } - LOG_TEE("%s: image token past: %d\n", __func__, n_past); + LOG_INF("%s: image token past: %d\n", __func__, n_past); } static const char * sample(struct gpt_sampler * smpl, @@ -181,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl, } static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ - auto ctx_clip = clip_init_context(params); - auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); + auto * ctx_clip = clip_init_context(params); + auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embeds) { - std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str()); return NULL; } // process the prompt if (params->prompt.empty() && params->interactive == false) { - LOG_TEE("prompt should be given or interactive mode should be on"); + LOG_ERR("prompt should be given or interactive mode should be on"); return NULL; } - auto model = llava_init(params); + auto * model = llava_init(params); if (model == NULL) { fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); return NULL; } const int64_t t_llava_init_start_us = ggml_time_us(); - auto ctx_llava = llava_init_context(params, model); + auto * ctx_llava = llava_init_context(params, model); ctx_llava->ctx_clip = ctx_clip; const int64_t t_llava_init_end_us = ggml_time_us(); float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; - LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); + LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); const int64_t t_process_image_start_us = ggml_time_us(); process_image(ctx_llava, embeds, params, n_past); const int64_t t_process_image_end_us = ggml_time_us(); float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; - LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); + LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); llava_image_embed_free(embeds); return ctx_llava; } -static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ +static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){ std::string user_prompt = prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); if (!is_first) { @@ -238,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par // generate the response - LOG_TEE("\n"); + LOG_INF("\n"); struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); return smpl; @@ -259,12 +256,7 @@ int main(int argc, char ** argv) { return 1; } -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS + gpt_init(); if (params.mmproj.empty() || (params.image.empty())) { show_additional_info(argc, argv); @@ -273,21 +265,23 @@ int main(int argc, char ** argv) { for (auto & image : params.image) { int n_past = 0; - auto ctx_llava = minicpmv_init(¶ms, image, n_past); + auto * ctx_llava = minicpmv_init(¶ms, image, n_past); if (!params.prompt.empty()) { - LOG_TEE("%s\n", params.prompt.c_str()); - LOG_TEE(""); - auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + LOG("%s\n", params.prompt.c_str()); + LOG(""); + auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; + std::string response; bool have_tmp = false; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, smpl, n_past); + const auto * tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0){ - if(!have_tmp)continue; - else break; + if (!have_tmp) { + continue; + } + break; } if (strstr(tmp, "###")) break; // Yi-VL behavior have_tmp = true; @@ -299,15 +293,15 @@ int main(int argc, char ** argv) { gpt_sampler_free(smpl); }else { while (true) { - LOG_TEE(""); + LOG(""); std::string prompt; std::getline(std::cin, prompt); - LOG_TEE(""); - auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + LOG(""); + auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response = ""; + std::string response; for (int i = 0; i < max_tgt_len; i++) { - auto tmp = llama_loop(ctx_llava, smpl, n_past); + const auto * tmp = llama_loop(ctx_llava, smpl, n_past); response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index de8b792f2..49870b4a4 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -1,6 +1,7 @@ #include "arg.h" #include "common.h" #include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -42,18 +43,14 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + const int W = 15; // lookahead window const int N = 5; // n-gram size const int G = 15; // max verification n-grams const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("lookahead", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -75,14 +72,14 @@ int main(int argc, char ** argv) { const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -166,7 +163,7 @@ int main(int argc, char ** argv) { { const std::string token_str = llama_token_to_piece(ctx, id); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); fflush(stdout); } } @@ -256,7 +253,7 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__); + LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__); return 1; } @@ -293,10 +290,10 @@ int main(int argc, char ** argv) { const std::string token_str = llama_token_to_piece(ctx, id); if (v == 0) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } else { // print light cyan - printf("\033[0;96m%s\033[0m", token_str.c_str()); + LOG("\033[0;96m%s\033[0m", token_str.c_str()); } fflush(stdout); @@ -330,21 +327,21 @@ int main(int argc, char ** argv) { // print known n-grams starting with token id (debug) if (0 && v == 0) { if (ngrams_observed.cnt[id] > 0) { - printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); + LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); } for (int i = 0; i < ngrams_observed.cnt[id]; i++) { - printf(" - ngram %2d: ", i); + LOG(" - ngram %2d: ", i); const int idx = id*(N - 1)*G + i*(N - 1); for (int j = 0; j < N - 1; j++) { const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } - printf("\n"); + LOG("\n"); } } @@ -455,20 +452,20 @@ int main(int argc, char ** argv) { auto t_dec_end = ggml_time_us(); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("W = %2d\n", W); - LOG_TEE("N = %2d\n", N); - LOG_TEE("G = %2d\n", G); - LOG_TEE("\n"); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_accept = %d\n", n_accept); + LOG_INF("\n"); + LOG_INF("W = %2d\n", W); + LOG_INF("N = %2d\n", N); + LOG_INF("G = %2d\n", G); + LOG_INF("\n"); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_accept = %d\n", n_accept); - LOG_TEE("\n"); + LOG_INF("\n"); gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); @@ -482,7 +479,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index f299d68a9..6d1e1ceb9 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -5,13 +5,12 @@ #include "llama.h" #include "ggml.h" -#include #include #include +#include #include #include #include -#include int main(int argc, char ** argv){ gpt_params params; @@ -20,6 +19,8 @@ int main(int argc, char ** argv){ return 1; } + gpt_init(); + const int n_draft = params.n_draft; // init llama.cpp @@ -49,7 +50,7 @@ int main(int argc, char ** argv){ try { ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } @@ -128,7 +129,7 @@ int main(int argc, char ** argv){ const int64_t eta_min = eta_ms / (60*1000); const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; - LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); + LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); } // After each chunk, update the dynamic ngram cache with the context ngram cache: @@ -136,24 +137,24 @@ int main(int argc, char ** argv){ ngram_cache_context.clear(); } - LOG_TEE("\n"); + LOG("\n"); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); llama_free(ctx); llama_free_model(model); llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index be6f8d7d7..2ccd0e6c1 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -3,6 +3,7 @@ #include "common.h" #include "ngram-cache.h" #include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -18,17 +19,13 @@ int main(int argc, char ** argv){ return 1; } + gpt_init(); + // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("lookup", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -58,7 +55,7 @@ int main(int argc, char ** argv){ try { ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { - fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); } } @@ -76,14 +73,14 @@ int main(int argc, char ** argv){ const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -124,7 +121,7 @@ int main(int argc, char ** argv){ } // print current draft sequence - LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str()); + LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str()); int i_dft = 0; while (true) { @@ -136,7 +133,7 @@ int main(int argc, char ** argv){ const std::string token_str = llama_token_to_piece(ctx, id); if (!params.use_color) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } if (llama_token_is_eog(model, id)) { @@ -147,7 +144,7 @@ int main(int argc, char ** argv){ // check if the target token matches the draft if (i_dft < (int) draft.size() && id == draft[i_dft]) { - LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); + LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str()); ++n_accept; ++n_past; ++i_dft; @@ -161,19 +158,19 @@ int main(int argc, char ** argv){ if (params.use_color) { // color accepted draft token - printf("\033[34m%s\033[0m", token_str.c_str()); + LOG("\033[34m%s\033[0m", token_str.c_str()); fflush(stdout); } continue; } if (params.use_color) { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } fflush(stdout); - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); + LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str()); draft.clear(); draft.push_back(id); @@ -224,22 +221,22 @@ int main(int argc, char ** argv){ llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ntarget:\n\n"); + LOG_INF("\ntarget:\n\n"); gpt_perf_print(ctx, smpl); gpt_sampler_free(smpl); @@ -251,7 +248,7 @@ int main(int argc, char ** argv){ llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f41be5308..d9e45ce2f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,12 +1,11 @@ #include "arg.h" #include "common.h" #include "console.h" +#include "log.h" #include "sampling.h" #include "llama.h" #include -#include -#include #include #include #include @@ -42,11 +41,13 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; -static void print_usage(int, char ** argv) { - printf("\nexample usage:\n"); - printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); - printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); - printf("\n"); +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + LOG("\n"); } static bool file_exists(const std::string & path) { @@ -74,8 +75,7 @@ static void write_logfile( const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); + LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); return; } @@ -83,7 +83,7 @@ static void write_logfile( FILE * logfile = fopen(logfile_path.c_str(), "w"); if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); return; } @@ -113,7 +113,7 @@ static void sigint_handler(int signo) { need_insert_eot = true; } else { console::cleanup(); - printf("\n"); + LOG("\n"); gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); @@ -122,17 +122,11 @@ static void sigint_handler(int signo) { } #endif -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - -static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, std::string role, std::string content) { +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, const std::string & role, const std::string & content) { llama_chat_msg new_msg{role, content}; auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); - LOG("formatted: %s\n", formatted.c_str()); + LOG_DBG("formatted: '%s'\n", formatted.c_str()); return formatted; } @@ -143,55 +137,46 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + auto & sparams = params.sparams; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("main", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS - - // TODO: Dump params ? - //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); - // save choice to use color for later // (note for later: this is a slightly awkward choice) console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); if (params.logits_all) { - printf("\n************\n"); - printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.embedding) { - printf("\n************\n"); - printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); - printf("************\n\n"); + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); return 0; } if (params.n_ctx != 0 && params.n_ctx < 8) { - LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } if (params.rope_freq_base != 0.0) { - LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); } if (params.rope_freq_scale != 0.0) { - LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); } - print_build_info(); + LOG_INF("%s: llama backend init\n", __func__); - LOG("%s: llama backend init\n", __func__); llama_backend_init(); llama_numa_init(params.numa); @@ -206,21 +191,19 @@ int main(int argc, char ** argv) { g_smpl = &smpl; // load the model and apply lora adapter, if any - LOG("%s: load the model and apply lora adapter, if any\n", __func__); + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; ctx = llama_init.context; if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n", __func__); + LOG_ERR("%s: error: unable to load model\n", __func__); return 1; } - LOG("%s: llama threadpool init = n_threads = %d\n", - __func__, - (int) params.cpuparams.n_threads - ); + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -232,8 +215,8 @@ int main(int argc, char ** argv) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { threadpool_batch = ggml_threadpool_new(&tpp_batch); if (!threadpool_batch) { - LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); - exit(1); + LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + return 1; } // Start the non-batch threadpool in the paused state @@ -242,55 +225,54 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); if (!threadpool) { - LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - exit(1); + LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + return 1; } llama_attach_threadpool(ctx, threadpool, threadpool_batch); const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); - LOG("n_ctx: %d\n", n_ctx); if (n_ctx > n_ctx_train) { - LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print chat template example in conversation mode if (params.conversation) { if (params.enable_chat_template) { - LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); } else { - LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } } // print system information { - LOG_TEE("\n"); - LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); } std::string path_session = params.path_prompt_cache; std::vector session_tokens; if (!path_session.empty()) { - LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); if (!file_exists(path_session)) { - LOG_TEE("%s: session file does not exist, will create.\n", __func__); + LOG_INF("%s: session file does not exist, will create.\n", __func__); } else if (file_is_empty(path_session)) { - LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); + LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); } else { // The file exists and is not empty session_tokens.resize(n_ctx); size_t n_token_count_out = 0; if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { - LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); return 1; } session_tokens.resize(n_token_count_out); - LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } @@ -298,7 +280,8 @@ int main(int argc, char ** argv) { if (!llama_model_has_encoder(model)) { GGML_ASSERT(!llama_add_eos_token(model)); } - LOG("add_bos: %d\n", add_bos); + + LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); std::vector embd_inp; @@ -307,31 +290,31 @@ int main(int argc, char ** argv) { ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { - LOG("tokenize the prompt\n"); + LOG_DBG("tokenize the prompt\n"); embd_inp = ::llama_tokenize(ctx, prompt, true, true); } else { - LOG("use session tokens\n"); + LOG_DBG("use session tokens\n"); embd_inp = session_tokens; } - LOG("prompt: \"%s\"\n", log_tostr(prompt)); - LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); } // Should not run without any tokens if (embd_inp.empty()) { if (add_bos) { embd_inp.push_back(llama_token_bos(model)); - LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } else { - LOG_TEE("error: input is empty\n"); + LOG_ERR("input is empty\n"); return -1; } } // Tokenize negative prompt if ((int) embd_inp.size() > n_ctx - 4) { - LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); return 1; } @@ -345,29 +328,28 @@ int main(int argc, char ** argv) { n_matching_session_tokens++; } if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { - LOG_TEE("%s: using full prompt from session file\n", __func__); + LOG_INF("%s: using full prompt from session file\n", __func__); } else if (n_matching_session_tokens >= embd_inp.size()) { - LOG_TEE("%s: session file has exact match for prompt!\n", __func__); + LOG_INF("%s: session file has exact match for prompt!\n", __func__); } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { - LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); } else { - LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", - __func__, n_matching_session_tokens, embd_inp.size()); + LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); } // remove any "future" tokens that we might have inherited from the previous session llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); } - LOGLN( - "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu", - log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); + LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", + embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size()); // if we will use the cache for the full prompt without reaching the end of the cache, force // reevaluation of the last token to recalculate the cached logits if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { - LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); + LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1); } @@ -389,21 +371,20 @@ int main(int argc, char ** argv) { } if (params.verbose_prompt) { - LOG_TEE("\n"); - LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > add_bos) { - LOG_TEE("%s: static prompt based on n_keep: '", __func__); + LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - LOG_TEE("'\n"); + LOG("'\n"); } - LOG_TEE("\n"); + LOG_INF("\n"); } // ctrl+C handling @@ -423,40 +404,40 @@ int main(int argc, char ** argv) { } if (params.interactive) { - LOG_TEE("%s: interactive mode on.\n", __func__); + LOG("%s: interactive mode on.\n", __func__); if (!params.antiprompt.empty()) { for (const auto & antiprompt : params.antiprompt) { - LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); + LOG("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } } if (params.input_prefix_bos) { - LOG_TEE("Input prefix with BOS\n"); + LOG("Input prefix with BOS\n"); } if (!params.input_prefix.empty()) { - LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } if (!params.input_suffix.empty()) { - LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -464,15 +445,15 @@ int main(int argc, char ** argv) { smpl = gpt_sampler_init(model, sparams); if (!smpl) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); - exit(1); + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); + return 1; } - LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl)); - LOG_TEE("sampling params: \n%s\n", sparams.print().c_str()); - LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); - LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); // group-attention state // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) @@ -486,9 +467,9 @@ int main(int argc, char ** argv) { GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT - LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); } - LOG_TEE("\n\n"); + LOG("\n"); if (params.interactive) { const char * control_message; @@ -500,11 +481,11 @@ int main(int argc, char ** argv) { " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } - LOG_TEE("== Running in interactive mode. ==\n"); + LOG("== Running in interactive mode. ==\n"); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) - LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); + LOG( " - Press Ctrl+C to interject at any time.\n"); #endif - LOG_TEE( "%s\n", control_message); + LOG( "%s\n", control_message); is_interacting = params.interactive_first; } @@ -543,7 +524,7 @@ int main(int argc, char ** argv) { llama_token * enc_input_buf = embd_inp.data(); if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -569,9 +550,8 @@ int main(int argc, char ** argv) { embd.resize(max_embd_size); console::set_display(console::error); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console::set_display(console::reset); - fflush(stdout); } if (ga_n == 1) { @@ -581,14 +561,14 @@ int main(int argc, char ** argv) { // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches if (n_past + (int) embd.size() >= n_ctx) { if (params.n_predict == -2) { - LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; } const int n_left = n_past - params.n_keep; const int n_discard = n_left/2; - LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); @@ -596,11 +576,11 @@ int main(int argc, char ** argv) { n_past -= n_discard; - LOG("after swap: n_past = %d\n", n_past); + LOG_DBG("after swap: n_past = %d\n", n_past); - LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - LOG("clear session path\n"); + LOG_DBG("clear session path\n"); path_session.clear(); } } else { @@ -610,10 +590,10 @@ int main(int argc, char ** argv) { const int bd = (ga_w/ga_n)*(ga_n - 1); const int dd = (ga_w/ga_n) - ib*bd - ga_w; - LOG("\n"); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); - LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); - LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); + LOG_DBG("\n"); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); + LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); @@ -623,7 +603,7 @@ int main(int argc, char ** argv) { ga_i += ga_w/ga_n; - LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); } } @@ -655,19 +635,19 @@ int main(int argc, char ** argv) { n_eval = params.n_batch; } - LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return 1; } n_past += n_eval; - LOG("n_past = %d\n", n_past); + LOG_DBG("n_past = %d\n", n_past); // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { - LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } @@ -685,14 +665,14 @@ int main(int argc, char ** argv) { need_to_save_session = false; llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - LOG("saved session to %s\n", path_session.c_str()); + LOG_DBG("saved session to %s\n", path_session.c_str()); } const llama_token id = gpt_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, /* apply_grammar= */ true); + gpt_sampler_accept(smpl, id, /* accept_grammar= */ true); - // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str()); + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); embd.push_back(id); @@ -702,16 +682,16 @@ int main(int argc, char ** argv) { // decrement remaining sampling budget --n_remain; - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { // some user input remains from prompt or interaction, forward it to processing - LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); while ((int) embd_inp.size() > n_consumed) { embd.push_back(embd_inp[n_consumed]); // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false); + gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -726,7 +706,7 @@ int main(int argc, char ** argv) { const std::string token_str = llama_token_to_piece(ctx, id, params.special); // Console/Stream Output - fprintf(stdout, "%s", token_str.c_str()); + LOG("%s", token_str.c_str()); // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check @@ -738,8 +718,6 @@ int main(int argc, char ** argv) { output_tokens.push_back(id); output_ss << token_str; } - - fflush(stdout); } } @@ -788,13 +766,13 @@ int main(int argc, char ** argv) { } if (is_antiprompt) { - LOG("found antiprompt: %s\n", last_output.c_str()); + LOG_DBG("found antiprompt: %s\n", last_output.c_str()); } } // deal with end of generation tokens in interactive mode if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { - LOG("found an EOG token\n"); + LOG_DBG("found an EOG token\n"); if (params.interactive) { if (!params.antiprompt.empty()) { @@ -808,7 +786,7 @@ int main(int argc, char ** argv) { chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); } is_interacting = true; - printf("\n"); + LOG("\n"); } } @@ -819,21 +797,21 @@ int main(int argc, char ** argv) { } if (n_past > 0 && is_interacting) { - LOG("waiting for user input\n"); + LOG_DBG("waiting for user input\n"); if (params.conversation) { - printf("\n> "); + LOG("\n> "); } if (params.input_prefix_bos) { - LOG("adding input prefix BOS token\n"); + LOG_DBG("adding input prefix BOS token\n"); embd_inp.push_back(llama_token_bos(model)); } std::string buffer; if (!params.input_prefix.empty() && !params.conversation) { - LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); - printf("%s", params.input_prefix.c_str()); + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("%s", params.input_prefix.c_str()); } // color user input only @@ -856,11 +834,11 @@ int main(int argc, char ** argv) { if (buffer.length() > 1) { // append input suffix if any if (!params.input_suffix.empty() && !params.conversation) { - LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); - printf("%s", params.input_suffix.c_str()); + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); } - LOG("buffer: '%s'\n", buffer.c_str()); + LOG_DBG("buffer: '%s'\n", buffer.c_str()); const size_t original_size = embd_inp.size(); @@ -877,7 +855,7 @@ int main(int argc, char ** argv) { const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); - LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { @@ -900,9 +878,9 @@ int main(int argc, char ** argv) { assistant_ss.str(""); n_remain -= line_inp.size(); - LOG("n_remain: %d\n", n_remain); + LOG_DBG("n_remain: %d\n", n_remain); } else { - LOG("empty line, passing control back\n"); + LOG_DBG("empty line, passing control back\n"); } input_echo = false; // do not echo this again @@ -918,7 +896,7 @@ int main(int argc, char ** argv) { // end of generation if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { - LOG_TEE(" [end of text]\n"); + LOG(" [end of text]\n"); break; } @@ -931,11 +909,11 @@ int main(int argc, char ** argv) { } if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { - LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } - LOG_TEE("\n"); + LOG("\n\n"); gpt_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); @@ -949,9 +927,5 @@ int main(int argc, char ** argv) { ggml_threadpool_free(threadpool); ggml_threadpool_free(threadpool_batch); -#ifndef LOG_DISABLE_LOGS - LOG_TEE("Log end\n"); -#endif // LOG_DISABLE_LOGS - return 0; } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 758393c3d..81e2f7ed7 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -4,6 +4,7 @@ #include "arg.h" #include "common.h" #include "sampling.h" +#include "log.h" #include "llama.h" #include @@ -83,7 +84,9 @@ static void print_date_time() { char buffer[80]; strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time); - printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer); + LOG_INF("\n"); + LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer); + LOG_INF("\n"); } // Define a split string function to ... @@ -106,6 +109,8 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + // number of simultaneous "clients" to simulate const int32_t n_clients = params.n_parallel; @@ -120,12 +125,6 @@ int main(int argc, char ** argv) { const bool dump_kv_cache = params.dump_kv_cache; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("parallel", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -138,23 +137,22 @@ int main(int argc, char ** argv) { // load the prompts from an external file if there are any if (params.prompt.empty()) { - printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); + LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); } else { // Output each line of the input params.prompts vector and copy to k_prompts int index = 0; - printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); + LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); std::vector prompts = split_string(params.prompt, '\n'); for (const auto& prompt : prompts) { k_prompts.resize(index + 1); k_prompts[index] = prompt; index++; - printf("%3d prompt: %s\n", index, prompt.c_str()); + LOG_INF("%3d prompt: %s\n", index, prompt.c_str()); } } - fprintf(stderr, "\n\n"); - fflush(stderr); + LOG_INF("\n\n"); const int n_ctx = llama_n_ctx(ctx); @@ -183,19 +181,19 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); - LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__); - LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); - LOG_TEE("\n"); + LOG_INF("%s: Simulating parallel requests from clients:\n", __func__); + LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + LOG_INF("\n"); { - LOG_TEE("%s: Evaluating the system prompt ...\n", __func__); + LOG_INF("%s: Evaluating the system prompt ...\n", __func__); for (int32_t i = 0; i < n_tokens_system; ++i) { llama_batch_add(batch, tokens_system[i], i, { 0 }, false); } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } @@ -204,10 +202,10 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } - LOG_TEE("\n"); + LOG_INF("\n"); } - LOG_TEE("Processing requests ...\n\n"); + LOG_INF("Processing requests ...\n\n"); while (true) { if (dump_kv_cache) { @@ -238,7 +236,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } - LOG_TEE("%s: clearing the KV cache\n", __func__); + LOG_INF("%s: clearing the KV cache\n", __func__); } // insert new sequences for decoding @@ -273,7 +271,7 @@ int main(int argc, char ** argv) { client.n_decoded = 0; client.i_batch = batch.n_tokens - 1; - LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); + LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); g_seq_id += 1; @@ -317,11 +315,11 @@ int main(int argc, char ** argv) { if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); return 1; } - LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); + LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); n_cache_miss += 1; @@ -332,7 +330,7 @@ int main(int argc, char ** argv) { continue; } - LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens); + LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens); for (auto & client : clients) { if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) { @@ -377,7 +375,7 @@ int main(int argc, char ** argv) { const auto t_main_end = ggml_time_us(); - LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n", + LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n", client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded, (t_main_end - client.t_start_prompt) / 1e6, (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6, @@ -400,19 +398,19 @@ int main(int argc, char ** argv) { print_date_time(); - LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); + LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); if (params.prompt_file.empty()) { params.prompt_file = "used built-in defaults"; } - LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); - LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); + LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); + LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); - LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Cache misses: %6d\n", n_cache_miss); + LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); + LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); + LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); + LOG_INF("Cache misses: %6d\n", n_cache_miss); - LOG_TEE("\n"); + LOG_INF("\n"); // TODO: print sampling/grammar timings for all clients llama_perf_context_print(ctx); @@ -424,7 +422,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 52aa68bfc..7ef8d14f3 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -1,5 +1,6 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include @@ -8,9 +9,9 @@ #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -24,6 +25,8 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + int n_junk = params.n_junk; int n_keep = params.n_keep; int n_grp = params.grp_attn_n; @@ -63,7 +66,7 @@ int main(int argc, char ** argv) { llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return 1; } @@ -77,7 +80,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return 1; } @@ -107,14 +110,14 @@ int main(int argc, char ** argv) { const int n_batch = ctx_params.n_batch; const int n_batch_grp = ctx_params.n_batch/n_grp; - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); + LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos); // print the prompt token-by-token - LOG_TEE("\n"); - LOG_TEE("prefix tokens: %d\n", n_tokens_prefix); - LOG_TEE("prompt tokens: %d\n", n_tokens_all); - //LOG_TEE("prompt: %s\n", params.prompt.c_str()); + LOG_INF("\n"); + LOG_INF("prefix tokens: %d\n", n_tokens_prefix); + LOG_INF("prompt tokens: %d\n", n_tokens_all); + //LOG_INF("prompt: %s\n", params.prompt.c_str()); llama_batch batch = llama_batch_init(params.n_batch, 0, 1); @@ -145,11 +148,11 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_INF("%s: llama_decode() failed\n", __func__); return 1; } - LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); if (i + n_batch >= n_tokens_all) { break; @@ -159,7 +162,7 @@ int main(int argc, char ** argv) { for (int i = n_ctx; i < n_tokens_all; i += n_batch) { const int n_discard = n_batch; - LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard); + LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -179,18 +182,18 @@ int main(int argc, char ** argv) { } if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } - LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); + LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all)); } { const int n_discard = n_past - n_ctx + n_predict; if (n_discard > 0) { - LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); + LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); @@ -201,17 +204,16 @@ int main(int argc, char ** argv) { } } - LOG_TEE("\n"); - LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); - LOG_TEE("\n"); + LOG_INF("\n"); + LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk); + LOG_INF("\n"); // main loop int n_cur = n_tokens_all; int n_decode = 0; - LOG_TEE("%s", prompt_suffix.c_str()); - fflush(stdout); + LOG_INF("%s", prompt_suffix.c_str()); const auto t_main_start = ggml_time_us(); @@ -222,13 +224,12 @@ int main(int argc, char ** argv) { // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { - LOG_TEE("\n"); + LOG("\n"); break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); - fflush(stdout); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); n_decode += 1; @@ -243,22 +244,22 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); + LOG("\n"); const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); - fprintf(stderr, "\n"); + LOG("\n"); llama_sampler_free(smpl); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 29ff86bbc..18e75a7a2 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,7 +1,9 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" +#include #include #include #include @@ -41,7 +43,7 @@ static void write_logfile( } if (params.hellaswag) { - fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); + LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); return; } @@ -49,7 +51,7 @@ static void write_logfile( const bool success = fs_create_directory_with_parents(params.logdir); if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str()); return; } @@ -58,7 +60,7 @@ static void write_logfile( FILE * logfile = fopen(logfile_path.c_str(), "w"); if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); return; } @@ -344,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + LOG_INF("%s: tokenizing the input ..\n", __func__); std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -364,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & prob_history.resize(tokens.size()); if (params.ppl_stride <= 0) { - fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); + LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } const int calc_chunk = n_ctx; - fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); + LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); if (int(tokens.size()) <= calc_chunk) { - fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, + LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, tokens.size(), n_ctx, params.ppl_stride); return {tokens, -1, logit_history, prob_history}; } @@ -387,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & int count = 0; double nll = 0.0; - fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); for (int i = 0; i < n_chunk; ++i) { const int start = i * params.ppl_stride; const int end = start + calc_chunk; const int num_batches = (calc_chunk + n_batch - 1) / n_batch; - //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); + //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); std::vector logits; @@ -407,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); - //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); + //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - //fprintf(stderr, "%s : failed to eval\n", __func__); + //LOG_ERR("%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -434,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + LOG("%.2f minutes\n", total_seconds / 60.0); } + LOG("\n"); - //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); + //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { // Calculate probability of next token, given the previous ones. @@ -460,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & } // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", i + 1, std::exp(nll / count)); } else { - printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); + LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); } - fflush(stdout); } - printf("\n"); + LOG("\n"); return {tokens, std::exp(nll / count), logit_history, prob_history}; } @@ -488,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par if (!params.logits_file.empty()) { logits_stream.open(params.logits_file.c_str(), std::ios::binary); if (!logits_stream.is_open()) { - fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); return {}; } - fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); + LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); logits_stream.write("_logits_", 8); logits_stream.write(reinterpret_cast(&n_ctx), sizeof(n_ctx)); } auto tim1 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + LOG_INF("%s: tokenizing the input ..\n", __func__); std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, + LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); return {std::move(tokens), 0., {}, {}}; } @@ -540,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par logits.reserve((size_t)n_ctx * n_vocab); } - fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); + LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); std::vector workers(std::thread::hardware_concurrency() - 1); @@ -613,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_INF("%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } @@ -628,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par llama_synchronize(ctx); const auto t_end = std::chrono::high_resolution_clock::now(); const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total*n_chunk/n_seq); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + LOG("%.2f minutes\n", total_seconds / 60.0); } + LOG("\n"); for (int seq = 0; seq < n_seq_batch; seq++) { const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first); @@ -656,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // perplexity is e^(average negative log-likelihood) if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); + LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); } else { double av = nll/count; double av2 = nll2/count - av*av; if (av2 > 0) av2 = sqrt(av2/(count-1)); - printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); + LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } } - fflush(stdout); logits.clear(); } - printf("\n"); + LOG("\n"); nll2 /= count; nll /= count; @@ -676,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par nll2 -= nll * nll; if (nll2 > 0) { nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); } else { - printf("Unexpected negative standard deviation of log(prob)\n"); + LOG_ERR("Unexpected negative standard deviation of log(prob)\n"); } llama_batch_free(batch); @@ -704,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< const int ret = llama_decode(ctx, batch_view); if (ret != 0) { - LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } @@ -790,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } if (prompt_lines.size() % 6 != 0) { - fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__); + LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__); return; } size_t hs_task_count = prompt_lines.size()/6; - fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); + LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; - fprintf(stderr, "================================= is_spm = %d\n", is_spm); + LOG_INF("================================= is_spm = %d\n", is_spm); // The tasks should be randomized so the score stabilizes quickly. bool randomize_tasks = true; @@ -825,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { std::vector seq_tokens[4]; }; - fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); + LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); // Select and read data from prompt lines std::vector hs_data(hs_task_count); @@ -871,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } } - fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__); + LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__); - printf("\ntask\tacc_norm\n"); + LOG("\ntask\tacc_norm\n"); double acc = 0.0f; @@ -941,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); + LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -949,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return; } @@ -999,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } } - //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); + //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); // If the gold ending got the maximum logprobe add one accuracy point if (ending_logprob_max_idx == hs_cur.gold_ending_idx) { @@ -1007,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } // Print the accumulated accuracy mean x 100 - printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); - fflush(stdout); + LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); } i0 = i1 - 1; @@ -1016,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { llama_batch_free(batch); - printf("\n"); + LOG("\n"); } struct winogrande_entry { @@ -1060,7 +1061,7 @@ static std::vector load_winogrande_from_csv(const std::string } } if (ipos != 4) { - printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); + LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); continue; } auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3) @@ -1074,13 +1075,13 @@ static std::vector load_winogrande_from_csv(const std::string if (sentence[where] == '_') break; } if (where == int(sentence.size())) { - printf("%s: no _ in <%s>\n", __func__, sentence.c_str()); + LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str()); continue; } std::istringstream stream(answer.c_str()); int i_answer; stream >> i_answer; if (stream.fail() || i_answer < 1 || i_answer > 2) { - printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); + LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); continue; } result.emplace_back(); @@ -1109,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { auto data = load_winogrande_from_csv(params.prompt); if (data.empty()) { - fprintf(stderr, "%s: no tasks\n", __func__); + LOG_ERR("%s: no tasks\n", __func__); return; } - fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size()); + LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size()); if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) { - fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); + LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); std::mt19937 rng(1); std::vector aux(data.size()); for (int i = 0; i < int(data.size()); ++i) { @@ -1134,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { data = std::move(selected); } - fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); + LOG_INF("%s : tokenizing selected tasks\n", __func__); for (auto & task : data) { task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); @@ -1157,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); } - fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); + LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); @@ -1218,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { } if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); + LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -1226,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return; } @@ -1286,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { ++n_done; // print the accumulated accuracy mean x 100 - printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); - fflush(stdout); + LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); } i0 = i1 - 1; } - printf("\n"); + LOG("\n"); if (n_done < 100) return; const float p = 1.f*n_correct/n_done; const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1)); - printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); + + LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); } static bool deserialize_string(std::istream & in, std::string & str) { @@ -1348,7 +1349,7 @@ struct multiple_choice_task { static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) { if (task.question.empty() || task.mc1.answers.empty()) { if (log_error) { - printf("%s: found bad task with empty question and/or answers\n", __func__); + LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__); } return false; } @@ -1356,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic for (auto& answer : task.mc1.answers) { if (answer.empty()) { if (log_error) { - printf("%s: found empty answer\n", __func__); + LOG_ERR("%s: found empty answer\n", __func__); } return false; } @@ -1410,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params uint32_t n_task; strstream.read((char *)&n_task, sizeof(n_task)); if (strstream.fail() || n_task == 0) { - printf("%s: no tasks\n", __func__); + LOG_ERR("%s: no tasks\n", __func__); return; } - printf("%s: there are %u tasks in prompt\n", __func__, n_task); + LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task); std::vector task_pos(n_task); strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); if (strstream.fail()) { - printf("%s: failed to read task positions from prompt\n", __func__); + LOG_ERR("%s: failed to read task positions from prompt\n", __func__); return; } @@ -1425,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) { // Use all tasks tasks.resize(n_task); - printf("%s: reading tasks", __func__); + LOG_INF("%s: reading tasks", __func__); int n_dot = std::max((int) n_task/100, 1); int i = 0; for (auto& task : tasks) { ++i; if (!task.deserialize(strstream)) { - printf("%s: failed to read task %d of %u\n", __func__, i, n_task); + LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task); return; } - if (i%n_dot == 0) printf("."); + if (i%n_dot == 0) LOG("."); } - printf("done\n"); + LOG("done\n"); } else { - printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); + LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); std::mt19937 rng(1); std::vector aux(n_task); for (uint32_t i = 0; i < n_task; ++i) aux[i] = i; @@ -1452,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params aux.pop_back(); strstream.seekg(task_pos[idx], std::ios::beg); if (!task.deserialize(strstream)) { - printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); + LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); return; } } n_task = params.multiple_choice_tasks; } - printf("%s: preparing task data", __func__); - fflush(stdout); + LOG_INF("%s: preparing task data", __func__); if (n_task > 500) { - printf("..."); - fflush(stdout); + LOG("..."); std::atomic counter(0); std::atomic n_bad(0); auto prepare = [&counter, &n_bad, &tasks, ctx] () { @@ -1487,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params for (auto& w : workers) w = std::thread(prepare); prepare(); for (auto& w : workers) w.join(); - printf("done\n"); - fflush(stdout); + LOG("done\n"); int nbad = n_bad; if (nbad > 0) { - printf("%s: found %d malformed tasks\n", __func__, nbad); + LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad); return; } } else { @@ -1503,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params return; } if (i_task%n_dot == 0) { - printf("."); - fflush(stdout); + LOG("."); } } - printf("done\n"); + LOG("done\n"); } - printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); + LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); - printf("\ntask\tacc_norm\n"); + LOG("\ntask\tacc_norm\n"); const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); @@ -1591,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params } if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); + LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0); return; } @@ -1599,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); + LOG_ERR("%s: llama_decode() failed\n", __func__); return; } @@ -1623,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // compute the logprobs for each ending of the decoded tasks for (size_t i = i0; i < i1; ++i) { auto & cur_task = tasks[i]; - //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); + //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) { // if (cur_task.mc1.labels[j] == 1) { - // printf("%d", j+1); + // LOG("%d", j+1); // } //} - //printf("\n common_prefix: %zu\n", cur_task.common_prefix); + //LOG("\n common_prefix: %zu\n", cur_task.common_prefix); // get the logits of the last token of the common prefix std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); @@ -1641,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params size_t count = 1; float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - //printf(" %zu %g\n", ir, eval_results[ir]); + //LOG(" %zu %g\n", ir, eval_results[ir]); ++count; log_prob += eval_results[ir++]; } cur_task.log_probs[s] = log_prob / count; - //printf(" Final: %g\n", log_prob / count); - //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); + //LOG(" Final: %g\n", log_prob / count); + //LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); } // Find the ending with maximum logprob @@ -1667,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params ++n_done; // Print the accumulated accuracy mean x 100 - printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); - fflush(stdout); + LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); } i0 = i1 - 1; @@ -1680,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params float p = 1.f*n_correct/n_done; float sigma = sqrt(p*(1-p)/(n_done-1)); - printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + LOG("\n"); + LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); p = 1.f*n_done/n_tot_answers; sigma = sqrt(p*(1-p)/(n_done-1)); - printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); + LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); - printf("\n"); + LOG_INF("\n"); } static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (params.logits_file.empty()) { - fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); + LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; } std::ifstream in(params.logits_file.c_str(), std::ios::binary); if (!in) { - fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str()); return; } { char check[9]; check[8] = 0; in.read(check, 8); if (in.fail() || strncmp("_logits_", check, 8) != 0) { - fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); return; } } @@ -1710,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { uint32_t n_ctx; in.read((char *)&n_ctx, sizeof(n_ctx)); if (n_ctx > llama_n_ctx(ctx)) { - fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", + LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); } @@ -1718,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { in.read((char *)&n_vocab, sizeof(n_vocab)); in.read((char *)&n_chunk, sizeof(n_chunk)); if (in.fail()) { - fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); return; } if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { - fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); + LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); } std::vector tokens(n_ctx * n_chunk); if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { - fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); + LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); return; } @@ -1776,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { - fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i); + LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i); return; } @@ -1797,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_ERR("%s : failed to eval\n", __func__); return; } @@ -1814,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); int total_seconds = (int)(t_total * n_chunk); if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); + LOG("%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - - printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); + LOG("%.2f minutes\n", total_seconds / 60.0); } + LOG("\n"); + LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); @@ -1832,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { p_diff_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first; - printf("%4d", i+1); + LOG("%4d", i+1); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); const double ppl_val = exp(log_ppl.first); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); + LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); + LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); + LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); double p_top_val = 1.*kld.n_same_top/kld.count; double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); - printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); + LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); - printf("\n"); - - fflush(stdout); + LOG("\n"); logits.clear(); } - printf("\n"); + LOG("\n"); if (kld.count < 100) return; // we do not wish to do statistics on so few values std::sort(kld_values.begin(), kld_values.end()); std::sort(p_diff_values.begin(), p_diff_values.end()); - printf("====== Perplexity statistics ======\n"); + LOG("====== Perplexity statistics ======\n"); auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); const double ppl_val = exp(log_ppl.first); const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); + LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); const double ppl_base_val = exp(log_ppl_base.first); const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 ) - printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); + LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); - // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); + // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second); - printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); + LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); + LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); const double ppl_ratio_val = exp(log_ppl_ratio_val); const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 ) - printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); + LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov; const double ppl_diff_val = ppl_val - ppl_base_val; const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov); - printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); + LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); - printf("\n"); + LOG("\n"); - printf("====== KL divergence statistics ======\n"); + LOG("====== KL divergence statistics ======\n"); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); + LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) : kld_values[kld_values.size()/2]; @@ -1916,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)]; }; - printf("Maximum KLD: %10.6f\n", kld_values.back()); - printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); - printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - printf("Median KLD: %10.6f\n", kld_median); - printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); - printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); - printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); - printf("Minimum KLD: %10.6f\n", kld_values.front()); + LOG("Maximum KLD: %10.6f\n", kld_values.back()); + LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); + LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + LOG("Median KLD: %10.6f\n", kld_median); + LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); + LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); + LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); + LOG("Minimum KLD: %10.6f\n", kld_values.front()); - printf("\n"); + LOG("\n"); - printf("====== Token probability statistics ======\n"); + LOG("====== Token probability statistics ======\n"); auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count); - printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); + LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1]) : p_diff_values[p_diff_values.size()/2]; - printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); - printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); - printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); - printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); - printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); - printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); - printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); - printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); - printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); - printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); - printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); - printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); - printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); + LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); + LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); + LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); + LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); + LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); + LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); + LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); + LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); + LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); + LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); + LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); + LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); + LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); - // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); + // LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); const double p_diff_rms_val = sqrt(p_diff_mse.first); const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); const double same_top_p = 1.0*kld.n_same_top/kld.count; - printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); - + LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); } int main(int argc, char ** argv) { @@ -1972,10 +1966,12 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + const int32_t n_ctx = params.n_ctx; if (n_ctx <= 0) { - fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__); + LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__); return 1; } @@ -2000,13 +1996,11 @@ int main(int argc, char ** argv) { } if (params.ppl_stride > 0) { - fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", + LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", params.n_ctx, params.n_ctx + params.ppl_stride/2); params.n_ctx += params.ppl_stride/2; } - print_build_info(); - llama_backend_init(); llama_numa_init(params.numa); @@ -2016,21 +2010,21 @@ int main(int argc, char ** argv) { llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } struct results_perplexity results; @@ -2046,8 +2040,9 @@ int main(int argc, char ** argv) { results = perplexity(ctx, params, n_ctx); } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); + write_logfile(ctx, params, model, results); llama_free(ctx); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index d08679edb..5971690f1 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -1,14 +1,16 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" #include #include +#include // TODO: remove me static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); + LOG("\n"); } struct chunk { @@ -17,7 +19,7 @@ struct chunk { // original file position size_t filepos; // original text data - std::string textdata = ""; + std::string textdata; // tokenized text data std::vector tokens; // embedding @@ -31,14 +33,14 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz std::ifstream f(filename.c_str()); if (!f.is_open()) { - fprintf(stderr, "Error: could not open file %s\n", filename.c_str()); + LOG_ERR("could not open file %s\n", filename.c_str()); return chunks; } chunk current_chunk; char buffer[1024]; int64_t filepos = 0; - std::string current = ""; + std::string current; while (f.read(buffer, 1024)) { current += std::string(buffer, f.gcount()); size_t pos; @@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu llama_kv_cache_clear(ctx); // run model - fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); + LOG_ERR("%s : failed to decode\n", __func__); } for (int i = 0; i < batch.n_tokens; i++) { @@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu if (embd == NULL) { embd = llama_get_embeddings_ith(ctx, i); if (embd == NULL) { - fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); + LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i); continue; } } @@ -116,24 +118,24 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; params.embedding = true; if (params.chunk_size <= 0) { - fprintf(stderr, "chunk_size must be positive\n"); + LOG_ERR("chunk_size must be positive\n"); return 1; } if (params.context_files.empty()) { - fprintf(stderr, "context_files must be specified\n"); + LOG_ERR("context_files must be specified\n"); return 1; } - print_build_info(); - - printf("processing files:\n"); + LOG_INF("processing files:\n"); for (auto & context_file : params.context_files) { - printf("%s\n", context_file.c_str()); + LOG_INF("%s\n", context_file.c_str()); } std::vector chunks; @@ -141,7 +143,7 @@ int main(int argc, char ** argv) { std::vector file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator); chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); } - printf("Number of chunks: %ld\n", chunks.size()); + LOG_INF("Number of chunks: %ld\n", chunks.size()); llama_backend_init(); llama_numa_init(params.numa); @@ -153,7 +155,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_init.context; if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); + LOG_ERR("%s: unable to load model\n", __func__); return 1; } @@ -162,19 +164,19 @@ int main(int argc, char ** argv) { const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__); + LOG_ERR("%s: pooling type NONE not supported\n", __func__); return 1; } if (n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); } // print system information { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); } // max batch size @@ -185,7 +187,7 @@ int main(int argc, char ** argv) { for (auto & chunk : chunks) { auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); if (inp.size() > n_batch) { - fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } @@ -199,12 +201,12 @@ int main(int argc, char ** argv) { // tokenization stats if (params.verbose_prompt) { for (int i = 0; i < (int) chunks.size(); i++) { - fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); + LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); } - fprintf(stderr, "\n\n"); + LOG_INF("\n\n"); } } @@ -256,7 +258,7 @@ int main(int argc, char ** argv) { // start loop, receive query and return top k similar chunks based on cosine similarity std::string query; while (true) { - printf("Enter query: "); + LOG("Enter query: "); std::getline(std::cin, query); std::vector query_tokens = llama_tokenize(ctx, query, true); @@ -280,18 +282,18 @@ int main(int argc, char ** argv) { return a.second > b.second; }); - printf("Top %d similar chunks:\n", params.sparams.top_k); + LOG("Top %d similar chunks:\n", params.sparams.top_k); for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { - printf("filename: %s\n", chunks[similarities[i].first].filename.c_str()); - printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); - printf("similarity: %f\n", similarities[i].second); - printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); - printf("--------------------\n"); + LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str()); + LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); + LOG("similarity: %f\n", similarities[i].second); + LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); + LOG("--------------------\n"); } } } - LOG_TEE("\n"); + LOG("\n"); llama_perf_context_print(ctx); // clean up diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 580f3a824..3e717e882 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-server) -option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) + +option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) @@ -46,9 +46,6 @@ endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) -target_compile_definitions(${TARGET} PRIVATE - SERVER_VERBOSE=$ -) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/server/README.md b/examples/server/README.md index 44a73ca0a..168e14a99 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -121,7 +121,6 @@ The project is under active development, and we are [looking for feedback and co | `-to, --timeout N` | server read/write timeout in seconds (default: 600) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | -| `--log-format {text, json}` | log output format: json or text (default: json) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--no-slots` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 0f18ca396..353368e13 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -40,7 +40,6 @@ server --host localhost --port 8080 \ --parallel 8 \ --batch-size 512 \ --ctx-size 4096 \ - --log-format text \ -ngl 33 ``` diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 2daac0884..a9ed747f5 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -272,7 +272,6 @@ def start_server_background(args): server_args.append('--cont-batching') server_args.append('--metrics') server_args.append('--flash-attn') - server_args.extend(['--log-format', "text"]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") pkwargs = { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 14c4af3d9..b5f264ff1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2,6 +2,7 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" @@ -31,22 +32,34 @@ #include "loading.html.hpp" #include -#include #include #include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include + +#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__) + +#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) + +#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) using json = nlohmann::ordered_json; -bool server_verbose = false; -bool server_log_json = true; - enum stop_type { STOP_TYPE_FULL, STOP_TYPE_PARTIAL, @@ -197,6 +210,8 @@ struct server_slot { std::function callback_on_release; void reset() { + SLT_DBG(*this, "%s", "\n"); + n_prompt_tokens = 0; generated_text = ""; truncated = false; @@ -234,8 +249,9 @@ struct server_slot { return state != SLOT_STATE_IDLE; } - void add_token_string(const completion_token_output & token) { + void add_token(const completion_token_output & token) { if (!is_processing()) { + SLT_WRN(*this, "%s", "slot is not processing\n"); return; } generated_token_probs.push_back(token); @@ -243,14 +259,10 @@ struct server_slot { void release() { if (is_processing()) { + SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated); + t_token_generation = (ggml_time_us() - t_start_generation) / 1e3; state = SLOT_STATE_IDLE; - LOG_INFO("slot released", { - {"id_slot", id}, - {"id_task", id_task}, - {"n_past", n_past}, - {"truncated", truncated}, - }); callback_on_release(id); } } @@ -298,49 +310,20 @@ struct server_slot { } void print_timings() const { - char buffer[512]; + const double t_prompt = t_prompt_processing / n_prompt_tokens_processed; + const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - double t_token = t_prompt_processing / n_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + const double t_gen = t_token_generation / n_decoded; + const double n_gen_second = 1e3 / t_token_generation * n_decoded; - snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, n_prompt_tokens_processed, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"n_prompt_tokens_processed", n_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - t_token = t_token_generation / n_decoded; - n_tokens_second = 1e3 / t_token_generation * n_decoded; - - snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", - t_token_generation, n_decoded, - t_token, n_tokens_second); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_token_generation", t_token_generation}, - {"n_decoded", n_decoded}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, - }); - - snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation); - - LOG_INFO(buffer, { - {"id_slot", id}, - {"id_task", id_task}, - {"t_prompt_processing", t_prompt_processing}, - {"t_token_generation", t_token_generation}, - {"t_total", t_prompt_processing + t_token_generation}, - }); + SLT_INF(*this, + "\n" + "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "\r total time = %10.2f ms / %5d tokens\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, + t_token_generation, n_decoded, t_gen, n_gen_second, + t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); } }; @@ -416,8 +399,8 @@ struct server_queue { std::unique_lock lock(mutex_tasks); if (task.id == -1) { task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); } + QUE_DBG("new task, id = %d, front = %d\n", task.id, front); if (front) { queue_tasks.push_front(std::move(task)); } else { @@ -433,8 +416,8 @@ struct server_queue { for (auto & task : tasks) { if (task.id == -1) { task.id = id++; - LOG_VERBOSE("new task id", {{"new_id", task.id}}); } + QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front); if (front) { queue_tasks.push_front(std::move(task)); } else { @@ -448,6 +431,7 @@ struct server_queue { // Add a new task, but defer until one slot is available void defer(server_task task) { std::unique_lock lock(mutex_tasks); + QUE_DBG("defer task, id = %d\n", task.id); queue_tasks_deferred.push_back(std::move(task)); condition_tasks.notify_one(); } @@ -456,7 +440,6 @@ struct server_queue { int get_new_id() { std::unique_lock lock(mutex_tasks); int new_id = id++; - LOG_VERBOSE("new task id", {{"new_id", new_id}}); return new_id; } @@ -498,7 +481,7 @@ struct server_queue { running = true; while (true) { - LOG_VERBOSE("new task may arrive", {}); + QUE_DBG("%s", "processing new tasks\n"); while (true) { std::unique_lock lock(mutex_tasks); @@ -509,21 +492,22 @@ struct server_queue { server_task task = queue_tasks.front(); queue_tasks.pop_front(); lock.unlock(); - LOG_VERBOSE("callback_new_task", {{"id_task", task.id}}); + + QUE_DBG("processing task, id = %d\n", task.id); callback_new_task(task); } // all tasks in the current loop is processed, slots data is now ready - LOG_VERBOSE("callback_update_slots", {}); + QUE_DBG("%s", "update slots\n"); callback_update_slots(); - LOG_VERBOSE("wait for new task", {}); + QUE_DBG("%s", "waiting for new tasks\n"); { std::unique_lock lock(mutex_tasks); if (queue_tasks.empty()) { if (!running) { - LOG_VERBOSE("ending start_loop", {}); + QUE_DBG("%s", "terminate\n"); return; } condition_tasks.wait(lock, [&]{ @@ -547,7 +531,7 @@ struct server_response { // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { - LOG_VERBOSE("waiting for task id", {{"id_task", id_task}}); + SRV_DBG("waiting for task id = %d\n", id_task); std::unique_lock lock(mutex_results); waiting_task_ids.insert(id_task); @@ -561,7 +545,7 @@ struct server_response { // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { - LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}}); + SRV_DBG("task id = %d is done\n", id_task); std::unique_lock lock(mutex_results); waiting_task_ids.erase(id_task); @@ -595,12 +579,13 @@ struct server_response { // Send a new result to a waiting id_task void send(server_task_result & result) { - LOG_VERBOSE("send new result", {{"id_task", result.id}}); + SRV_DBG("sending result for task id = %d\n", result.id); std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { if (result.id == id_task) { - LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}}); + SRV_DBG("task id = %d moved to result queue\n", result.id); + queue_results.push_back(std::move(result)); condition_results.notify_all(); return; @@ -612,7 +597,7 @@ struct server_response { struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; - std::vector lora_adapters; + std::vector loras; gpt_params params; @@ -672,11 +657,13 @@ struct server_context { llama_init_result llama_init = llama_init_from_gpt_params(params); model = llama_init.model; - ctx = llama_init.context; - lora_adapters = llama_init.lora_adapters; + ctx = llama_init.context; + loras = llama_init.lora_adapters; + params.n_parallel -= 1; // but be sneaky about it + if (model == nullptr) { - LOG_ERROR("unable to load model", {{"model", params.model}}); + SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); return false; } @@ -699,7 +686,7 @@ struct server_context { void init() { const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel); for (int i = 0; i < params.n_parallel; i++) { server_slot slot; @@ -708,10 +695,7 @@ struct server_context { slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_INFO("new slot", { - {"id_slot", slot.id}, - {"n_ctx_slot", slot.n_ctx} - }); + SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -722,11 +706,7 @@ struct server_context { //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_INFO("slot self-extend", { - {"id_slot", slot.id}, - {"ga_n", ga_n}, - {"ga_w", ga_w} - }); + SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w); } slot.ga_i = 0; @@ -849,11 +829,7 @@ struct server_context { } if (ret != nullptr) { - LOG_VERBOSE("selected slot by lcp similarity", { - {"id_slot", ret->id}, - {"max_lcp_len", max_lcp_len}, - {"similarity", similarity}, - }); + SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity); } } @@ -874,10 +850,7 @@ struct server_context { } if (ret != nullptr) { - LOG_VERBOSE("selected slot by lru", { - {"id_slot", ret->id}, - {"t_last", t_last}, - }); + SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last); } } @@ -941,17 +914,14 @@ struct server_context { } if (slot.params.cache_prompt && slot.ga_n != 1) { - LOG_WARNING("cache_prompt is not supported with group-attention", {}); slot.params.cache_prompt = false; + SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n"); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? - LOG_WARNING("Max tokens to predict exceeds server configuration", { - {"params.n_predict", slot.params.n_predict}, - {"slot.n_predict", slot.n_predict}, - }); slot.params.n_predict = slot.n_predict; + SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } // infill @@ -1060,16 +1030,13 @@ struct server_context { slot.state = SLOT_STATE_PROCESSING_PROMPT; slot.prompt_tokens.clear(); - LOG_INFO("slot is processing task", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - }); + SLT_INF(slot, "%s", "processing task\n"); return true; } void kv_cache_clear() { - LOG_VERBOSE("clearing KV cache", {}); + SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache llama_kv_cache_clear(ctx); @@ -1077,9 +1044,7 @@ struct server_context { } void system_prompt_update() { - LOG_VERBOSE("system prompt update", { - {"system_prompt", system_prompt}, - }); + SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str()); kv_cache_clear(); system_tokens.clear(); @@ -1100,7 +1065,7 @@ struct server_context { } if (llama_decode(ctx, batch) != 0) { - LOG_ERROR("llama_decode() failed", {}); + SRV_ERR("%s", "llama_decode() failed\n"); return; } } @@ -1115,11 +1080,9 @@ struct server_context { } bool system_prompt_set(const std::string & sys_prompt) { - system_prompt = sys_prompt; + SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); - LOG_VERBOSE("system prompt process", { - {"system_prompt", system_prompt}, - }); + system_prompt = sys_prompt; // release all slots for (server_slot & slot : slots) { @@ -1187,7 +1150,7 @@ struct server_context { // add the token to slot queue and cache } - slot.add_token_string(result); + slot.add_token(result); if (slot.params.stream) { send_partial_response(slot, result); } @@ -1202,55 +1165,30 @@ struct server_context { slot.stopped_limit = true; slot.has_next_token = false; - LOG_VERBOSE("stopped by limit", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_decoded", slot.n_decoded}, - {"n_predict", slot.params.n_predict}, - }); + SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); } if (llama_token_is_eog(model, result.tok)) { slot.stopped_eos = true; slot.has_next_token = false; - LOG_VERBOSE("eos token found", {}); + SLT_DBG(slot, "%s", "stopped by EOS\n"); } - auto n_ctx_train = llama_n_ctx_train(model); - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 - && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { - LOG_WARNING("n_predict is not set and self-context extend is disabled." - " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", { - { "id_slot", slot.id }, - { "params.n_predict", slot.params.n_predict }, - { "slot.n_prompt_tokens", slot.n_prompt_tokens }, - { "slot.n_decoded", slot.n_decoded }, - { "slot.n_predict", slot.n_predict }, - { "n_slots", params.n_parallel }, - { "slot.n_ctx", slot.n_ctx }, - { "n_ctx", n_ctx }, - { "n_ctx_train", n_ctx_train }, - { "ga_n", slot.ga_n }, - }); + const auto n_ctx_train = llama_n_ctx_train(model); + + if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; // stop prediction + + SLT_WRN(slot, + "n_predict (%d) is not set and self-context extend is disabled. " + "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", + slot.params.n_predict, n_ctx_train); } - LOG_VERBOSE("next token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); return slot.has_next_token; // continue } @@ -1307,10 +1245,7 @@ struct server_context { } void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { - LOG_ERROR("task error", { - {"id_task", id_task}, - {"error", error}, - }); + SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); server_task_result res; res.id = id_task; @@ -1429,10 +1364,7 @@ struct server_context { } if (embd == NULL) { - LOG_ERROR("failed to get embeddings", { - {"token", batch.token [i]}, - {"seq_id", batch.seq_id[i][0]} - }); + SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); res.data = json { {"embedding", std::vector(n_embd, 0.0f)}, @@ -1449,6 +1381,8 @@ struct server_context { }; } + SLT_DBG(slot, "%s", "sending embeddings\n"); + queue_results.send(res); } @@ -1465,7 +1399,7 @@ struct server_context { task.type = SERVER_TASK_TYPE_COMPLETION; if (replace_prompt) { task.data = task_data; - task.data["prompt"] = prompt; + task.data["prompt"] = std::move(prompt); } else { task.data = std::move(task_data); } @@ -1509,7 +1443,8 @@ struct server_context { std::vector cancel_tasks; cancel_tasks.reserve(id_tasks.size()); for (const auto & id_task : id_tasks) { - LOG_VERBOSE("cancel task", {{"id_task", id_task}}); + SRV_WRN("cancel task, id_task = %d\n", id_task); + server_task task; task.type = SERVER_TASK_TYPE_CANCEL; task.id_target = id_task; @@ -1521,7 +1456,10 @@ struct server_context { } // receive the results from task(s) created by create_tasks_cmpl - void receive_cmpl_results(const std::unordered_set & id_tasks, std::function&)> result_handler, std::function error_handler) { + void receive_cmpl_results( + const std::unordered_set & id_tasks, + const std::function&)> & result_handler, + const std::function & error_handler) { // TODO: currently, there is no way to detect the client has cancelled the request std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { @@ -1540,7 +1478,10 @@ struct server_context { } // receive the results from task(s) created by create_tasks_cmpl, in stream mode - void receive_cmpl_results_stream(const std::unordered_set & id_tasks, std::function result_handler, std::function error_handler) { + void receive_cmpl_results_stream( + const std::unordered_set & id_tasks, const + std::function & result_handler, const + std::function & error_handler) { size_t n_finished = 0; while (true) { server_task_result result = queue_results.recv(id_tasks); @@ -1588,13 +1529,13 @@ struct server_context { if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {{"id_task", task.id}}); + SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1616,7 +1557,7 @@ struct server_context { slot->index = json_value(task.data, "index", 0); if (!launch_slot_with_task(*slot, task)) { - LOG_ERROR("error while launching slot", task.data); + SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); break; } } break; @@ -1665,18 +1606,7 @@ struct server_context { slots_data.push_back(slot_data); } - LOG_INFO("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots} - }); - - LOG_VERBOSE("slot data", { - {"id_task", task.id}, - {"n_idle_slots", n_idle_slots}, - {"n_processing_slots", n_processing_slots}, - {"slots", slots_data} - }); + SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); server_task_result res; res.id = task.id; @@ -1722,7 +1652,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1763,7 +1693,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1811,7 +1741,7 @@ struct server_context { } if (slot->is_processing()) { // if requested slot is unavailable, we defer this task for processing later - LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}}); + SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id); queue_tasks.defer(task); break; } @@ -1833,7 +1763,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SET_LORA: { - llama_lora_adapters_apply(ctx, lora_adapters); + llama_lora_adapters_apply(ctx, loras); server_task_result result; result.id = task.id; result.stop = true; @@ -1861,7 +1791,7 @@ struct server_context { } if (all_idle) { - LOG_INFO("all slots are idle", {}); + SRV_INF("%s", "all slots are idle\n"); if (system_prompt.empty() && clean_kv_cache) { kv_cache_clear(); } @@ -1871,7 +1801,7 @@ struct server_context { } { - LOG_VERBOSE("posting NEXT_RESPONSE", {}); + SRV_DBG("%s", "posting NEXT_RESPONSE\n"); server_task task; task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; @@ -1890,17 +1820,7 @@ struct server_context { const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - LOG_INFO("slot context shift", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_keep", n_keep}, - {"n_left", n_left}, - {"n_discard", n_discard}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()} - }); + SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1943,15 +1863,8 @@ struct server_context { slot.cache_tokens.push_back(slot.sampled); } - LOG_VERBOSE("slot decode token", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", n_ctx}, - {"n_past", slot.n_past}, - {"n_system_tokens", system_tokens.size()}, - {"n_cache_tokens", slot.cache_tokens.size()}, - {"truncated", slot.truncated} - }); + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated); } // process in chunks of params.n_batch @@ -1972,10 +1885,7 @@ struct server_context { // we haven't tokenized the prompt yet - do it now: if (prompt_tokens.empty()) { - LOG_VERBOSE("tokenizing prompt", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); + SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size()); slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; @@ -2019,21 +1929,11 @@ struct server_context { slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("prompt tokenized", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); + SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { - LOG_INFO("empty prompt - releasing slot", { - {"id_slot", slot.id}, - {"id_task", slot.id_task} - }); + SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); slot.release(); slot.print_timings(); @@ -2075,15 +1975,7 @@ struct server_context { slot.truncated = true; slot.n_prompt_tokens = prompt_tokens.size(); - LOG_VERBOSE("input truncated", { - {"id_slot", slot.id}, - {"id_task", slot.id_task}, - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, - }); + SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens); GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } @@ -2108,10 +2000,7 @@ struct server_context { if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_INFO("we have to evaluate at least 1 token to generate logits", { - { "id_slot", slot.id }, - { "id_task", slot.id_task } - }); + SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); slot.n_past--; if (slot.ga_i > 0) { @@ -2160,11 +2049,7 @@ struct server_context { // remove the non-common part from the cache slot.cache_tokens.resize(slot.n_past); - LOG_INFO("kv cache rm [p0, end)", { - { "id_slot", slot.id }, - { "id_task", slot.id_task }, - { "p0", p0 } - }); + SLT_INF(slot, "kv cache rm [%d, end)\n", p0); int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; @@ -2193,13 +2078,7 @@ struct server_context { slot_npast++; } - LOG_VERBOSE("prompt processing progress", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens}, - }); + SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); // entire prompt has been processed if (slot.n_past == slot.n_prompt_tokens) { @@ -2213,12 +2092,7 @@ struct server_context { slot.n_decoded = 0; slot.i_batch = batch.n_tokens - 1; - LOG_VERBOSE("prompt done", { - {"id_slot", slot.id}, - {"n_past", slot.n_past}, - {"n_ctx", n_ctx}, - {"n_tokens", batch.n_tokens}, - }); + SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens); } } @@ -2229,13 +2103,11 @@ struct server_context { } if (batch.n_tokens == 0) { - LOG_VERBOSE("no tokens to decode", {}); + SRV_WRN("%s", "no tokens to decode\n"); return; } - LOG_VERBOSE("decoding batch", { - {"n_tokens", batch.n_tokens}, - }); + SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); // make sure we're in the right embedding mode llama_set_embeddings(ctx, batch_type == 1); @@ -2253,10 +2125,9 @@ struct server_context { const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); @@ -2266,7 +2137,7 @@ struct server_context { slot.ga_i += slot.ga_w / slot.ga_n; - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); } slot.n_past_se += n_tokens; @@ -2290,11 +2161,7 @@ struct server_context { if (ret != 0) { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", { - {"i", i}, - {"n_batch", n_batch}, - {"ret", ret}, - }); + SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); for (auto & slot : slots) { slot.release(); send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); @@ -2306,11 +2173,7 @@ struct server_context { n_batch /= 2; i -= n_batch; - LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", { - {"i", i}, - {"n_batch", n_batch}, - {"ret", ret}, - }); + SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); continue; // continue loop of n_batch } @@ -2370,7 +2233,7 @@ struct server_context { } } - LOG_VERBOSE("run slots completed", {}); + SRV_DBG("%s", "run slots completed\n"); } json model_meta() const { @@ -2391,19 +2254,18 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp return; } - LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); + //LOG_INFO("request", { + // {"remote_addr", req.remote_addr}, + // {"remote_port", req.remote_port}, + // {"status", res.status}, + // {"method", req.method}, + // {"path", req.path}, + // {"params", req.params}, + //}); + LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); + LOG_DBG("request: %s\n", req.body.c_str()); + LOG_DBG("response: %s\n", res.body.c_str()); } std::function shutdown_handler; @@ -2421,9 +2283,6 @@ inline void signal_handler(int signal) { } int main(int argc, char ** argv) { -#if SERVER_VERBOSE != 1 - log_disable(); -#endif // own arguments required by this example gpt_params params; @@ -2431,9 +2290,11 @@ int main(int argc, char ** argv) { return 1; } - // TODO: not great to use extern vars - server_log_json = params.log_json; - server_verbose = params.verbosity > 0; + gpt_init(); + + // enabling this will output extra debug information in the HTTP responses from the server + // see format_final_response_oaicompat() + const bool verbose = params.verbosity > 9; // struct that contains llama context and inference server_context ctx_server; @@ -2449,17 +2310,10 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - LOG_INFO("build info", { - {"build", LLAMA_BUILD_NUMBER}, - {"commit", LLAMA_COMMIT} - }); - - LOG_INFO("system info", { - {"n_threads", params.cpuparams.n_threads}, - {"n_threads_batch", params.cpuparams_batch.n_threads}, - {"total_threads", std::thread::hardware_concurrency()}, - {"system_info", llama_print_system_info()}, - }); + LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); + LOG_INF("\n"); + LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("\n"); std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT @@ -2491,13 +2345,13 @@ int main(int argc, char ** argv) { svr->set_logger(log_server_request); - auto res_error = [](httplib::Response & res, json error_data) { + auto res_error = [](httplib::Response & res, const json & error_data) { json final_response {{"error", error_data}}; res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); res.status = json_value(error_data, "code", 500); }; - auto res_ok = [](httplib::Response & res, json data) { + auto res_ok = [](httplib::Response & res, const json & data) { res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); res.status = 200; }; @@ -2505,7 +2359,7 @@ int main(int argc, char ** argv) { svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { std::string message; try { - std::rethrow_exception(std::move(ep)); + std::rethrow_exception(ep); } catch (std::exception & e) { message = e.what(); } catch (...) { @@ -2513,7 +2367,7 @@ int main(int argc, char ** argv) { } json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); - LOG_VERBOSE("Got exception", formatted_error); + LOG_WRN("got exception: %s\n", formatted_error.dump().c_str()); res_error(res, formatted_error); }); @@ -2588,7 +2442,7 @@ int main(int argc, char ** argv) { // API key is invalid or not provided res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); - LOG_WARNING("Unauthorized: Invalid API Key", {}); + LOG_WRN("Unauthorized: Invalid API Key\n"); return false; }; @@ -2925,14 +2779,14 @@ int main(int argc, char ** argv) { } res_ok(res, arr); } - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); }); } else { const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool { + ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { return server_sent_event(sink, "data", result.data); - }, [&](json error_data) { + }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); sink.done(); @@ -2953,7 +2807,7 @@ int main(int argc, char ** argv) { }; // TODO: maybe merge this function with "handle_completions_generic" - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; @@ -2970,16 +2824,16 @@ int main(int argc, char ** argv) { const auto completion_id = gen_chatcmplid(); if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { + ctx_server.receive_cmpl_results(task_ids, [&](const std::vector & results) { // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id); + json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose); res_ok(res, result_oai); - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); }); } else { const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool { + ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); for (auto & event_data : result_array) { if (event_data.empty()) { @@ -2990,7 +2844,7 @@ int main(int argc, char ** argv) { } } return true; // ok - }, [&](json error_data) { + }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); static const std::string ev_done = "data: [DONE]\n\n"; @@ -3103,7 +2957,7 @@ int main(int argc, char ** argv) { for (const auto & res : results) { responses.push_back(res.data); } - }, [&](json error_data) { + }, [&](const json & error_data) { res_error(res, error_data); error = true; }); @@ -3122,12 +2976,12 @@ int main(int argc, char ** argv) { const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { json result = json::array(); - for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) { - auto & la = ctx_server.lora_adapters[i]; + for (size_t i = 0; i < ctx_server.loras.size(); ++i) { + auto & lora = ctx_server.loras[i]; result.push_back({ {"id", i}, - {"path", la.path}, - {"scale", la.scale}, + {"path", lora.path}, + {"scale", lora.scale}, }); } res_ok(res, result); @@ -3136,11 +2990,11 @@ int main(int argc, char ** argv) { const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { const std::vector body = json::parse(req.body); - int max_idx = ctx_server.lora_adapters.size(); + int max_idx = ctx_server.loras.size(); // clear existing value - for (auto & la : ctx_server.lora_adapters) { - la.scale = 0.0f; + for (auto & lora : ctx_server.loras) { + lora.scale = 0.0f; } // set value @@ -3148,7 +3002,7 @@ int main(int argc, char ** argv) { int id = entry.at("id"); float scale = entry.at("scale"); if (0 <= id && id < max_idx) { - ctx_server.lora_adapters[id].scale = scale; + ctx_server.loras[id].scale = scale; } else { throw std::runtime_error("invalid adapter id"); } @@ -3243,59 +3097,59 @@ int main(int argc, char ** argv) { // bind HTTP listen port, run the HTTP server in a thread if (!svr->bind_to_port(params.hostname, params.port)) { - LOG_ERROR("couldn't bind HTTP server socket", { - {"hostname", params.hostname}, - {"port", params.port}, - }); + //LOG_ERROR("couldn't bind HTTP server socket", { + // {"hostname", params.hostname}, + // {"port", params.port}, + //}); + LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); clean_up(); - LOG_ERROR("exiting due to HTTP server error", {}); return 1; } std::thread t([&]() { svr->listen_after_bind(); }); svr->wait_until_ready(); - LOG_INFO("HTTP server is listening", log_data); + //LOG_INFO("HTTP server is listening", log_data); + LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); // load the model - LOG_INFO("loading model", log_data); + LOG_INF("%s: loading model\n", __func__); + if (!ctx_server.load_model(params)) { clean_up(); t.join(); - LOG_ERROR("exiting due to model loading error", {}); + LOG_ERR("%s: exiting due to model loading error\n", __func__); return 1; - } else { - ctx_server.init(); - state.store(SERVER_STATE_READY); - - LOG_INFO("model loaded", {}); - - // if a custom chat template is not supplied, we will use the one that comes with the model (if any) - if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template()) { - LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); - params.chat_template = "chatml"; - } - } - - // print sample chat example to make it clear which template is used - { - LOG_INFO("chat template", { - {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)}, - {"built_in", params.chat_template.empty()}, - }); - } - - ctx_server.queue_tasks.on_new_task(std::bind( - &server_context::process_single_task, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_update_slots(std::bind( - &server_context::update_slots, &ctx_server)); - - shutdown_handler = [&](int) { - ctx_server.queue_tasks.terminate(); - }; - ctx_server.queue_tasks.start_loop(); } + ctx_server.init(); + state.store(SERVER_STATE_READY); + + LOG_INF("%s: model loaded\n", __func__); + + // if a custom chat template is not supplied, we will use the one that comes with the model (if any) + if (params.chat_template.empty()) { + if (!ctx_server.validate_model_chat_template()) { + LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); + params.chat_template = "chatml"; + } + } + + // print sample chat example to make it clear which template is used + LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str()); + + ctx_server.queue_tasks.on_new_task(std::bind( + &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( + &server_context::update_slots, &ctx_server)); + + shutdown_handler = [&](int) { + ctx_server.queue_tasks.terminate(); + }; + + LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); + + ctx_server.queue_tasks.start_loop(); + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore new file mode 100644 index 000000000..1d17dae13 --- /dev/null +++ b/examples/server/tests/.gitignore @@ -0,0 +1 @@ +.venv diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 5e6cb277b..10f22c447 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables: | `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` | | `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` | | `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` | -| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format | | `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` | ### Run @bug, @wip or @wrong_usage annotated scenario diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 0f4249b13..062f084be 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1372,8 +1372,6 @@ def start_server_background(context): server_args.append('--verbose') if context.lora_file: server_args.extend(['--lora', context.lora_file]) - if 'SERVER_LOG_FORMAT_JSON' not in os.environ: - server_args.extend(['--log-format', "text"]) args = [str(arg) for arg in [context.server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index adb1a1cb9..537c8a223 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -1,7 +1,8 @@ #pragma once -#include "llama.h" #include "common.h" +#include "log.h" +#include "llama.h" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -15,10 +16,10 @@ #define JSON_ASSERT GGML_ASSERT #include "json.hpp" +#include +#include #include #include -#include -#include #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" @@ -35,32 +36,6 @@ enum error_type { ERROR_TYPE_NOT_SUPPORTED, // custom error }; -extern bool server_verbose; -extern bool server_log_json; - -#ifndef SERVER_VERBOSE -#define SERVER_VERBOSE 1 -#endif - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do \ - { \ - if (server_verbose) \ - { \ - server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra); - template static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value @@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul try { return body.at(key); } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { - std::stringstream ss; - ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value."; - LOG_WARNING(ss.str().c_str(), body); + LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name()); return default_value; } } else { @@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul } } -static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) { - std::stringstream ss_tid; - ss_tid << std::this_thread::get_id(); - json log = json{ - {"tid", ss_tid.str()}, - {"timestamp", time(nullptr)}, - }; - - if (server_log_json) { - log.merge_patch({ - {"level", level}, - {"function", function}, - {"line", line}, - {"msg", message}, - }); - - if (!extra.empty()) { - log.merge_patch(extra); - } - - printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str()); - } else { - char buf[1024]; - snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - - if (!extra.empty()) { - log.merge_patch(extra); - } - std::stringstream ss; - ss << buf << " |"; - for (const auto & el : log.items()) - { - const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); - ss << " " << el.key() << "=" << value; - } - - const std::string str = ss.str(); - printf("%.*s\n", (int)str.size(), str.data()); - } - fflush(stdout); -} - // // chat template utils // @@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri chat.push_back({role, content}); } - auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); - LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}}); + const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); + LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); + return formatted_chat; } @@ -243,10 +175,7 @@ static std::string random_string() { } static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - - return chatcmplid.str(); + return "chatcmpl-" + random_string(); } // @@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin return std::string::npos; } -static bool json_is_array_of_numbers(json data) { +static bool json_is_array_of_numbers(const json & data) { if (data.is_array()) { for (const auto & e : data) { if (!e.is_number()) { @@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector unsupported_params { "tools", "tool_choice" }; - for (auto & param : unsupported_params) { + for (const auto & param : unsupported_params) { if (body.contains(param)) { throw std::runtime_error("Unsupported param: " + param); } @@ -444,7 +371,7 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { +static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) { bool stopped_word = result.count("stopped_word") != 0; bool stopped_eos = json_value(result, "stopped_eos", false); int num_tokens_predicted = json_value(result, "tokens_predicted", 0); @@ -481,7 +408,8 @@ static json format_final_response_oaicompat(const json & request, json result, c {"id", completion_id} }; - if (server_verbose) { + // extra fields for debugging purposes + if (verbose) { res["__verbose"] = result; } @@ -493,7 +421,7 @@ static json format_final_response_oaicompat(const json & request, json result, c } // return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(json result, const std::string & completion_id) { +static std::vector format_partial_response_oaicompat(const json & result, const std::string & completion_id) { if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { return std::vector({result}); } @@ -595,7 +523,7 @@ static std::vector format_partial_response_oaicompat(json result, const st static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { json data = json::array(); int i = 0; - for (auto & elem : embeddings) { + for (const auto & elem : embeddings) { data.push_back(json{ {"embedding", json_value(elem, "embedding", json::array())}, {"index", i++}, diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 0c923d4ed..c2b7267c8 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,16 +1,14 @@ #include "arg.h" #include "common.h" +#include "log.h" #include "llama.h" -#include -#include -#include #include static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); - LOG_TEE("\n"); + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); + LOG("\n"); } int main(int argc, char ** argv) { @@ -23,6 +21,8 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + // total length of the sequence including the prompt const int n_predict = params.n_predict; @@ -69,25 +69,24 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); + LOG("\n"); + LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { - LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__); + LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); + LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__); return 1; } // print the prompt token-by-token - fprintf(stderr, "\n"); + LOG("\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", llama_token_to_piece(ctx, id).c_str()); } - fflush(stderr); - // create a llama_batch with size 512 // we use this object to submit token data for decoding @@ -102,7 +101,7 @@ int main(int argc, char ** argv) { batch.logits[batch.n_tokens - 1] = true; if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG("%s: llama_decode() failed\n", __func__); return 1; } @@ -116,16 +115,16 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // sample the next token { - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); + const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1); // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { - LOG_TEE("\n"); + LOG("\n"); break; } - LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); // prepare the next batch @@ -141,23 +140,23 @@ int main(int argc, char ** argv) { // evaluate the current batch with the transformer model if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); return 1; } } - LOG_TEE("\n"); + LOG("\n"); const auto t_main_end = ggml_time_us(); - LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG_TEE("\n"); + LOG("\n"); llama_perf_sampler_print(smpl); llama_perf_context_print(ctx); - fprintf(stderr, "\n"); + LOG("\n"); llama_batch_free(batch); llama_sampler_free(smpl); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 843579acd..fbac21811 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -1,13 +1,16 @@ #include "arg.h" #include "common.h" #include "sampling.h" +#include "log.h" #include "llama.h" +#include #include +#include +#include +#include #include #include -#include -#include #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 @@ -33,8 +36,10 @@ int main(int argc, char ** argv) { return 1; } + gpt_init(); + if (params.model_draft.empty()) { - fprintf(stderr, "%s: error: --model-draft is required\n", __func__); + LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -47,12 +52,6 @@ int main(int argc, char ** argv) { std::default_random_engine rng(params.sparams.seed); std::uniform_real_distribution<> u_dist; -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("speculative", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -81,14 +80,14 @@ int main(int argc, char ** argv) { ctx_dft = llama_init_dft.context; const bool vocab_type_tgt = llama_vocab_type(model_tgt); - LOG("vocab_type tgt: %d\n", vocab_type_tgt); + LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt); const bool vocab_type_dft = llama_vocab_type(model_dft); - LOG("vocab_type dft: %d\n", vocab_type_dft); + LOG_DBG("vocab_type dft: %d\n", vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { - fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__); - fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); + LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__); + LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); return 1; } @@ -98,7 +97,7 @@ int main(int argc, char ** argv) { llama_token_bos(model_tgt) != llama_token_bos(model_dft) || llama_token_eos(model_tgt) != llama_token_eos(model_dft) ) { - fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__); + LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); return 1; } @@ -110,8 +109,8 @@ int main(int argc, char ** argv) { : n_vocab_dft - n_vocab_tgt; if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { - fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__); - fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__); + LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); return 1; } @@ -120,8 +119,8 @@ int main(int argc, char ** argv) { const char * token_text_tgt = llama_token_get_text(model_tgt, i); const char * token_text_dft = llama_token_get_text(model_dft, i); if (std::strcmp(token_text_tgt, token_text_dft) != 0) { - fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__); - fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i, + LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__); + LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i, llama_token_to_piece(ctx_tgt, i).c_str(), llama_token_to_piece(ctx_dft, i).c_str()); return 1; @@ -138,18 +137,16 @@ int main(int argc, char ** argv) { const int max_tokens_list_size = max_context_size - 4; if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); + LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); return 1; } - fprintf(stderr, "\n\n"); + LOG("\n\n"); for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str()); + LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str()); } - fflush(stderr); - const int n_input = inp.size(); const auto t_enc_start = ggml_time_us(); @@ -211,7 +208,7 @@ int main(int argc, char ** argv) { active_seqs.insert(s); const auto & tokens = drafts[s].tokens; - LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str()); + LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str()); } int i_dft = 0; @@ -254,7 +251,7 @@ int main(int argc, char ** argv) { continue; } - LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); + LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); float r = u_dist(rng); llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true }; @@ -272,7 +269,7 @@ int main(int argc, char ** argv) { break; } } - LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt); + LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt); if (r <= p_tgt / p_dft) { s_keep = s; accept = true; @@ -280,10 +277,10 @@ int main(int argc, char ** argv) { token_str = llama_token_to_piece(ctx_tgt, token_id); gpt_sampler_accept(smpl, token_id, true); - LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); + LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); break; } else { - LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); + LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); drafts[s].active = false; // calculate residual probability @@ -338,7 +335,7 @@ int main(int argc, char ** argv) { if (!accept) { // all drafted tokens were rejected // sample from the target model - LOG("all drafted tokens were rejected, sampling from residual distribution\n"); + LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n"); std::vector probs(dist_tgt.size); for (size_t i = 0; i < dist_tgt.size; ++i) { probs[i] = dist_tgt.data[i].p; @@ -356,13 +353,11 @@ int main(int argc, char ** argv) { // greedy verification // sample from the target model - LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); + LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); gpt_sampler_accept(smpl, token_id, true); - //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str()); - token_str = llama_token_to_piece(ctx_tgt, token_id); for (int s = 0; s < n_seq_dft; ++s) { @@ -371,7 +366,7 @@ int main(int argc, char ** argv) { } if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str()); + LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str()); s_keep = s; accept = true; @@ -393,26 +388,24 @@ int main(int argc, char ** argv) { ++i_dft; if (params.use_color) { // Color token according to its origin sequence - printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); + LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); } else { - printf("%s", token_str.c_str()); + LOG("%s", token_str.c_str()); } - fflush(stdout); continue; } else { - printf("%s", token_str.c_str()); - fflush(stdout); + LOG("%s", token_str.c_str()); break; } } } { - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str()); + LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str()); // TODO: simplify { - LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); + LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); llama_kv_cache_seq_keep(ctx_dft, s_keep); llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); @@ -439,7 +432,7 @@ int main(int argc, char ** argv) { llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); - // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); + // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); llama_decode(ctx_dft, batch_dft); ++n_past_dft; @@ -486,7 +479,7 @@ int main(int argc, char ** argv) { const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { - LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", + LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); } @@ -495,7 +488,7 @@ int main(int argc, char ** argv) { // attempt to split the branch if the probability is high enough for (int f = 1; f < 8; ++f) { if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) { - LOG("splitting seq %3d into %3d\n", s, n_seq_cur); + LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur); llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); @@ -584,7 +577,7 @@ int main(int argc, char ** argv) { llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); } - // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); + // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); llama_decode(ctx_tgt, batch_tgt); ++n_past_tgt; } @@ -602,23 +595,25 @@ int main(int argc, char ** argv) { auto t_dec_end = ggml_time_us(); - LOG_TEE("\n\n"); + LOG("\n\n"); - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); + LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); + LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_INF("\n"); + LOG_INF("n_draft = %d\n", n_draft); + LOG_INF("n_predict = %d\n", n_predict); + LOG_INF("n_drafted = %d\n", n_drafted); + LOG_INF("n_accept = %d\n", n_accept); + LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - LOG_TEE("\ndraft:\n\n"); + LOG_INF("\n"); + LOG_INF("draft:\n\n"); // TODO: print sampling/grammar timings for all drafts llama_perf_context_print(ctx_dft); - LOG_TEE("\ntarget:\n\n"); + LOG_INF("\n"); + LOG_INF("target:\n\n"); gpt_perf_print(ctx_tgt, smpl); gpt_sampler_free(smpl); @@ -637,7 +632,7 @@ int main(int argc, char ** argv) { llama_backend_free(); - fprintf(stderr, "\n\n"); + LOG("\n\n"); return 0; } diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index c817be566..a9af6471f 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -1,11 +1,13 @@ #include "common.h" +//#include "log.h" // TODO: start using log.h #include "llama.h" -#include #include +#include #include #include #include +#include // TODO: remove me #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN @@ -13,25 +15,25 @@ #include // For CommandLineToArgvW #endif -static void print_usage_information(const char * argv0, FILE * stream) { - fprintf(stream, "usage: %s [options]\n\n", argv0); - fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); - fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); - fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); - fprintf(stream, "to control the behavior of the tokenizer.\n\n"); - fprintf(stream, " The possible options are:\n"); - fprintf(stream, "\n"); - fprintf(stream, " -h, --help print this help and exit\n"); - fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); - fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); - fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); - fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); - fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); - fprintf(stream, " --stdin read prompt from standard input.\n"); - fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); - fprintf(stream, " --no-parse-special do not parse control tokens.\n"); - fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); - fprintf(stream, " --show-count print the total number of tokens.\n"); +static void print_usage_information(const char * argv0) { + printf("usage: %s [options]\n\n", argv0); + printf("The tokenize program tokenizes a prompt using a given model,\n"); + printf("and prints the resulting tokens to standard output.\n\n"); + printf("It needs a model file, a prompt, and optionally other flags\n"); + printf("to control the behavior of the tokenizer.\n\n"); + printf(" The possible options are:\n"); + printf("\n"); + printf(" -h, --help print this help and exit\n"); + printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n"); + printf(" --ids if given, only print numerical token IDs, and not token strings.\n"); + printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); + printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); + printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); + printf(" --stdin read prompt from standard input.\n"); + printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + printf(" --no-parse-special do not parse control tokens.\n"); + printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n"); + printf(" --show-count print the total number of tokens.\n"); } static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { @@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) { const int argc = argv.size(); if (argc <= 1) { - print_usage_information(argv[0].c_str(), stderr); + print_usage_information(argv[0].c_str()); return 1; } @@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) { for (; iarg < argc; ++iarg) { std::string arg{argv[iarg]}; if (arg == "-h" || arg == "--help") { - print_usage_information(argv[0].c_str(), stdout); + print_usage_information(argv[0].c_str()); return 0; } else if (arg == "--ids") { @@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) { // Start actually doing the tokenizing stuff. ////// -#ifdef LOG_DISABLE_LOGS - disable_logging = true; -#endif - if (disable_logging) { llama_log_set(llama_log_callback_null, NULL); } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 13026ab32..a413df357 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -564,10 +564,11 @@ extern "C" { }; enum ggml_log_level { - GGML_LOG_LEVEL_ERROR = 2, - GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4, - GGML_LOG_LEVEL_DEBUG = 5 + GGML_LOG_LEVEL_NONE = 0, + GGML_LOG_LEVEL_INFO = 1, + GGML_LOG_LEVEL_WARN = 2, + GGML_LOG_LEVEL_ERROR = 3, + GGML_LOG_LEVEL_DEBUG = 4, }; enum ggml_tensor_flag { diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 6c85acfec..7f85dc30d 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -13,13 +13,16 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) #ifdef GGML_METAL_NDEBUG +#define GGML_METAL_LOG(...) #define GGML_METAL_LOG_INFO(...) #define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_ERROR(...) #else +#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__) #define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #endif #define UNUSED(x) (void)(x) @@ -3183,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", + GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n", __func__, size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, @@ -3191,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); - } else { - GGML_METAL_LOG_INFO("\n"); } } else { GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", diff --git a/src/llama-impl.h b/src/llama-impl.h index 87012617f..2bde75ec1 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3) void llama_log_internal (ggml_log_level level, const char * format, ...); void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); +#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__) #define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__) #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) diff --git a/src/llama.cpp b/src/llama.cpp index 1986a90fb..c917d1c7b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18074,9 +18074,9 @@ struct llama_model * llama_load_model_from_file( unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { *cur_percentage_p = percentage; - LLAMA_LOG_INFO("."); + LLAMA_LOG("."); if (percentage >= 100) { - LLAMA_LOG_INFO("\n"); + LLAMA_LOG("\n"); } } return true; @@ -20781,8 +20781,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l if (len < 128) { g_state.log_callback(level, buffer, g_state.log_callback_user_data); } else { - char* buffer2 = new char[len+1]; - vsnprintf(buffer2, len+1, format, args_copy); + char * buffer2 = new char[len + 1]; + vsnprintf(buffer2, len + 1, format, args_copy); buffer2[len] = 0; g_state.log_callback(level, buffer2, g_state.log_callback_user_data); delete[] buffer2; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 30e71cfd4..7dcd3fce8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) # llama_target_and_test(test-double-float.cpp) # SLOW +llama_target_and_test(test-log.cpp) llama_target_and_test(test-arg-parser.cpp) llama_target_and_test(test-quantize-fns.cpp) llama_target_and_test(test-quantize-perf.cpp) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index f26707910..e07d09733 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -85,7 +85,7 @@ int main(void) { argv = {"binary_name", "--verbose"}; assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.verbosity == 1); + assert(params.verbosity > 1); argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); diff --git a/tests/test-log.cpp b/tests/test-log.cpp new file mode 100644 index 000000000..211222369 --- /dev/null +++ b/tests/test-log.cpp @@ -0,0 +1,39 @@ +#include "log.h" + +#include +#include + +int main() { + const int n_thread = 8; + + std::thread threads[n_thread]; + for (int i = 0; i < n_thread; i++) { + threads[i] = std::thread([i]() { + const int n_msg = 1000; + + for (int j = 0; j < n_msg; j++) { + const int log_type = std::rand() % 4; + + switch (log_type) { + case 0: LOG_INF("Thread %d: %d\n", i, j); break; + case 1: LOG_WRN("Thread %d: %d\n", i, j); break; + case 2: LOG_ERR("Thread %d: %d\n", i, j); break; + case 3: LOG_DBG("Thread %d: %d\n", i, j); break; + default: + break; + } + + if (rand () % 10 < 5) { + gpt_log_set_timestamps(gpt_log_main(), rand() % 2); + gpt_log_set_prefix (gpt_log_main(), rand() % 2); + } + } + }); + } + + for (int i = 0; i < n_thread; i++) { + threads[i].join(); + } + + return 0; +} From 90a2fff0e7f80c6fea3fc6cf9a7b482744f3f164 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Sep 2024 05:14:23 +0300 Subject: [PATCH 14/51] flake.lock: Update (#9488) --- flake.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flake.lock b/flake.lock index e9382ff3d..0db5ff92a 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1725234343, - "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=", + "lastModified": 1726153070, + "narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6", + "rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1725634671, - "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=", + "lastModified": 1726062873, + "narHash": "sha256-IiA3jfbR7K/B5+9byVi9BZGWTD4VSbWe8VLpp9B/iYk=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c", + "rev": "4f807e8940284ad7925ebd0a0993d2a1791acb2f", "type": "github" }, "original": { From c4965a64f72ac9434c21cf0e1c3421d13e889155 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Sep 2024 09:05:56 +0300 Subject: [PATCH 15/51] metal : handle zero-sized allocs (#9466) --- ggml/src/ggml-metal.m | 48 ++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index 7f85dc30d..f87181d19 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3225,15 +3225,19 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff ctx->n_buffers = 1; if (ctx->all_data != NULL) { - ctx->buffers[0].data = ctx->all_data; - ctx->buffers[0].size = size; - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = nil; + + if (size_aligned > 0) { + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + } } - if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) { + if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); ggml_backend_metal_free_device(); @@ -3310,14 +3314,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + ctx->buffers[ctx->n_buffers].metal = nil; - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + if (size_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - return false; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); + return false; + } } ggml_backend_metal_log_allocated_size(device, size_aligned); @@ -3333,14 +3340,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, for (size_t i = 0; i < size; i += size_step) { const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + ctx->buffers[ctx->n_buffers].metal = nil; - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + if (size_step_aligned > 0) { + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); - return false; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); + return false; + } } ggml_backend_metal_log_allocated_size(device, size_step_aligned); From 441b72b91f818fe69497e5816f87969e90c73c43 Mon Sep 17 00:00:00 2001 From: Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> Date: Mon, 16 Sep 2024 01:20:01 -0500 Subject: [PATCH 16/51] main : option to disable context shift (#9484) * added cli arg to disable context shift * reverted precommit * updated README.md for main * white space * allow disabling context shift in the server * Update common/arg.cpp no-context-shift only works for main example Co-authored-by: Georgi Gerganov * added server example to --no-context-shift args * removed server changes * white space --------- Co-authored-by: Georgi Gerganov --- common/arg.cpp | 8 ++++++- common/common.h | 1 + examples/main/README.md | 2 ++ examples/main/main.cpp | 46 +++++++++++++++++++++++------------------ 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8fcb8c25f..60e37a89a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -685,6 +685,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.n_keep = value; } )); + add_opt(llama_arg( + {"--no-context-shift"}, + format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + [](gpt_params & params) { + params.ctx_shift = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--chunks"}, "N", format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -1985,4 +1992,3 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, return ctx_arg; } - diff --git a/common/common.h b/common/common.h index e100c8fa7..cb87c4479 100644 --- a/common/common.h +++ b/common/common.h @@ -246,6 +246,7 @@ struct gpt_params { bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention bool no_perf = false; // disable performance metrics + bool ctx_shift = true; // context shift on inifinite text generation bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool logits_all = false; // return logits for all tokens in the batch diff --git a/examples/main/README.md b/examples/main/README.md index 9396a34fa..6730effdf 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -161,6 +161,8 @@ A value of -1 will enable infinite text generation, even though we have a finite If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. +The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full. + It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. ### Temperature diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d9e45ce2f..91fea9326 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -559,29 +559,35 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() >= n_ctx) { - if (params.n_predict == -2) { - LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + if (!params.ctx_shift){ + LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); break; + } else { + if (params.n_predict == -2) { + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG_DBG("after swap: n_past = %d\n", n_past); - - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - - LOG_DBG("clear session path\n"); - path_session.clear(); } } else { // context extension via Self-Extend From 95ca85168b5b089b80a811b59528ce0a2f1bd1dd Mon Sep 17 00:00:00 2001 From: CarryFun <76023481+CarryFun@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:45:20 +0800 Subject: [PATCH 17/51] llama : support MiniCPM3 (#9322) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: 范睿凯 --- convert_hf_to_gguf.py | 54 +++++++ gguf-py/gguf/constants.py | 19 +++ src/llama.cpp | 298 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 371 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2c6d5d95b..d995ed764 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1841,6 +1841,60 @@ class MiniCPMModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(Model): + model_arch = gguf.MODEL_ARCH.MINICPM3 + + def set_gguf_parameters(self): + hparams = self.hparams + + rope_dims = hparams["qk_rope_head_dim"] + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: + self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) + self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is None: + return + + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32)) + self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32)) + + def set_vocab(self): + self._set_vocab_llama_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + @Model.register("QWenLMHeadModel") class QwenModel(Model): model_arch = gguf.MODEL_ARCH.QWEN diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c87d08782..2c8545455 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -210,6 +210,7 @@ class MODEL_ARCH(IntEnum): ORION = auto() INTERNLM2 = auto() MINICPM = auto() + MINICPM3 = auto() GEMMA = auto() GEMMA2 = auto() STARCODER2 = auto() @@ -364,6 +365,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", @@ -867,6 +869,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.MINICPM3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.GEMMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/llama.cpp b/src/llama.cpp index c917d1c7b..aa565723b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -193,6 +193,7 @@ enum llm_arch { LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, + LLM_ARCH_MINICPM3, LLM_ARCH_GEMMA, LLM_ARCH_GEMMA2, LLM_ARCH_STARCODER2, @@ -241,6 +242,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_MINICPM3, "minicpm3" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_STARCODER2, "starcoder2" }, @@ -1034,6 +1036,29 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, }, }, + { + LLM_ARCH_MINICPM3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, + { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, + { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_GEMMA, { @@ -5390,6 +5415,17 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MINICPM3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + + switch (hparams.n_layer) { + case 62: model.type = e_model::MODEL_4B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_GROK: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -6897,6 +6933,54 @@ static bool llm_load_tensors( } } } break; + case LLM_ARCH_MINICPM3: + { + const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}); + + layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}); + + layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}); + layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}); + + layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}); + layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + } break; case LLM_ARCH_GROK: { if (n_expert == 0) { @@ -12843,6 +12927,215 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_minicpm3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + //TODO: if the model varies, these parameters need to be read from the model + const int64_t n_embd_base = 256; + const float scale_embd = 12.0f; + const float scale_depth = 1.4f; + const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k)); + + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // scale the input embeddings + inpL = ggml_scale(ctx0, inpL, scale_embd); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + struct ggml_tensor * rope_factors = build_rope_factors(il); + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + struct ggml_tensor * q = NULL; + // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); + + q = llm_build_norm(ctx0, q, hparams, + model.layers[il].attn_q_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(q, "q", il); + + // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + 0); + cb(q_nope, "q_nope", il); + + // and {n_head * n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_pe_compresseed, "kv_pe_compresseed", il); + + // split into {kv_lora_rank, n_tokens} + struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, + kv_pe_compresseed->nb[1], + 0); + cb(kv_compressed, "kv_compressed", il); + + // and {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_pe_compresseed->nb[1], + kv_pe_compresseed->nb[1], + ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); + + kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm + kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(kv_compressed, "kv_compressed", il); + + // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} + struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); + cb(kv, "kv", il); + + // split into {n_head * n_embd_head_qk_nope, n_tokens} + struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + 0); + cb(k_nope, "k_nope", il); + + // and {n_head * n_embd_head_v, n_tokens} + struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_row_size(kv->type, (n_embd_head_qk_nope))); + cb(v_states, "v_states", il); + + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states", il); + + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + 0); + cb(v_states, "v_states", il); + + q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + q_pe = ggml_rope_ext( + ctx0, q_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(q_pe, "q_pe", il); + + // shared RoPE key + k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE + k_pe = ggml_rope_ext( + ctx0, k_pe, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(k_pe, "k_pe", il); + + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + cb(q_states, "q_states", il); + + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + cb(k_states, "k_states", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // scale_res - scale the hidden states for residual connection + const float scale_res = scale_depth/sqrtf(float(n_layer)); + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled", il); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // scale the hidden states for residual connection + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled_ffn", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head scaling + const float scale_lmhead = float(n_embd_base)/float(n_embd); + cur = ggml_scale(ctx0, cur, scale_lmhead); + cb(cur, "lmhead_scaling", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_gemma() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -15383,6 +15676,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_minicpm(); } break; + case LLM_ARCH_MINICPM3: + { + result = llm.build_minicpm3(); + } break; case LLM_ARCH_GEMMA: { result = llm.build_gemma(); @@ -18609,6 +18906,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_CODESHELL: case LLM_ARCH_NEMOTRON: case LLM_ARCH_EXAONE: + case LLM_ARCH_MINICPM3: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here From 0aadac10c7dd704f8285ddf5a63d6f764cb340aa Mon Sep 17 00:00:00 2001 From: Shane A Date: Sun, 15 Sep 2024 23:47:37 -0700 Subject: [PATCH 18/51] llama : support OLMoE (#9462) --- README.md | 1 + convert_hf_to_gguf.py | 60 ++++++++++ gguf-py/gguf/constants.py | 19 +++ gguf-py/gguf/tensor_mapping.py | 30 ++--- src/llama.cpp | 203 +++++++++++++++++++++++++++++++++ 5 files changed, 298 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 9a10ead83..4d24dd591 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ Typically finetunes of the base models below are supported as well. - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) +- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924) - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) - [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d995ed764..f026977e9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2998,6 +2998,66 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("OlmoeForCausalLM") +class OlmoeModel(Model): + model_arch = gguf.MODEL_ARCH.OLMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_rms_eps(1e-5) + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2c8545455..0d88649d8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -220,6 +220,7 @@ class MODEL_ARCH(IntEnum): COMMAND_R = auto() DBRX = auto() OLMO = auto() + OLMOE = auto() OPENELM = auto() ARCTIC = auto() DEEPSEEK2 = auto() @@ -375,6 +376,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK2: "deepseek2", @@ -1027,6 +1029,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OLMOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], MODEL_ARCH.OPENELM: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bc9a13ee5..2ebfa2b43 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -13,7 +13,7 @@ class TensorNameMap: "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone "transformer.word_embeddings", # falcon "word_embeddings", # bloom - "model.embed_tokens", # llama-hf nemotron + "model.embed_tokens", # llama-hf nemotron olmoe "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert nomic-bert "language_model.embedding.word_embeddings", # persimmon @@ -54,7 +54,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -66,7 +66,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon jais exaone - "model.norm", # llama-hf baichuan internlm2 + "model.norm", # llama-hf baichuan internlm2 olmoe "norm", # llama-pth "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 @@ -98,7 +98,7 @@ class TensorNameMap: "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom "transformer.h.{bid}.ln_mlp", # falcon40b - "model.layers.{bid}.input_layernorm", # llama-hf nemotron + "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe "layers.{bid}.attention_norm", # llama-pth "language_model.encoder.layers.{bid}.input_layernorm", # persimmon "model.layers.{bid}.ln1", # yi @@ -142,7 +142,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -154,7 +154,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j @@ -167,7 +167,7 @@ class TensorNameMap: # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wv", # llama-pth "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j @@ -185,7 +185,7 @@ class TensorNameMap: "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j @@ -229,7 +229,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt - "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron + "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe "layers.{bid}.ffn_norm", # llama-pth "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi @@ -253,7 +253,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral - "model.layers.{bid}.mlp.gate", # qwen2moe + "model.layers.{bid}.mlp.gate", # qwen2moe olmoe "transformer.decoder_layer.{bid}.router", # Grok "transformer.blocks.{bid}.ffn.router.layer", # dbrx ), @@ -295,7 +295,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx - "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -327,7 +327,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx - "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_GATE_SHEXP: ( @@ -367,7 +367,7 @@ class TensorNameMap: "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx - "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged) + "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( @@ -378,7 +378,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon - "model.layers.{bid}.self_attn.q_norm", # cohere + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 "transformer.layers.{bid}.attn.q_norm", # openelm @@ -387,7 +387,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon - "model.layers.{bid}.self_attn.k_norm", # cohere + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe "transformer.blocks.{bid}.attn.k_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 "transformer.layers.{bid}.attn.k_norm", # openelm diff --git a/src/llama.cpp b/src/llama.cpp index aa565723b..30997bf15 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -202,6 +202,7 @@ enum llm_arch { LLM_ARCH_COMMAND_R, LLM_ARCH_DBRX, LLM_ARCH_OLMO, + LLM_ARCH_OLMOE, LLM_ARCH_OPENELM, LLM_ARCH_ARCTIC, LLM_ARCH_DEEPSEEK2, @@ -251,6 +252,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" }, @@ -1193,6 +1195,26 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_OLMOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_OPENELM, { @@ -2277,6 +2299,7 @@ enum e_model { MODEL_MEDIUM, MODEL_LARGE, MODEL_XL, + MODEL_A1_7B, MODEL_A2_7B, MODEL_8x7B, MODEL_8x22B, @@ -5241,6 +5264,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_MEDIUM: return "0.4B"; case MODEL_LARGE: return "0.8B"; case MODEL_XL: return "1.5B"; + case MODEL_A1_7B: return "A1.7B"; case MODEL_A2_7B: return "A2.7B"; case MODEL_8x7B: return "8x7B"; case MODEL_8x22B: return "8x22B"; @@ -5791,6 +5815,14 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_OLMOE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 16: model.type = e_model::MODEL_A1_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_OPENELM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -8018,6 +8050,44 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_OLMOE: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + + GGML_ASSERT(n_expert > 0); + GGML_ASSERT(n_expert_used > 0); + + // MoE branch + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } + } break; case LLM_ARCH_OPENELM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -13832,6 +13902,134 @@ struct llm_build_context { return gf; } + // based on the build_qwen2moe() function, changes: + // * removed shared experts + // * removed bias + // * added q, k norm + struct ggml_cgraph * build_olmoe() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_rope", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_rope", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_moe_ffn(ctx0, lctx, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + cb, il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_openelm() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -15712,6 +15910,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_olmo(); } break; + case LLM_ARCH_OLMOE: + { + result = llm.build_olmoe(); + } break; case LLM_ARCH_OPENELM: { result = llm.build_openelm(); @@ -18896,6 +19098,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2MOE: + case LLM_ARCH_OLMOE: case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: From 5c3d0f1824714e9a97fc9b06e046eefcb6ecc721 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Mon, 16 Sep 2024 06:48:24 +0000 Subject: [PATCH 19/51] ggml : IQ4_NL sgemm + Q4_0 AVX optimization (#9422) * squashed readd my iq4_nl sgemm PR https://github.com/ggerganov/llama.cpp/pull/8049 have ggml_vec_dot_q4_0 do two blocks per loop for avx try out f16c ggml_vec_dot_iq4_nl, but it's not really faster. as per https://github.com/ggerganov/llama.cpp/pull/8549 we can calculate several blocks at a time with no issue * shuffle * remove f16c iq4_nl as i cant make it faster than before --- ggml/src/ggml-quants.c | 69 +++++++++++++++++------------------- ggml/src/llamafile/sgemm.cpp | 38 ++++++++++++++++++++ 2 files changed, 71 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 322c85d2a..7c1ec8d54 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -230,6 +230,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) return _mm_packus_epi16( bytes1, bytes2); } + +static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { + const __m128i ax = _mm_sign_epi8(x, x); + const __m128i sy = _mm_sign_epi8(y, x); + return _mm_maddubs_epi16(ax, sy); +} #endif #elif defined(__SSSE3__) // horizontally add 4x4 floats @@ -4206,37 +4212,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc); #elif defined(__AVX__) - // Initialize accumulator with zeros - __m256 acc = _mm256_setzero_ps(); + const __m128i mone = _mm_set1_epi16(1); - // Main loop - for (; ib < nb; ++ib) { - // Compute combined scale for the block - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { + const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs); + const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); + const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs); + const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1); + const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs); + const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1); - const __m128i lowMask = _mm_set1_epi8(0xF); - const __m128i off = _mm_set1_epi8(8); - - const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs); - - __m128i bx_0 = _mm_and_si128(lowMask, tmp); - __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); - - bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); - by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16)); - bx_0 = _mm_sub_epi8(bx_0, off); - const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); - - // Apply the scale, and accumulate - acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); + const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8)); + const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8)); + const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8)); + const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8)); + const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0); + const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1); + const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0); + const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1); + const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone); + const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone); + const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone); + const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone); + accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1); + accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), + _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2); } - sumf = hsum_float_8(acc); + sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); #elif defined(__SSSE3__) // set constants const __m128i lowMask = _mm_set1_epi8(0xF); @@ -11819,15 +11825,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * #endif } - -#if defined(__AVX__) -static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { - const __m128i ax = _mm_sign_epi8(x, x); - const __m128i sy = _mm_sign_epi8(y, x); - return _mm_maddubs_epi16(ax, sy); -} -#endif - #if defined(__AVX2__) static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index d0c2bb284..1be6fdb57 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -235,6 +235,14 @@ template <> inline __m512 load(const ggml_fp16_t *p) { } #endif // __AVX512F__ +//////////////////////////////////////////////////////////////////////////////////////////////////// +// CONSTANTS + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; +static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl); +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // FLOATING POINT MATRIX MULTIPLICATION @@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX { return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); } + inline __m256i load(const block_iq4_nl *b) { + return MM256_SET_M128I(load1(b), load0(b)); + } + + inline __m128i load0(const block_iq4_nl *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x)); + } + + inline __m128i load1(const block_iq4_nl *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4))); + } + inline __m256 updot(__m256i u, __m256i s) { __m256i res; #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) @@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda #endif } + case GGML_TYPE_IQ4_NL: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ + k, (const block_iq4_nl *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n); + return true; +#else + return false; +#endif + } + default: return false; } From 19514d632e5274bc0c27c2269e8f2ad88b526d62 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Sep 2024 10:27:50 +0300 Subject: [PATCH 20/51] cmake : do not hide GGML options + rename option (#9465) * cmake : do not hide GGML options ggml-ci * build : rename flag GGML_CUDA_USE_GRAPHS -> GGML_CUDA_GRAPHS for consistency ggml-ci --- CMakeLists.txt | 6 +++--- Makefile | 2 +- ggml/CMakeLists.txt | 13 +++++++++++-- ggml/src/CMakeLists.txt | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 244019313..973907819 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,11 +82,11 @@ set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) # change the default for these ggml options if (NOT DEFINED GGML_LLAMAFILE) - set(GGML_LLAMAFILE ON) + set(GGML_LLAMAFILE_DEFAULT ON) endif() -if (NOT DEFINED GGML_CUDA_USE_GRAPHS) - set(GGML_CUDA_USE_GRAPHS ON) +if (NOT DEFINED GGML_CUDA_GRAPHS) + set(GGML_CUDA_GRAPHS_DEFAULT ON) endif() # transition helpers diff --git a/Makefile b/Makefile index cb5ff9f9d..f922f7083 100644 --- a/Makefile +++ b/Makefile @@ -619,7 +619,7 @@ ifdef GGML_CUDA CUDA_PATH ?= /usr/local/cuda endif - MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS + MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_NVCCFLAGS += -use_fast_math endif # GGML_MUSA diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 532534bcb..89fdf9d1c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -56,6 +56,15 @@ else() set(GGML_NATIVE_DEFAULT ON) endif() +# defaults +if (NOT GGML_LLAMAFILE_DEFAULT) + set(GGML_LLAMAFILE_DEFAULT OFF) +endif() + +if (NOT GGML_CUDA_GRAPHS_DEFAULT) + set(GGML_CUDA_GRAPHS_DEFAULT OFF) +endif() + # general option(GGML_STATIC "ggml: static link libraries" OFF) option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) @@ -110,7 +119,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework" option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") -option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF) +option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF) @@ -127,7 +136,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) -option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) +option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT}) option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 11b877e19..042ea9b77 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -329,7 +329,7 @@ if (GGML_CUDA) add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) - if (GGML_CUDA_USE_GRAPHS) + if (GGML_CUDA_GRAPHS) add_compile_definitions(GGML_CUDA_USE_GRAPHS) endif() From d54c21df7e2669c6cd7492713479d1aeb5846883 Mon Sep 17 00:00:00 2001 From: compilade Date: Mon, 16 Sep 2024 03:30:22 -0400 Subject: [PATCH 21/51] convert : identify missing model files (#9397) --- convert_hf_to_gguf.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f026977e9..c7e6ae0ca 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -132,12 +132,14 @@ class Model: def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() - if len(self.part_names) > 1: + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): self.tensor_names = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(self.dir_model / index_name, "r", encoding="utf-8") as f: + with open(index_file, "r", encoding="utf-8") as f: index: dict[str, Any] = json.load(f) weight_map = index.get("weight_map") if weight_map is None or not isinstance(weight_map, dict): @@ -145,6 +147,7 @@ class Model: self.tensor_names.update(weight_map.keys()) else: self.tensor_names = tensor_names_from_parts + weight_map = {} for part_name in self.part_names: logger.info(f"gguf: loading model part '{part_name}'") @@ -171,9 +174,17 @@ class Model: data = LazyTorchTensor.from_eager(data) yield name, data - # only verify tensor name presence; it doesn't matter if they are not in the right files - if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") + # verify tensor name presence and identify potentially missing files + if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) + missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError(f"Missing or incomplete model files: {missing_files}") + else: + raise ValueError("Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}") def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: if key not in gguf.MODEL_TENSORS[self.model_arch]: From a6a3a5c531c73aef85750a847d21e7d4671e723d Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Mon, 16 Sep 2024 13:06:50 +0200 Subject: [PATCH 22/51] ggml : link MATH_LIBRARY not by its full path (#9339) --- ggml/src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 042ea9b77..527c22c68 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1341,7 +1341,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads) find_library(MATH_LIBRARY m) if (MATH_LIBRARY) if (NOT WIN32 OR NOT GGML_SYCL) - target_link_libraries(ggml PRIVATE ${MATH_LIBRARY}) + list(APPEND GGML_EXTRA_LIBS_PRIVATE m) endif() endif() From acb2c32c336ce60d765bb189563cc216e57e9fc2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 16 Sep 2024 13:07:13 +0200 Subject: [PATCH 23/51] llama : rename n_embed to n_embd in rwkv6_time_mix (#9504) This commit renames n_embed to n_embd in llm_build_rwkv6_time_mix. The motivation for this change is consistency with the other rwkv6 functions like build_rwkv6 (and other parts of the code base). --- src/llama.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 30997bf15..0da764f9d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9571,7 +9571,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( struct ggml_tensor * cur, struct ggml_tensor * x_prev, struct ggml_tensor ** wkv_state) { - size_t n_embed = cur->ne[0]; + size_t n_embd = cur->ne[0]; size_t n_seq_tokens = cur->ne[1]; size_t n_seqs = cur->ne[2]; @@ -9582,8 +9582,8 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur); - sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens); - cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur); @@ -9608,11 +9608,11 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( xxx ); - struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0); - struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float)); - struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float)); - struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float)); - struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float)); + struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0); + struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); struct ggml_tensor * xw = ggml_add( ctx, @@ -9681,7 +9681,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( ) ); - w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed)); + w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd)); w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w))); w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens); @@ -9690,21 +9690,21 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix( r = ggml_transpose(ctx, r); struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state); - cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0); - *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float)); + cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0); + *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); // group norm with head_count groups - cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens); + cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens); cur = ggml_norm(ctx, cur, 64e-5f); // Convert back to regular vectors. - cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens); + cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens); cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b); cur = ggml_mul(ctx, cur, g); cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur); - return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs); + return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs); } static struct ggml_tensor * llm_build_rwkv6_channel_mix( From 23e0d70bacaaca1429d365a44aa9e7434f17823b Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 16 Sep 2024 16:22:07 +0200 Subject: [PATCH 24/51] ggml : move common CPU backend impl to new header (#9509) --- ggml/src/ggml-aarch64.c | 1 + ggml/src/ggml-cpu-impl.h | 614 ++++++++++++++++++++++++++++++++++ ggml/src/ggml-impl.h | 622 +---------------------------------- ggml/src/ggml-quants.c | 1 + ggml/src/ggml.c | 1 + ggml/src/llamafile/sgemm.cpp | 1 + 6 files changed, 631 insertions(+), 609 deletions(-) create mode 100644 ggml/src/ggml-cpu-impl.h diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 72cb83c9b..27375d0d7 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -4,6 +4,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include #include diff --git a/ggml/src/ggml-cpu-impl.h b/ggml/src/ggml-cpu-impl.h new file mode 100644 index 000000000..5b45155b0 --- /dev/null +++ b/ggml/src/ggml-cpu-impl.h @@ -0,0 +1,614 @@ +#pragma once + +// GGML CPU internal header + +#include "ggml.h" +#include "ggml-impl.h" +#include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ +//#include +#include +#include // memcpy +#include // fabsf + + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_MSC_VER) + +#define m512bh(p) p +#define m512i(p) p + +#else + +#define m512bh(p) (__m512bh)(p) +#define m512i(p) (__m512i)(p) + +#endif + +/** + * Converts brain16 to float32. + * + * The bfloat16 floating point format has the following structure: + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───┐ + * 0b0000000000000000 brain16 + * + * Since bf16 has the same number of exponent bits as a 32bit float, + * encoding and decoding numbers becomes relatively straightforward. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───────────────────┐ + * 0b00000000000000000000000000000000 IEEE binary32 + * + * For comparison, the standard fp16 format has fewer exponent bits. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌─┴─┐┌─┴──────┐ + * 0b0000000000000000 IEEE binary16 + * + * @see IEEE 754-2008 + */ +static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h.bits << 16; + return u.f; +} + +/** + * Converts float32 to brain16. + * + * This is binary identical with Google Brain float conversion. + * Floats shall round to nearest even, and NANs shall be quiet. + * Subnormals aren't flushed to zero, except perhaps when used. + * This code should vectorize nicely if using modern compilers. + */ +static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { + ggml_bf16_t h; + union { + float f; + uint32_t i; + } u; + u.f = s; + if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ + h.bits = (u.i >> 16) | 64; /* force to quiet */ + return h; + } + h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; + return h; +} + +#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) +#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) + +// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 +#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __FMA__ +#define __FMA__ +#endif +#ifndef __F16C__ +#define __F16C__ +#endif +#endif + +// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available +#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) +#ifndef __SSE3__ +#define __SSE3__ +#endif +#ifndef __SSSE3__ +#define __SSSE3__ +#endif +#endif + +#if defined(__ARM_FEATURE_SVE) +#include +#include +#endif + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +#if defined(__ARM_NEON) + +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include + +#ifdef _MSC_VER + +typedef uint16_t ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + +typedef __fp16 ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif // _MSC_VER + +#if !defined(__aarch64__) + +// 32-bit ARM compatibility + +// vaddlvq_s16 +// vpaddq_s16 +// vpaddq_s32 +// vaddvq_s32 +// vaddvq_f32 +// vmaxvq_f32 +// vcvtnq_s32_f32 +// vzip1_u8 +// vzip2_u8 + +inline static int32_t vaddlvq_s16(int16x8_t v) { + int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); + return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + +inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { + int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[0]; res[1] = b[0]; + res[2] = a[1]; res[3] = b[1]; + res[4] = a[2]; res[5] = b[2]; + res[6] = a[3]; res[7] = b[3]; + + return res; +} + +inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[4]; res[1] = b[4]; + res[2] = a[5]; res[3] = b[5]; + res[4] = a[6]; res[5] = b[6]; + res[6] = a[7]; res[7] = b[7]; + + return res; +} + +// vld1q_s16_x2 +// vld1q_u8_x2 +// vld1q_u8_x4 +// vld1q_s8_x2 +// vld1q_s8_x4 +// TODO: double-check these work correctly + +typedef struct ggml_int16x8x2_t { + int16x8_t val[2]; +} ggml_int16x8x2_t; + +inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { + ggml_int16x8x2_t res; + + res.val[0] = vld1q_s16(ptr + 0); + res.val[1] = vld1q_s16(ptr + 8); + + return res; +} + +typedef struct ggml_uint8x16x2_t { + uint8x16_t val[2]; +} ggml_uint8x16x2_t; + +inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { + ggml_uint8x16x2_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + + return res; +} + +typedef struct ggml_uint8x16x4_t { + uint8x16_t val[4]; +} ggml_uint8x16x4_t; + +inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { + ggml_uint8x16x4_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + res.val[2] = vld1q_u8(ptr + 32); + res.val[3] = vld1q_u8(ptr + 48); + + return res; +} + +typedef struct ggml_int8x16x2_t { + int8x16_t val[2]; +} ggml_int8x16x2_t; + +inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { + ggml_int8x16x2_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + + return res; +} + +typedef struct ggml_int8x16x4_t { + int8x16_t val[4]; +} ggml_int8x16x4_t; + +inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { + ggml_int8x16x4_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + res.val[2] = vld1q_s8(ptr + 32); + res.val[3] = vld1q_s8(ptr + 48); + + return res; +} + +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +// NOTE: not tested +inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + uint8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +#else + +#define ggml_int16x8x2_t int16x8x2_t +#define ggml_uint8x16x2_t uint8x16x2_t +#define ggml_uint8x16x4_t uint8x16x4_t +#define ggml_int8x16x2_t int8x16x2_t +#define ggml_int8x16x4_t int8x16x4_t + +#define ggml_vld1q_s16_x2 vld1q_s16_x2 +#define ggml_vld1q_u8_x2 vld1q_u8_x2 +#define ggml_vld1q_u8_x4 vld1q_u8_x4 +#define ggml_vld1q_s8_x2 vld1q_s8_x2 +#define ggml_vld1q_s8_x4 vld1q_s8_x4 +#define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 + +#endif // !defined(__aarch64__) + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#else + +#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + +#endif // !defined(__ARM_FEATURE_DOTPROD) + +#endif // defined(__ARM_NEON) + +#if defined(__ARM_NEON) && !defined(_MSC_VER) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + ggml_fp16_internal_t tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + ggml_fp16_internal_t tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; +} + +#else + +#ifdef __wasm_simd128__ +#include +#else +#ifdef __POWER9_VECTOR__ +#include +#undef bool +#define bool _Bool +#else +#if defined(_MSC_VER) || defined(__MINGW32__) +#include +#else +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) +#if !defined(__riscv) +#include +#endif +#endif +#endif +#endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif + +#if defined(__loongarch64) +#if defined(__loongarch_asx) +#include +#endif +#if defined(__loongarch_sx) +#include +#endif +#endif + +#if defined(__loongarch_asx) + +typedef union { + int32_t i; + float f; +} ft_union; + +/* float type data load instructions */ +static __m128 __lsx_vreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +} + +static __m256 __lasx_xvreplfr2vr_s(float val) { + ft_union fi_tmpval = {.f = val}; + return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); +} +#endif + +#ifdef __F16C__ + +#ifdef _MSC_VER +#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#else +#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#endif + +#elif defined(__POWER9_VECTOR__) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + +#else + +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 + +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} + +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#endif // __F16C__ + +#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) + +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +// precomputed f32 table for f16 (256 KB) +// defined in ggml.c, initialized in ggml_init() +extern float ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_FP16_TO_FP32) +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#endif + +#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index cb7f7728b..833984190 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -1,15 +1,17 @@ #pragma once -#include "ggml.h" - // GGML internal header +#include "ggml.h" + #include #include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ -#include #include -#include // memcpy -#include // fabsf +#include + +#ifdef __cplusplus +extern "C" { +#endif #undef MIN #undef MAX @@ -17,96 +19,6 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -#if defined(_MSC_VER) - -#define m512bh(p) p -#define m512i(p) p - -#else - -#define m512bh(p) (__m512bh)(p) -#define m512i(p) (__m512i)(p) - -#endif - -/** - * Converts brain16 to float32. - * - * The bfloat16 floating point format has the following structure: - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───┐ - * 0b0000000000000000 brain16 - * - * Since bf16 has the same number of exponent bits as a 32bit float, - * encoding and decoding numbers becomes relatively straightforward. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌──┴───┐┌─┴───────────────────┐ - * 0b00000000000000000000000000000000 IEEE binary32 - * - * For comparison, the standard fp16 format has fewer exponent bits. - * - * ┌sign - * │ - * │ ┌exponent - * │ │ - * │ │ ┌mantissa - * │ │ │ - * │┌─┴─┐┌─┴──────┐ - * 0b0000000000000000 IEEE binary16 - * - * @see IEEE 754-2008 - */ -static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { - union { - float f; - uint32_t i; - } u; - u.i = (uint32_t)h.bits << 16; - return u.f; -} - -/** - * Converts float32 to brain16. - * - * This is binary identical with Google Brain float conversion. - * Floats shall round to nearest even, and NANs shall be quiet. - * Subnormals aren't flushed to zero, except perhaps when used. - * This code should vectorize nicely if using modern compilers. - */ -static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { - ggml_bf16_t h; - union { - float f; - uint32_t i; - } u; - u.f = s; - if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ - h.bits = (u.i >> 16) | 64; /* force to quiet */ - return h; - } - h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; - return h; -} - -#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) -#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) - -#ifdef __cplusplus -extern "C" { -#endif - // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop @@ -121,520 +33,6 @@ extern "C" { #endif #endif -// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 -#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __FMA__ -#define __FMA__ -#endif -#ifndef __F16C__ -#define __F16C__ -#endif -#endif - -// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available -#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) -#ifndef __SSE3__ -#define __SSE3__ -#endif -#ifndef __SSSE3__ -#define __SSSE3__ -#endif -#endif - -#if defined(__ARM_FEATURE_SVE) -#include -#include -#endif - -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -#if defined(__ARM_NEON) - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#ifdef _MSC_VER - -typedef uint16_t ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } - -#else - -typedef __fp16 ggml_fp16_internal_t; - -#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } - -#endif // _MSC_VER - -#if !defined(__aarch64__) - -// 32-bit ARM compatibility - -// vaddlvq_s16 -// vpaddq_s16 -// vpaddq_s32 -// vaddvq_s32 -// vaddvq_f32 -// vmaxvq_f32 -// vcvtnq_s32_f32 -// vzip1_u8 -// vzip2_u8 - -inline static int32_t vaddlvq_s16(int16x8_t v) { - int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); - return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); -} - -inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { - int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); - int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); - return vcombine_s16(a0, b0); -} - -inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { - int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); - int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); - return vcombine_s32(a0, b0); -} - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -// vld1q_s16_x2 -// vld1q_u8_x2 -// vld1q_u8_x4 -// vld1q_s8_x2 -// vld1q_s8_x4 -// TODO: double-check these work correctly - -typedef struct ggml_int16x8x2_t { - int16x8_t val[2]; -} ggml_int16x8x2_t; - -inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { - ggml_int16x8x2_t res; - - res.val[0] = vld1q_s16(ptr + 0); - res.val[1] = vld1q_s16(ptr + 8); - - return res; -} - -typedef struct ggml_uint8x16x2_t { - uint8x16_t val[2]; -} ggml_uint8x16x2_t; - -inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { - ggml_uint8x16x2_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - - return res; -} - -typedef struct ggml_uint8x16x4_t { - uint8x16_t val[4]; -} ggml_uint8x16x4_t; - -inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { - ggml_uint8x16x4_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - res.val[2] = vld1q_u8(ptr + 32); - res.val[3] = vld1q_u8(ptr + 48); - - return res; -} - -typedef struct ggml_int8x16x2_t { - int8x16_t val[2]; -} ggml_int8x16x2_t; - -inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { - ggml_int8x16x2_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - - return res; -} - -typedef struct ggml_int8x16x4_t { - int8x16_t val[4]; -} ggml_int8x16x4_t; - -inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { - ggml_int8x16x4_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - res.val[2] = vld1q_s8(ptr + 32); - res.val[3] = vld1q_s8(ptr + 48); - - return res; -} - -// NOTE: not tested -inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { - int8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -// NOTE: not tested -inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -#else - -#define ggml_int16x8x2_t int16x8x2_t -#define ggml_uint8x16x2_t uint8x16x2_t -#define ggml_uint8x16x4_t uint8x16x4_t -#define ggml_int8x16x2_t int8x16x2_t -#define ggml_int8x16x4_t int8x16x4_t - -#define ggml_vld1q_s16_x2 vld1q_s16_x2 -#define ggml_vld1q_u8_x2 vld1q_u8_x2 -#define ggml_vld1q_u8_x4 vld1q_u8_x4 -#define ggml_vld1q_s8_x2 vld1q_s8_x2 -#define ggml_vld1q_s8_x4 vld1q_s8_x4 -#define ggml_vqtbl1q_s8 vqtbl1q_s8 -#define ggml_vqtbl1q_u8 vqtbl1q_u8 - -#endif // !defined(__aarch64__) - -#if !defined(__ARM_FEATURE_DOTPROD) - -inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { - const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); - const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); - - return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); -} - -#else - -#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) - -#endif // !defined(__ARM_FEATURE_DOTPROD) - -#endif // defined(__ARM_NEON) - -#if defined(__ARM_NEON) && !defined(_MSC_VER) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - ggml_fp16_internal_t tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - ggml_fp16_internal_t tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; -} - -#else - -#ifdef __wasm_simd128__ -#include -#else -#ifdef __POWER9_VECTOR__ -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - -#if defined(__loongarch64) -#if defined(__loongarch_asx) -#include -#endif -#if defined(__loongarch_sx) -#include -#endif -#endif - -#if defined(__loongarch_asx) - -typedef union { - int32_t i; - float f; -} ft_union; - -/* float type data load instructions */ -static __m128 __lsx_vreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); -} - -static __m256 __lasx_xvreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); -} -#endif - -#ifdef __F16C__ - -#ifdef _MSC_VER -#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) -#else -#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) -#endif - -#elif defined(__POWER9_VECTOR__) - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -/* the inline asm below is about 12% faster than the lookup method */ -#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - register float f; - register double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - register double d; - register ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; -} - -#else - -// FP16 <-> FP32 -// ref: https://github.com/Maratyszcza/FP16 - -static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; -} - -static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; -} - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float exp_scale = 0x1.0p-112f; -#else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); -#endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; -#else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); -#endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); -} - -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // __F16C__ - -#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) - -#ifdef __ARM_FEATURE_SVE -#include -#endif // __ARM_FEATURE_SVE - -// precomputed f32 table for f16 (256 KB) -// defined in ggml.c, initialized in ggml_init() -extern float ggml_table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; -} - -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif - -#if !defined(GGML_FP32_TO_FP16) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -#endif - -enum ggml_cgraph_eval_order { - GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, - GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, - GGML_CGRAPH_EVAL_ORDER_COUNT -}; - // bitset typedef uint32_t ggml_bitset_t; @@ -761,6 +159,12 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g // computation graph +enum ggml_cgraph_eval_order { + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + GGML_CGRAPH_EVAL_ORDER_COUNT +}; + struct ggml_cgraph { int size; int n_nodes; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7c1ec8d54..8bffce860 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 490c8d602..3a8aadae8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2,6 +2,7 @@ #define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-quants.h" #include "ggml.h" #include "ggml-aarch64.h" diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index 1be6fdb57..0193a463a 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -50,6 +50,7 @@ #include "sgemm.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-quants.h" #ifdef _MSC_VER From 37f3a3810e545be6ac835c726f97956c1403ea02 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Tue, 17 Sep 2024 08:23:30 +0200 Subject: [PATCH 25/51] llama : add llama_n_head() (#9512) --- include/llama.h | 1 + src/llama.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/llama.h b/include/llama.h index cfc8d85dc..f316a87ba 100644 --- a/include/llama.h +++ b/include/llama.h @@ -441,6 +441,7 @@ extern "C" { LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_n_layer (const struct llama_model * model); + LLAMA_API int32_t llama_n_head (const struct llama_model * model); LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 0da764f9d..146b327d0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19047,6 +19047,10 @@ int32_t llama_n_layer(const struct llama_model * model) { return model->hparams.n_layer; } +int32_t llama_n_head(const struct llama_model * model) { + return model->hparams.n_head(); +} + const struct llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } From 0d2ec438330271d201c2e9224aca23d0d5c908bf Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Tue, 17 Sep 2024 00:44:58 -0600 Subject: [PATCH 26/51] llama : support IBM Granite architecture (#9412) * feat(gguf-py): Add Granite model and params to gguf-py Branch: GraniteLM Signed-off-by: Gabe Goodhart * feat(convert_hf_to_gguf): Add registration and param setup for Granite Branch: GraniteLM Signed-off-by: Gabe Goodhart * feat(llama.cpp): Add config parsing for Granite multiplier params Branch: GraniteLM Signed-off-by: Gabe Goodhart * feat(llama.cpp): First pass at full port of granite deviations from llama Something is still not working right since the results are mostly terrible, but on occasion it's producing relevant results at this point, so _something_ is working. Branch: GraniteLM Signed-off-by: Gabe Goodhart * fix(llama.cpp): Determine granite language 3b instruct by vocab size Branch: GraniteLM Signed-off-by: Gabe Goodhart * fix(convert_hf_to_gguf): Use LlamaModel as base for GraniteModel The defaults in LlamaModel are needed for Granite as well Branch: GraniteLM Signed-off-by: Gabe Goodhart * fix(llama.cpp): Switch Granite param names to use _scale for consistency Other scalar multipliers are called *_scale, so this provides a more consistent naming convention. Branch: GraniteLM Signed-off-by: Gabe Goodhart * fix(convert_hf_to_gguf/gguf-py): _multiplier -> _scale The transformers names with _multiplier will now be converted to the _scale equivalent during conversion. Branch: GraniteLM Signed-off-by: Gabe Goodhart * fix(llama.cpp): Use separate switch clause for granite in llm_load_hparams Branch: GraniteLM Signed-off-by: Gabe Goodhart --------- Signed-off-by: Gabe Goodhart --- convert_hf_to_gguf.py | 30 ++++++++++++++ gguf-py/gguf/constants.py | 18 +++++++++ gguf-py/gguf/gguf_writer.py | 9 +++++ src/llama.cpp | 79 ++++++++++++++++++++++++++++++++++++- 4 files changed, 135 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c7e6ae0ca..ff4c9226f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4080,6 +4080,36 @@ class ExaoneModel(Model): super().prepare_tensors() +@Model.register("GraniteForCausalLM") +class GraniteModel(LlamaModel): + """Conversion for IBM's GraniteForCausalLM""" + model_arch = gguf.MODEL_ARCH.GRANITE + + def set_gguf_parameters(self): + """Granite uses standard llama parameters with the following differences: + + - No head_dim support + - New multiplier params: + - attention_scale + - embedding_scale + - residual_scale + - logits_scaling + """ + if head_dim := self.hparams.pop("head_dim", None): + logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) + super().set_gguf_parameters() + # NOTE: Convert _multiplier params to _scale params for naming + # consistency + if attention_scale := self.hparams.get("attention_multiplier"): + self.gguf_writer.add_attention_scale(attention_scale) + if embedding_scale := self.hparams.get("embedding_multiplier"): + self.gguf_writer.add_embedding_scale(embedding_scale) + if residual_scale := self.hparams.get("residual_multiplier"): + self.gguf_writer.add_residual_scale(residual_scale) + if logits_scaling := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scaling) + + ###### CONVERSION LOGIC ###### # tree of lazy tensors diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0d88649d8..b36a60d49 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -97,6 +97,8 @@ class Keys: RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" + RESIDUAL_SCALE = "{arch}.residual_scale" + EMBEDDING_SCALE = "{arch}.embedding_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -112,6 +114,7 @@ class Keys: KV_LORA_RANK = "{arch}.attention.kv_lora_rank" REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" + SCALE = "{arch}.attention.scale" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -231,6 +234,7 @@ class MODEL_ARCH(IntEnum): JAIS = auto() NEMOTRON = auto() EXAONE = auto() + GRANITE = auto() class MODEL_TENSOR(IntEnum): @@ -387,6 +391,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.JAIS: "jais", MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.GRANITE: "granite", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -1224,6 +1229,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GRANITE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 3c95c2673..bd059b45c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -679,6 +679,12 @@ class GGUFWriter: def add_time_decay_extra_dim(self, dim: int) -> None: self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim) + def add_residual_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.RESIDUAL_SCALE.format(arch=self.arch), value) + + def add_embedding_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.EMBEDDING_SCALE.format(arch=self.arch), value) + def add_wkv_head_size(self, size: int) -> None: self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size) @@ -703,6 +709,9 @@ class GGUFWriter: def add_sliding_window(self, value: int) -> None: self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value) + def add_attention_scale(self, value: float) -> None: + self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value) + def add_pooling_type(self, value: PoolingType) -> None: self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) diff --git a/src/llama.cpp b/src/llama.cpp index 146b327d0..27e3dfdfa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -214,6 +214,7 @@ enum llm_arch { LLM_ARCH_NEMOTRON, LLM_ARCH_EXAONE, LLM_ARCH_RWKV6, + LLM_ARCH_GRANITE, LLM_ARCH_UNKNOWN, }; @@ -264,6 +265,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_RWKV6, "rwkv6" }, + { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -303,6 +305,8 @@ enum llm_kv { LLM_KV_RESCALE_EVERY_N_LAYERS, LLM_KV_TIME_MIX_EXTRA_DIM, LLM_KV_TIME_DECAY_EXTRA_DIM, + LLM_KV_RESIDUAL_SCALE, + LLM_KV_EMBEDDING_SCALE, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -317,6 +321,7 @@ enum llm_kv { LLM_KV_ATTENTION_KV_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, + LLM_KV_ATTENTION_SCALE, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, @@ -407,6 +412,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" }, + { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, + { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -421,6 +428,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, + { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, @@ -1454,6 +1462,22 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" }, }, }, + { + LLM_ARCH_GRANITE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2372,6 +2396,11 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; + // Additional scale factors (Granite) + float f_residual_scale = 0.0f; + float f_embedding_scale = 0.0f; + float f_attention_scale = 0.0f; + bool causal_attn = true; bool use_alibi = false; bool attn_soft_cap = false; @@ -2434,6 +2463,9 @@ struct llama_hparams { if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; + if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true; + if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true; + if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true; return false; } @@ -6019,6 +6051,20 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GRANITE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_3B; break; + // Add additional layer/vocab/etc checks here for other model sizes + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -6717,6 +6763,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } + + if (model.arch == LLM_ARCH_GRANITE) { + LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); + LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); + LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); + } } // Returns false if cancelled by progress_callback @@ -6885,6 +6937,7 @@ static bool llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: case LLM_ARCH_MINICPM: + case LLM_ARCH_GRANITE: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -8868,6 +8921,11 @@ static struct ggml_tensor * llm_build_inp_embd( ggml_set_input(lctx.inp_embd); } + // For Granite architecture + if (hparams.f_embedding_scale != 0.0f) { + inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale); + } + cb(inpL, "inp_embd", -1); return inpL; @@ -10146,6 +10204,7 @@ struct llm_build_context { // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -10198,7 +10257,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); } if (il == n_layer - 1) { @@ -10209,6 +10268,11 @@ struct llm_build_context { inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -10245,6 +10309,11 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); } + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); @@ -10264,6 +10333,12 @@ struct llm_build_context { // lm_head cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + // For Granite architecture + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -15789,6 +15864,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_GRANITE: { result = llm.build_llama(); } break; @@ -19089,6 +19165,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_CHATGLM: + case LLM_ARCH_GRANITE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 From 503147a9f9d195d6a14e7c998df23b6eb61f2bae Mon Sep 17 00:00:00 2001 From: Yuri Khrustalev Date: Tue, 17 Sep 2024 02:51:15 -0400 Subject: [PATCH 27/51] unicode : add (#9508) --- src/unicode.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unicode.cpp b/src/unicode.cpp index 46650bff0..f4e941cd1 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -5,6 +5,7 @@ #include "unicode.h" #include "unicode-data.h" +#include #include #include #include From 0226613853133c081b55bb892a41bb5eacc0bc94 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 17 Sep 2024 01:19:46 -0700 Subject: [PATCH 28/51] threadpool : skip polling for unused threads (#9461) * threadpool: skip polling for unused threads Currently all threads do N polling rounds even if only 1 thread is active (n_threads_cur == 1). This commit adds a check to skip the polling for unused threads (ith >= n_threads_cur). n_threads_cur is now an atomic_int to explicitly tell thread sanitizer that it is written from one thread and read from other threads (not a race conditions). * threadpool: further simplify and improve ggml_barrier Avoid using strict memory order while polling, yet make sure that all threads go through full memory barrier (memory fence) on ggml_barrier entrace and exit. * threads: add simple barrier test This test does lots of small, parallel matmul ops where the barriers in between dominate the overhead. * threadpool: improve thread sync for new-graphs Using the same tricks as ggml_barrier. All the polling is done with relaxed memory order to keep it efficient, once the new graph is detected we do full fence using read-modify-write with strict memory order. * threadpool: improve abort handling Do not use threadpool->ec (exit code) to decide whether to exit the compute loop. threadpool->ec is not atomic which makes thread-sanitizer rightfully unhappy about it. Instead introduce atomic threadpool->abort flag used for this. This is consistent with how we handle threadpool->stop or pause. While at it add an explicit atomic_load for n_threads_cur for consistency. * test-barrier: release threadpool before releasing the context fixes use-after-free detected by gcc thread-sanitizer on x86-64 for some reason llvm sanitizer is not detecting this issue. --- ggml/src/ggml.c | 126 +++++++++++++++++++++++++---------------- tests/CMakeLists.txt | 1 + tests/test-barrier.cpp | 93 ++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+), 50 deletions(-) create mode 100644 tests/test-barrier.cpp diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3a8aadae8..bccb62377 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2013,10 +2013,11 @@ struct ggml_threadpool { // these are atomic as an annotation for thread-sanitizer atomic_bool stop; // Used for stopping the threadpool altogether atomic_bool pause; // Used for pausing the threadpool or individual threads + atomic_bool abort; // Used for aborting processing of a graph struct ggml_compute_state * workers; // per thread state int n_threads_max; // number of threads in the pool - int n_threads_cur; // number of threads used in the current graph + atomic_int n_threads_cur; // number of threads used in the current graph int32_t prio; // Scheduling priority uint32_t poll; // Polling level (0 - no polling) @@ -3178,41 +3179,36 @@ inline static void ggml_critical_section_start(void) { } } +static void ggml_barrier(struct ggml_threadpool * tp) { + int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); + if (n_threads == 1) { + return; + } + #ifdef GGML_USE_OPENMP -static void ggml_barrier(struct ggml_threadpool * threadpool) { - if (threadpool->n_threads_cur == 1) { - return; - } - #pragma omp barrier -} #else -static void ggml_barrier(struct ggml_threadpool * threadpool) { - if (threadpool->n_threads_cur == 1) { - return; - } + int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed); - atomic_int * n_barrier = &threadpool->n_barrier; - atomic_int * n_barrier_passed = &threadpool->n_barrier_passed; + // enter barrier (full seq-cst fence) + int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst); - int n_threads = threadpool->n_threads_cur; - int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed); - - if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { + int last = 0; + if (n_barrier == (n_threads - 1)) { // last thread - atomic_store(n_barrier, 0); - atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed); + atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed); + last = 1; } else { // wait for other threads - while (true) { - if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) { - return; - } + while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) { ggml_thread_cpu_relax(); } } -} + + // exit barrier (full seq-cst fence) + atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst); #endif +} // TODO: make this somehow automatically executed // some sort of "sentry" mechanism @@ -19933,34 +19929,33 @@ struct ggml_cplan ggml_graph_plan( static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_threadpool * tp = state->threadpool; - const struct ggml_cgraph * cgraph = state->threadpool->cgraph; - const struct ggml_cplan * cplan = state->threadpool->cplan; + const struct ggml_cgraph * cgraph = tp->cgraph; + const struct ggml_cplan * cplan = tp->cplan; set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { /*.ith =*/ state->ith, - /*.nth =*/ state->threadpool->n_threads_cur, + /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), /*.wsize =*/ cplan->work_size, /*.wdata =*/ cplan->work_data, - /*.threadpool=*/ state->threadpool, + /*.threadpool=*/ tp, }; - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; ggml_compute_forward(¶ms, node); - if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->threadpool->ec = GGML_STATUS_ABORTED; + if (state->ith == 0 && cplan->abort_callback && + cplan->abort_callback(cplan->abort_callback_data)) { + tp->abort = true; + tp->ec = GGML_STATUS_ABORTED; } ggml_barrier(state->threadpool); - - if (state->threadpool->ec != GGML_STATUS_SUCCESS) { - break; - } } return 0; @@ -19968,7 +19963,15 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { #ifndef GGML_USE_OPENMP -static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { +// check if thread is active +static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed); + return (state->ith < n_threads); +} + +// check if thread is ready to proceed (exit from polling or sleeping) +static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) { struct ggml_threadpool * threadpool = state->threadpool; if (state->pending || threadpool->stop || threadpool->pause) { return true; } @@ -19976,21 +19979,34 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { // check for new graph/work int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed); if (new_graph != state->last_graph) { - state->pending = (state->ith < threadpool->n_threads_cur); + state->pending = ggml_graph_compute_thread_active(state); state->last_graph = new_graph; } return state->pending; } +// sync thread state after polling +static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) { + struct ggml_threadpool * threadpool = state->threadpool; + // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer + // so instead we just use a dummy read-modify-write + atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst); +} + static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { struct ggml_threadpool * threadpool = state->threadpool; + // Skip polling for unused threads + if (!ggml_graph_compute_thread_active(state)) { + return state->pending; + } + // This seems to make 0 ... 100 a decent range for polling level across modern processors. // Perhaps, we can adjust it dynamically based on load and things. const uint64_t n_rounds = 1024UL * 128 * threadpool->poll; - for (uint64_t i=0; !ggml_graph_compute_ready(state) && ithreadpool; if (ggml_graph_compute_poll_for_work(state)) { + ggml_graph_compute_thread_sync(state); return state->pending; } ggml_mutex_lock_shared(&threadpool->mutex); - while (!ggml_graph_compute_ready(state)) { + while (!ggml_graph_compute_thread_ready(state)) { // No new work. Wait for the signal. - GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith); + GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith); ggml_cond_wait(&threadpool->cond, &threadpool->mutex); } ggml_mutex_unlock_shared(&threadpool->mutex); @@ -20055,13 +20072,20 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { } // Start processing new graph -static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool) +static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads) { - // always take the mutex here because the worker threads are doing hybrid poll/wait + // Always take the mutex here because the worker threads are doing hybrid poll/wait ggml_mutex_lock(&threadpool->mutex); - atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed); + GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads); + + // Update the number of active threads + atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); + + // Indicate the graph is ready to be processed + // We need the full seq-cst fence here because of the polling threads (used in thread_sync) + atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst); if (threadpool->pause) { // Update main thread prio and affinity to match the threadpool settings @@ -20120,6 +20144,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( threadpool->current_chunk = 0; threadpool->stop = false; threadpool->pause = tpp->paused; + threadpool->abort = false; threadpool->workers = NULL; threadpool->n_threads_max = tpp->n_threads; threadpool->n_threads_cur = tpp->n_threads; @@ -20195,15 +20220,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl // No worker threads should be accessing the parameters below at this stage threadpool->cgraph = cgraph; threadpool->cplan = cplan; - threadpool->n_threads_cur = n_threads; threadpool->current_chunk = 0; + threadpool->abort = false; threadpool->ec = GGML_STATUS_SUCCESS; } - if (n_threads > threadpool->n_threads_max) { - GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n"); - } - #ifdef GGML_USE_OPENMP if (n_threads > 1) { #pragma omp parallel num_threads(n_threads) @@ -20212,7 +20233,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl { // update the number of threads from the actual number of threads that we got from OpenMP n_threads = omp_get_num_threads(); - threadpool->n_threads_cur = n_threads; + atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); } ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); @@ -20221,8 +20242,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl ggml_graph_compute_thread(&threadpool->workers[0]); } #else + if (n_threads > threadpool->n_threads_max) { + GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max); + n_threads = threadpool->n_threads_max; + } + // Kick all threads to start the new graph - ggml_graph_compute_kickoff(threadpool); + ggml_graph_compute_kickoff(threadpool, n_threads); // This is a work thread too ggml_graph_compute_thread(&threadpool->workers[0]); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7dcd3fce8..08ad66b49 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -119,6 +119,7 @@ llama_target_and_test(test-grammar-parser.cpp) llama_target_and_test(test-llama-grammar.cpp) llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-grad0.cpp) +llama_target_and_test(test-barrier.cpp) # llama_target_and_test(test-opt.cpp) # SLOW llama_target_and_test(test-backend-ops.cpp) diff --git a/tests/test-barrier.cpp b/tests/test-barrier.cpp new file mode 100644 index 000000000..cf54237db --- /dev/null +++ b/tests/test-barrier.cpp @@ -0,0 +1,93 @@ +#include "ggml.h" +#include "ggml-backend.h" + +#include +#include +#include +#include +#include +#include + +#define MAX_NARGS 2 + +int main(int argc, char *argv[]) { + + int n_threads = 4; + int n_rounds = 100; + + if (argc > 1) { + n_threads = std::atoi(argv[1]); + } + + if (argc > 2) { + n_rounds = std::atoi(argv[2]); + } + + struct ggml_init_params params = { + /* .mem_size = */ 1024*1024*1024, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, + }; + + struct ggml_context * ctx = ggml_init(params); + + // Create graph + struct ggml_cgraph * gf = ggml_new_graph(ctx); + + // Lots of small, parallel ops where barriers in between will dominate + struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64); + for (int i = 0; i < 1000; i++) { + struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128); + out = ggml_mul_mat(ctx, a, out); + + struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64); + out = ggml_mul_mat(ctx, d, out); + } + + ggml_build_forward_expand(gf, out); + int n_nodes = ggml_graph_n_nodes(gf); + + // Create threadpool + struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads); + struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); + if (!threadpool) { + fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads); + exit(1); + } + + // Create compute plan + struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool); + + std::vector work_data(cplan.work_size); + cplan.work_data = work_data.data(); + + std::cerr << "graph-compute with" + << "\n n_threads: " << n_threads + << "\n n_nodes: " << n_nodes + << "\n n_rounds: " << n_rounds + << "\n"; + // ggml_graph_print(gf); + + // Warmup + ggml_graph_compute(gf, &cplan); + + auto t0 = std::chrono::high_resolution_clock::now(); + + for (int i=0; i < n_rounds; i++) { + ggml_graph_compute(gf, &cplan); + } + + auto t1 = std::chrono::high_resolution_clock::now(); + + auto usec = std::chrono::duration_cast(t1-t0).count(); + auto nsec = std::chrono::duration_cast(t1-t0).count(); + std::cerr << "graph-compute took " << usec << " usec " + << "\n " << (float) usec / n_rounds << " usec per-iter" + << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node" + << "\n"; + + ggml_threadpool_free(threadpool); + ggml_free(ctx); + + return 0; +} From 8344ef58f8cdb8ebd6faf4463ca32ae91c374c81 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Tue, 17 Sep 2024 12:18:22 +0200 Subject: [PATCH 29/51] llama : fix n_vocab init for 'no_vocab' case (#9511) * llama: fixed n_vocab for `no_vocab` models * llama: updated error output for `llama_decode_internal` and `llama_encode_internal` * llama: log warning if there's no vocab_size in metadata * llama: correct vocab size for logging Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- src/llama.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 27e3dfdfa..af8afd845 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6107,8 +6107,15 @@ static void llm_load_vocab( vocab.special_mask_id = -1; vocab.linefeed_id = -1; + // read vocab size from metadata + if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { + vocab.n_vocab = 0; + LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab); + } return; - } else if (tokenizer_model == "llama") { + } + + if (tokenizer_model == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens @@ -16653,7 +16660,7 @@ static int llama_decode_internal( const uint32_t n_tokens_all = batch_all.n_tokens; if (n_tokens_all == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } @@ -16666,7 +16673,7 @@ static int llama_decode_internal( if (batch_all.token) { for (uint32_t i = 0; i < n_tokens_all; ++i) { if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]); return -1; } } @@ -16954,7 +16961,7 @@ static int llama_encode_internal( const uint32_t n_tokens = batch.n_tokens; if (n_tokens == 0) { - LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); + LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); return -1; } @@ -16967,7 +16974,7 @@ static int llama_encode_internal( if (batch.token) { for (uint32_t i = 0; i < n_tokens; ++i) { if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); return -1; } } From 8b836ae731bbb2c5640bc47df5b0a78ffcb129cb Mon Sep 17 00:00:00 2001 From: Bert Wagner Date: Tue, 17 Sep 2024 09:35:38 -0400 Subject: [PATCH 30/51] arg : add env variable for parallel (#9513) * add env variable for parallel * Update README.md with env: LLAMA_ARG_N_PARALLEL --- common/arg.cpp | 2 +- examples/server/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 60e37a89a..922391069 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1312,7 +1312,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, [](gpt_params & params, int value) { params.n_parallel = value; } - )); + ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(llama_arg( {"-ns", "--sequences"}, "N", format("number of sequences to decode (default: %d)", params.n_sequences), diff --git a/examples/server/README.md b/examples/server/README.md index 168e14a99..7a5d26ca0 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -87,7 +87,7 @@ The project is under active development, and we are [looking for feedback and co | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | -| `-np, --parallel N` | number of parallel sequences to decode (default: 1) | +| `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing | From 7be099fa817e9c53ffb4c3ed7d063e1cffcd675a Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Tue, 17 Sep 2024 22:41:38 +0200 Subject: [PATCH 31/51] llama-bench: correct argument parsing error message (#9524) --- examples/llama-bench/llama-bench.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2d90f65a0..fb1d387b2 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -439,6 +439,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } types.push_back(gt); } + if (invalid_param) { + break; + } params.type_k.insert(params.type_k.end(), types.begin(), types.end()); } else if (arg == "-ctv" || arg == "--cache-type-v") { if (++i >= argc) { @@ -455,6 +458,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } types.push_back(gt); } + if (invalid_param) { + break; + } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { @@ -520,6 +526,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } modes.push_back(mode); } + if (invalid_param) { + break; + } params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { From faf67b3de4688f47c3b1019c89df255df2fd59b4 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Wed, 18 Sep 2024 08:30:31 +0800 Subject: [PATCH 32/51] [SYCL]set context default value to avoid memory issue, update guide (#9476) * set context default to avoid memory issue, update guide * Update docs/backend/SYCL.md Co-authored-by: Meng, Hengyu --------- Co-authored-by: arthw <14088817+arthw@users.noreply.github.com> Co-authored-by: Meng, Hengyu --- docs/backend/SYCL.md | 8 ++++++++ examples/sycl/run-llama2.sh | 7 ++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index e3b9572cc..bc266f7d8 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -636,6 +636,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512 It's same for other projects including llama.cpp SYCL backend. +- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer` + + Device Memory is not enough. + + |Reason|Solution| + |-|-| + |Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.| + |Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;
Use more than one devices to load model.| ### **GitHub contribution**: Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index a8cf0aa64..3b9ba3b2d 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -11,16 +11,17 @@ source /opt/intel/oneapi/setvars.sh #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:" -MODEL_FILE=llama-2-7b.Q4_0.gguf +MODEL_FILE=models/llama-2-7b.Q4_0.gguf NGL=33 +CONEXT=8192 if [ $# -gt 0 ]; then GGML_SYCL_DEVICE=$1 echo "use $GGML_SYCL_DEVICE as main GPU" #use signle GPU only - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none else #use multiple GPUs with same max compute units - ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 + ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} fi From f799155ab8279cfa668086ce584aa52ecc05f5e5 Mon Sep 17 00:00:00 2001 From: Eric Zhang <34133756+EZForever@users.noreply.github.com> Date: Wed, 18 Sep 2024 14:28:20 +0800 Subject: [PATCH 33/51] server : fix OpenSSL build (remove obsolete `LOG_INFO`) (#9529) --- examples/server/server.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b5f264ff1..dce69f832 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2254,14 +2254,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp return; } - //LOG_INFO("request", { - // {"remote_addr", req.remote_addr}, - // {"remote_port", req.remote_port}, - // {"status", res.status}, - // {"method", req.method}, - // {"path", req.path}, - // {"params", req.params}, - //}); LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); LOG_DBG("request: %s\n", req.body.c_str()); @@ -2318,12 +2310,12 @@ int main(int argc, char ** argv) { std::unique_ptr svr; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}}); + LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); svr.reset( new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) ); } else { - LOG_INFO("Running without SSL", {}); + LOG_INF("Running without SSL\n"); svr.reset(new httplib::Server()); } #else @@ -3108,7 +3100,6 @@ int main(int argc, char ** argv) { std::thread t([&]() { svr->listen_after_bind(); }); svr->wait_until_ready(); - //LOG_INFO("HTTP server is listening", log_data); LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http); // load the model From 8a308354f6520df6bea851b435bd8054ee5617b4 Mon Sep 17 00:00:00 2001 From: Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> Date: Wed, 18 Sep 2024 01:50:34 -0500 Subject: [PATCH 34/51] server : match OAI structured output response (#9527) --- examples/server/README.md | 2 +- examples/server/utils.hpp | 3 +++ grammars/README.md | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 7a5d26ca0..326e05e1e 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -501,7 +501,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported. - The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers. + The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. *Examples:* diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 537c8a223..f093f547f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -331,6 +331,9 @@ static json oaicompat_completion_params_parse( std::string response_type = json_value(response_format, "type", std::string()); if (response_type == "json_object") { llama_params["json_schema"] = json_value(response_format, "schema", json::object()); + } else if (response_type == "json_schema") { + json json_schema = json_value(response_format, "json_schema", json::object()); + llama_params["json_schema"] = json_value(json_schema, "schema", json::object()); } else if (!response_type.empty() && response_type != "text") { throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); } diff --git a/grammars/README.md b/grammars/README.md index 7ec815471..4e8b4e2fc 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -120,7 +120,7 @@ You can use GBNF grammars: - In [llama-server](../examples/server): - For any completion endpoints, passed as the `json_schema` body field - - For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}`) + - For the `/chat/completions` endpoint, passed inside the `response_format` body field (e.g. `{"type", "json_object", "schema": {"items": {}}}` or `{ type: "json_schema", json_schema: {"schema": ...} }`) - In [llama-cli](../examples/main), passed as the `--json` / `-j` flag - To convert to a grammar ahead of time: - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) From 6443ddd98576a9da904ef9f07df4e4398bb6a01a Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 18 Sep 2024 13:42:36 +0200 Subject: [PATCH 35/51] llama : use reserve/emplace_back in sampler_sample (#9534) This commit updates the llama_sampler_sample function to use reserve and emplace_back for the vector of llama_token_data structs. The motivation for this change is to avoid the creation of n_vocab default-constructed llama_token_data structs which are then immediately overwritten. --- src/llama-sampling.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5275b1d60..5299f5116 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -236,9 +236,10 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte const int n_vocab = llama_n_vocab(llama_get_model(ctx)); // TODO: do not allocate each time - std::vector cur(n_vocab); + std::vector cur; + cur.reserve(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); } llama_token_data_array cur_p = { From 0d2f22e45c3c3b6f8222acb6284d0c8c93443ba1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Sep 2024 18:34:32 +0300 Subject: [PATCH 36/51] scripts : verify py deps at the start of compare (#9520) --- scripts/compare-commits.sh | 3 +++ scripts/compare-llama-bench.py | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index 70679f4e5..8b9b1ad39 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -8,6 +8,9 @@ fi set -e set -x +# verify at the start that the compare script has all the necessary dependencies installed +./scripts/compare-llama-bench.py --check + bench_args="${@:3}" rm -f llama-bench.sqlite > /dev/null diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 92b9e682a..e45e83ce8 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -92,6 +92,7 @@ help_s = ( "If the columns are manually specified, then the results for each unique combination of the " "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench." ) +parser.add_argument("--check", action="store_true", help="check if all required Python libraries are installed") parser.add_argument("-s", "--show", help=help_s) parser.add_argument("--verbose", action="store_true", help="increase output verbosity") @@ -99,6 +100,10 @@ known_args, unknown_args = parser.parse_known_args() logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO) +if known_args.check: + # Check if all required Python libraries are installed. Would have failed earlier if not. + sys.exit(0) + if unknown_args: logger.error(f"Received unknown args: {unknown_args}.\n") parser.print_help() From 64c6af3195c3cd4aa3328a1282d29cd2635c34c9 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 18 Sep 2024 19:13:08 +0200 Subject: [PATCH 37/51] ggml : fix n_threads_cur initialization with one thread (#9538) * ggml : fix n_threads_cur initialization with one thread * Update ggml/src/ggml.c --------- Co-authored-by: Max Krasnyansky --- ggml/src/ggml.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index bccb62377..c5c98dbe4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -20239,6 +20239,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); } } else { + atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed); ggml_graph_compute_thread(&threadpool->workers[0]); } #else From eca0fab44eb449ed34e17a9b651ae7398b494117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 19 Sep 2024 09:58:14 +0200 Subject: [PATCH 38/51] imatrix : disable prompt escape by default (#9543) --- examples/imatrix/imatrix.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 265281699..c8e273529 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -572,6 +572,7 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; + params.escape = false; if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { return 1; From 6026da52d6942b253df835070619775d849d0258 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 19 Sep 2024 12:44:53 +0300 Subject: [PATCH 39/51] server : clean-up completed tasks from waiting list (#9531) ggml-ci --- examples/server/server.cpp | 40 ++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index dce69f832..0ca999994 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -531,26 +531,38 @@ struct server_response { // add the id_task to the list of tasks waiting for response void add_waiting_task_id(int id_task) { - SRV_DBG("waiting for task id = %d\n", id_task); + SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", id_task, (int) waiting_task_ids.size()); std::unique_lock lock(mutex_results); waiting_task_ids.insert(id_task); } void add_waiting_tasks(const std::vector & tasks) { - for (const auto & t : tasks) { - add_waiting_task_id(t.id); + std::unique_lock lock(mutex_results); + + for (const auto & task : tasks) { + SRV_DBG("add task %d to waiting list. current waiting = %d (before add)\n", task.id, (int) waiting_task_ids.size()); + waiting_task_ids.insert(task.id); } } // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int id_task) { - SRV_DBG("task id = %d is done\n", id_task); + SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); std::unique_lock lock(mutex_results); waiting_task_ids.erase(id_task); } + void remove_waiting_task_ids(const std::unordered_set & id_tasks) { + std::unique_lock lock(mutex_results); + + for (const auto & id_task : id_tasks) { + SRV_DBG("remove task %d from waiting list. current waiting = %d (before remove)\n", id_task, (int) waiting_task_ids.size()); + waiting_task_ids.erase(id_task); + } + } + // This function blocks the thread until there is a response for one of the id_tasks server_task_result recv(const std::unordered_set & id_tasks) { while (true) { @@ -2774,6 +2786,8 @@ int main(int argc, char ** argv) { }, [&](const json & error_data) { res_error(res, error_data); }); + + ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { @@ -2784,7 +2798,12 @@ int main(int argc, char ** argv) { sink.done(); return false; }; - res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + + auto on_complete = [task_ids, &ctx_server] (bool) { + ctx_server.queue_results.remove_waiting_task_ids(task_ids); + }; + + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); } }; @@ -2823,6 +2842,8 @@ int main(int argc, char ** argv) { }, [&](const json & error_data) { res_error(res, error_data); }); + + ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { @@ -2844,7 +2865,12 @@ int main(int argc, char ** argv) { sink.done(); return true; }; - res.set_chunked_content_provider("text/event-stream", chunked_content_provider); + + auto on_complete = [task_ids, &ctx_server] (bool) { + ctx_server.queue_results.remove_waiting_task_ids(task_ids); + }; + + res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); } }; @@ -2953,6 +2979,8 @@ int main(int argc, char ** argv) { res_error(res, error_data); error = true; }); + + ctx_server.queue_results.remove_waiting_task_ids(task_ids); } if (error) { From 722ec1eb51ed53be2ab1ef31c6a1da8261803c71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 20 Sep 2024 08:38:10 +0200 Subject: [PATCH 40/51] perplexity : do not escape input data by default (#9548) --- examples/perplexity/perplexity.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 18e75a7a2..cbd466656 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1961,6 +1961,7 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; + params.escape = false; if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { return 1; From d39e26741f9f02340651dbc640c9776e1a1128ef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 20 Sep 2024 11:46:56 +0300 Subject: [PATCH 41/51] examples : flush log upon ctrl+c (#9559) --- examples/infill/infill.cpp | 5 +++++ examples/main/main.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index b77b876cc..35607276a 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -97,6 +97,11 @@ static void sigint_handler(int signo) { LOG("\n"); gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); + + // make sure all logs are flushed + LOG("Interrupted by user\n"); + gpt_log_pause(gpt_log_main()); + _exit(130); } } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 91fea9326..c3041f1fb 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -116,6 +116,11 @@ static void sigint_handler(int signo) { LOG("\n"); gpt_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); + + // make sure all logs are flushed + LOG("Interrupted by user\n"); + gpt_log_pause(gpt_log_main()); + _exit(130); } } From 5cb12f68395a5ec00b357e408883ce124a11dcdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 20 Sep 2024 18:35:35 +0200 Subject: [PATCH 42/51] CUDA: fix sum.cu compilation for CUDA < 11.7 (#9562) --- ggml/src/ggml-cuda/sum.cu | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index 21da63509..0583e4fe0 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -1,9 +1,13 @@ -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 +#define USE_CUB +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 + +#ifdef USE_CUB // On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. // For this reason CUB must be included BEFORE anything else. #include using namespace cub; -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +#endif // USE_CUB #include "sumrows.cuh" #include "sum.cuh" @@ -11,7 +15,7 @@ using namespace cub; #include void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +#ifdef USE_CUB size_t tmp_size = 0; DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream); ggml_cuda_pool_alloc tmp_alloc(pool, tmp_size); @@ -21,7 +25,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14. sum_rows_f32_cuda(x, dst, ne, 1, stream); GGML_UNUSED(pool); -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +#endif // USE_CUB } void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { From a6809c6a2e8163cba296d4cc0c5cd4bbc8073638 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Sep 2024 11:10:43 +0300 Subject: [PATCH 43/51] examples : add null threadpool args where needed (ggml/0) ggml-ci --- ggml/src/ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c5c98dbe4..51532c5ea 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -19753,8 +19753,8 @@ void ggml_threadpool_resume(struct ggml_threadpool * threadpool) { struct ggml_cplan ggml_graph_plan( const struct ggml_cgraph * cgraph, - int n_threads, - struct ggml_threadpool * threadpool) { + int n_threads, + struct ggml_threadpool * threadpool) { if (threadpool == NULL) { GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); From 424c5d00a9b97dd5559635872db9b57f87c23b02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 20 Sep 2024 19:04:44 +0300 Subject: [PATCH 44/51] ggml/examples: add backend support for numerical optimization (ggml/949) * CUDA eval works * stochastic gradient descent op * Adam except decay * CUDA CROSS_ENTROPY_LOSS_BACK * CUDA mnist-fc training works * backend CLI arg * refactor gguf load * remove sched from opt_step_adam * implement l1 regularization (weight decay) * extra call to add optimizer * initialize gradients with ggml_graph_reset * gradient accumulation * increment iter per eval instead of epoch * adjust backend interfaces * fix ggml_graph_reset without backend * fix ggml graph export/import * fixup * rename * revert ggml_opt changes * more general CUDA repeat_back * update documentation, fix CNN * validation split * add clarifying comment * optimize PyTorch training * adjust buffer size, thread count * fix 0.0f validation split * Update examples/mnist/mnist-common.cpp Co-authored-by: Georgi Gerganov * fix gradient accumulation * tensor flag for accumulators -> tensor hash set * Update include/ggml.h Co-authored-by: slaren * Update tests/test-backend-ops.cpp Co-authored-by: slaren * Update tests/test-backend-ops.cpp Co-authored-by: slaren * fix test prints * Update src/ggml-backend.c Co-authored-by: Georgi Gerganov * better CUDA support for noncontiguous out_prod * add comment --------- Co-authored-by: Georgi Gerganov Co-authored-by: slaren --- ggml/include/ggml-backend.h | 3 +- ggml/include/ggml.h | 40 +- ggml/src/ggml-backend-impl.h | 19 +- ggml/src/ggml-backend.c | 25 ++ ggml/src/ggml-cann.cpp | 1 + ggml/src/ggml-cuda.cu | 42 ++- ggml/src/ggml-cuda/binbcast.cu | 67 ++++ ggml/src/ggml-cuda/binbcast.cuh | 2 + ggml/src/ggml-cuda/cross-entropy-loss.cu | 60 +++ ggml/src/ggml-cuda/cross-entropy-loss.cuh | 2 + ggml/src/ggml-cuda/opt-step-adamw.cu | 80 ++++ ggml/src/ggml-cuda/opt-step-adamw.cuh | 5 + ggml/src/ggml-cuda/out-prod.cu | 52 +++ ggml/src/ggml-cuda/out-prod.cuh | 3 + ggml/src/ggml-cuda/unary.cu | 29 ++ ggml/src/ggml-cuda/unary.cuh | 3 + ggml/src/ggml-kompute.cpp | 1 + ggml/src/ggml-metal.m | 1 + ggml/src/ggml-rpc.cpp | 1 + ggml/src/ggml-sycl.cpp | 1 + ggml/src/ggml-vulkan.cpp | 1 + ggml/src/ggml.c | 424 +++++++++++++++++----- tests/test-backend-ops.cpp | 148 ++++++-- tests/test-grad0.cpp | 2 +- 24 files changed, 883 insertions(+), 129 deletions(-) create mode 100644 ggml/src/ggml-cuda/opt-step-adamw.cu create mode 100644 ggml/src/ggml-cuda/opt-step-adamw.cuh create mode 100644 ggml/src/ggml-cuda/out-prod.cu create mode 100644 ggml/src/ggml-cuda/out-prod.cuh diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index e497b6d02..71c0bef8e 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -66,6 +66,7 @@ extern "C" { // "offset" refers to the offset of the tensor data for setting/getting data GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); GGML_API void ggml_backend_synchronize(ggml_backend_t backend); @@ -122,7 +123,7 @@ extern "C" { // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way GGML_API size_t ggml_backend_reg_get_count(void); - GGML_API size_t ggml_backend_reg_find_by_name(const char * name); + GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional) GGML_API const char * ggml_backend_reg_get_name(size_t i); GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a413df357..2035001e9 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -534,6 +534,7 @@ extern "C" { GGML_OP_CROSS_ENTROPY_LOSS, GGML_OP_CROSS_ENTROPY_LOSS_BACK, + GGML_OP_OPT_STEP_ADAMW, GGML_OP_COUNT, }; @@ -571,10 +572,12 @@ extern "C" { GGML_LOG_LEVEL_DEBUG = 4, }; + // this tensor... enum ggml_tensor_flag { - GGML_TENSOR_FLAG_INPUT = 1, - GGML_TENSOR_FLAG_OUTPUT = 2, - GGML_TENSOR_FLAG_PARAM = 4, + GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph + GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph + GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters + GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) }; // n-dimensional tensor @@ -2037,23 +2040,44 @@ extern "C" { struct ggml_tensor * b, struct ggml_tensor * c); + // AdamW optimizer step + // Paper: https://arxiv.org/pdf/1711.05101v3.pdf + // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html + GGML_API struct ggml_tensor * ggml_opt_step_adamw( + struct ggml_context * ctx, + struct ggml_tensor * a, + float alpha, + float beta1, + float beta2, + float eps, + float wd); // weight decay + // // automatic differentiation // - GGML_API void ggml_set_param( - struct ggml_context * ctx, - struct ggml_tensor * tensor); + GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API void ggml_set_loss(struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); - GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep); + + GGML_API void ggml_build_opt_adamw( + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + float alpha, + float beta1, + float beta2, + float eps, + float wd); // weight decay // graph allocation in a context GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1 GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph); diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 36ca37086..b0d4141cc 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -38,15 +38,16 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); - void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer); - void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); - void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer - void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); - void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer); + void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer); + void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer); + void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer + void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct ggml_backend_buffer { diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index b5d9301a7..97ca5a1f3 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -246,6 +246,22 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * buf->iface.get_tensor(buf, tensor, data, offset, size); } +GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + + GGML_ASSERT(buf != NULL && "tensor buffer not set"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + + if (!size) { + return; + } + + GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer"); + + buf->iface.memset_tensor(buf, tensor, value, offset, size); +} + void ggml_backend_synchronize(ggml_backend_t backend) { if (backend->iface.synchronize == NULL) { return; @@ -569,6 +585,12 @@ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t free(buffer->context); } +GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); @@ -600,6 +622,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, @@ -613,6 +636,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, @@ -980,6 +1004,7 @@ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface( /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer, /* .get_base = */ NULL, /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, /* .set_tensor = */ NULL, /* .get_tensor = */ NULL, /* .cpy_tensor = */ NULL, diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index aa315b83f..d3ab78006 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1037,6 +1037,7 @@ static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, /* .get_base = */ ggml_backend_cann_buffer_get_base, /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor, diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 54f1a7c2d..b0843dc62 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -21,6 +21,8 @@ #include "ggml-cuda/mmq.cuh" #include "ggml-cuda/mmvq.cuh" #include "ggml-cuda/norm.cuh" +#include "ggml-cuda/opt-step-adamw.cuh" +#include "ggml-cuda/out-prod.cuh" #include "ggml-cuda/pad.cuh" #include "ggml-cuda/pool2d.cuh" #include "ggml-cuda/quantize.cuh" @@ -493,6 +495,14 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t } } +GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); +} + GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -544,6 +554,7 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, @@ -860,6 +871,7 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, /* .cpy_tensor = */ NULL, @@ -2168,6 +2180,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_REPEAT: ggml_cuda_op_repeat(ctx, dst); break; + case GGML_OP_REPEAT_BACK: + ggml_cuda_op_repeat_back(ctx, dst); + break; case GGML_OP_GET_ROWS: ggml_cuda_op_get_rows(ctx, dst); break; @@ -2201,6 +2216,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_UNARY_OP_NEG: ggml_cuda_op_neg(ctx, dst); break; + case GGML_UNARY_OP_STEP: + ggml_cuda_op_step(ctx, dst); + break; case GGML_UNARY_OP_GELU: ggml_cuda_op_gelu(ctx, dst); break; @@ -2267,6 +2285,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_MUL_MAT_ID: ggml_cuda_mul_mat_id(ctx, dst); break; + case GGML_OP_OUT_PROD: + ggml_cuda_out_prod(ctx, dst); + break; case GGML_OP_SCALE: ggml_cuda_op_scale(ctx, dst); break; @@ -2324,6 +2345,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_CROSS_ENTROPY_LOSS: ggml_cuda_cross_entropy_loss(ctx, dst); break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + ggml_cuda_cross_entropy_loss_back(ctx, dst); + break; + case GGML_OP_OPT_STEP_ADAMW: + ggml_cuda_opt_step_adamw(ctx, dst); + break; default: return false; } @@ -2761,6 +2788,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: @@ -2813,6 +2841,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return false; } } break; + case GGML_OP_OUT_PROD: + return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -2869,6 +2899,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons } break; case GGML_OP_DUP: case GGML_OP_REPEAT: + { + ggml_type src0_type = op->src[0]->type; + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + } break; + case GGML_OP_REPEAT_BACK: + return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1; case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; @@ -2935,9 +2971,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons } return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; - case GGML_OP_CROSS_ENTROPY_LOSS: - return true; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + case GGML_OP_CROSS_ENTROPY_LOSS: + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + case GGML_OP_OPT_STEP_ADAMW: + return true; default: return false; } diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index e1390a041..c7b6be4e2 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -1,4 +1,5 @@ #include "binbcast.cuh" +#include static __device__ __forceinline__ float op_repeat(const float a, const float b) { return b; @@ -90,6 +91,30 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); } +template +static __global__ void k_repeat_back( + const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t ne0, const int64_t ne1, const int64_t ne2) { + + const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x; + const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y; + const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z; + + if (tid0 >= ne0) { + return; + } + + T sum = 0; + for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) { + for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) { + for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) { + sum += src[i2*ne01*ne00 + i1*ne00 + i0]; + } + } + } + dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum; +} + template struct bin_bcast_cuda { template @@ -247,6 +272,16 @@ struct bin_bcast_cuda { } }; +template +static void repeat_back_cuda( + const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, + const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) { + + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2); + k_repeat_back<<>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2); +} + template static void ggml_cuda_op_bin_bcast( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, @@ -286,3 +321,35 @@ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); } + +void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == dst->type); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_can_repeat(dst, src0)); + + cudaStream_t stream = ctx.stream(); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + GGML_ASSERT(src0->ne[3] == 1); + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + GGML_ASSERT(dst->ne[3] == 1); + + switch (dst->type) { + case GGML_TYPE_F32: { + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream); + } break; + default: { + GGML_ASSERT(false); + } break; + } +} diff --git a/ggml/src/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh index 198c9ef6f..3ac1c9b03 100644 --- a/ggml/src/ggml-cuda/binbcast.cuh +++ b/ggml/src/ggml-cuda/binbcast.cuh @@ -5,3 +5,5 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu index 5575a90f6..ed09406a8 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cu +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu @@ -71,6 +71,32 @@ static __global__ void cross_entropy_loss_f32(const float * logits, const float dst[blockIdx.x] = loss; } +static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) { + extern __shared__ float tmp[]; + + float maxval = -INFINITY; + for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { + const float val = logits[blockIdx.x*nclasses + i]; + maxval = fmaxf(maxval, val); + tmp[i] = val; + } + maxval = warp_reduce_max(maxval); + + float sum = 0.0f; + for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { + const float val = expf(tmp[i] - maxval); + sum += val; + tmp[i] = val; + } + sum = warp_reduce_sum(sum); + const float sm_scale = 1.0f/sum; + + const float d_by_nrows = *loss/gridDim.x; + for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { + dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows; + } +} + void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -104,3 +130,37 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * // Combine results from individual blocks: sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream); } + +void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * opt0 = dst->src[2]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(opt0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + const float * opt0_d = (const float *) opt0->data; + float * dst_d = (float *) dst->data; + + cudaStream_t stream = ctx.stream(); + + const dim3 blocks_dim(WARP_SIZE, 1, 1); + const dim3 blocks_num(nrows, 1, 1); + const int shmem = ne00*sizeof(float); + + cross_entropy_loss_back_f32<<>>(src0_d, src1_d, opt0_d, dst_d, ne00); +} diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cuh b/ggml/src/ggml-cuda/cross-entropy-loss.cuh index 9d7b8b0f0..9ec7152ff 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cuh +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cuh @@ -3,3 +3,5 @@ #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256 void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cu b/ggml/src/ggml-cuda/opt-step-adamw.cu new file mode 100644 index 000000000..d6f13a9c6 --- /dev/null +++ b/ggml/src/ggml-cuda/opt-step-adamw.cu @@ -0,0 +1,80 @@ +#include "opt-step-adamw.cuh" + +#include + +static __global__ void opt_step_adamw_f32( + float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const int64_t k, + const float alpha, const float beta1, const float beta2, const float eps, const float wd, + const float beta1h, const float beta2h) { + + const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x; + + if (i >= k) { + return; + } + + const float gi = g[i]; + const float gmi = g_m[i]*beta1 + gi*(1.0f - beta1); + const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2); + + g_m[i] = gmi; + g_v[i] = gvi; + + const float mh = gmi*beta1h; + const float vh = sqrtf(gvi*beta2h) + eps; + + x[i] = x[i]*(1.0f - alpha*wd) - mh/vh; +} + +static void opt_step_adamw_f32_cuda( + float * x, const float * g, float * g_m, float * g_v, const int64_t k, + const float alpha, const float beta1, const float beta2, const float eps, const float wd, + const float beta1h, const float beta2h, cudaStream_t stream) { + + const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1); + const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1); + opt_step_adamw_f32<<>>(x, g, g_m, g_v, k, alpha, beta1, beta2, eps, wd, beta1h, beta2h); +} + +void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src0_grad = dst->src[1]; + const ggml_tensor * src0_grad_m = dst->src[2]; + const ggml_tensor * src0_grad_v = dst->src[3]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0_grad->type == GGML_TYPE_F32); + GGML_ASSERT(src0_grad_m->type == GGML_TYPE_F32); + GGML_ASSERT(src0_grad_v->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src0_grad)); + GGML_ASSERT(ggml_is_contiguous(src0_grad_m)); + GGML_ASSERT(ggml_is_contiguous(src0_grad_v)); + GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); + GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m)); + GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v)); + + float * src0_d = (float *) src0->data; + const float * src0_grad_d = (const float *) src0_grad->data; + float * src0_grad_m_d = (float *) src0_grad_m->data; + float * src0_grad_v_d = (float *) src0_grad_v->data; + + cudaStream_t stream = ctx.stream(); + + const int64_t ne = ggml_nelements(src0); + + int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t)); + float alpha; memcpy(&alpha, &dst->op_params[2], sizeof(float)); + float beta1; memcpy(&beta1, &dst->op_params[3], sizeof(float)); + float beta2; memcpy(&beta2, &dst->op_params[4], sizeof(float)); + float eps; memcpy(&eps, &dst->op_params[5], sizeof(float)); + float wd; memcpy(&wd, &dst->op_params[6], sizeof(float)); + + const float beta1h = alpha/(1.0f - powf(beta1, iter)); + const float beta2h = 1.0f/(1.0f - powf(beta2, iter)); + + opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, ne, alpha, beta1, beta2, eps, wd, beta1h, beta2h, stream); + + iter++; + memcpy(&dst->op_params[0], &iter, sizeof(int64_t)); +} diff --git a/ggml/src/ggml-cuda/opt-step-adamw.cuh b/ggml/src/ggml-cuda/opt-step-adamw.cuh new file mode 100644 index 000000000..58d6f6e5d --- /dev/null +++ b/ggml/src/ggml-cuda/opt-step-adamw.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256 + +void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu new file mode 100644 index 000000000..657d50e15 --- /dev/null +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -0,0 +1,52 @@ +#include "out-prod.cuh" +#include "vendors/cuda.h" + +#include + +void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_ASSERT(ne01 == ne11); + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + + GGML_ASSERT(ne2 == src0->ne[2]); + GGML_ASSERT(ne2 == src1->ne[2]); + GGML_ASSERT(ne3 == src0->ne[3]); + GGML_ASSERT(ne3 == src1->ne[3]); + + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + cudaStream_t stream = ctx.stream(); + cublasHandle_t handle = ctx.cublas_handle(); + + const float alpha = 1.0f; + const float beta = 0.0f; + + GGML_ASSERT(ne2 == 1); + GGML_ASSERT(ne3 == 1); + CUBLAS_CHECK(cublasSetStream(handle, stream)); + + const bool src1_T = ggml_is_transposed(src1); + const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); + GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float)); + + CUBLAS_CHECK( + cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op, + ne0, ne1, ne01, + &alpha, src0_d, ne00, + src1_d, ldb, + &beta, dst_d, ne0)); +} diff --git a/ggml/src/ggml-cuda/out-prod.cuh b/ggml/src/ggml-cuda/out-prod.cuh new file mode 100644 index 000000000..a0046f5f8 --- /dev/null +++ b/ggml/src/ggml-cuda/out-prod.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 8ac669f94..163b5a8ff 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -10,6 +10,16 @@ static __global__ void neg_f32(const float * x, float * dst, const int k) { dst[i] = -x[i]; } +static __global__ void step_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = x[i] > 0.0f; +} + static __global__ void gelu_f32(const float * x, float * dst, const int k) { const float GELU_COEF_A = 0.044715f; const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; @@ -134,6 +144,11 @@ static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t neg_f32<<>>(x, dst, k); } +static void step_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE; + step_f32<<>>(x, dst, k); +} + static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; gelu_f32<<>>(x, dst, k); @@ -213,6 +228,20 @@ void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); } +void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + step_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index ed2ffc461..fe519f6a2 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -1,6 +1,7 @@ #include "common.cuh" #define CUDA_NEG_BLOCK_SIZE 256 +#define CUDA_STEP_BLOCK_SIZE 256 #define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_TANH_BLOCK_SIZE 256 @@ -15,6 +16,8 @@ void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index 7f0bd82d5..9cbc57a64 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -1872,6 +1872,7 @@ static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, /* .get_base = */ ggml_backend_kompute_buffer_get_base, /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor, /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor, /* .cpy_tensor = */ NULL, diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index f87181d19..ef3b7f0e8 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3167,6 +3167,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor, /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_metal_buffer_cpy_tensor, diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index a8a2eb85a..49b3fa911 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -469,6 +469,7 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = { /* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer, /* .get_base = */ ggml_backend_rpc_buffer_get_base, /* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_rpc_buffer_set_tensor, /* .get_tensor = */ ggml_backend_rpc_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_rpc_buffer_cpy_tensor, diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index acef7c6d4..2cf445047 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -4323,6 +4323,7 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, /* .get_base = */ ggml_backend_sycl_buffer_get_base, /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor, /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor, diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index bad960510..f9da45881 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -6246,6 +6246,7 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { /* .free_buffer = */ ggml_backend_vk_buffer_free_buffer, /* .get_base = */ ggml_backend_vk_buffer_get_base, /* .init_tensor = */ ggml_backend_vk_buffer_init_tensor, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_vk_buffer_set_tensor, /* .get_tensor = */ ggml_backend_vk_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_vk_buffer_cpy_tensor, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 51532c5ea..201d5466a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1,6 +1,7 @@ #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows #define _USE_MATH_DEFINES // For M_PI on MSVC +#include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-cpu-impl.h" #include "ggml-quants.h" @@ -2997,9 +2998,10 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS", "CROSS_ENTROPY_LOSS_BACK", + "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); +static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3090,9 +3092,10 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss(x,y)", "cross_entropy_loss_back(x,y)", + "adamw(x)", }; -static_assert(GGML_OP_COUNT == 79, "GGML_OP_COUNT != 79"); +static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4094,7 +4097,11 @@ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, floa } struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { - memset(tensor->data, 0, ggml_nbytes(tensor)); + if (tensor->buffer) { + ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor)); + } else { + memset(tensor->data, 0, ggml_nbytes(tensor)); + } return tensor; } @@ -8320,11 +8327,46 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( return result; } +// opt_step_adamw + +struct ggml_tensor * ggml_opt_step_adamw( + struct ggml_context * ctx, + struct ggml_tensor * a, + float alpha, + float beta1, + float beta2, + float eps, + float wd) { + GGML_ASSERT(a->grad); + GGML_ASSERT(alpha > 0.0f); + GGML_ASSERT(beta1 >= 0.0f && beta1 <= 1.0f); + GGML_ASSERT(beta2 >= 0.0f && beta2 <= 1.0f); + GGML_ASSERT(eps >= 0.0f); + GGML_ASSERT(wd >= 0.0f && wd <= 1.0f); + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + result->op = GGML_OP_OPT_STEP_ADAMW; + result->grad = NULL; + result->src[0] = a; + result->src[1] = a->grad; + result->src[2] = ggml_dup_tensor(ctx, a->grad); + result->src[3] = ggml_dup_tensor(ctx, a->grad); + + const int64_t iter = 1; + memcpy(&result->op_params[0], &iter, sizeof(int64_t)); + ggml_set_op_params_f32(result, 2, alpha); + ggml_set_op_params_f32(result, 3, beta1); + ggml_set_op_params_f32(result, 4, beta2); + ggml_set_op_params_f32(result, 5, eps); + ggml_set_op_params_f32(result, 6, wd); + + return result; +} + //////////////////////////////////////////////////////////////////////////////// -void ggml_set_param( - struct ggml_context * ctx, - struct ggml_tensor * tensor) { +void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) { tensor->flags |= GGML_TENSOR_FLAG_PARAM; GGML_ASSERT(tensor->grad == NULL); @@ -8332,6 +8374,13 @@ void ggml_set_param( ggml_format_name(tensor->grad, "%s (grad)", tensor->name); } +void ggml_set_loss(struct ggml_tensor * tensor) { + GGML_ASSERT(ggml_is_scalar(tensor)); + GGML_ASSERT(tensor->type == GGML_TYPE_F32); + GGML_ASSERT(tensor->grad); + tensor->flags |= GGML_TENSOR_FLAG_LOSS; +} + // ggml_compute_forward_dup static void ggml_compute_forward_dup_same_cont( @@ -17406,7 +17455,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ir0 = dr*ith; const int64_t ir1 = MIN(ir0 + dr, nr); - float * d = (float *) opt0->data; + const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr; for (int64_t i1 = ir0; i1 < ir1; i1++) { float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); @@ -17430,7 +17479,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr ggml_vec_sub_f32(nc, ds0, ds0, s1); - ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); + ggml_vec_scale_f32(nc, ds0, d_by_nr); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -17459,6 +17508,94 @@ static void ggml_compute_forward_cross_entropy_loss_back( } } +static void ggml_compute_forward_opt_step_adamw_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src0_grad = dst->src[1]; + const struct ggml_tensor * src0_grad_m = dst->src[2]; + const struct ggml_tensor * src0_grad_v = dst->src[3]; + GGML_ASSERT(ggml_are_same_shape(src0, src0_grad)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + GGML_ASSERT(nb00 == sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + /* const float gnorm = 1.0f; */ + int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t)); + const float alpha = ggml_get_op_params_f32(dst, 2); + const float beta1 = ggml_get_op_params_f32(dst, 3); + const float beta2 = ggml_get_op_params_f32(dst, 4); + const float eps = ggml_get_op_params_f32(dst, 5); + const float wd = ggml_get_op_params_f32(dst, 6); + + const float beta1h = alpha/(1.0f - powf(beta1, iter)); + const float beta2h = 1.0f/(1.0f - powf(beta2, iter)); + + for (int ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const size_t offset = i03*nb03 + i02*nb02 + i01*nb01; + + float * w = (float *) ((char *) src0->data + offset); // weight + const float * g = (const float *) ((const char *) src0_grad->data + offset); // grad + float * m = (float *) ((char *) src0_grad_m->data + offset); + float * v = (float *) ((char *) src0_grad_v->data + offset); + + for (int i00 = 0; i00 < ne00; ++i00) { + m[i00] = m[i00]*beta1 + g[i00]*(1.0f - beta1); + v[i00] = v[i00]*beta2 + g[i00]*g[i00]*(1.0f - beta2); + + const float mh = m[i00]*beta1h; + const float vh = sqrtf(v[i00]*beta2h) + eps; + + // The weight decay is applied independently of the Adam momenta m and v. + // This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss. + // See: https://arxiv.org/pdf/1711.05101v3.pdf + w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh; + } + } + + ggml_barrier(params->threadpool); + if (ith != 0) { + return; + } + + iter++; + memcpy(&dst->op_params[0], &iter, sizeof(int64_t)); +} + +static void ggml_compute_forward_opt_step_adamw( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_opt_step_adamw_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -17804,6 +17941,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm ggml_compute_forward_cross_entropy_loss_back(params, tensor); } break; + case GGML_OP_OPT_STEP_ADAMW: + { + ggml_compute_forward_opt_step_adamw(params, tensor); + } + break; case GGML_OP_NONE: { // nop @@ -17958,7 +18100,7 @@ void ggml_build_backward_gradient_checkpointing( struct ggml_tensor * * checkpoints, int n_checkpoints) { ggml_graph_cpy(gf, gb_tmp); - ggml_build_backward_expand(ctx, gf, gb_tmp, true); + ggml_build_backward_expand(ctx, gf, gb_tmp, false, true); if (n_checkpoints <= 0) { ggml_graph_cpy(gb_tmp, gb); @@ -17996,42 +18138,93 @@ void ggml_build_backward_gradient_checkpointing( ggml_hash_map_free(replacements); } -// functions to change gradients considering the case that input a might be initial gradient with zero value +// utility functions to change gradients +// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator +// else if a is in zero_table, replace a +// else, just add/subtract/etc. the gradients -static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { +static struct ggml_tensor * ggml_add_or_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_hash_set * zero_table, + struct ggml_hash_set * acc_table) { + if (ggml_hash_contains(acc_table, a)) { + struct ggml_tensor * ret = ggml_add_impl(ctx, a, b, true); + const size_t insert_result = ggml_hash_insert(acc_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } if (ggml_hash_contains(zero_table, a)) { return b; - } else { - return ggml_add_impl(ctx, a, b, false); } + return ggml_add_impl(ctx, a, b, false); } -static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) { +static struct ggml_tensor * ggml_acc_or_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const size_t nb1, + const size_t nb2, + const size_t nb3, + const size_t offset, + struct ggml_hash_set * zero_table, + struct ggml_hash_set * acc_table) { + if (ggml_hash_contains(acc_table, a)) { + struct ggml_tensor * ret = ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); + const size_t insert_result = ggml_hash_insert(acc_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } if (ggml_hash_contains(zero_table, a)) { - struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); + struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); - } else { - return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } -static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { +static struct ggml_tensor * ggml_add1_or_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_hash_set * zero_table, + struct ggml_hash_set * acc_table) { + if (ggml_hash_contains(acc_table, a)) { + struct ggml_tensor * ret = ggml_add1_impl(ctx, a, b, true); + const size_t insert_result = ggml_hash_insert(acc_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } if (ggml_hash_contains(zero_table, a)) { return ggml_repeat(ctx, b, a); - } else { - return ggml_add1_impl(ctx, a, b, false); } + return ggml_add1_impl(ctx, a, b, false); } -static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { +static struct ggml_tensor * ggml_sub_or_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_hash_set * zero_table, + struct ggml_hash_set * acc_table) { + if (ggml_hash_contains(acc_table, a)) { + struct ggml_tensor * ret = ggml_sub_impl(ctx, a, b, true); + const size_t insert_result = ggml_hash_insert(acc_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } if (ggml_hash_contains(zero_table, a)) { return ggml_neg(ctx, b); - } else { - return ggml_sub_impl(ctx, a, b, false); } + return ggml_sub_impl(ctx, a, b, false); } -static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) { +static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table, struct ggml_hash_set * acc_table) { struct ggml_tensor * src0 = tensor->src[0]; struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src2 = tensor->src[2]; @@ -18040,38 +18233,38 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_DUP: { if (src0->grad) { - src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } } break; case GGML_OP_ADD: { if (src0->grad) { - src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } if (src1->grad) { if (ggml_are_same_shape(src0, src1)) { - src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table); + src1->grad = ggml_add_or_set(ctx, src1->grad, tensor->grad, zero_table, acc_table); } else { - src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_repeat_back(ctx, tensor->grad, src1), zero_table); + src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_repeat_back(ctx, tensor->grad, src1), zero_table, acc_table); } } } break; case GGML_OP_ADD1: { if (src0->grad) { - src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } if (src1->grad) { src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean - zero_table); + zero_table, acc_table); } } break; case GGML_OP_ACC: { if (src0->grad) { - src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } if (src1->grad) { const size_t nb1 = ((int32_t *) tensor->op_params)[0]; @@ -18093,16 +18286,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SUB: { if (src0->grad) { - src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } if (src1->grad) { - src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table); + src1->grad = ggml_sub_or_set(ctx, src1->grad, tensor->grad, zero_table, acc_table); } } break; case GGML_OP_MUL: @@ -18112,14 +18305,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_add_or_set(ctx, src0->grad, ggml_mul(ctx, src1, tensor->grad), - zero_table); + zero_table, acc_table); } if (src1->grad) { src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_mul(ctx, src0, tensor->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_DIV: @@ -18129,7 +18322,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_add_or_set(ctx, src0->grad, ggml_div(ctx, tensor->grad, src1), - zero_table); + zero_table, acc_table); } if (src1->grad) { src1->grad = @@ -18138,7 +18331,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_mul(ctx, tensor->grad, ggml_div(ctx, tensor, src1)), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SQR: @@ -18150,7 +18343,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_scale(ctx, ggml_mul(ctx, src0, tensor->grad), 2.0f), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SQRT: @@ -18164,7 +18357,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor tensor->grad, tensor), 0.5f), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_LOG: @@ -18176,7 +18369,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_div(ctx, tensor->grad, src0), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SIN: @@ -18188,7 +18381,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_mul(ctx, tensor->grad, ggml_cos(ctx, src0)), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_COS: @@ -18200,7 +18393,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_mul(ctx, tensor->grad, ggml_sin(ctx, src0)), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SUM: @@ -18210,7 +18403,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_add1_or_set(ctx, src0->grad, tensor->grad, - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SUM_ROWS: @@ -18222,7 +18415,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_repeat(ctx, tensor->grad, src0->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_MEAN: @@ -18237,7 +18430,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_repeat_back(ctx, tensor->grad, src0->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_REPEAT_BACK: @@ -18247,7 +18440,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_repeat(ctx, tensor->grad, src0->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_CONCAT: @@ -18272,7 +18465,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_rms_norm_back(ctx, src0, tensor->grad, eps), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_RMS_NORM_BACK: @@ -18320,7 +18513,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_add_or_set(ctx, src0->grad, // [n,m,q1,r1] s1_tg, // [n,m,q1,r1] - zero_table); + zero_table, acc_table); } if (src1->grad) { src1->grad = @@ -18338,7 +18531,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0, // [n,m,q1,r1] ggml_transpose(ctx, // [p,m,qq,rr] tensor->grad)), // [m,p,qq,rr] - zero_table); + zero_table, acc_table); } } break; case GGML_OP_MUL_MAT_ID: @@ -18360,7 +18553,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_add_or_set(ctx, src0->grad, ggml_scale_impl(ctx, tensor->grad, s, false), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SET: @@ -18389,7 +18582,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor tensor->grad, ggml_neg(ctx, tensor_grad_view), nb1, nb2, nb3, offset, false), - zero_table); + zero_table, acc_table); } if (src1->grad) { @@ -18399,7 +18592,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_CPY: @@ -18410,7 +18603,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // tensor = src0 * 1 + src1 * 0 if (src0->grad) { // dsrc0 = dtensor * 1 - src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } if (src1->grad) { // dsrc1 = dtensor * 0 -> noop @@ -18422,7 +18615,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { GGML_ASSERT(ggml_is_contiguous(src0->grad)); GGML_ASSERT(ggml_is_contiguous(tensor->grad)); - src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_add_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } } break; case GGML_OP_RESHAPE: @@ -18436,7 +18629,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ? tensor->grad : ggml_cont(ctx, tensor->grad), src0->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_VIEW: @@ -18465,7 +18658,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor nb3 = (nb3 / n0) * ng; } - src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table); + src0->grad = ggml_acc_or_set(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, zero_table, acc_table); } } break; case GGML_OP_PERMUTE: @@ -18490,7 +18683,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor axes_backward[1], axes_backward[2], axes_backward[3]), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_TRANSPOSE: @@ -18500,7 +18693,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_transpose(ctx, tensor->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_GET_ROWS: @@ -18512,7 +18705,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // last ggml_get_rows_back argument src0->grad is only // necessary to setup correct output shape ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), - zero_table); + zero_table, acc_table); } if (src1->grad) { // noop @@ -18536,7 +18729,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor /* ggml_diag_mask_inf_impl() shouldn't be here */ /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */ ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_DIAG_MASK_ZERO: @@ -18547,7 +18740,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_SOFT_MAX: @@ -18557,7 +18750,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_soft_max_back(ctx, tensor->grad, tensor), - zero_table); + zero_table, acc_table); } } break; @@ -18598,7 +18791,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor attn_factor, beta_fast, beta_slow), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_ROPE_BACK: @@ -18634,7 +18827,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor beta_fast, beta_slow, false), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_CLAMP: @@ -18659,7 +18852,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1->grad = ggml_add_or_set(ctx, src1->grad, ggml_im2col_back(ctx, src0, tensor->grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_IM2COL_BACK: @@ -18688,7 +18881,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_pool_2d_back(ctx, tensor->grad, src0, op, k0, k1, s0, s1, p0, p1), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_POOL_2D_BACK: @@ -18753,7 +18946,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, grad_q, - zero_table); + zero_table, acc_table); } if (src1->grad) { struct ggml_tensor * view_k = ggml_view_1d(ctx, flash_grad, elem_k, offs_k); @@ -18761,7 +18954,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src1->grad = ggml_add_or_set(ctx, src1->grad, grad_k, - zero_table); + zero_table, acc_table); } if (src2->grad) { struct ggml_tensor * view_v = ggml_view_1d(ctx, flash_grad, elem_v, offs_v); @@ -18769,7 +18962,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src2->grad = ggml_add_or_set(ctx, src2->grad, grad_v, - zero_table); + zero_table, acc_table); } } break; case GGML_OP_FLASH_ATTN_BACK: @@ -18795,7 +18988,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_mul(ctx, ggml_sgn(ctx, src0), tensor->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_UNARY_OP_SGN: @@ -18807,7 +19000,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_UNARY_OP_NEG: { if (src0->grad) { - src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table); + src0->grad = ggml_sub_or_set(ctx, src0->grad, tensor->grad, zero_table, acc_table); } } break; case GGML_UNARY_OP_STEP: @@ -18832,7 +19025,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor ggml_mul(ctx, ggml_step(ctx, src0), tensor->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_UNARY_OP_SIGMOID: @@ -18854,7 +19047,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_silu_back(ctx, src0, tensor->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_UNARY_OP_EXP: @@ -18863,7 +19056,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_or_set(ctx, src0->grad, ggml_mul(ctx, tensor, tensor->grad), - zero_table); + zero_table, acc_table); } } break; default: @@ -18893,13 +19086,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0, src1, tensor->grad), - zero_table); + zero_table, acc_table); } } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { GGML_ABORT("fatal error"); // not supported } + case GGML_OP_OPT_STEP_ADAMW: + { + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_NONE: { // nop @@ -18989,7 +19186,7 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * ggml_build_forward_impl(cgraph, tensor, true); } -void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) { +void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep) { GGML_ASSERT(gf->n_nodes > 0); GGML_ASSERT(gf->grads); @@ -19005,21 +19202,35 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * } } - // remember original gradients which start with zero values + // keep tables of original gradients for replacement/accumulation logic struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size); + struct ggml_hash_set acc_table = ggml_hash_set_new(gf->size); for (int i = 0; i < gf->n_nodes; i++) { - if (gf->grads[i]) { - ggml_hash_insert(&zero_table, gf->grads[i]); + struct ggml_tensor * node = gf->nodes[i]; + + if (node->grad) { + { + const size_t insert_result = ggml_hash_insert(&zero_table, node->grad); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + } + + // only gradients of trainable parameters should be accumulated + if (accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) { + const size_t insert_result = ggml_hash_insert(&acc_table, node->grad); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + } } } for (int i = gf->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = gf->nodes[i]; - // inplace operations to add gradients are not created by ggml_compute_backward + // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation // use allocator to automatically make inplace operations if (node->grad) { - ggml_compute_backward(ctx, node, &zero_table); + ggml_compute_backward(ctx, node, &zero_table, &acc_table); } } @@ -19033,8 +19244,30 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * } ggml_hash_set_free(&zero_table); + ggml_hash_set_free(&acc_table); } +void ggml_build_opt_adamw( + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + float alpha, + float beta1, + float beta2, + float eps, + float wd) { + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (node->flags & GGML_TENSOR_FLAG_PARAM) { + GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); + struct ggml_tensor * opt_step = ggml_opt_step_adamw(ctx, node, alpha, beta1, beta2, eps, wd); + ggml_build_forward_expand(gb, opt_step); + } + } +} + + static void * incr_ptr_aligned(void ** p, size_t size, size_t align) { void * ptr = *p; ptr = (void *) GGML_PAD((uintptr_t) ptr, align); @@ -19162,10 +19395,28 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { GGML_ASSERT(cgraph->grads != NULL); for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * grad = cgraph->grads[i]; + struct ggml_tensor * node = cgraph->nodes[i]; - if (grad) { - ggml_set_zero(grad); + // initial gradients of loss should be 1, 0 otherwise + if (node->grad) { + if (node->flags & GGML_TENSOR_FLAG_LOSS) { + GGML_ASSERT(node->grad->buffer); + GGML_ASSERT(node->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_scalar(node)); + + const float onef = 1.0f; + ggml_backend_tensor_set(node->grad, &onef, 0, ggml_nbytes(node->grad)); + } else { + ggml_set_zero(node->grad); + } + } + + GGML_ASSERT(node); + if (node->op == GGML_OP_OPT_STEP_ADAMW) { + // set iteration to 1 and clear momenta + ggml_set_op_params_i32(node, 0, 1); + ggml_set_zero(node->src[2]); + ggml_set_zero(node->src[3]); } } } @@ -19458,6 +19709,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + case GGML_OP_OPT_STEP_ADAMW: { n_tasks = n_threads; } break; @@ -21851,7 +22103,7 @@ enum ggml_opt_result ggml_opt_resume( ggml_build_forward_expand(gf, f); struct ggml_cgraph * gb = ggml_graph_dup(ctx, gf); - ggml_build_backward_expand(ctx, gf, gb, true); + ggml_build_backward_expand(ctx, gf, gb, false, true); return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index aa7896def..889a19944 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -799,10 +799,11 @@ struct test_case { out = ggml_sum(ctx, out); ggml_set_name(out, "sum_of_out"); } + ggml_set_loss(out); ggml_build_forward_expand(gf, out); ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx, gf, gb, false); + ggml_build_backward_expand(ctx, gf, gb, false, false); if (expect.size() != 1 || expect[0] != 0.0f) { GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf)); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { @@ -837,22 +838,11 @@ struct test_case { return false; } - // randomize tensors - initialize_tensors(ctx); - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - if (!t->grad) { - continue; - } + initialize_tensors(ctx); // Randomizes all tensors (including gradients). + ggml_graph_reset(gb); // Sets gradients to 1 if loss, 0 otherwise. - std::vector tmp(ggml_nelements(t->grad)); - ggml_backend_tensor_set(t->grad, tmp.data(), 0, ggml_nbytes(t->grad)); - } - - // build graphs - const float onef = 1.0f; ggml_backend_graph_compute(backend, gf); - ggml_backend_tensor_set(out->grad, &onef, 0, ggml_nbytes(out->grad)); ggml_backend_graph_compute(backend, gb); bool ok = true; @@ -1681,6 +1671,50 @@ struct test_mul_mat_id : public test_case { } }; +// GGML_OP_OUT_PROD +struct test_out_prod : public test_case { + const ggml_type type_a; + const ggml_type type_b; + const int64_t m; + const int64_t n; + const int64_t k; + const std::array bs; // dims 3 and 4 + const bool trans_b; + + std::string vars() override { + return VARS_TO_STR7(type_a, type_b, m, n, k, bs, trans_b); + } + + double max_nmse_err() override { + return 5e-4; + } + + test_out_prod(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, + int64_t m = 32, int64_t n = 32, int64_t k = 32, + std::array bs = {10, 10}, + bool trans_b = false) + : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), trans_b(trans_b) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, m, k, bs[0], bs[1]); + ggml_set_name(a, "a"); + + ggml_tensor * b; + if (trans_b) { + b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0], bs[1]); + b = ggml_transpose(ctx, b); + } else { + b = ggml_new_tensor_4d(ctx, type_b, n, k, bs[0], bs[1]); + } + ggml_set_name(b, "b"); + + ggml_tensor * out = ggml_out_prod(ctx, a, b); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_SQR struct test_sqr : public test_case { const ggml_type type; @@ -2666,6 +2700,51 @@ struct test_cross_entropy_loss : public test_case { } }; +// GGML_OP_OPT_STEP_ADAMW +struct test_opt_step_adamw : public test_case { + const ggml_type type; + const std::array ne; + const float alpha; + const float beta1; + const float beta2; + const float eps; + const float wd; + + std::string vars() override { + return VARS_TO_STR7(type, ne, alpha, beta1, beta2, eps, wd); + } + + test_opt_step_adamw(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 5, 4, 3}, + float alpha = 1e-3f, + float beta1 = 0.9f, + float beta2 = 0.999f, + float eps = 1e-8f, + float wd = 0.0f) + : type(type), ne(ne), alpha(alpha), beta1(beta1), beta2(beta2), eps(eps), wd(wd) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]); + ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not. + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_opt_step_adamw(ctx, a, alpha, beta1, beta2, eps, wd); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, 0.0f, 1.0f); // grad_v needs non-negative values. + } + } + + bool grad_precise() override { + return true; + } +}; + enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, @@ -3159,14 +3238,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); - - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {2, 1, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 2, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 2, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, 3}, {1, 1, 1, 2})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, 3}, {2, 1, 1, 1})); - test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, 3}, {1, 1, 1, 2})); + for (int ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1 + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 2, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2})); + } test_cases.emplace_back(new test_dup(GGML_TYPE_F32)); test_cases.emplace_back(new test_dup(GGML_TYPE_F16)); @@ -3350,6 +3430,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } + for (ggml_type type_a : base_types) { + for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, { 1, 1})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 1})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 1, 16, {10, 10})); + + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, { 1, 1}, true)); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 1})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); + test_cases.emplace_back(new test_out_prod(type_a, type_b, 256, 16, 16, {10, 10})); + } + } + test_cases.emplace_back(new test_sqr()); test_cases.emplace_back(new test_sqrt()); test_cases.emplace_back(new test_log()); @@ -3476,6 +3577,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } test_cases.emplace_back(new test_cross_entropy_loss()); + for (float wd : {0.0f, 1e-2f}) { + test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}, 1.0f, 1e-3f, 0.9f, 0.999f, wd)); + } // these tests are disabled to save execution time, but they can be handy for debugging #if 0 diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 1834c11d8..2ef606d2c 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -240,7 +240,7 @@ static bool check_gradient( struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true); ggml_build_forward_expand(gf, f); ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx0, gf, gb, false); + ggml_build_backward_expand(ctx0, gf, gb, false, false); ggml_graph_compute_with_ctx(ctx0, gf, n_threads); From 43015353262de835215103475055b74045cb9f49 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 20 Sep 2024 19:06:59 +0300 Subject: [PATCH 45/51] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 3d2dfb413..cf7b97d45 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -10e83a412717c20d57ba19f025248e18e43addf3 +e7b23907cb2816e9951fe9b524d7127ab777297a From 27609c49b94766a136764ce7919c8a082bf71548 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 20 Sep 2024 19:13:02 +0300 Subject: [PATCH 46/51] ggml : fix trailing whitespace (#0) ggml-ci --- ggml/src/ggml-backend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 97ca5a1f3..ba280e064 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -256,7 +256,7 @@ GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, if (!size) { return; } - + GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer"); buf->iface.memset_tensor(buf, tensor, value, offset, size); From d13edb17ed1ce3b961016cbdb616b1c8d161c026 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 20 Sep 2024 20:12:52 +0300 Subject: [PATCH 47/51] ggml : fix builds (#0) ggml-ci --- ggml/src/ggml-cuda/out-prod.cu | 1 - ggml/src/ggml-cuda/vendors/hip.h | 1 + ggml/src/ggml-sycl.cpp | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index 657d50e15..619cfdcb5 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -1,5 +1,4 @@ #include "out-prod.cuh" -#include "vendors/cuda.h" #include diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index d0c377255..1f3c70c2e 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -30,6 +30,7 @@ #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t +#define cublasOperation_t hipblasOperation_t #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 2cf445047..16e6be4a0 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -4735,6 +4735,7 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer, /* .get_base = */ ggml_backend_sycl_split_buffer_get_base, /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor, + /* .memset_tensor = */ NULL, /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor, /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor, /* .cpy_tensor = */ NULL, From 63351143b2ea5efe9f8b9c61f553af8a51f1deff Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 20 Sep 2024 20:55:36 +0200 Subject: [PATCH 48/51] quantize : improve type name parsing (#9570) quantize : do not ignore invalid types in arg parsing quantize : ignore case of type and ftype arguments --- examples/quantize/quantize.cpp | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index a23bfb86b..b98993210 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -63,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; +static bool striequals(const char * a, const char * b) { + while (*a && *b) { + if (std::tolower(*a) != std::tolower(*b)) { + return false; + } + a++; b++; + } + return *a == *b; +} + static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; @@ -70,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp ftype_str.push_back(std::toupper(ch)); } for (auto & it : QUANT_OPTIONS) { - if (it.name == ftype_str) { + if (striequals(it.name.c_str(), ftype_str.c_str())) { ftype = it.ftype; ftype_str_out = it.name; return true; @@ -225,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file, } static ggml_type parse_ggml_type(const char * arg) { - ggml_type result = GGML_TYPE_COUNT; - for (int j = 0; j < GGML_TYPE_COUNT; ++j) { - auto type = ggml_type(j); + for (int i = 0; i < GGML_TYPE_COUNT; ++i) { + auto type = (ggml_type)i; const auto * name = ggml_type_name(type); - if (name && strcmp(arg, name) == 0) { - result = type; break; + if (name && striequals(name, arg)) { + return type; } } - return result; + fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg); + return GGML_TYPE_COUNT; } int main(int argc, char ** argv) { @@ -254,12 +264,18 @@ int main(int argc, char ** argv) { } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { if (arg_idx < argc-1) { params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); + if (params.output_tensor_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } } else { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { if (arg_idx < argc-1) { params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); + if (params.token_embedding_type == GGML_TYPE_COUNT) { + usage(argv[0]); + } } else { usage(argv[0]); } From e948a7da7af7f2dbfdcd695b3ba903ab12575f78 Mon Sep 17 00:00:00 2001 From: Huang Qi Date: Sat, 21 Sep 2024 08:39:41 +0800 Subject: [PATCH 49/51] CI: Provide prebuilt windows binary for hip (#9467) --- .github/workflows/build.yml | 70 +++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1777489ec..a54c5de99 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -967,6 +967,7 @@ jobs: name: llama-bin-win-sycl-x64.zip windows-latest-cmake-hip: + if: ${{ github.event.inputs.create_release != 'true' }} runs-on: windows-latest steps: @@ -994,8 +995,72 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON - cmake --build build --config Release + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON + cmake --build build -j ${env:NUMBER_OF_PROCESSORS} + + windows-latest-cmake-hip-release: + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + runs-on: windows-latest + + strategy: + matrix: + gpu_target: [gfx1100, gfx1101, gfx1030] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Install + id: depends + run: | + $ErrorActionPreference = "Stop" + write-host "Downloading AMD HIP SDK Installer" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + write-host "Installing AMD HIP SDK" + Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait + write-host "Completed AMD HIP SDK installation" + + - name: Verify ROCm + id: verify + run: | + & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + + - name: Build + id: cmake_build + run: | + $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) + $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON + cmake --build build -j ${env:NUMBER_OF_PROCESSORS} + md "build\bin\rocblas\library\" + cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + run: | + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip + name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip ios-xcode-build: runs-on: macos-latest @@ -1060,6 +1125,7 @@ jobs: - macOS-latest-cmake - windows-latest-cmake - windows-latest-cmake-cuda + - windows-latest-cmake-hip-release - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 From 41f477879fd5ccc31211634292e8e293ec700e85 Mon Sep 17 00:00:00 2001 From: agray3 Date: Sat, 21 Sep 2024 01:41:07 +0100 Subject: [PATCH 50/51] Update CUDA graph on scale change plus clear nodes/params (#9550) * Avoid using saved CUDA graph if scale changes and reset nodes/params on update Fixes https://github.com/ggerganov/llama.cpp/issues/9451 * clear before resize --- ggml/src/ggml-cuda.cu | 9 +++++++++ ggml/src/ggml-cuda/common.cuh | 1 + 2 files changed, 10 insertions(+) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index b0843dc62..895ba4794 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2478,6 +2478,7 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p for (int i = 0; i < GGML_MAX_SRC; i++) { graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr; } + memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS); } static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { @@ -2509,6 +2510,12 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra return false; } } + + if (node->op == GGML_OP_SCALE && + memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) { + return false; + } + return true; } @@ -2720,7 +2727,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t // First call with null argument gets number of nodes in graph CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes)); // Subsequent call with non-null argument gets nodes + cuda_ctx->cuda_graph->nodes.clear(); cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes); + cuda_ctx->cuda_graph->params.clear(); cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes); if (cuda_ctx->cuda_graph->num_nodes > 0) { CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes)); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index eb39b6d23..85eb200f0 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -569,6 +569,7 @@ struct ggml_graph_node_properties { int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; void * src_address[GGML_MAX_SRC]; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; }; struct ggml_cuda_graph { From d09770cae71b416c032ec143dda530f7413c4038 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 21 Sep 2024 14:24:23 +0200 Subject: [PATCH 51/51] ggml-alloc : fix list of allocated tensors with GGML_ALLOCATOR_DEBUG (#9573) --- ggml/src/ggml-alloc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index e485326ab..70187b9b6 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { alloc->free_blocks[0].offset = 0; alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows alloc->max_size = 0; + +#ifdef GGML_ALLOCATOR_DEBUG + for (int i = 0; i < 1024; i++) { + alloc->allocated_tensors[i].tensor = NULL; + } +#endif } static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {