ggml : add support for dynamic loading of backends (#10469)

* ggml : add support for dynamic loading of backends --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-11-25 15:13:39 +01:00 · 2024-11-25 15:13:39 +01:00 · 5931c1f233
commit 5931c1f233
parent f6d12e7df8
44 changed files with 728 additions and 272 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})

 if (EMSCRIPTEN)
 else()
-    add_subdirectory(cvector-generator)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
-    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
-    add_subdirectory(export-lora)
    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
@ -27,24 +24,16 @@ else()
    add_subdirectory(imatrix)
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
-    add_subdirectory(llava)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(main)
    add_subdirectory(parallel)
    add_subdirectory(passkey)
    add_subdirectory(perplexity)
-    add_subdirectory(quantize-stats)
    add_subdirectory(quantize)
    add_subdirectory(retrieval)
-    if (GGML_RPC)
-        add_subdirectory(rpc)
-    endif()
    if (LLAMA_BUILD_SERVER)
-    add_subdirectory(server)
-    endif()
-    if (GGML_SYCL)
-        add_subdirectory(sycl)
+        add_subdirectory(server)
    endif()
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
@ -52,4 +41,18 @@ else()
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(tokenize)
+    if (NOT GGML_BACKEND_DL)
+        # these examples use the backends directly and cannot be built with dynamic loading
+        add_subdirectory(convert-llama2c-to-ggml)
+        add_subdirectory(cvector-generator)
+        add_subdirectory(export-lora)
+        add_subdirectory(quantize-stats)
+        add_subdirectory(llava)
+        if (GGML_RPC)
+            add_subdirectory(rpc)
+        endif()
+        if (GGML_SYCL)
+            add_subdirectory(sycl)
+        endif()
+    endif()
 endif()
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)

 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET}
+        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {

    cmd_params params = parse_cmd_params(argc, argv);

+    // initialize backends
+    ggml_backend_load_all();
+    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!cpu_dev) {
+        fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
+        return 1;
+    }
+    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
+
    // initialize llama.cpp
    if (!params.verbose) {
        llama_log_set(llama_null_log_callback, NULL);
@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
        tpp.poll       = t.poll;
        tpp.prio       = params.prio;

-        struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+        struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
        if (!threadpool) {
            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
            exit(1);
@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {

        llama_free(ctx);

-        ggml_threadpool_free(threadpool);
+        ggml_threadpool_free_fn(threadpool);
    }

    llama_free_model(lmodel);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -165,6 +165,10 @@ int main(int argc, char ** argv) {

    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

+    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+
    struct ggml_threadpool_params tpp_batch =
            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
    struct ggml_threadpool_params tpp =
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {

    struct ggml_threadpool * threadpool_batch = NULL;
    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
-        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
        if (!threadpool_batch) {
            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
            return 1;
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
        tpp.paused = true;
    }

-    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
    if (!threadpool) {
        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
        return 1;
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {

    llama_backend_free();

-    ggml_threadpool_free(threadpool);
-    ggml_threadpool_free(threadpool_batch);
+    ggml_threadpool_free_fn(threadpool);
+    ggml_threadpool_free_fn(threadpool_batch);

    return 0;
 }
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
        }
    }, nullptr);

+    // load dynamic backends
+    ggml_backend_load_all();
+
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
        }
    }

+    // load dynamic backends
+
+    ggml_backend_load_all();
+
    // initialize the model

    llama_model_params model_params = llama_model_default_params();