ggml : add support for dynamic loading of backends (#10469)

* ggml : add support for dynamic loading of backends --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-11-25 15:13:39 +01:00 · 2024-11-25 15:13:39 +01:00 · 5931c1f233
commit 5931c1f233
parent f6d12e7df8
44 changed files with 728 additions and 272 deletions
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -16,7 +16,6 @@


 #include <ggml.h>
-#include <ggml-cpu.h>
 #include <ggml-alloc.h>
 #include <ggml-backend.h>

@ -26,7 +25,6 @@
 #include <cstdint>
 #include <cstring>
 #include <cinttypes>
-#include <functional>
 #include <memory>
 #include <random>
 #include <stdio.h>
@ -639,19 +637,20 @@ struct test_case {

        // determine number of runs
        int n_runs;
+        bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
        if (op_flops(out) > 0) {
            // based on flops
            const uint64_t GFLOP = 1000 * 1000 * 1000;
            const uint64_t target_flops_cpu =   8ULL * GFLOP;
            const uint64_t target_flops_gpu = 100ULL * GFLOP;
-            uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
+            uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
        } else {
            // based on memory size
            const size_t GB = 1ULL << 30;
            const size_t target_size_cpu =  8 * GB;
            const size_t target_size_gpu = 32 * GB;
-            size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
+            size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
        }

@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
    if (mode == MODE_TEST) {
        auto test_cases = make_test_cases_eval();
-        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+        if (backend_cpu == NULL) {
+            printf("  Failed to initialize CPU backend\n");
+            return false;
+        }

        size_t n_ok = 0;
        for (auto & test : test_cases) {
@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
        }
    }

-    // enumerate backends
+    // load and enumerate backends
+    ggml_backend_load_all();
+
    printf("Testing %zu devices\n\n", ggml_backend_dev_count());

    size_t n_ok = 0;
@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
            continue;
        }

-        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
-        GGML_ASSERT(backend != NULL);
-
-        if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
+        if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
            printf("  Skipping CPU backend\n");
-            ggml_backend_free(backend);
            n_ok++;
            continue;
        }

+        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+        GGML_ASSERT(backend != NULL);
+
        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
        if (ggml_backend_set_n_threads_fn) {