Merge remote-tracking branch 'upstream/master'

2024-11-26 07:10:23 +00:00 · 2024-11-26 07:10:23 +00:00 · cf6b987be3
commit cf6b987be3
parent 1c79893ca2 0eb4e12bee
69 changed files with 2029 additions and 831 deletions
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@ -24,7 +24,8 @@ body:
  - type: dropdown
    id: operating-system
    attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
@ -41,14 +42,17 @@ body:
        description: Which GGML backends do you know to be affected?
        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
        multiple: true
    validations:
      required: true
  - type: textarea
-    id: steps_to_reproduce
+    id: info
    attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
      description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
      placeholder: >
        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
        Here are the exact commands that I used: ...
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@ -26,7 +26,8 @@ body:
  - type: dropdown
    id: operating-system
    attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
@ -43,6 +44,8 @@ body:
        description: Which GGML backends do you know to be affected?
        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
        multiple: true
    validations:
      required: true
  - type: textarea
    id: hardware
    attributes:
@ -55,20 +58,20 @@ body:
  - type: textarea
    id: model
    attributes:
-      label: Model
+      label: Models
      description: >
-        Which model at which quantization were you using when encountering the bug?
+        Which model(s) at which quantization were you using when encountering the bug?
        If you downloaded a GGUF file off of Huggingface, please provide a link.
      placeholder: >
        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
    validations:
      required: false
  - type: textarea
-    id: steps_to_reproduce
+    id: info
    attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
      description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
        that information would be very much appreciated by us.
      placeholder: >
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@ -14,7 +14,7 @@ body:
    id: version
    attributes:
      label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
+      description: Which version of our software is affected? (You can use `--version` to get a version string.)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
@ -24,7 +24,8 @@ body:
  - type: dropdown
    id: operating-system
    attributes:
-      label: Which operating systems do you know to be affected?
+      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
@ -33,28 +34,30 @@ body:
        - BSD
        - Other? (Please let us know in description)
    validations:
-      required: true
+      required: false
  - type: dropdown
    id: module
    attributes:
      label: Which llama.cpp modules do you know to be affected?
      multiple: true
      options:
        - Documentation/Github
        - libllama (core library)
        - llama-cli
        - llama-server
        - llama-bench
        - llama-quantize
        - Python/Bash scripts
        - Test code
        - Other (Please specify in the next section)
    validations:
-      required: true
+      required: false
  - type: textarea
-    id: steps_to_reproduce
+    id: info
    attributes:
-      label: Steps to Reproduce
+      label: Problem description & steps to reproduce
      description: >
-        Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it.
+        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
    validations:
      required: true
  - type: textarea
@ -62,7 +65,7 @@ body:
    attributes:
      label: First Bad Commit
      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
+        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
@ -71,8 +74,8 @@ body:
    attributes:
      label: Relevant log output
      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
+          If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
-      required: true
+      required: false
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -952,7 +952,7 @@ jobs:
    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
@ -962,7 +962,8 @@ jobs:
          fetch-depth: 0
      - name: Install
-        run:  scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
      - name: Build
        id: cmake_build
@ -981,27 +982,34 @@ jobs:
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
-      - name: Pack artifacts
+      - name: Build the release package
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
        run: |
          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-      - name: Upload artifacts
+      - name: Upload the release package
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,12 +10,10 @@
 name: Publish Docker image
 on:
-  #pull_request:
+  workflow_dispatch: # allows manual triggering
-  push:
+  schedule:
-    branches:
+    # Rebuild daily rather than on every push because it is expensive
-      - master
+    - cron: '12 4 * * *'
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
  workflow_dispatch: # allows manual triggering, useful for debugging
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,7 +27,6 @@ permissions:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    #if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -163,8 +163,11 @@ if (GGML_TARGET_DEFINES)
    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
-
+# all public headers
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+set(LLAMA_PUBLIC_HEADERS
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
 set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 configure_package_config_file(
--- a/9
+++ b/9
@ -34,6 +34,7 @@ BUILD_TARGETS = \
 	llama-server \
 	llama-simple \
 	llama-simple-chat \
 	llama-run \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
@ -251,7 +252,7 @@ endif
 #
 # keep standard at C11 and C++11
-MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
@ -290,6 +291,7 @@ endif
 # some memory allocation are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
 	MK_CPPFLAGS += -D_GNU_SOURCE
 	MK_LDFLAGS  += -ldl
 endif
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
@ -1166,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-run: examples/run/run.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-simple: examples/simple/simple.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/Package.swift
+++ b/Package.swift
@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
        .define("GGML_USE_ACCELERATE"),
-        .define("GGML_USE_METAL")
+        .define("GGML_USE_METAL"),
        .define("GGML_USE_CPU")
    ]
 )
 #endif
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -298,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
    print_options(specific_options);
 }
 static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
    std::vector<ggml_backend_dev_t> devices;
    auto dev_names = string_split<std::string>(value, ',');
    if (dev_names.empty()) {
        throw std::invalid_argument("no devices specified");
    }
    if (dev_names.size() == 1 && dev_names[0] == "none") {
        devices.push_back(nullptr);
    } else {
        for (const auto & device : dev_names) {
            auto * dev = ggml_backend_dev_by_name(device.c_str());
            if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
                throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
            }
            devices.push_back(dev);
        }
        devices.push_back(nullptr);
    }
    return devices;
 }
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -324,6 +345,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 }
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    // load dynamic backends
    ggml_backend_load_all();
    common_params_context ctx_arg(params);
    ctx_arg.print_usage = print_usage;
    ctx_arg.ex          = ex;
@ -1312,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            else { throw std::invalid_argument("invalid value"); }
        }
    ).set_env("LLAMA_ARG_NUMA"));
    add_opt(common_arg(
        {"-dev", "--device"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading (none = don't offload)\n"
        "use --list-devices to see a list of available devices",
        [](common_params & params, const std::string & value) {
            params.devices = parse_device_list(value);
        }
    ).set_env("LLAMA_ARG_DEVICE"));
    add_opt(common_arg(
        {"--list-devices"},
        "print list of available devices and exit",
        [](common_params &) {
            printf("Available devices:\n");
            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                auto * dev = ggml_backend_dev_get(i);
                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
                    size_t free, total;
                    ggml_backend_dev_memory(dev, &free, &total);
                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
                }
            }
            exit(0);
        }
    ));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@ -1336,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            } else if (arg_next == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
            } else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
                exit(1);
 #endif // GGML_USE_SYCL
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
            } else {
                throw std::invalid_argument("invalid value");
@ -2042,6 +2086,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.n_ctx = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
        "use --list-devices to see a list of available devices",
        [](common_params & params, const std::string & value) {
            params.speculative.devices = parse_device_list(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
        "number of layers to store in VRAM for the draft model",
--- a/common/common.cpp
+++ b/common/common.cpp
@ -979,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
    }
 }
-struct llama_model_params common_model_params_to_llama(const common_params & params) {
+struct llama_model_params common_model_params_to_llama(common_params & params) {
    auto mparams = llama_model_default_params();
    if (!params.devices.empty()) {
        mparams.devices = params.devices.data();
    }
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
--- a/common/common.h
+++ b/common/common.h
@ -156,6 +156,7 @@ struct common_params_sampling {
 };
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@ -178,9 +179,6 @@ struct common_params {
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            =     1; // number of parallel sequences to decode
    int32_t n_sequences           =     1; // number of sequences to decode
    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@ -193,6 +191,13 @@ struct common_params {
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
    // offload params
    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
@ -201,7 +206,6 @@ struct common_params {
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
@ -462,7 +466,7 @@ struct common_init_result {
 struct common_init_result     common_init_from_params(common_params & params);
-struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -90,9 +90,10 @@ bool common_speculative_are_compatible(
    if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
-        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
    ) {
        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
        return false;
    }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -3040,9 +3040,9 @@ class OlmoModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
-@Model.register("Olmo1124ForCausalLM")
+@Model.register("Olmo2ForCausalLM")
-class Olmo1124Model(Model):
+class Olmo2Model(Model):
-    model_arch = gguf.MODEL_ARCH.OLMO_1124
+    model_arch = gguf.MODEL_ARCH.OLMO2
@Model.register("OlmoeForCausalLM")
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(cvector-generator)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
    add_subdirectory(export-lora)
    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
@ -27,29 +24,36 @@ else()
    add_subdirectory(imatrix)
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(main)
    add_subdirectory(parallel)
    add_subdirectory(passkey)
    add_subdirectory(perplexity)
    add_subdirectory(quantize-stats)
    add_subdirectory(quantize)
    add_subdirectory(retrieval)
    if (GGML_RPC)
        add_subdirectory(rpc)
    endif()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
    if (GGML_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(save-load-state)
    add_subdirectory(run)
    add_subdirectory(simple)
    add_subdirectory(simple-chat)
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(tokenize)
    if (NOT GGML_BACKEND_DL)
        # these examples use the backends directly and cannot be built with dynamic loading
        add_subdirectory(convert-llama2c-to-ggml)
        add_subdirectory(cvector-generator)
        add_subdirectory(export-lora)
        add_subdirectory(quantize-stats)
        add_subdirectory(llava)
        if (GGML_RPC)
            add_subdirectory(rpc)
        endif()
        if (GGML_SYCL)
            add_subdirectory(sycl)
        endif()
    endif()
 endif()
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET}
        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
    cmd_params params = parse_cmd_params(argc, argv);
    // initialize backends
    ggml_backend_load_all();
    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
    if (!cpu_dev) {
        fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
        return 1;
    }
    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
    // initialize llama.cpp
    if (!params.verbose) {
        llama_log_set(llama_null_log_callback, NULL);
@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
        tpp.poll       = t.poll;
        tpp.prio       = params.prio;
-        struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+        struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
        if (!threadpool) {
            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
            exit(1);
@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
        llama_free(ctx);
-        ggml_threadpool_free(threadpool);
+        ggml_threadpool_free_fn(threadpool);
    }
    llama_free_model(lmodel);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
    struct ggml_threadpool_params tpp_batch =
            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
    struct ggml_threadpool_params tpp =
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
    struct ggml_threadpool * threadpool_batch = NULL;
    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
-        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
        if (!threadpool_batch) {
            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
            return 1;
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
        tpp.paused = true;
    }
-    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
    if (!threadpool) {
        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
        return 1;
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
    llama_backend_free();
-    ggml_threadpool_free(threadpool);
+    ggml_threadpool_free_fn(threadpool);
-    ggml_threadpool_free(threadpool_batch);
+    ggml_threadpool_free_fn(threadpool_batch);
    return 0;
 }
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@ -0,0 +1,7 @@
 # llama.cpp/example/run
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
 ```bash
 ./llama-run Meta-Llama-3.1-8B-Instruct.gguf
 ...
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -0,0 +1,409 @@
 #if defined(_WIN32)
 #include <windows.h>
 #else
 #include <unistd.h>
 #endif
 #include <climits>
 #include <cstdio>
 #include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "llama-cpp.h"
 typedef std::unique_ptr<char[]> char_array_ptr;
 struct Argument {
    std::string flag;
    std::string help_text;
 };
 struct Options {
    std::string model_path, prompt_non_interactive;
    int ngl = 99;
    int n_ctx = 2048;
 };
 class ArgumentParser {
   public:
    ArgumentParser(const char * program_name) : program_name(program_name) {}
    void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
        string_args[flag] = &var;
        arguments.push_back({flag, help_text});
    }
    void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
        int_args[flag] = &var;
        arguments.push_back({flag, help_text});
    }
    int parse(int argc, const char ** argv) {
        for (int i = 1; i < argc; ++i) {
            std::string arg = argv[i];
            if (string_args.count(arg)) {
                if (i + 1 < argc) {
                    *string_args[arg] = argv[++i];
                } else {
                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
                    print_usage();
                    return 1;
                }
            } else if (int_args.count(arg)) {
                if (i + 1 < argc) {
                    if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
                        fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
                        print_usage();
                        return 1;
                    }
                } else {
                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
                    print_usage();
                    return 1;
                }
            } else {
                fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
                print_usage();
                return 1;
            }
        }
        if (string_args["-m"]->empty()) {
            fprintf(stderr, "error: -m is required\n");
            print_usage();
            return 1;
        }
        return 0;
    }
   private:
    const char * program_name;
    std::unordered_map<std::string, std::string *> string_args;
    std::unordered_map<std::string, int *> int_args;
    std::vector<Argument> arguments;
    int parse_int_arg(const char * arg, int & value) {
        char * end;
        const long val = std::strtol(arg, &end, 10);
        if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
            value = static_cast<int>(val);
            return 0;
        }
        return 1;
    }
    void print_usage() const {
        printf("\nUsage:\n");
        printf("  %s [OPTIONS]\n\n", program_name);
        printf("Options:\n");
        for (const auto & arg : arguments) {
            printf("  %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
        }
        printf("\n");
    }
 };
 class LlamaData {
   public:
    llama_model_ptr model;
    llama_sampler_ptr sampler;
    llama_context_ptr context;
    std::vector<llama_chat_message> messages;
    int init(const Options & opt) {
        model = initialize_model(opt.model_path, opt.ngl);
        if (!model) {
            return 1;
        }
        context = initialize_context(model, opt.n_ctx);
        if (!context) {
            return 1;
        }
        sampler = initialize_sampler();
        return 0;
    }
   private:
    // Initializes the model and returns a unique pointer to it
    llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
        llama_model_params model_params = llama_model_default_params();
        model_params.n_gpu_layers = ngl;
        llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
        if (!model) {
            fprintf(stderr, "%s: error: unable to load model\n", __func__);
        }
        return model;
    }
    // Initializes the context with the specified parameters
    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
        llama_context_params ctx_params = llama_context_default_params();
        ctx_params.n_ctx = n_ctx;
        ctx_params.n_batch = n_ctx;
        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
        if (!context) {
            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
        }
        return context;
    }
    // Initializes and configures the sampler
    llama_sampler_ptr initialize_sampler() {
        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
        return sampler;
    }
 };
 // Add a message to `messages` and store its content in `owned_content`
 static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
                        std::vector<char_array_ptr> & owned_content) {
    char_array_ptr content(new char[text.size() + 1]);
    std::strcpy(content.get(), text.c_str());
    llama_data.messages.push_back({role, content.get()});
    owned_content.push_back(std::move(content));
 }
 // Function to apply the chat template and resize `formatted` if needed
 static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
    int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
    if (result > static_cast<int>(formatted.size())) {
        formatted.resize(result);
        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
    }
    return result;
 }
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
                           std::vector<llama_token> & prompt_tokens) {
    const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
    prompt_tokens.resize(n_prompt_tokens);
    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
                       true) < 0) {
        GGML_ABORT("failed to tokenize the prompt\n");
    }
    return n_prompt_tokens;
 }
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
    const int n_ctx = llama_n_ctx(ctx.get());
    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
    if (n_ctx_used + batch.n_tokens > n_ctx) {
        printf("\033[0m\n");
        fprintf(stderr, "context size exceeded\n");
        return 1;
    }
    return 0;
 }
 // convert the token to a string
 static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
    char buf[256];
    int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
    if (n < 0) {
        GGML_ABORT("failed to convert token to piece\n");
    }
    piece = std::string(buf, n);
    return 0;
 }
 static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
    printf("%s", piece.c_str());
    fflush(stdout);
    response += piece;
 }
 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
    std::vector<llama_token> prompt_tokens;
    const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
    if (n_prompt_tokens < 0) {
        return 1;
    }
    // prepare a batch for the prompt
    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
    llama_token new_token_id;
    while (true) {
        check_context_size(llama_data.context, batch);
        if (llama_decode(llama_data.context.get(), batch)) {
            GGML_ABORT("failed to decode\n");
        }
        // sample the next token, check is it an end of generation?
        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
        if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
            break;
        }
        std::string piece;
        if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
            return 1;
        }
        print_word_and_concatenate_to_response(piece, response);
        // prepare the next batch with the sampled token
        batch = llama_batch_get_one(&new_token_id, 1);
    }
    return 0;
 }
 static int parse_arguments(const int argc, const char ** argv, Options & opt) {
    ArgumentParser parser(argv[0]);
    parser.add_argument("-m", opt.model_path, "model");
    parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
    parser.add_argument("-c", opt.n_ctx, "context_size");
    parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
    if (parser.parse(argc, argv)) {
        return 1;
    }
    return 0;
 }
 static int read_user_input(std::string & user) {
    std::getline(std::cin, user);
    return user.empty();  // Indicate an error or empty input
 }
 // Function to generate a response based on the prompt
 static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
    // Set response color
    printf("\033[33m");
    if (generate(llama_data, prompt, response)) {
        fprintf(stderr, "failed to generate response\n");
        return 1;
    }
    // End response with color reset and newline
    printf("\n\033[0m");
    return 0;
 }
 // Helper function to apply the chat template and handle errors
 static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
                                                   const bool is_user_input, int & output_length) {
    const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
    if (new_len < 0) {
        fprintf(stderr, "failed to apply the chat template\n");
        return -1;
    }
    output_length = new_len;
    return 0;
 }
 // Helper function to handle user input
 static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
    if (!prompt_non_interactive.empty()) {
        user_input = prompt_non_interactive;
        return true;  // No need for interactive input
    }
    printf("\033[32m> \033[0m");
    return !read_user_input(user_input);  // Returns false if input ends the loop
 }
 // Function to tokenize the prompt
 static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
    std::vector<char_array_ptr> owned_content;
    std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
    int prev_len = 0;
    while (true) {
        // Get user input
        std::string user_input;
        if (!handle_user_input(user_input, prompt_non_interactive)) {
            break;
        }
        add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
                    owned_content);
        int new_len;
        if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
            return 1;
        }
        std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
        std::string response;
        if (generate_response(llama_data, prompt, response)) {
            return 1;
        }
    }
    return 0;
 }
 static void log_callback(const enum ggml_log_level level, const char * text, void *) {
    if (level == GGML_LOG_LEVEL_ERROR) {
        fprintf(stderr, "%s", text);
    }
 }
 static bool is_stdin_a_terminal() {
 #if defined(_WIN32)
    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
    DWORD mode;
    return GetConsoleMode(hStdin, &mode);
 #else
    return isatty(STDIN_FILENO);
 #endif
 }
 static std::string read_pipe_data() {
    std::ostringstream result;
    result << std::cin.rdbuf();  // Read all data from std::cin
    return result.str();
 }
 int main(int argc, const char ** argv) {
    Options opt;
    if (parse_arguments(argc, argv, opt)) {
        return 1;
    }
    if (!is_stdin_a_terminal()) {
        if (!opt.prompt_non_interactive.empty()) {
            opt.prompt_non_interactive += "\n\n";
        }
        opt.prompt_non_interactive += read_pipe_data();
    }
    llama_log_set(log_callback, nullptr);
    LlamaData llama_data;
    if (llama_data.init(opt)) {
        return 1;
    }
    if (chat_loop(llama_data, opt.prompt_non_interactive)) {
        return 1;
    }
    return 0;
 }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -412,7 +412,7 @@ node index.js
    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -81,7 +81,13 @@
              <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
            </svg>
          </button>
-
+          <button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
              <!-- download conversation button -->
              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
                  <path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
                  <path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
            </svg>
          </button>
          <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
            <!-- edit config button -->
            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
@ -526,6 +532,23 @@
            this.fetchMessages();
          }
        },
        downloadConv(convId) {
          const conversation = StorageUtils.getOneConversation(convId);
          if (!conversation) {
            alert('Conversation not found.');
            return;
          }
          const conversationJson = JSON.stringify(conversation, null, 2);
          const blob = new Blob([conversationJson], { type: 'application/json' });
          const url = URL.createObjectURL(blob);
          const a = document.createElement('a');
          a.href = url;
          a.download = `conversation_${convId}.json`;
          document.body.appendChild(a);
          a.click();
          document.body.removeChild(a);
          URL.revokeObjectURL(url);
        },
        async sendMessage() {
          if (!this.inputMsg) return;
          const currConvId = this.viewingConvId;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2,10 +2,11 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
 #include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
 #include "log.h"
 #include "sampling.h"
 #include "speculative.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@ -110,7 +111,7 @@ struct server_static_file {
 struct slot_params {
    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@ -121,12 +122,21 @@ struct slot_params {
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<std::string> antiprompt;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
 };
 struct server_slot {
    int id;
    int id_task = -1;
    llama_batch batch_spec;
    llama_context * ctx_dft = nullptr;
    common_speculative * spec = nullptr;
    // the index relative to completion multi-task request
    size_t index = 0;
@ -175,7 +185,6 @@ struct server_slot {
    // sampling
    json json_schema;
    struct common_params_sampling sparams;
    struct common_sampler * smpl = nullptr;
    llama_token sampled;
@ -212,7 +221,7 @@ struct server_slot {
        generated_token_probs.clear();
    }
-    bool has_budget(common_params &global_params) {
+    bool has_budget(const common_params & global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1) {
            return true; // limitless
        }
@ -232,6 +241,10 @@ struct server_slot {
        return state != SLOT_STATE_IDLE;
    }
    bool can_speculate() const {
        return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
    }
    void add_token(const completion_token_output & token) {
        if (!is_processing()) {
            SLT_WRN(*this, "%s", "slot is not processing\n");
@ -591,11 +604,14 @@ struct server_response {
 };
 struct server_context {
    common_params params_base;
    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
    std::vector<common_lora_adapter_container> loras;
-    common_params params;
+    llama_model * model_dft = nullptr;
    llama_context_params cparams_dft;
    llama_batch batch = {};
@ -628,27 +644,41 @@ struct server_context {
            model = nullptr;
        }
        if (model_dft) {
            llama_free_model(model_dft);
            model_dft = nullptr;
        }
        // Clear any sampling context
        for (server_slot & slot : slots) {
            if (slot.smpl != nullptr) {
            common_sampler_free(slot.smpl);
-            }
+            slot.smpl = nullptr;
            llama_free(slot.ctx_dft);
            slot.ctx_dft = nullptr;
            common_speculative_free(slot.spec);
            slot.spec = nullptr;
            llama_batch_free(slot.batch_spec);
        }
        llama_batch_free(batch);
    }
-    bool load_model(const common_params & params_) {
+    bool load_model(const common_params & params) {
-        params = params_;
+        SRV_INF("loading model '%s'\n", params.model.c_str());
-        common_init_result llama_init = common_init_from_params(params);
+        params_base = params;
        common_init_result llama_init = common_init_from_params(params_base);
        model = llama_init.model;
        ctx   = llama_init.context;
        loras = llama_init.lora_adapters;
        if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
            return false;
        }
@ -657,6 +687,41 @@ struct server_context {
        add_bos_token = llama_add_bos_token(model);
        has_eos_token = !llama_add_eos_token(model);
        if (!params_base.speculative.model.empty()) {
            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
            auto params_dft = params_base;
            params_dft.devices      = params_base.speculative.devices;
            params_dft.model        = params_base.speculative.model;
            params_dft.n_ctx        = params_base.speculative.n_ctx;
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
            common_init_result llama_init_dft = common_init_from_params(params_dft);
            model_dft = llama_init_dft.model;
            if (model_dft == nullptr) {
                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
                return false;
            }
            if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
                llama_free      (llama_init_dft.context);
                llama_free_model(llama_init_dft.model);
                return false;
            }
            cparams_dft = common_context_params_to_llama(params_base);
            cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
            // the context is not needed - we will create one for each slot
            llama_free(llama_init_dft.context);
        }
        return true;
    }
@ -674,20 +739,36 @@ struct server_context {
    }
    void init() {
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+        const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
-        SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
+        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
-        for (int i = 0; i < params.n_parallel; i++) {
+        for (int i = 0; i < params_base.n_parallel; i++) {
            server_slot slot;
            slot.id = i;
            slot.n_ctx = n_ctx_slot;
-            slot.n_predict = params.n_predict;
+            slot.n_predict = params_base.n_predict;
            if (model_dft) {
                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
                slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
                if (slot.ctx_dft == nullptr) {
                    SRV_ERR("%s", "failed to create draft context\n");
                    return;
                }
                slot.spec = common_speculative_init(slot.ctx_dft);
                if (slot.spec == nullptr) {
                    SRV_ERR("%s", "failed to create speculator\n");
                    return;
                }
            }
            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
-            slot.sparams = params.sampling;
+            slot.params.sampling = params_base.sampling;
            slot.callback_on_release = [this](int) {
                queue_tasks.pop_deferred_task();
@ -707,7 +788,7 @@ struct server_context {
            const int32_t n_batch = llama_n_batch(ctx);
            // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
        }
        metrics.init();
@ -786,9 +867,11 @@ struct server_context {
    }
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sampling;
+        slot_params defaults;
        defaults.sampling    = params_base.sampling;
        defaults.speculative = params_base.speculative;
        const auto & data = task.data;
        if (data.count("__oaicompat") != 0) {
@ -800,41 +883,47 @@ struct server_context {
        }
        slot.params.stream           = json_value(data, "stream",             false);
-        slot.params.cache_prompt        = json_value(data, "cache_prompt",       false);
+        slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
-        slot.params.n_predict           = json_value(data, "n_predict",          json_value(data, "max_tokens", default_params.n_predict));
+        slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
-        slot.params.n_indent            = json_value(data, "n_indent",           default_params.n_indent);
+        slot.params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
-        slot.sparams.top_k              = json_value(data, "top_k",              default_sparams.top_k);
+        slot.params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
-        slot.sparams.top_p              = json_value(data, "top_p",              default_sparams.top_p);
+        slot.params.n_discard        = json_value(data, "n_discard",          defaults.n_discard);
-        slot.sparams.min_p              = json_value(data, "min_p",              default_sparams.min_p);
+      //slot.params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
-        slot.sparams.xtc_probability    = json_value(data, "xtc_probability",    default_sparams.xtc_probability);
+        slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms);
        slot.sparams.xtc_threshold      = json_value(data, "xtc_threshold",      default_sparams.xtc_threshold);
        slot.sparams.typ_p              = json_value(data, "typical_p",          default_sparams.typ_p);
        slot.sparams.temp               = json_value(data, "temperature",        default_sparams.temp);
        slot.sparams.dynatemp_range     = json_value(data, "dynatemp_range",     default_sparams.dynatemp_range);
        slot.sparams.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  default_sparams.dynatemp_exponent);
        slot.sparams.penalty_last_n     = json_value(data, "repeat_last_n",      default_sparams.penalty_last_n);
        slot.sparams.penalty_repeat     = json_value(data, "repeat_penalty",     default_sparams.penalty_repeat);
        slot.sparams.penalty_freq       = json_value(data, "frequency_penalty",  default_sparams.penalty_freq);
        slot.sparams.penalty_present    = json_value(data, "presence_penalty",   default_sparams.penalty_present);
        slot.sparams.dry_multiplier     = json_value(data, "dry_multiplier",     default_sparams.dry_multiplier);
        slot.sparams.dry_base           = json_value(data, "dry_base",           default_sparams.dry_base);
        slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
        slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
        slot.sparams.mirostat           = json_value(data, "mirostat",           default_sparams.mirostat);
        slot.sparams.mirostat_tau       = json_value(data, "mirostat_tau",       default_sparams.mirostat_tau);
        slot.sparams.mirostat_eta       = json_value(data, "mirostat_eta",       default_sparams.mirostat_eta);
        slot.sparams.penalize_nl        = json_value(data, "penalize_nl",        default_sparams.penalize_nl);
        slot.params.n_keep              = json_value(data, "n_keep",             default_params.n_keep);
        slot.params.n_discard           = json_value(data, "n_discard",          default_params.n_discard);
        slot.sparams.seed               = json_value(data, "seed",               default_sparams.seed);
        slot.sparams.n_probs            = json_value(data, "n_probs",            default_sparams.n_probs);
        slot.sparams.min_keep           = json_value(data, "min_keep",           default_sparams.min_keep);
      //slot.params.t_max_prompt_ms     = json_value(data, "t_max_prompt_ms",    default_params.t_max_prompt_ms); // TODO: implement
        slot.params.t_max_predict_ms    = json_value(data, "t_max_predict_ms",   default_params.t_max_predict_ms);
-        if (slot.sparams.dry_base < 1.0f)
+        slot.params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k);
-        {
+        slot.params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p);
-           slot.sparams.dry_base = default_sparams.dry_base;
+        slot.params.sampling.min_p              = json_value(data, "min_p",              defaults.sampling.min_p);
        slot.params.sampling.xtc_probability    = json_value(data, "xtc_probability",    defaults.sampling.xtc_probability);
        slot.params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",      defaults.sampling.xtc_threshold);
        slot.params.sampling.typ_p              = json_value(data, "typical_p",          defaults.sampling.typ_p);
        slot.params.sampling.temp               = json_value(data, "temperature",        defaults.sampling.temp);
        slot.params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",     defaults.sampling.dynatemp_range);
        slot.params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  defaults.sampling.dynatemp_exponent);
        slot.params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",      defaults.sampling.penalty_last_n);
        slot.params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",     defaults.sampling.penalty_repeat);
        slot.params.sampling.penalty_freq       = json_value(data, "frequency_penalty",  defaults.sampling.penalty_freq);
        slot.params.sampling.penalty_present    = json_value(data, "presence_penalty",   defaults.sampling.penalty_present);
        slot.params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",     defaults.sampling.dry_multiplier);
        slot.params.sampling.dry_base           = json_value(data, "dry_base",           defaults.sampling.dry_base);
        slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
        slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
        slot.params.sampling.mirostat           = json_value(data, "mirostat",           defaults.sampling.mirostat);
        slot.params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",       defaults.sampling.mirostat_tau);
        slot.params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",       defaults.sampling.mirostat_eta);
        slot.params.sampling.penalize_nl        = json_value(data, "penalize_nl",        defaults.sampling.penalize_nl);
        slot.params.sampling.seed               = json_value(data, "seed",               defaults.sampling.seed);
        slot.params.sampling.n_probs            = json_value(data, "n_probs",            defaults.sampling.n_probs);
        slot.params.sampling.min_keep           = json_value(data, "min_keep",           defaults.sampling.min_keep);
        slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
        slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
        slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
        slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
        if (slot.params.sampling.dry_base < 1.0f) {
           slot.params.sampling.dry_base = defaults.sampling.dry_base;
        }
        // sequence breakers for DRY
@ -843,8 +932,8 @@ struct server_context {
            // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
            if (data.contains("dry_sequence_breakers")) {
-                slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+                slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
-                if (slot.sparams.dry_sequence_breakers.empty()) {
+                if (slot.params.sampling.dry_sequence_breakers.empty()) {
                    send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
                    return false;
                }
@ -859,13 +948,13 @@ struct server_context {
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                  = json_value(data, "json_schema", json::object());
-                slot.sparams.grammar = json_schema_to_grammar(schema);
+                slot.params.sampling.grammar = json_schema_to_grammar(schema);
            } catch (const std::exception & e) {
                send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
                return false;
            }
        } else {
-            slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
+            slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
        }
        if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@ -875,10 +964,10 @@ struct server_context {
        }
        {
-            slot.sparams.logit_bias.clear();
+            slot.params.sampling.logit_bias.clear();
            if (json_value(data, "ignore_eos", false) && has_eos_token) {
-                slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+                slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
            }
            const auto & logit_bias = data.find("logit_bias");
@ -899,12 +988,12 @@ struct server_context {
                        if (el[0].is_number_integer()) {
                            llama_token tok = el[0].get<llama_token>();
                            if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                            }
                        } else if (el[0].is_string()) {
                            auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                            for (auto tok : toks) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                            }
                        }
                    }
@ -935,16 +1024,16 @@ struct server_context {
                            sampler_names.emplace_back(name);
                        }
                    }
-                    slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
+                    slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
                } else if (samplers->is_string()){
                    std::string sampler_string;
                    for (const auto & name : *samplers) {
                        sampler_string += name;
                    }
-                    slot.sparams.samplers = common_sampler_types_from_chars(sampler_string);
+                    slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
                }
            } else {
-                slot.sparams.samplers = default_sparams.samplers;
+                slot.params.sampling.samplers = defaults.sampling.samplers;
            }
        }
@ -953,7 +1042,7 @@ struct server_context {
                common_sampler_free(slot.smpl);
            }
-            slot.smpl = common_sampler_init(model, slot.sparams);
+            slot.smpl = common_sampler_init(model, slot.params.sampling);
            if (slot.smpl == nullptr) {
                // for now, the only error that may happen here is invalid grammar
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@ -961,6 +1050,12 @@ struct server_context {
            }
        }
        if (slot.ctx_dft) {
            llama_batch_free(slot.batch_spec);
            slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
        }
        slot.state = SLOT_STATE_STARTED;
        SLT_INF(slot, "%s", "processing task\n");
@ -978,7 +1073,7 @@ struct server_context {
    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
+        const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
        slot.sampled = result.tok;
        // search stop word and delete it
@ -1043,7 +1138,7 @@ struct server_context {
        }
        // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
            slot.stopped_limit  = true;
            slot.has_next_token = false;
@ -1136,50 +1231,54 @@ struct server_context {
    json get_formated_generation(const server_slot & slot) const {
        std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
+        samplers.reserve(slot.params.sampling.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers) {
+        for (const auto & sampler : slot.params.sampling.samplers) {
            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }
        return json {
            {"n_ctx",                     slot.n_ctx},
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
-            {"model",                     params.model_alias},
+            {"model",                     params_base.model_alias},
-            {"seed",                      slot.sparams.seed},
+            {"seed",                      slot.params.sampling.seed},
            {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
-            {"temperature",               slot.sparams.temp},
+            {"temperature",               slot.params.sampling.temp},
-            {"dynatemp_range",            slot.sparams.dynatemp_range},
+            {"dynatemp_range",            slot.params.sampling.dynatemp_range},
-            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
+            {"dynatemp_exponent",         slot.params.sampling.dynatemp_exponent},
-            {"top_k",                     slot.sparams.top_k},
+            {"top_k",                     slot.params.sampling.top_k},
-            {"top_p",                     slot.sparams.top_p},
+            {"top_p",                     slot.params.sampling.top_p},
-            {"min_p",                     slot.sparams.min_p},
+            {"min_p",                     slot.params.sampling.min_p},
-            {"xtc_probability",           slot.sparams.xtc_probability},
+            {"xtc_probability",           slot.params.sampling.xtc_probability},
-            {"xtc_threshold",             slot.sparams.xtc_threshold},
+            {"xtc_threshold",             slot.params.sampling.xtc_threshold},
-            {"typical_p",                 slot.sparams.typ_p},
+            {"typical_p",                 slot.params.sampling.typ_p},
-            {"repeat_last_n",             slot.sparams.penalty_last_n},
+            {"repeat_last_n",             slot.params.sampling.penalty_last_n},
-            {"repeat_penalty",            slot.sparams.penalty_repeat},
+            {"repeat_penalty",            slot.params.sampling.penalty_repeat},
-            {"presence_penalty",          slot.sparams.penalty_present},
+            {"presence_penalty",          slot.params.sampling.penalty_present},
-            {"frequency_penalty",         slot.sparams.penalty_freq},
+            {"frequency_penalty",         slot.params.sampling.penalty_freq},
-            {"dry_multiplier",            slot.sparams.dry_multiplier},
+            {"dry_multiplier",            slot.params.sampling.dry_multiplier},
-            {"dry_base",                  slot.sparams.dry_base},
+            {"dry_base",                  slot.params.sampling.dry_base},
-            {"dry_allowed_length",        slot.sparams.dry_allowed_length},
+            {"dry_allowed_length",        slot.params.sampling.dry_allowed_length},
-            {"dry_penalty_last_n",        slot.sparams.dry_penalty_last_n},
+            {"dry_penalty_last_n",        slot.params.sampling.dry_penalty_last_n},
-            {"dry_sequence_breakers",     slot.sparams.dry_sequence_breakers},
+            {"dry_sequence_breakers",     slot.params.sampling.dry_sequence_breakers},
-            {"mirostat",                  slot.sparams.mirostat},
+            {"mirostat",                  slot.params.sampling.mirostat},
-            {"mirostat_tau",              slot.sparams.mirostat_tau},
+            {"mirostat_tau",              slot.params.sampling.mirostat_tau},
-            {"mirostat_eta",              slot.sparams.mirostat_eta},
+            {"mirostat_eta",              slot.params.sampling.mirostat_eta},
-            {"penalize_nl",               slot.sparams.penalize_nl},
+            {"penalize_nl",               slot.params.sampling.penalize_nl},
            {"stop",                      slot.params.antiprompt},
            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
            {"n_keep",                    slot.params.n_keep},
            {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                slot.sparams.ignore_eos},
+            {"ignore_eos",                slot.params.sampling.ignore_eos},
            {"stream",                    slot.params.stream},
-          //{"logit_bias",                slot.sparams.logit_bias},
+          //{"logit_bias",                slot.params.sampling.logit_bias},
-            {"n_probs",                   slot.sparams.n_probs},
+            {"n_probs",                   slot.params.sampling.n_probs},
-            {"min_keep",                  slot.sparams.min_keep},
+            {"min_keep",                  slot.params.sampling.min_keep},
-            {"grammar",                   slot.sparams.grammar},
+            {"grammar",                   slot.params.sampling.grammar},
            {"samplers",                  samplers},
            {"speculative",               slot.can_speculate()},
            {"speculative.n_max",         slot.params.speculative.n_max},
            {"speculative.n_min",         slot.params.speculative.n_min},
            {"speculative.p_min",         slot.params.speculative.p_min},
        };
    }
@ -1216,7 +1315,7 @@ struct server_context {
            {"index",      slot.index},
        };
-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
            const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
            const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
@ -1249,7 +1348,7 @@ struct server_context {
            {"content",             !slot.params.stream ? slot.generated_text : ""},
            {"id_slot",             slot.id},
            {"stop",                true},
-            {"model",               params.model_alias},
+            {"model",               params_base.model_alias},
            {"tokens_predicted",    slot.n_decoded},
            {"tokens_evaluated",    slot.n_prompt_tokens},
            {"generation_settings", get_formated_generation(slot)},
@ -1265,7 +1364,7 @@ struct server_context {
            {"index",               slot.index},
        };
-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
            std::vector<completion_token_output> probs;
            if (!slot.params.stream && slot.stopped_word) {
                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@ -1422,10 +1521,10 @@ struct server_context {
                            data.at("input_prefix"),
                            data.at("input_suffix"),
                            data.at("input_extra"),
-                            params.n_batch,
+                            params_base.n_batch,
-                            params.n_predict,
+                            params_base.n_predict,
                            slots[0].n_ctx, // TODO: there should be a better way
-                            params.spm_infill,
+                            params_base.spm_infill,
                            tokenized_prompts[i]
                        );
                        create_task(data, tokens);
@ -1798,7 +1897,7 @@ struct server_context {
        // TODO: simplify and improve
        for (server_slot & slot : slots) {
            if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
-                if (!params.ctx_shift) {
+                if (!params_base.ctx_shift) {
                    // this check is redundant (for good)
                    // we should never get here, because generation should already stopped in process_token()
                    slot.release();
@ -1864,7 +1963,7 @@ struct server_context {
        int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
        // next, batch any pending prompts without exceeding n_batch
-        if (params.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch.n_tokens == 0) {
            for (auto & slot : slots) {
                // this slot still has a prompt to be processed
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
@ -1917,7 +2016,7 @@ struct server_context {
                                continue;
                            }
                        } else {
-                            if (!params.ctx_shift) {
+                            if (!params_base.ctx_shift) {
                                // if context shift is disabled, we make sure prompt size is smaller than KV size
                                // TODO: there should be a separate parameter that control prompt truncation
                                //       context shift should be applied only during the generation phase
@ -1963,11 +2062,11 @@ struct server_context {
                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
-                                if (params.n_cache_reuse > 0) {
+                                if (params_base.n_cache_reuse > 0) {
                                    size_t head_c = slot.n_past; // cache
                                    size_t head_p = slot.n_past; // current prompt
-                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
                                    while (head_c < slot.cache_tokens.size() &&
                                           head_p < prompt_tokens.size()) {
@ -1980,7 +2079,7 @@ struct server_context {
                                            n_match++;
                                        }
-                                        if (n_match >= (size_t) params.n_cache_reuse) {
+                                        if (n_match >= (size_t) params_base.n_cache_reuse) {
                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@ -2168,8 +2267,14 @@ struct server_context {
                    continue; // continue loop of slots
                }
                llama_token id;
                {
                    completion_token_output result;
-                const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+
                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
                    slot.i_batch = -1;
                    common_sampler_accept(slot.smpl, id, true);
@ -2184,7 +2289,7 @@ struct server_context {
                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
+                    for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
                        result.probs.push_back({
                            cur_p->data[i].id,
                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
@ -2197,9 +2302,64 @@ struct server_context {
                        slot.print_timings();
                        send_final_response(slot);
                        metrics.on_prediction(slot);
                        continue;
                    }
                }
-                slot.i_batch = -1;
+                // check if the slot supports speculative decoding
                if (!slot.can_speculate()) {
                    continue;
                }
                struct common_speculative_params params_spec;
                params_spec.n_draft   = slot.params.speculative.n_max;
                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
                params_spec.p_min     = slot.params.speculative.p_min;
                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
                // ignore small drafts
                if (slot.params.speculative.n_min > (int) draft.size()) {
                    continue;
                }
                // construct the speculation batch
                common_batch_clear(slot.batch_spec);
                common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
                for (size_t i = 0; i < draft.size(); ++i) {
                    common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
                }
                llama_decode(ctx, slot.batch_spec);
                // the accepted tokens from the speculation
                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
                slot.n_past    += ids.size();
                slot.n_decoded += ids.size();
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
                for (size_t i = 0; i < ids.size(); ++i) {
                    completion_token_output result;
                    result.tok = ids[i];
                    if (!process_token(result, slot)) {
                        // release slot because of stop condition
                        slot.release();
                        slot.print_timings();
                        send_final_response(slot);
                        metrics.on_prediction(slot);
                        break;
                    }
                }
                SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
            }
        }
@ -2697,7 +2857,7 @@ int main(int argc, char ** argv) {
    const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
        json data = {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params.n_parallel },
+            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "chat_template",               llama_get_chat_template(ctx_server.model) },
        };
@ -2705,7 +2865,7 @@ int main(int argc, char ** argv) {
    };
    const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.endpoint_props) {
+        if (!ctx_server.params_base.endpoint_props) {
            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -2718,7 +2878,7 @@ int main(int argc, char ** argv) {
    };
    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -2824,7 +2984,7 @@ int main(int argc, char ** argv) {
    // TODO: maybe merge this function with "handle_completions_generic"
    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -3001,7 +3161,7 @@ int main(int argc, char ** argv) {
    };
    const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.reranking || ctx_server.params.embedding) {
+        if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
        }
    }, nullptr);
    // load dynamic backends
    ggml_backend_load_all();
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
        }
    }
    // load dynamic backends
    ggml_backend_load_all();
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -46,6 +46,7 @@ int main(int argc, char ** argv) {
    ctx_tgt   = llama_init_tgt.context;
    // load the draft model
    params.devices      = params.speculative.devices;
    params.model        = params.speculative.model;
    params.n_ctx        = params.speculative.n_ctx;
    params.n_batch      = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -76,6 +76,7 @@ int main(int argc, char ** argv) {
    ctx_tgt = llama_init_tgt.context;
    // load the draft model
    params.devices = params.speculative.devices;
    params.model = params.speculative.model;
    params.n_gpu_layers = params.speculative.n_gpu_layers;
    if (params.speculative.cpuparams.n_threads > 0) {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -33,6 +33,7 @@ else()
 endif()
 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
 #
 # option list
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -190,6 +190,14 @@ extern "C" {
    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
    // Get additional buffer types provided by the device (returns a NULL-terminated array)
    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
    // Set the abort callback for the backend
    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
    struct ggml_backend_feature {
        const char * name;
        const char * value;
    };
    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
    //
    // Backend registry
@ -214,6 +222,13 @@ extern "C" {
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
    GGML_API ggml_backend_t ggml_backend_init_best(void);
    // Load a backend from a dynamic library and register it
    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
    // Unload a backend if loaded dynamically and unregister it
    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
    // Load all known backends from dynamic libraries
    GGML_API void               ggml_backend_load_all(void);
    //
    // Backend scheduler
    //
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -7,29 +7,6 @@
 extern "C" {
 #endif
    // Scheduling priorities
    enum ggml_sched_priority {
        GGML_SCHED_PRIO_NORMAL,
        GGML_SCHED_PRIO_MEDIUM,
        GGML_SCHED_PRIO_HIGH,
        GGML_SCHED_PRIO_REALTIME
    };
    // Threadpool params
    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
    struct ggml_threadpool_params {
        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
        int                 n_threads;                   // number of threads
        enum ggml_sched_priority prio;                   // thread priority
        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
        bool                strict_cpu;                  // strict cpu placement
        bool                paused;                      // start in paused state
    };
    struct ggml_threadpool;     // forward declaration, see ggml.c
    typedef struct ggml_threadpool * ggml_threadpool_t;
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -75,12 +52,9 @@ extern "C" {
    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
    GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
    GGML_BACKEND_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
    GGML_BACKEND_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
-    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
@ -104,10 +78,10 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -2215,6 +2215,37 @@ extern "C" {
    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
    // ggml threadpool
    // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
    // the goal should be to create an API that other backends can use move everything to the ggml base
    // scheduling priorities
    enum ggml_sched_priority {
        GGML_SCHED_PRIO_NORMAL,
        GGML_SCHED_PRIO_MEDIUM,
        GGML_SCHED_PRIO_HIGH,
        GGML_SCHED_PRIO_REALTIME
    };
    // threadpool params
    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
    struct ggml_threadpool_params {
        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
        int                 n_threads;                   // number of threads
        enum ggml_sched_priority prio;                   // thread priority
        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
        bool                strict_cpu;                  // strict cpu placement
        bool                paused;                      // start in paused state
    };
    struct ggml_threadpool;     // forward declaration, see ggml.c
    typedef struct ggml_threadpool * ggml_threadpool_t;
    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -202,6 +202,10 @@ endif()
 # ggml
 if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
 endif()
 add_library(ggml-base
            ../include/ggml.h
            ../include/ggml-alloc.h
@ -226,6 +230,31 @@ add_library(ggml
 target_link_libraries(ggml PUBLIC ggml-base)
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
    target_link_libraries(ggml PRIVATE dl)
 endif()
 function(ggml_add_backend_library backend)
    if (GGML_BACKEND_DL)
        add_library(${backend} MODULE ${ARGN})
        # write the shared library to the output directory
        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
    else()
        add_library(${backend} ${ARGN})
        target_link_libraries(ggml PUBLIC ${backend})
        install(TARGETS ${backend} LIBRARY)
    endif()
    target_link_libraries(${backend} PRIVATE ggml-base)
    target_include_directories(${backend} PRIVATE ..)
    if (${BUILD_SHARED_LIBS})
        target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
        target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
    endif()
 endfunction()
 function(ggml_add_backend backend)
    string(TOUPPER "GGML_${backend}" backend_id)
    if (${backend_id})
@ -236,16 +265,12 @@ function(ggml_add_backend backend)
        # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
        if (${backend_id})
            message(STATUS "Including ${backend} backend")
-            if (${BUILD_SHARED_LIBS})
+            if (NOT GGML_BACKEND_DL)
                target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
                target_compile_definitions(${backend_target} PUBLIC  GGML_BACKEND_SHARED)
            endif()
            install(TARGETS ${backend_target} LIBRARY)
            target_link_libraries(ggml PUBLIC ${backend_target})
                string(TOUPPER "GGML_USE_${backend}" backend_use)
                target_compile_definitions(ggml PUBLIC ${backend_use})
            endif()
        endif()
    endif()
 endfunction()
 ggml_add_backend(CPU)
@ -256,10 +281,10 @@ ggml_add_backend(CUDA)
 ggml_add_backend(HIP)
 ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
 ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(MUSA)
 foreach (target ggml-base ggml)
    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ b/ggml/src/ggml-amx/CMakeLists.txt
@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA
    file(GLOB   GGML_SOURCES_AMX "*.cpp")
-    add_library(ggml-amx
+    ggml_add_backend_library(ggml-amx
                             ${GGML_HEADERS_AMX}
-                ${GGML_SOURCES_AMX})
+                             ${GGML_SOURCES_AMX}
-
+                            )
    target_link_libraries(ggml-amx PRIVATE ggml-base)
    target_include_directories(ggml-amx PRIVATE . ..)
    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
    # TODO: integrate AMX backend into the CPU backend
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@ -409,6 +409,7 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
 ggml_backend_reg_t ggml_backend_amx_reg(void) {
    static struct ggml_backend_reg ggml_backend_amx_reg = {
        /* .api_version = */ GGML_BACKEND_API_VERSION,
        /* .iface       = */ ggml_backend_amx_reg_i,
        /* .context     = */ NULL,
    };
@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
 }
 #endif
 GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -8,6 +8,8 @@
 extern "C" {
 #endif
    #define GGML_BACKEND_API_VERSION 1
    //
    // Backend buffer type
    //
@ -63,20 +65,20 @@ extern "C" {
        enum ggml_backend_buffer_usage usage;
    };
-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
                   ggml_backend_buffer_type_t buft,
            struct ggml_backend_buffer_i      iface,
                   void *                     context,
                   size_t                     size);
    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
    // multi-buffer
    // buffer that contains a collection of buffers
-    ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_API bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    //
    // Backend (stream)
@ -199,17 +201,37 @@ extern "C" {
    };
    struct ggml_backend_reg {
-        // int api_version; // TODO: for dynamic loading
+        int api_version; // initialize to GGML_BACKEND_API_VERSION
        struct ggml_backend_reg_i iface;
        void * context;
    };
    // Internal backend registry API
-    void ggml_backend_register(ggml_backend_reg_t reg);
+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
-    void ggml_backend_device_register(ggml_backend_dev_t device);
+    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
-    // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
+
-    // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
+    // Add backend dynamic loading support to the backend
    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
    #ifdef GGML_BACKEND_DL
        #ifdef __cplusplus
        #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \
                extern "C" {                                                     \
                    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
                }                                                                \
                ggml_backend_reg_t ggml_backend_init(void) {                     \
                    return reg_fn();                                             \
                }
        #else
        #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \
                GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
                ggml_backend_reg_t ggml_backend_init(void) {                 \
                    return reg_fn();                                         \
                }
        #endif
    #else
    #    define GGML_BACKEND_DL_IMPL(reg_fn)
    #endif
 #ifdef  __cplusplus
 }
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -1,11 +1,29 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include <algorithm>
 #include <cstring>
 #include <string>
 #include <vector>
 #ifdef _WIN32
 #    define WIN32_LEAN_AND_MEAN
 #    ifndef NOMINMAX
 #        define NOMINMAX
 #    endif
 #    include <windows.h>
 #elif defined(__APPLE__)
 #    include <mach-o/dyld.h>
 #    include <dlfcn.h>
 #else
 #    include <dlfcn.h>
 #    include <unistd.h>
 #endif
 // Backend registry
 #ifdef GGML_USE_CPU
 #include "ggml-cpu.h"
 #endif
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@ -43,8 +61,13 @@
 #include "ggml-kompute.h"
 #endif
 struct ggml_backend_reg_entry {
    ggml_backend_reg_t reg;
    void * handle;
 };
 struct ggml_backend_registry {
-    std::vector<ggml_backend_reg_t> backends;
+    std::vector<ggml_backend_reg_entry> backends;
    std::vector<ggml_backend_dev_t> devices;
    ggml_backend_registry() {
@ -75,11 +98,19 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_KOMPUTE
        register_backend(ggml_backend_kompute_reg());
 #endif
-
+#ifdef GGML_USE_CPU
        register_backend(ggml_backend_cpu_reg());
 #endif
    }
-    void register_backend(ggml_backend_reg_t reg) {
+    ~ggml_backend_registry() {
        while (!backends.empty()) {
            // use silent since the log system may have been destroyed at this point
            unload_backend(backends.back().reg, true);
        }
    }
    void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
        if (!reg) {
            return;
        }
@ -88,7 +119,7 @@ struct ggml_backend_registry {
        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
 #endif
-        backends.push_back(reg);
+        backends.push_back({ reg, handle });
        for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
            register_device(ggml_backend_reg_dev_get(reg, i));
        }
@ -100,6 +131,111 @@ struct ggml_backend_registry {
 #endif
        devices.push_back(device);
    }
    ggml_backend_reg_t load_backend(const char * path, bool silent) {
 #ifdef _WIN32
        // suppress error dialogs for missing DLLs
        DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
        SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
        HMODULE handle = LoadLibraryA(path);
        if (!handle) {
            if (!silent) {
                GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
            }
            SetErrorMode(old_mode);
            return nullptr;
        }
        ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
        SetErrorMode(old_mode);
        if (!backend_init) {
            if (!silent) {
                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
            }
            FreeLibrary(handle);
            return nullptr;
        }
 #else
        void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
        if (!handle) {
            if (!silent) {
                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
            }
            return nullptr;
        }
        auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
        if (!backend_init) {
            if (!silent) {
                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
            }
            dlclose(handle);
            return nullptr;
        }
 #endif
        ggml_backend_reg_t reg = backend_init();
        if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
            if (!silent) {
                if (!reg) {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
                } else {
                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
                                   __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
                }
            }
 #ifdef _WIN32
            FreeLibrary(handle);
 #else
            dlclose(handle);
 #endif
            return nullptr;
        }
        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
        register_backend(reg, handle);
        return reg;
    }
    void unload_backend(ggml_backend_reg_t reg, bool silent) {
        auto it = std::find_if(backends.begin(), backends.end(),
                                [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
        if (it == backends.end()) {
            if (!silent) {
                GGML_LOG_ERROR("%s: backend not found\n", __func__);
            }
            return;
        }
        if (!silent) {
            GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
        }
        // remove devices
        devices.erase(
            std::remove_if(devices.begin(), devices.end(),
                            [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
            devices.end());
        // unload library
        if (it->handle) {
 #ifdef _WIN32
            FreeLibrary((HMODULE) it->handle);
 #else
            dlclose(it->handle);
 #endif
        }
        // remove backend
        backends.erase(it);
    }
 };
 static ggml_backend_registry & get_reg() {
@ -117,23 +253,32 @@ void ggml_backend_device_register(ggml_backend_dev_t device) {
 }
 // Backend (reg) enumeration
 static bool striequals(const char * a, const char * b) {
    for (; *a && *b; a++, b++) {
        if (std::tolower(*a) != std::tolower(*b)) {
            return false;
        }
    }
    return *a == *b;
 }
 size_t ggml_backend_reg_count() {
    return get_reg().backends.size();
 }
 ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
    GGML_ASSERT(index < ggml_backend_reg_count());
-    return get_reg().backends[index];
+    return get_reg().backends[index].reg;
 }
 ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
        ggml_backend_reg_t reg = ggml_backend_reg_get(i);
-        if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
+        if (striequals(ggml_backend_reg_name(reg), name)) {
            return reg;
        }
    }
-    return NULL;
+    return nullptr;
 }
 // Device enumeration
@ -149,11 +294,11 @@ ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
 ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-        if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
+        if (striequals(ggml_backend_dev_name(dev), name)) {
            return dev;
        }
    }
-    return NULL;
+    return nullptr;
 }
 ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
@ -163,14 +308,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
            return dev;
        }
    }
-    return NULL;
+    return nullptr;
 }
 // Convenience functions
 ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
    ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
    if (!dev) {
-        return NULL;
+        return nullptr;
    }
    return ggml_backend_dev_init(dev, params);
 }
@ -178,7 +323,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params)
 ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
    ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
    if (!dev) {
-        return NULL;
+        return nullptr;
    }
    return ggml_backend_dev_init(dev, params);
 }
@ -189,7 +334,97 @@ ggml_backend_t ggml_backend_init_best(void) {
        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
    }
    if (!dev) {
-        return NULL;
+        return nullptr;
    }
-    return ggml_backend_dev_init(dev, NULL);
+    return ggml_backend_dev_init(dev, nullptr);
 }
 // Dynamic loading
 ggml_backend_reg_t ggml_backend_load(const char * path) {
    return get_reg().load_backend(path, false);
 }
 void ggml_backend_unload(ggml_backend_reg_t reg) {
    get_reg().unload_backend(reg, true);
 }
 void ggml_backend_load_all() {
    std::vector<std::string> search_prefix;
    // add the executable directory to the search path
    // FIXME: this is convenient for development, but it should probably be disabled in production
 #if defined(__APPLE__)
    // get executable path
    std::vector<char> path;
    uint32_t size;
    while (true) {
        size = path.size();
        if (_NSGetExecutablePath(path.data(), &size) == 0) {
            break;
        }
        path.resize(size);
    }
    std::string base_path(path.data(), size);
    // remove executable name
    auto last_slash = base_path.find_last_of('/');
    if (last_slash != std::string::npos) {
        base_path = base_path.substr(0, last_slash);
    }
    search_prefix.push_back(base_path + "/");
 #elif defined(__linux__)
    std::string base_path = ".";
    std::vector<char> path(1024);
    while (true) {
        // get executable path
        ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
        if (len == -1) {
            break;
        }
        if (len < (ssize_t) path.size()) {
            base_path = std::string(path.data(), len);
            // remove executable name
            auto last_slash = base_path.find_last_of('/');
            if (last_slash != std::string::npos) {
                base_path = base_path.substr(0, last_slash);
            }
            break;
        }
        path.resize(path.size() * 2);
    }
    search_prefix.push_back(base_path + "/");
 #endif
    auto & reg = get_reg();
    auto try_load = [&](const std::string & name) {
        std::string os_name;
 #ifdef _WIN32
        os_name = "ggml-" + name + ".dll";
 #else
        os_name = "libggml-" + name + ".so";
 #endif
        if (reg.load_backend(os_name.c_str(), true)) {
            return;
        }
        for (const auto & prefix : search_prefix) {
            if (reg.load_backend((prefix + os_name).c_str(), true)) {
                return;
            }
        }
    };
    try_load("amx");
    try_load("blas");
    try_load("cann");
    try_load("cuda");
    try_load("hip");
    try_load("kompute");
    try_load("metal");
    try_load("rpc");
    try_load("sycl");
    try_load("vulkan");
    try_load("musa");
    try_load("cpu");
 }
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@ -11,13 +11,10 @@ find_package(BLAS)
 if (BLAS_FOUND)
    message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-    add_library(ggml-blas
+    ggml_add_backend_library(ggml-blas
                             ggml-blas.cpp
                            )
    target_link_libraries(ggml-blas PRIVATE ggml-base)
    target_include_directories(ggml-blas PRIVATE . ..)
    if (${GGML_BLAS_VENDOR} MATCHES "Apple")
        add_compile_definitions(ACCELERATE_NEW_LAPACK)
        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
 ggml_backend_reg_t ggml_backend_blas_reg(void) {
    static struct ggml_backend_reg ggml_backend_blas_reg = {
        /* .api_version = */ GGML_BACKEND_API_VERSION,
        /* .iface       = */ ggml_backend_blas_reg_i,
        /* .context     = */ NULL,
    };
    return &ggml_backend_blas_reg;
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR)
    file(GLOB GGML_SOURCES_CANN "*.cpp")
-    add_library(ggml-cann ${GGML_SOURCES_CANN})
+    ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
-    target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
+    target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
-    target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
+    target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
    target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2080,7 +2080,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
                ggml_cann_set_device(i);
                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cann_device_interface,
+                    /* .iface   = */ ggml_backend_cann_device_interface,
                    /* .reg     = */ &reg,
                    /* .context = */ dev_ctx
                };
@ -2088,7 +2088,8 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
            }
            reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cann_reg_interface,
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
                /* .iface       = */ ggml_backend_cann_reg_interface,
                /* .context     = */ ctx
            };
        }
@ -2142,3 +2143,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
    ggml_cann_set_device(device);
    ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -1,4 +1,4 @@
-add_library(ggml-cpu
+ggml_add_backend_library(ggml-cpu
                         ggml-cpu.c
                         ggml-cpu.cpp
                         ggml-cpu-aarch64.c
@ -7,8 +7,7 @@ add_library(ggml-cpu
                         ggml-cpu-quants.h
                        )
-target_link_libraries(ggml-cpu PRIVATE ggml-base)
+target_include_directories(ggml-cpu PRIVATE .)
 target_include_directories(ggml-cpu PRIVATE . ..)
 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
 #endif // GGML_USE_OPENMP
 void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
    p->n_threads  = n_threads;
    p->prio       = 0;     // default priority (usually means normal or inherited)
    p->poll       = 50;    // hybrid-polling enabled
    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
    p->paused     = false; // threads are ready to go
    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
 }
 struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
    struct ggml_threadpool_params p;
    ggml_threadpool_params_init(&p, n_threads);
    return p;
 }
 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
    if (p0->n_threads      != p1->n_threads  )    return false;
    if (p0->prio           != p1->prio       )    return false;
    if (p0->poll           != p1->poll       )    return false;
    if (p0->strict_cpu     != p1->strict_cpu )    return false;
    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
 static struct ggml_threadpool * ggml_threadpool_new_impl(
    struct ggml_threadpool_params * tpp,
               struct ggml_cgraph * cgraph,
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
    return &ggml_backend_cpu_device;
 }
 struct ggml_backend_feature {
    const char * name;
    const char * value;
 };
 // Not used yet
 // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
-// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
+// and additionally to allow other backends to expose their own list of features that applications can query using the same API
 static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
    static std::vector<ggml_backend_feature> features = []() {
        ggml_cpu_init();
        std::vector<ggml_backend_feature> features;
        if (ggml_cpu_has_sse3()) {
            features.push_back({ "SSE3", "1" });
@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_avx()) {
            features.push_back({ "AVX", "1" });
        }
        if (ggml_cpu_has_avx_vnni()) {
            features.push_back({ "AVX_VNNI", "1" });
        }
        if (ggml_cpu_has_avx2()) {
            features.push_back({ "AVX2", "1" });
        }
@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_fma()) {
            features.push_back({ "FMA", "1" });
        }
        if (ggml_cpu_has_avx_vnni()) {
            features.push_back({ "AVX_VNNI", "1" });
        }
        if (ggml_cpu_has_avx512()) {
            features.push_back({ "AVX512", "1" });
        }
@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_llamafile()) {
            features.push_back({ "LLAMAFILE", "1" });
        }
        // TODO: rename this
    #ifdef GGML_USE_CPU_AARCH64
        features.push_back({ "AARCH64_REPACK", "1" });
    #endif
        features.push_back({ nullptr, nullptr });
@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
    if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
        return (void *)ggml_backend_cpu_get_extra_bufts;
    }
    if (strcmp(name, "ggml_backend_get_features") == 0) {
        return (void *)ggml_backend_cpu_get_features;
    }
    if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
        return (void *)ggml_backend_cpu_set_abort_callback;
    }
    if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
        return (void *)ggml_numa_init;
    }
    if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
        return (void *)ggml_is_numa;
    }
    // threadpool - TODO:  move to ggml-base
    if (strcmp(name, "ggml_threadpool_new") == 0) {
        return (void *)ggml_threadpool_new;
    }
    if (strcmp(name, "ggml_threadpool_free") == 0) {
        return (void *)ggml_threadpool_free;
    }
    if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
        return (void *)ggml_backend_cpu_set_threadpool;
    }
    return NULL;
@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
    ggml_cpu_init();
    static struct ggml_backend_reg ggml_backend_cpu_reg = {
        /* .api_version = */ GGML_BACKEND_API_VERSION,
        /* .iface       = */ ggml_backend_cpu_reg_i,
        /* .context     = */ NULL,
    };
    return &ggml_backend_cpu_reg;
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@ -46,14 +46,11 @@ if (CUDAToolkit_FOUND)
        list(APPEND GGML_SOURCES_CUDA ${SRCS})
    endif()
-    add_library(ggml-cuda
+    ggml_add_backend_library(ggml-cuda
                             ${GGML_HEADERS_CUDA}
                             ${GGML_SOURCES_CUDA}
                            )
    target_link_libraries(ggml-cuda PRIVATE ggml-base)
    target_include_directories(ggml-cuda PRIVATE . ..)
    add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
    if (GGML_CUDA_GRAPHS)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
    return ctx->devices[index];
 }
 static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
    static std::vector<ggml_backend_feature> features = []() {
        std::vector<ggml_backend_feature> features;
    #define _STRINGIFY(...) #__VA_ARGS__
    #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
    #ifdef __CUDA_ARCH_LIST__
        features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
    #endif
    #ifdef GGML_CUDA_FORCE_MMQ
        features.push_back({ "FORCE_MMQ", "1" });
    #endif
    #ifdef GGML_CUDA_FORCE_CUBLAS
        features.push_back({ "FORCE_CUBLAS", "1" });
    #endif
    #ifdef GGML_CUDA_NO_VMM
        features.push_back({ "NO_VMM", "1" });
    #endif
    #ifdef GGML_CUDA_NO_PEER_COPY
        features.push_back({ "NO_PEER_COPY", "1" });
    #endif
    #ifdef GGML_CUDA_F16
        features.push_back({ "F16", "1" });
    #endif
    #ifdef GGML_CUDA_USE_GRAPHS
        features.push_back({ "USE_GRAPHS", "1" });
    #endif
    #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
        features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
    #endif
    #ifdef GGML_CUDA_FA_ALL_QUANTS
        features.push_back({ "FA_ALL_QUANTS", "1" });
    #endif
    #undef _STRINGIFY
    #undef STRINGIFY
        features.push_back({ nullptr, nullptr });
        return features;
    }();
    return features.data();
    GGML_UNUSED(reg);
 }
 static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
    GGML_UNUSED(reg);
    if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
    if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
        return (void *)ggml_backend_cuda_unregister_host_buffer;
    }
    if (strcmp(name, "ggml_backend_get_features") == 0) {
        return (void *)ggml_backend_cuda_get_features;
    }
    return nullptr;
 }
@ -3169,7 +3227,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
                dev_ctx->description = prop.name;
                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_cuda_device_interface,
+                    /* .iface   = */ ggml_backend_cuda_device_interface,
                    /* .reg     = */ &reg,
                    /* .context = */ dev_ctx
                };
@ -3177,7 +3235,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
            }
            reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_cuda_reg_interface,
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
                /* .iface       = */ ggml_backend_cuda_reg_interface,
                /* .context     = */ ctx
            };
        }
@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
    return cuda_backend;
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@ -64,12 +64,10 @@ else()
    list(APPEND GGML_SOURCES_ROCM ${SRCS})
 endif()
-add_library(ggml-hip
+ggml_add_backend_library(ggml-hip
                         ${GGML_HEADERS_ROCM}
-            ${GGML_SOURCES_ROCM})
+                         ${GGML_SOURCES_ROCM}
-
+                        )
 target_link_libraries(ggml-hip PRIVATE ggml-base)
 target_include_directories(ggml-hip PRIVATE . ..)
 # TODO: do not use CUDA definitions for HIP
 target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
--- a/ggml/src/ggml-kompute/CMakeLists.txt
+++ b/ggml/src/ggml-kompute/CMakeLists.txt
@ -6,13 +6,13 @@ if (NOT glslc_executable)
    message(FATAL_ERROR "glslc not found")
 endif()
-add_library(ggml-kompute
+ggml_add_backend_library(ggml-kompute
                         ggml-kompute.cpp
                         ../../include/ggml-kompute.h
                        )
 target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
-target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
--- a/ggml/src/ggml-kompute/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute/ggml-kompute.cpp
@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
 ggml_backend_reg_t ggml_backend_kompute_reg() {
    static ggml_backend_reg reg = {
        /* .api_version = */ GGML_BACKEND_API_VERSION,
        /* .iface       = */ ggml_backend_kompute_reg_i,
        /* .context     = */ nullptr,
    };
    return &reg;
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
 message(STATUS "Metal framework found")
-add_library(ggml-metal
+ggml_add_backend_library(ggml-metal
                         ggml-metal.m
                        )
 target_link_libraries(ggml-metal PRIVATE
                      ggml-base
                      ${FOUNDATION_LIBRARY}
                      ${METAL_FRAMEWORK}
                      ${METALKIT_FRAMEWORK}
                      )
 target_include_directories(ggml-metal PRIVATE . ..)
 if (GGML_METAL_NDEBUG)
    add_compile_definitions(GGML_METAL_NDEBUG)
 endif()
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -1927,7 +1927,7 @@ static void ggml_metal_encode_node(
                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                // to the matrix-vector kernel
-                int ne11_mm_min = 1;
+                int ne11_mm_min = 4;
 #if 0
                // the numbers below are measured on M2 Ultra for 7B and 13B models
@ -4372,17 +4372,43 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r
    GGML_UNUSED(index);
 }
 static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
 #if defined(GGML_METAL_EMBED_LIBRARY)
    { "EMBED_LIBRARY", "1" },
 #endif
 #if defined(GGML_METAL_USE_BF16)
    { "BF16", "1" },
 #endif
    { nil, nil },
 };
 static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
    return g_ggml_backend_metal_features;
    GGML_UNUSED(reg);
 }
 static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
    if (strcmp(name, "ggml_backend_get_features") == 0) {
        return (void *)ggml_backend_metal_get_features;
    }
    return NULL;
    GGML_UNUSED(reg);
 }
 static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
    /* .get_name         = */ ggml_backend_metal_reg_get_name,
    /* .device_count     = */ ggml_backend_metal_reg_device_count,
    /* .device_get       = */ ggml_backend_metal_reg_device_get,
-    /* .get_proc_address = */ NULL,
+    /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
 };
 ggml_backend_reg_t ggml_backend_metal_reg(void) {
    // TODO: make this thread-safe somehow?
    {
        g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
            /* .api_version = */ GGML_BACKEND_API_VERSION,
            /* .iface       = */ ggml_backend_metal_reg_i,
            /* .context     = */ NULL,
        };
@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
    return &g_ggml_backend_metal_reg;
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -5447,12 +5447,12 @@ kernel void kernel_mul_mm(
    const int im = tgpig.z;
    // if this block is of 64x32 shape or smaller
-    short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
    // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
    simdgroup_T8x8     ma[4];
    simdgroup_float8x8 mb[2];
@ -5467,20 +5467,23 @@ kernel void kernel_mul_mm(
    const int i12 = im%args.ne12;
    const int i13 = im/args.ne12;
-    uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
-    short    offset1 = il/nl;
+    const short    offset1 = il/nl;
    device const block_q * x = (device const block_q *)(src0
        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
    device const block_q * x = (device const block_q *)(src0 + (r0*BLOCK_SIZE_M + thread_row)*args.nb01 + offset0) + offset1;
    device const float   * y = (device const float   *)(src1
        + args.nb13*i13
        + args.nb12*i12
-        + args.nb11*(r1 * BLOCK_SIZE_N + thread_col)
+        + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
        + args.nb10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
        // load data and store to threadgroup memory
        T4x4 temp_a;
        dequantize_func(x, il, temp_a);
        threadgroup_barrier(mem_flags::mem_threadgroup);
        #pragma unroll(16)
@ -5490,44 +5493,46 @@ kernel void kernel_mul_mm(
            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
        }
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL)*8*32 + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
+        *(threadgroup float2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device float2x4 *) y);
        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
        y += BLOCK_SIZE_K;
        threadgroup_barrier(mem_flags::mem_threadgroup);
        // load matrices from threadgroup memory and conduct outer products
-        threadgroup T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup const T     * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
+        threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
        #pragma unroll(4)
-        for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
            #pragma unroll(4)
            for (short i = 0; i < 4; i++) {
                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
            }
            simdgroup_barrier(mem_flags::mem_none);
            #pragma unroll(2)
            for (short i = 0; i < 2; i++) {
                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
            }
            lsma += BLOCK_SIZE_M/SG_MAT_ROW * SG_MAT_SIZE;
            lsmb += BLOCK_SIZE_N/SG_MAT_ROW * SG_MAT_SIZE;
            #pragma unroll(8)
            for (short i = 0; i < 8; i++){
                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
            }
            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
        }
    }
    if ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1) {
        device float * C = (device float *) dst +
-            (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) + \
+            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
-            (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
+            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
        for (short i = 0; i < 8; i++) {
            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
@ -5536,7 +5541,7 @@ kernel void kernel_mul_mm(
        // block is smaller than 64x32, we should avoid writing data outside of the matrix
        threadgroup_barrier(mem_flags::mem_threadgroup);
        threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1))*BLOCK_SIZE_M;
+                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
        for (short i = 0; i < 8; i++) {
            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
        }
--- a/ggml/src/ggml-musa/CMakeLists.txt
+++ b/ggml/src/ggml-musa/CMakeLists.txt
@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND)
        set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
    endforeach()
-    add_library(ggml-musa
+    ggml_add_backend_library(ggml-musa
                             ${GGML_HEADERS_MUSA}
-                ${GGML_SOURCES_MUSA})
+                             ${GGML_SOURCES_MUSA}
-
+                            )
    target_link_libraries(ggml-musa PRIVATE ggml-base)
    target_include_directories(ggml-musa PRIVATE . ..)
    # TODO: do not use CUDA definitions for MUSA
    target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
--- a/ggml/src/ggml-rpc/CMakeLists.txt
+++ b/ggml/src/ggml-rpc/CMakeLists.txt
@ -1,10 +1,8 @@
 message(STATUS "Using RPC backend")
-add_library(ggml-rpc
+ggml_add_backend_library(ggml-rpc
-            ggml-rpc.cpp)
+                         ggml-rpc.cpp
-
+                        )
 target_link_libraries(ggml-rpc PRIVATE ggml-base)
 target_include_directories(ggml-rpc PRIVATE . ..)
 if (WIN32)
    target_link_libraries(ggml-rpc PRIVATE ws2_32)
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -1369,6 +1369,7 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
 ggml_backend_reg_t ggml_backend_rpc_reg(void) {
    static struct ggml_backend_reg ggml_backend_rpc_reg = {
        /* .api_version = */ GGML_BACKEND_API_VERSION,
        /* .iface       = */ ggml_backend_rpc_reg_i,
        /* .context     = */ NULL,
    };
@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
    return dev;
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@ -16,12 +16,10 @@ endif()
 message(STATUS "SYCL found")
 #todo: AOT
-add_library(ggml-sycl
+ggml_add_backend_library(ggml-sycl
                         ggml-sycl.cpp
-            ../../include/ggml-sycl.h)
+                         ../../include/ggml-sycl.h
-
+                        )
 target_link_libraries(ggml-sycl PRIVATE ggml-base)
 target_include_directories(ggml-sycl PRIVATE . ..)
 if (GGML_SYCL_F16)
    if (GGML_SYCL_TARGET STREQUAL "AMD")
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -4637,7 +4637,7 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
                dev_ctx->description = prop.get_name();
                ggml_backend_dev_t dev = new ggml_backend_device {
-                    /* .interface = */ ggml_backend_sycl_device_interface,
+                    /* .iface       = */ ggml_backend_sycl_device_interface,
                    /* .reg         = */ &reg,
                    /* .context     = */ dev_ctx
                };
@ -4645,7 +4645,8 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
            }
            reg = ggml_backend_reg {
-                /* .interface = */ ggml_backend_sycl_reg_interface,
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
                /* .iface       = */ ggml_backend_sycl_reg_interface,
                /* .context     = */ ctx
            };
        }
@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
    return sycl_backend;
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED)
 if (Vulkan_FOUND)
    message(STATUS "Vulkan found")
-    add_library(ggml-vulkan
+    ggml_add_backend_library(ggml-vulkan
                             ggml-vulkan.cpp
                             ../../include/ggml-vulkan.h
                            )
-    target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan)
+    target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
-    target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+    target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
    # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
    # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -6738,6 +6738,7 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
 ggml_backend_reg_t ggml_backend_vk_reg() {
    static ggml_backend_reg reg = {
        /* .api_version = */ GGML_BACKEND_API_VERSION,
        /* .iface       = */ ggml_backend_vk_reg_i,
        /* .context     = */ nullptr,
    };
@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
    VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
 }
 #endif
 GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -474,9 +474,15 @@ void write_output_files() {
 int main(int argc, char** argv) {
    std::map<std::string, std::string> args;
-    for (int i = 1; i < argc; i += 2) {
+    for (int i = 1; i < argc; ++i) {
-        if (i + 1 < argc) {
+        std::string arg = argv[i];
-            args[argv[i]] = argv[i + 1];
+        if (arg.rfind("--", 0) == 0) {
            if (i + 1 < argc && argv[i + 1][0] != '-') {
                args[arg] = argv[i + 1];
                ++i;
            } else {
                args[arg] = "";
            }
        }
    }
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
    g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
    g_logger_state.log_callback_user_data = user_data;
 }
 void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
    p->n_threads  = n_threads;
    p->prio       = 0;     // default priority (usually means normal or inherited)
    p->poll       = 50;    // hybrid-polling enabled
    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
    p->paused     = false; // threads are ready to go
    memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
 }
 struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
    struct ggml_threadpool_params p;
    ggml_threadpool_params_init(&p, n_threads);
    return p;
 }
 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
    if (p0->n_threads      != p1->n_threads  )    return false;
    if (p0->prio           != p1->prio       )    return false;
    if (p0->poll           != p1->poll       )    return false;
    if (p0->strict_cpu     != p1->strict_cpu )    return false;
    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -243,7 +243,7 @@ class MODEL_ARCH(IntEnum):
    COMMAND_R    = auto()
    DBRX         = auto()
    OLMO         = auto()
-    OLMO_1124    = auto()
+    OLMO2        = auto()
    OLMOE        = auto()
    OPENELM      = auto()
    ARCTIC       = auto()
@ -405,7 +405,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.COMMAND_R:      "command-r",
    MODEL_ARCH.DBRX:           "dbrx",
    MODEL_ARCH.OLMO:           "olmo",
-    MODEL_ARCH.OLMO_1124:      "olmo_1124",
+    MODEL_ARCH.OLMO2:          "olmo2",
    MODEL_ARCH.OLMOE:          "olmoe",
    MODEL_ARCH.OPENELM:        "openelm",
    MODEL_ARCH.ARCTIC:         "arctic",
@ -1071,7 +1071,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
-    MODEL_ARCH.OLMO_1124: [
+    MODEL_ARCH.OLMO2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -13,7 +13,7 @@ class TensorNameMap:
            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
            "transformer.word_embeddings",               # falcon
            "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo_1124
+            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert nomic-bert
            "language_model.embedding.word_embeddings",  # persimmon
@ -54,7 +54,7 @@ class TensorNameMap:
        # Output
        MODEL_TENSOR.OUTPUT: (
            "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo_1124
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
            "output",                    # llama-pth bloom internlm2
            "word_embeddings_for_head",  # persimmon
            "lm_head.linear",            # phi2
@ -66,7 +66,7 @@ class TensorNameMap:
        MODEL_TENSOR.OUTPUT_NORM: (
            "gpt_neox.final_layer_norm",               # gptneox
            "transformer.ln_f",                        # gpt2 gpt-j falcon jais exaone
-            "model.norm",                              # llama-hf baichuan internlm2 olmoe olmo_1124
+            "model.norm",                              # llama-hf baichuan internlm2 olmoe olmo2
            "norm",                                    # llama-pth
            "transformer.norm_f",                      # mpt dbrx
            "ln_f",                                    # refact bloom qwen gpt2
@ -145,7 +145,7 @@ class TensorNameMap:
        # Attention query
        MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe olmo2
            "layers.{bid}.attention.wq",                                 # llama-pth
            "encoder.layer.{bid}.attention.self.query",                  # bert
            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
@ -157,7 +157,7 @@ class TensorNameMap:
        # Attention key
        MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe olmo2
            "layers.{bid}.attention.wk",                               # llama-pth
            "encoder.layer.{bid}.attention.self.key",                  # bert
            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
@ -170,7 +170,7 @@ class TensorNameMap:
        # Attention value
        MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2
            "layers.{bid}.attention.wv",                                 # llama-pth
            "encoder.layer.{bid}.attention.self.value",                  # bert
            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
@ -188,7 +188,7 @@ class TensorNameMap:
            "transformer.blocks.{bid}.attn.out_proj",                       # mpt
            "transformer.h.{bid}.self_attention.dense",                     # falcon
            "h.{bid}.self_attention.dense",                                 # bloom
-            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo_1124
+            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2
            "layers.{bid}.attention.wo",                                    # llama-pth
            "encoder.layer.{bid}.attention.output.dense",                   # bert
            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
@ -215,7 +215,7 @@ class TensorNameMap:
        ),
        MODEL_TENSOR.ATTN_POST_NORM: (
-            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo_1124
+            "model.layers.{bid}.post_attention_layernorm",     # gemma2 olmo2
        ),
        # Rotary embeddings
@ -250,7 +250,7 @@ class TensorNameMap:
        # Post feed-forward norm
        MODEL_TENSOR.FFN_POST_NORM: (
-            "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo_1124
+            "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2
        ),
        MODEL_TENSOR.FFN_GATE_INP: (
@ -273,7 +273,7 @@ class TensorNameMap:
            "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
            "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
            "h.{bid}.mlp.dense_h_to_4h",                              # bloom
-            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo_1124
+            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
            "layers.{bid}.feed_forward.w3",                           # llama-pth
            "encoder.layer.{bid}.intermediate.dense",                 # bert
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
@ -314,7 +314,7 @@ class TensorNameMap:
        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact olmo_1124
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact olmo2
            "layers.{bid}.feed_forward.w1",               # llama-pth
            "transformer.h.{bid}.mlp.w2",                 # qwen
            "transformer.h.{bid}.mlp.c_fc2",              # jais
@ -346,7 +346,7 @@ class TensorNameMap:
            "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
            "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
            "h.{bid}.mlp.dense_4h_to_h",                              # bloom
-            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo_1124
+            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
            "layers.{bid}.feed_forward.w2",                           # llama-pth
            "encoder.layer.{bid}.output.dense",                       # bert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
@ -383,7 +383,7 @@ class TensorNameMap:
        MODEL_TENSOR.ATTN_Q_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
            "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo_1124
+            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe chameleon olmo2
            "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
            "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
            "transformer.layers.{bid}.attn.q_norm",                           # openelm
@ -392,7 +392,7 @@ class TensorNameMap:
        MODEL_TENSOR.ATTN_K_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
            "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo_1124
+            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe chameleon olmo2
            "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
            "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
            "transformer.layers.{bid}.attn.k_norm",                           # openelm
--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@ -0,0 +1,25 @@
 #pragma once
 #ifndef __cplusplus
 #error "This header is for C++ only"
 #endif
 #include <memory>
 #include "llama.h"
 struct llama_model_deleter {
    void operator()(llama_model * model) { llama_free_model(model); }
 };
 struct llama_context_deleter {
    void operator()(llama_context * context) { llama_free(context); }
 };
 struct llama_sampler_deleter {
    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
 };
 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
 typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
 typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
--- a/include/llama.h
+++ b/include/llama.h
@ -272,6 +272,9 @@ extern "C" {
    };
    struct llama_model_params {
        // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
        ggml_backend_dev_t * devices;
        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
--- a/pocs/CMakeLists.txt
+++ b/pocs/CMakeLists.txt
@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(vdot)
    endif()
 endif()
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -179,7 +179,7 @@ enum llm_arch {
    LLM_ARCH_COMMAND_R,
    LLM_ARCH_DBRX,
    LLM_ARCH_OLMO,
-    LLM_ARCH_OLMO_1124,
+    LLM_ARCH_OLMO2,
    LLM_ARCH_OLMOE,
    LLM_ARCH_OPENELM,
    LLM_ARCH_ARCTIC,
@ -233,7 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_COMMAND_R,       "command-r"    },
    { LLM_ARCH_DBRX,            "dbrx"         },
    { LLM_ARCH_OLMO,            "olmo"         },
-    { LLM_ARCH_OLMO_1124,       "olmo_1124"    },
+    { LLM_ARCH_OLMO2,           "olmo2"        },
    { LLM_ARCH_OLMOE,           "olmoe"        },
    { LLM_ARCH_OPENELM,         "openelm"      },
    { LLM_ARCH_ARCTIC,          "arctic"       },
@ -1210,7 +1210,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
        },
    },
    {
-        LLM_ARCH_OLMO_1124,
+        LLM_ARCH_OLMO2,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
@ -4866,7 +4866,9 @@ struct llama_model_loader {
            mappings.reserve(files.size());
            mmaps_used.reserve(files.size());
            for (const auto & file : files) {
-                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
+                auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
                auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
                mmaps_used.emplace_back(mapping->size, 0);
                if (mlock_mmaps) {
                    std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@ -5898,7 +5900,7 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_OLMO_1124:
+        case LLM_ARCH_OLMO2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -8591,7 +8593,7 @@ static bool llm_load_tensors(
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                    }
                } break;
-            case LLM_ARCH_OLMO_1124:
+            case LLM_ARCH_OLMO2:
                {
                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -9190,7 +9192,7 @@ static bool llm_load_tensors(
        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
        if (!dev) {
            // FIXME: workaround for CPU backend buft having a NULL device
-            dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
+            dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
        }
        ggml_backend_dev_props props;
        ggml_backend_dev_get_props(dev, &props);
@ -14481,7 +14483,7 @@ struct llm_build_context {
        return gf;
    }
-    struct ggml_cgraph * build_olmo_1124() {
+    struct ggml_cgraph * build_olmo2() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
        // mutable variable, needed during the last layer of the computation to skip unused tokens
@ -16797,9 +16799,9 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_olmo();
            } break;
-        case LLM_ARCH_OLMO_1124:
+        case LLM_ARCH_OLMO2:
            {
-                result = llm.build_olmo_1124();
+                result = llm.build_olmo2();
            } break;
        case LLM_ARCH_OLMOE:
            {
@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute(
                    int   n_threads,
        ggml_threadpool * threadpool) {
    if (lctx.backend_cpu != nullptr) {
-        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
-        ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
+        auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
        set_threadpool_fn(lctx.backend_cpu, threadpool);
    }
    // set the number of threads for all the backends
@ -19361,6 +19364,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
 //
 struct llama_model_params llama_model_default_params() {
    struct llama_model_params result = {
        /*.devices                     =*/ nullptr,
        /*.n_gpu_layers                =*/ 0,
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
@ -19478,7 +19482,11 @@ void llama_backend_init(void) {
 void llama_numa_init(enum ggml_numa_strategy numa) {
    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
-        ggml_numa_init(numa);
+        auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
        GGML_ASSERT(dev && "CPU backend is not loaded");
        auto * reg = ggml_backend_dev_backend_reg(dev);
        auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
        numa_init_fn(numa);
    }
 }
@ -19569,8 +19577,12 @@ struct llama_model * llama_load_model_from_file(
    }
    // create list of devices to use with this model
-    // currently, we use all available devices
+    if (params.devices) {
-    // TODO: rework API to give user more control over device selection
+        for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
            model->devices.push_back(*dev);
        }
    } else {
        // use all available devices
        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
            switch (ggml_backend_dev_type(dev)) {
@ -19584,6 +19596,7 @@ struct llama_model * llama_load_model_from_file(
                    break;
            }
        }
    }
    // if using single GPU mode, remove all except the main GPU
    if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
@ -19752,9 +19765,6 @@ struct llama_context * llama_new_context_with_model(
                __func__, n_ctx_per_seq, hparams.n_ctx_train);
    }
    ctx->abort_callback      = params.abort_callback;
    ctx->abort_callback_data = params.abort_callback_data;
    ctx->logits_all = params.logits_all;
    // build worst-case graph for encoder if a model contains encoder
@ -19803,7 +19813,7 @@ struct llama_context * llama_new_context_with_model(
        }
        // add CPU backend
-        ctx->backend_cpu = ggml_backend_cpu_init();
+        ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
        if (ctx->backend_cpu == nullptr) {
            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
            llama_free(ctx);
@ -19823,6 +19833,8 @@ struct llama_context * llama_new_context_with_model(
            }
        }
        llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
        if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
@ -19868,7 +19880,8 @@ struct llama_context * llama_new_context_with_model(
            std::vector<ggml_backend_t> backend_ptrs;
            for (auto & backend : ctx->backends) {
                auto * buft = ggml_backend_get_default_buffer_type(backend.get());
-                if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
+                auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
                if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
                    // use the host buffer of the first device CPU for faster transfer of the intermediate state
                    auto * dev = model->devices[0];
                    auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@ -19896,7 +19909,8 @@ struct llama_context * llama_new_context_with_model(
            // pipeline parallelism requires support for async compute and events in all devices
            if (pipeline_parallel) {
                for (auto & backend : ctx->backends) {
-                    if (ggml_backend_is_cpu(backend.get())) {
+                    auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
                    if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
                        // ignore CPU backend
                        continue;
                    }
@ -20070,7 +20084,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_QWEN:
        case LLM_ARCH_QWEN2:
        case LLM_ARCH_QWEN2MOE:
-        case LLM_ARCH_OLMO_1124:
+        case LLM_ARCH_OLMO2:
        case LLM_ARCH_OLMOE:
        case LLM_ARCH_PHI2:
        case LLM_ARCH_PHI3:
@ -21450,6 +21464,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
 void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
    ctx->abort_callback      = abort_callback;
    ctx->abort_callback_data = abort_callback_data;
    for (auto & backend : ctx->backends) {
        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
        auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
        if (set_abort_callback_fn) {
            set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
        }
    }
 }
 void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@ -22191,32 +22213,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
 }
 const char * llama_print_system_info(void) {
    ggml_cpu_init(); // some ARM features are detected at runtime
    static std::string s;
-    s  = "";
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
-    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
+        auto * reg = ggml_backend_reg_get(i);
-    s += "AVX_VNNI = "    + std::to_string(ggml_cpu_has_avx_vnni())    + " | ";
+        auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
-    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
+        if (get_features_fn) {
-    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
+            ggml_backend_feature * features = get_features_fn(reg);
-    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
+            s += ggml_backend_reg_name(reg);
-    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
+            s += " : ";
-    s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
+            for (; features->name; features++) {
-    s += "AMX_INT8 = "    + std::to_string(ggml_cpu_has_amx_int8())    + " | ";
+                s += features->name;
-    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
+                s += " = ";
-    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
+                s += features->value;
-    s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";
+                s += " | ";
-    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
+            }
-    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
+        }
-    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
+    }
    s += "RISCV_VECT = "  + std::to_string(ggml_cpu_has_riscv_v())     + " | ";
    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
    s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
    s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
    s += "LLAMAFILE = "   + std::to_string(ggml_cpu_has_llamafile())   + " | ";
    return s.c_str();
 }
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
 # llama_target_and_test(test-double-float.cpp) # SLOW
 llama_target_and_test(test-log.cpp)
 llama_target_and_test(test-arg-parser.cpp)
 llama_target_and_test(test-quantize-fns.cpp)
 llama_target_and_test(test-quantize-perf.cpp)
 llama_target_and_test(test-sampling.cpp)
 llama_target_and_test(test-chat-template.cpp)
 llama_target_and_test(test-grammar-parser.cpp)
 llama_target_and_test(test-grammar-integration.cpp)
 llama_target_and_test(test-llama-grammar.cpp)
 llama_target_and_test(test-barrier.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
 llama_target_and_test(test-backend-ops.cpp)
 llama_target_and_test(test-rope.cpp)
 llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_target_and_test(test-autorelease.cpp        LABEL "model")
 if (NOT GGML_BACKEND_DL)
    # these tests use the backends directly and cannot be built with dynamic loading
    llama_target_and_test(test-barrier.cpp)
    llama_target_and_test(test-quantize-fns.cpp)
    llama_target_and_test(test-quantize-perf.cpp)
    llama_target_and_test(test-rope.cpp)
 endif()
 # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
    llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -16,7 +16,6 @@
 #include <ggml.h>
 #include <ggml-cpu.h>
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
@ -26,7 +25,6 @@
 #include <cstdint>
 #include <cstring>
 #include <cinttypes>
 #include <functional>
 #include <memory>
 #include <random>
 #include <stdio.h>
@ -639,19 +637,20 @@ struct test_case {
        // determine number of runs
        int n_runs;
        bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
        if (op_flops(out) > 0) {
            // based on flops
            const uint64_t GFLOP = 1000 * 1000 * 1000;
            const uint64_t target_flops_cpu =   8ULL * GFLOP;
            const uint64_t target_flops_gpu = 100ULL * GFLOP;
-            uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
+            uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
        } else {
            // based on memory size
            const size_t GB = 1ULL << 30;
            const size_t target_size_cpu =  8 * GB;
            const size_t target_size_gpu = 32 * GB;
-            size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
+            size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
            n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
        }
@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
    if (mode == MODE_TEST) {
        auto test_cases = make_test_cases_eval();
-        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+        ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
        if (backend_cpu == NULL) {
            printf("  Failed to initialize CPU backend\n");
            return false;
        }
        size_t n_ok = 0;
        for (auto & test : test_cases) {
@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
        }
    }
-    // enumerate backends
+    // load and enumerate backends
    ggml_backend_load_all();
    printf("Testing %zu devices\n\n", ggml_backend_dev_count());
    size_t n_ok = 0;
@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
            continue;
        }
-        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+        if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
        GGML_ASSERT(backend != NULL);
        if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
            printf("  Skipping CPU backend\n");
            ggml_backend_free(backend);
            n_ok++;
            continue;
        }
        ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
        GGML_ASSERT(backend != NULL);
        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
        if (ggml_backend_set_n_threads_fn) {
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -79,9 +79,9 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 }
 // Total dot product error
-static float dot_product_error(
+static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
-    const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float *test_data2
+    GGML_UNUSED(qfns);
-) {
+
    std::vector<uint8_t> tmp_q1(2*test_size);
    std::vector<uint8_t> tmp_q2(2*test_size);