Merge branch 'master' into gg/cmake-warnings

2024-11-26 14:16:08 +02:00 · 2024-11-26 14:16:08 +02:00 · c254ee763a
commit c254ee763a
parent 6fab3ffe02 7db3846a94
20 changed files with 1012 additions and 385 deletions
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -3,19 +3,18 @@ Kompute:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute.cpp
+            - ggml/src/ggml-kompute/**
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal.cpp
+            - ggml/src/ggml-metal/**
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
            - ggml/src/ggml-sycl.cpp
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
@ -27,8 +26,8 @@ Nvidia GPU:
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/ggml_vk_generate_shaders.py
+            - ggml/include/ggml-vulkan.h
-            - ggml/src/ggml-vulkan*
+            - ggml/src/ggml-vulkan/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@ -75,11 +74,7 @@ server:
 ggml:
    - changed-files:
        - any-glob-to-any-file:
-            - ggml/include/ggml*.h
+            - ggml/**
            - ggml/src/ggml*.c
            - ggml/src/ggml*.cpp
            - ggml/src/ggml*.h
            - ggml-cuda/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -871,8 +871,65 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
          name: llama-bin-win-${{ matrix.build }}.zip
  ubuntu-latest-cmake-cuda:
    runs-on: ubuntu-latest
    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
    steps:
        - name: Clone
          id: checkout
          uses: actions/checkout@v4
        - name: Install dependencies
          env:
            DEBIAN_FRONTEND: noninteractive
          run: |
              apt update
              apt install -y cmake build-essential ninja-build libgomp1 git
        - name: Build with CMake
          run: |
            cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
            cmake --build build
  windows-latest-cmake-cuda:
    runs-on: windows-latest
    strategy:
      matrix:
        cuda: ['12.6.2']
        build: ['cuda']
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Install CUDA toolkit
        id: cuda-toolkit
        uses: Jimver/cuda-toolkit@v0.2.19
        with:
          cuda: ${{ matrix.cuda }}
          method: 'network'
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
      - name: Install Ninja
        id: install_ninja
        run: |
          choco install ninja
      - name: Build
        id: cmake_build
        shell: cmd
        run: |
          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
          cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real
          cmake --build build --config Release -t ggml-cuda
          cmake --build build --config Release
  windows-2019-cmake-cuda:
    runs-on: windows-2019
    if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }}
    strategy:
      matrix:
@ -1173,7 +1230,7 @@ jobs:
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
-      - windows-latest-cmake-cuda
+      - windows-2019-cmake-cuda
      - windows-latest-cmake-hip-release
      - macOS-latest-cmake-arm64
      - macOS-latest-cmake-x64
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,12 +10,10 @@
 name: Publish Docker image
 on:
-  #pull_request:
+  workflow_dispatch: # allows manual triggering
-  push:
+  schedule:
-    branches:
+    # Rebuild daily rather than on every push because it is expensive
-      - master
+    - cron: '12 4 * * *'
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
  workflow_dispatch: # allows manual triggering, useful for debugging
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,7 +27,6 @@ permissions:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    #if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
@ -117,7 +114,7 @@ jobs:
          swap-storage: true
      - name: Build and push Docker image (tagged + versioned)
-        if: github.event_name == 'push'
+        if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
        uses: docker/build-push-action@v6
        with:
          context: .
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -5,8 +5,10 @@ on:
  push:
    branches:
      - master
    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -1,6 +1,13 @@
 name: flake8 Lint
-on: [push, pull_request]
+on:
  push:
    branches:
      - master
    paths: ['.github/workflows/python-lint.yml', '**/*.py']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/python-lint.yml', '**/*.py']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -164,8 +164,11 @@ if (GGML_TARGET_DEFINES)
    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
-
+# all public headers
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+set(LLAMA_PUBLIC_HEADERS
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
 set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 configure_package_config_file(
--- a/6
+++ b/6
@ -34,6 +34,7 @@ BUILD_TARGETS = \
 	llama-server \
 	llama-simple \
 	llama-simple-chat \
 	llama-run \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
@ -1167,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-run: examples/run/run.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-simple: examples/simple/simple.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -90,9 +90,10 @@ bool common_speculative_are_compatible(
    if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
-        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
+        llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
    ) {
        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
        return false;
    }
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -40,6 +40,7 @@ else()
        add_subdirectory(server)
    endif()
    add_subdirectory(save-load-state)
    add_subdirectory(run)
    add_subdirectory(simple)
    add_subdirectory(simple-chat)
    add_subdirectory(speculative)
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@ -0,0 +1,7 @@
 # llama.cpp/example/run
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
 ```bash
 ./llama-run Meta-Llama-3.1-8B-Instruct.gguf
 ...
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -0,0 +1,409 @@
 #if defined(_WIN32)
 #include <windows.h>
 #else
 #include <unistd.h>
 #endif
 #include <climits>
 #include <cstdio>
 #include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "llama-cpp.h"
 typedef std::unique_ptr<char[]> char_array_ptr;
 struct Argument {
    std::string flag;
    std::string help_text;
 };
 struct Options {
    std::string model_path, prompt_non_interactive;
    int ngl = 99;
    int n_ctx = 2048;
 };
 class ArgumentParser {
   public:
    ArgumentParser(const char * program_name) : program_name(program_name) {}
    void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
        string_args[flag] = &var;
        arguments.push_back({flag, help_text});
    }
    void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
        int_args[flag] = &var;
        arguments.push_back({flag, help_text});
    }
    int parse(int argc, const char ** argv) {
        for (int i = 1; i < argc; ++i) {
            std::string arg = argv[i];
            if (string_args.count(arg)) {
                if (i + 1 < argc) {
                    *string_args[arg] = argv[++i];
                } else {
                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
                    print_usage();
                    return 1;
                }
            } else if (int_args.count(arg)) {
                if (i + 1 < argc) {
                    if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
                        fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
                        print_usage();
                        return 1;
                    }
                } else {
                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
                    print_usage();
                    return 1;
                }
            } else {
                fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
                print_usage();
                return 1;
            }
        }
        if (string_args["-m"]->empty()) {
            fprintf(stderr, "error: -m is required\n");
            print_usage();
            return 1;
        }
        return 0;
    }
   private:
    const char * program_name;
    std::unordered_map<std::string, std::string *> string_args;
    std::unordered_map<std::string, int *> int_args;
    std::vector<Argument> arguments;
    int parse_int_arg(const char * arg, int & value) {
        char * end;
        const long val = std::strtol(arg, &end, 10);
        if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
            value = static_cast<int>(val);
            return 0;
        }
        return 1;
    }
    void print_usage() const {
        printf("\nUsage:\n");
        printf("  %s [OPTIONS]\n\n", program_name);
        printf("Options:\n");
        for (const auto & arg : arguments) {
            printf("  %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
        }
        printf("\n");
    }
 };
 class LlamaData {
   public:
    llama_model_ptr model;
    llama_sampler_ptr sampler;
    llama_context_ptr context;
    std::vector<llama_chat_message> messages;
    int init(const Options & opt) {
        model = initialize_model(opt.model_path, opt.ngl);
        if (!model) {
            return 1;
        }
        context = initialize_context(model, opt.n_ctx);
        if (!context) {
            return 1;
        }
        sampler = initialize_sampler();
        return 0;
    }
   private:
    // Initializes the model and returns a unique pointer to it
    llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
        llama_model_params model_params = llama_model_default_params();
        model_params.n_gpu_layers = ngl;
        llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
        if (!model) {
            fprintf(stderr, "%s: error: unable to load model\n", __func__);
        }
        return model;
    }
    // Initializes the context with the specified parameters
    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
        llama_context_params ctx_params = llama_context_default_params();
        ctx_params.n_ctx = n_ctx;
        ctx_params.n_batch = n_ctx;
        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
        if (!context) {
            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
        }
        return context;
    }
    // Initializes and configures the sampler
    llama_sampler_ptr initialize_sampler() {
        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
        return sampler;
    }
 };
 // Add a message to `messages` and store its content in `owned_content`
 static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
                        std::vector<char_array_ptr> & owned_content) {
    char_array_ptr content(new char[text.size() + 1]);
    std::strcpy(content.get(), text.c_str());
    llama_data.messages.push_back({role, content.get()});
    owned_content.push_back(std::move(content));
 }
 // Function to apply the chat template and resize `formatted` if needed
 static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
    int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
    if (result > static_cast<int>(formatted.size())) {
        formatted.resize(result);
        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
    }
    return result;
 }
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
                           std::vector<llama_token> & prompt_tokens) {
    const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
    prompt_tokens.resize(n_prompt_tokens);
    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
                       true) < 0) {
        GGML_ABORT("failed to tokenize the prompt\n");
    }
    return n_prompt_tokens;
 }
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
    const int n_ctx = llama_n_ctx(ctx.get());
    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
    if (n_ctx_used + batch.n_tokens > n_ctx) {
        printf("\033[0m\n");
        fprintf(stderr, "context size exceeded\n");
        return 1;
    }
    return 0;
 }
 // convert the token to a string
 static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
    char buf[256];
    int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
    if (n < 0) {
        GGML_ABORT("failed to convert token to piece\n");
    }
    piece = std::string(buf, n);
    return 0;
 }
 static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
    printf("%s", piece.c_str());
    fflush(stdout);
    response += piece;
 }
 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
    std::vector<llama_token> prompt_tokens;
    const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
    if (n_prompt_tokens < 0) {
        return 1;
    }
    // prepare a batch for the prompt
    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
    llama_token new_token_id;
    while (true) {
        check_context_size(llama_data.context, batch);
        if (llama_decode(llama_data.context.get(), batch)) {
            GGML_ABORT("failed to decode\n");
        }
        // sample the next token, check is it an end of generation?
        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
        if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
            break;
        }
        std::string piece;
        if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
            return 1;
        }
        print_word_and_concatenate_to_response(piece, response);
        // prepare the next batch with the sampled token
        batch = llama_batch_get_one(&new_token_id, 1);
    }
    return 0;
 }
 static int parse_arguments(const int argc, const char ** argv, Options & opt) {
    ArgumentParser parser(argv[0]);
    parser.add_argument("-m", opt.model_path, "model");
    parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
    parser.add_argument("-c", opt.n_ctx, "context_size");
    parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
    if (parser.parse(argc, argv)) {
        return 1;
    }
    return 0;
 }
 static int read_user_input(std::string & user) {
    std::getline(std::cin, user);
    return user.empty();  // Indicate an error or empty input
 }
 // Function to generate a response based on the prompt
 static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
    // Set response color
    printf("\033[33m");
    if (generate(llama_data, prompt, response)) {
        fprintf(stderr, "failed to generate response\n");
        return 1;
    }
    // End response with color reset and newline
    printf("\n\033[0m");
    return 0;
 }
 // Helper function to apply the chat template and handle errors
 static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
                                                   const bool is_user_input, int & output_length) {
    const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
    if (new_len < 0) {
        fprintf(stderr, "failed to apply the chat template\n");
        return -1;
    }
    output_length = new_len;
    return 0;
 }
 // Helper function to handle user input
 static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
    if (!prompt_non_interactive.empty()) {
        user_input = prompt_non_interactive;
        return true;  // No need for interactive input
    }
    printf("\033[32m> \033[0m");
    return !read_user_input(user_input);  // Returns false if input ends the loop
 }
 // Function to tokenize the prompt
 static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
    std::vector<char_array_ptr> owned_content;
    std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
    int prev_len = 0;
    while (true) {
        // Get user input
        std::string user_input;
        if (!handle_user_input(user_input, prompt_non_interactive)) {
            break;
        }
        add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
                    owned_content);
        int new_len;
        if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
            return 1;
        }
        std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
        std::string response;
        if (generate_response(llama_data, prompt, response)) {
            return 1;
        }
    }
    return 0;
 }
 static void log_callback(const enum ggml_log_level level, const char * text, void *) {
    if (level == GGML_LOG_LEVEL_ERROR) {
        fprintf(stderr, "%s", text);
    }
 }
 static bool is_stdin_a_terminal() {
 #if defined(_WIN32)
    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
    DWORD mode;
    return GetConsoleMode(hStdin, &mode);
 #else
    return isatty(STDIN_FILENO);
 #endif
 }
 static std::string read_pipe_data() {
    std::ostringstream result;
    result << std::cin.rdbuf();  // Read all data from std::cin
    return result.str();
 }
 int main(int argc, const char ** argv) {
    Options opt;
    if (parse_arguments(argc, argv, opt)) {
        return 1;
    }
    if (!is_stdin_a_terminal()) {
        if (!opt.prompt_non_interactive.empty()) {
            opt.prompt_non_interactive += "\n\n";
        }
        opt.prompt_non_interactive += read_pipe_data();
    }
    llama_log_set(log_callback, nullptr);
    LlamaData llama_data;
    if (llama_data.init(opt)) {
        return 1;
    }
    if (chat_loop(llama_data, opt.prompt_non_interactive)) {
        return 1;
    }
    return 0;
 }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2267,12 +2267,7 @@ struct server_context {
                    continue; // continue loop of slots
                }
-                llama_token id;
+                llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
                {
                    completion_token_output result;
                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
                slot.i_batch = -1;
@ -2285,6 +2280,7 @@ struct server_context {
                    metrics.on_prompt_eval(slot);
                }
                completion_token_output result;
                result.tok = id;
                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
@ -2306,11 +2302,14 @@ struct server_context {
                }
            }
-                // check if the slot supports speculative decoding
+            // do speculative decoding
-                if (!slot.can_speculate()) {
+            for (auto & slot : slots) {
                if (!slot.is_processing() || !slot.can_speculate()) {
                    continue;
                }
                llama_token id = slot.sampled;
                struct common_speculative_params params_spec;
                params_spec.n_draft   = slot.params.speculative.n_max;
                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -117,7 +117,8 @@ int main(int argc, char ** argv) {
    llama_token id_last = inp.back();
    // all tokens currently in the target context
-    auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
+    llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
    prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
    int n_past = inp.size() - 1;
@ -181,29 +182,26 @@ int main(int argc, char ** argv) {
        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
        n_past    += ids.size() - 1;
-        n_drafted += batch_tgt.n_tokens - 1;
+        n_drafted += draft.size(); // note: we ignore the discarded small drafts
        n_accept  += ids.size() - 1;
        n_predict += ids.size();
        // process the accepted tokens and update contexts
        //
        // this is the standard token post-processing that we normally do
        // in this case, we do it for a group of accepted tokens at once
        //
        {
            llama_token id = 0;
            std::string token_str;
        for (size_t i = 0; i < ids.size(); ++i) {
-                id = ids[i];
+            prompt_tgt.push_back(id_last);
-                ++n_predict;
+            id_last = ids[i];
-                if (llama_token_is_eog(model_tgt, id)) {
+            if (llama_token_is_eog(model_tgt, id_last)) {
                has_eos = true;
                break;
            }
-                token_str = common_token_to_piece(ctx_tgt, id);
+            const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
            if (params.use_color && i + 1 < ids.size()) {
                LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
@ -212,11 +210,7 @@ int main(int argc, char ** argv) {
            }
        }
-            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
+        LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
                break;
            }
            LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
@ -224,11 +218,8 @@ int main(int argc, char ** argv) {
            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
        }
-            prompt_tgt.push_back(id_last);
+        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
-            prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
+            break;
            // remember the last accepted token for the next iteration
            id_last = id;
        }
    }
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -21,6 +21,7 @@
 */
 #include "aclnn_ops.h"
 #include "ggml-impl.h"
 #include <aclnnop/aclnn_avgpool2d.h>
 #include <aclnnop/aclnn_cast.h>
@ -32,6 +33,8 @@
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_index_fill_tensor.h>
 #include <aclnnop/aclnn_layer_norm.h>
 #include <aclnnop/aclnn_mm.h>
 #include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
 #include <aclnnop/aclnn_permute.h>
@ -241,10 +244,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    int64_t concat_dim = 1;
+    const int32_t dim = ggml_get_op_params_i32(dst, 0);
    GGML_ASSERT(dim >= 0 && dim < 4);
    int32_t acl_dim = 3 - dim;
    aclTensor* tensors[] = {acl_src0, acl_src1};
    aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
-    aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
+    aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
    ACL_CHECK(aclDestroyTensorList(tensorList));
    ACL_CHECK(aclDestroyTensor(acl_dst));
@ -1437,10 +1444,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];  // kernel
    ggml_tensor* src1 = dst->src[1];  // input
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
    GGML_TENSOR_BINARY_OP_LOCALS;
    // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@ -1462,9 +1465,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    const int64_t OH = is_2D ? ne2 : 1;
    const int64_t OW = ne1;
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));
    // memory allocated increased to 3x when is_2D == false
    const int64_t n_bytes_factor = is_2D ? 1 : 3;
@ -2425,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                          aclTensor* acl_weight, aclTensor* acl_dst) {
    int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
                                // fp32, atlas a2 will transpose it to HFLOAT32.
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
@ -2443,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
        aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 /**
 * @brief Performs matrix multiplication of two 2D tensors.
 *
 * This function computes the matrix multiplication of the input tensor
 * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
 * destination tensor `acl_dst`.
 * The operation is defined as:
 * \f[
 *     \text {acl_dst}=\text {acl_input@acl_weight}
 * \f]
 *
 * @param ctx The context for the CANN backend operations.
 * @param acl_input The input tensor for the matrix multiplication.
 * @param acl_weight The weight tensor for the matrix multiplication.
 * @param acl_dst The destination tensor where the result of the matrix
 * multiplication will be stored.
 */
 static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                             aclTensor* acl_weight, aclTensor* acl_dst) {
    int8_t cube_math_type = 2;
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
                                      cube_math_type, &workspaceSize,
                                      &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    ACL_CHECK(
        aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 /**
 * @brief Performs matrix multiplication of two 3D tensors.
 *
 * This function computes the matrix multiplication of the input tensor
 * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
 * destination tensor `acl_dst`.
 * The operation is defined as:
 * \f[
 *     \text {acl_dst}=\text {acl_input@acl_weight}
 * \f]
 *
 * @param ctx The context for the CANN backend operations.
 * @param acl_input The input tensor for the matrix multiplication.
 * @param acl_weight The weight tensor for the matrix multiplication.
 * @param acl_dst The destination tensor where the result of the matrix
 * multiplication will be stored.
 */
 static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                             aclTensor* acl_weight, aclTensor* acl_dst) {
    int8_t cube_math_type = 2;
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
                                               cube_math_type, &workspaceSize,
                                               &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    ACL_CHECK(
        aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 /**
 * @brief Performs matrix multiplication with floating-point precision on
 * tensors using the CANN backend.
@ -2464,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
    // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
    BCAST_MUL_MAT_SHAPE(input, weight, dst);
-    // transpose weight: [1,2,3,4] -> [1,2,4,3]
+    int64_t n_dims = bcast_dims;
-    int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
+    if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
-                              bcast_weight_ne[2], bcast_weight_ne[3],
+        if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
-                              bcast_weight_ne[4], bcast_weight_ne[5]};
+            n_dims = 2;
-    size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
+        } else if (bcast_input_ne[2] == 1) {
-                             bcast_weight_nb[2], bcast_weight_nb[3],
+            n_dims = 3;
-                             bcast_weight_nb[4], bcast_weight_nb[5]};
+        }
    }
    aclTensor* acl_weight_tensor =
        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
    aclTensor* acl_input_tensor =
-        ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
+        ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
+    int64_t transpose_ne[] = {
        bcast_weight_ne[1], bcast_weight_ne[0],
        bcast_weight_ne[2], bcast_weight_ne[3],
        bcast_weight_ne[4], bcast_weight_ne[5]
    };
    size_t transpose_nb[] = {
        bcast_weight_nb[1], bcast_weight_nb[0],
        bcast_weight_nb[2], bcast_weight_nb[3],
        bcast_weight_nb[4], bcast_weight_nb[5]
    };
    aclTensor* acl_weight_tensor =
        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
    aclTensor* acl_dst =
        ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
    switch (n_dims) {
    case 2:
        aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
        break;
    case 3:
        aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
        break;
    default:
        aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
        break;
    }
    ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
    ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@ -2503,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
    ggml_tensor* src0 = dst->src[0];  // weight
    ggml_tensor* src1 = dst->src[1];  // input
-    // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
+    // The shape of the weight is NCHW.
-    // is regarded as batch. weight need transpose.
+    // Matrix multiplication uses HW dims.
-    int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
+    // HC is regarded as batch.
    // weight need transpose.
    float weight_elem_size;
    if (type == GGML_TYPE_Q4_0) {
        weight_elem_size = float(sizeof(uint8_t)) / 2;
-    }
+    } else if (type == GGML_TYPE_Q8_0) {
    else if (type == GGML_TYPE_Q8_0) {
        weight_elem_size = float(sizeof(uint8_t));
-    }
+    } else {
    else {
        GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
    }
-    float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
+    float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
-
+    size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
    // size of one matrix is element_size * height * width.
    size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
    size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
    // scale stored at the end of weight. Also need transpose.
    GGML_ASSERT(QK4_0 == QK8_0);
    int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
    size_t scale_elem_size = sizeof(uint16_t);
-    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
-                         scale_elem_size};
+    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
    size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
    char* scale_offset = (char*)src0->data + weight_size;
    // input
    void* input_buffer;
    size_t input_elem_size = sizeof(uint16_t);
    int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
-    size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
+    size_t input_nb[] = {input_elem_size,  input_ne[0] * input_elem_size};
-    size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
+    size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
    ggml_cann_pool_alloc input_alloctor(ctx.pool());
    void* input_buffer = src1->data;
    // case in
    if (src1->type != GGML_TYPE_F16) {
        aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
-        input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
+        input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
        input_buffer = input_alloctor.get();
        int64_t* input_cast_ne = src1->ne;
        size_t input_cast_nb[GGML_MAX_DIMS];
@ -2552,73 +2642,122 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
        }
        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
+            input_buffer,
-            input_cast_nb, GGML_MAX_DIMS);
+            ACL_FLOAT16,
            input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
        aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
        ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
    } else {
        input_buffer = src1->data;
    }
    // output
    size_t output_elem_size = sizeof(uint16_t);
-    int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
+    size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
-    size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
+    ggml_cann_pool_alloc output_allocator(ctx.pool());
-    ggml_cann_pool_alloc output_alloctor(
+    void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
-        ctx.pool(), ggml_nelements(dst) * output_elem_size);
+    size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
    void* output_buffer = output_alloctor.get();
    size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
    // aclnn
    int64_t max_elem_size = 65535;
    int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
    aclOpExecutor* executor = nullptr;
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
        for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
            int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
            int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
-            int64_t batch1 = n1 * src1->ne[2] + c1;
+            int64_t batch1 = (n1 * src1->ne[2]) + c1;
-            int64_t batch0 = n0 * src0->ne[2] + c0;
+            int64_t batch0 = (n0 * src0->ne[2]) + c0;
            aclTensor* acl_input_tensor = ggml_cann_create_tensor(
                (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
                input_elem_size, input_ne, input_nb, 2);
            // first split
            int64_t weight_ne_offset = 0;
            int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
            int64_t scale_ne_offset = 0;
            int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
            int64_t output_ne_offset = 0;
            int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
                (char*)src0->data + batch0 * weight_stride,
-                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
+                ggml_cann_type_mapping(type),
-                weight_nb, 2);
+                weight_elem_size, weight_ne, weight_nb, 2,
                ACL_FORMAT_ND, weight_ne_offset);
            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
+                scale_offset + batch0 * scale_stride,
-                scale_elem_size, scale_ne, scale_nb, 2);
+                ACL_FLOAT16,
                scale_elem_size, scale_ne, scale_nb, 2,
                ACL_FORMAT_ND, scale_ne_offset);
            aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
+                (char*)output_buffer + batch1 * output_stride,
-                output_elem_size, output_ne, output_nb, 2);
+                ACL_FLOAT16,
                output_elem_size, output_ne, output_nb, 2,
                ACL_FORMAT_ND, output_ne_offset);
            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
-                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
-                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
+                nullptr, nullptr, nullptr, nullptr, QK8_0,
-                &workspaceSize, &executor));
+                acl_output_tensor, &workspaceSize, &executor));
-
+            if (workspaceAddr == nullptr) {
-            if (workspaceSize > 0 && workspaceAddr == nullptr) {
+                workspaceAddr = workspace_allocator.alloc(workspaceSize);
                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
                                                         workspaceSize);
                workspaceAddr = workspace_allocator.get();
            }
            ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
                workspaceAddr, workspaceSize, executor, ctx.stream()));
-            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
            ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
            ACL_CHECK(aclDestroyTensor(acl_output_tensor));
            // other splits
            for (int64_t split = 1; split < split_size; split++) {
                weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
                scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
                scale_ne[0] = weight_ne[0];
                output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
                output_ne[0] = weight_ne[0];
                acl_weight_tensor = ggml_cann_create_tensor(
                    (char*)src0->data + batch0 * weight_stride,
                    ggml_cann_type_mapping(type),
                    weight_elem_size, weight_ne, weight_nb, 2,
                    ACL_FORMAT_ND, weight_ne_offset);
                acl_scale_tensor = ggml_cann_create_tensor(
                    scale_offset + batch0 * scale_stride,
                    ACL_FLOAT16,
                    scale_elem_size, scale_ne, scale_nb, 2,
                    ACL_FORMAT_ND, scale_ne_offset);
                acl_output_tensor = ggml_cann_create_tensor(
                    (char*)output_buffer + batch1 * output_stride,
                    ACL_FLOAT16,
                    output_elem_size, output_ne, output_nb, 2,
                    ACL_FORMAT_ND, output_ne_offset);
                ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                    acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
                    nullptr, nullptr, nullptr, nullptr, QK8_0,
                    acl_output_tensor, &workspaceSize, &executor));
                ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
                    workspaceAddr, workspaceSize, executor, ctx.stream()));
                ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
                ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
                ACL_CHECK(aclDestroyTensor(acl_output_tensor));
            }
            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
        }
    }
    // cast out
    if (dst->type != GGML_TYPE_F16) {
        int64_t* output_cast_ne = dst->ne;
        size_t output_cast_nb[GGML_MAX_DIMS];
        output_cast_nb[0] = sizeof(uint16_t);
@ -2626,14 +2765,16 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
            output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
        }
-    aclTensor* acl_output_tensor =
+        aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-        ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
+            output_buffer,
-                                output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+            ACL_FLOAT16,
            output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-    aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
+        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
        ACL_CHECK(aclDestroyTensor(acl_output_tensor));
        ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
    }
 }
 void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@ -2859,15 +3000,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
    ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
 }
 #ifdef __cplusplus
 extern "C" {
 #endif
 aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
    const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
    int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
    aclOpExecutor** executor);
 aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
                                         uint64_t workspaceSize,
                                         aclOpExecutor* executor,
                                         aclrtStream stream);
 #ifdef __cplusplus
 }
 #endif
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    // TODO: use ascendc
    // Only test with LLAMA model.
    ggml_tensor* src0 = dst->src[0];  // input
    ggml_tensor* src2 = dst->src[2];  // freq_factors
    // TODO: with freq_factors
    GGML_ASSERT(src2 == NULL);
    // param
    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
    // const int n_past     = ((int32_t *) dst->op_params)[0];
@ -2885,13 +3038,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
    memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
-    GGML_ASSERT(n_dims <= ne0);
+    // TODO: with freq_factors
    GGML_ASSERT(src2 == NULL);
    // TODO: attn_factor != 1
    GGML_ASSERT(attn_factor == 1);
    // TODO: n_dims <= ne0
    GGML_ASSERT(n_dims == ne0);
    GGML_ASSERT(n_dims % 2 == 0);
    // TODO: ext_factor != 0
    GGML_ASSERT(ext_factor == 0);
    // TODO: freq_scale != 1
    GGML_ASSERT(freq_scale == 1);
    // TODO: type == GGML_TYPE_F16
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    const float theta_scale = powf(freq_base, -2.0f / n_dims);
@ -2924,177 +3083,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
                     theta_scale, is_neox);
-    // roll input
+    uint64_t workspaceSize = 0;
-    void* input_roll_buffer;
+    aclOpExecutor* executor;
    aclTensor* acl_minus_one_tensor;
    void* minus_one_scale_buffer = nullptr;
    ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
    ggml_cann_pool_alloc minus_one_scale_allocator(
        ctx.pool(), sizeof(float_t) * src0->ne[0]);
    if (!is_neox) {
        // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
        input_roll_buffer = roll_allocator.get();
        int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
                                    src0->ne[2], src0->ne[3]};
        size_t input_roll_nb[GGML_MAX_DIMS];
        input_roll_nb[0] = ggml_type_size(src0->type);
        for (int i = 1; i < GGML_MAX_DIMS; i++) {
            input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
        }
        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
            input_roll_buffer, ggml_cann_type_mapping(src0->type),
            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
            GGML_MAX_DIMS);
        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
            src0->data, ggml_cann_type_mapping(src0->type),
            ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
            GGML_MAX_DIMS);
-        int64_t shifts[] = {1};
+    void* workspaceAddr = nullptr;
        int64_t dims[] = {3};
        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
-        // init [-1, 1, -1, 1, ...]
+    int acl_mode = mode;
-        minus_one_scale_buffer = minus_one_scale_allocator.get();
+    if (mode == 0) {
-
+        acl_mode = 1;
        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
        size_t minus_one_nb[GGML_MAX_DIMS];
        minus_one_nb[0] = sizeof(float_t);
        for (int i = 1; i < GGML_MAX_DIMS; i++) {
            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
        }
        acl_minus_one_tensor = aclnn_ones(
            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
        int64_t dim = 3;
        int64_t* index = new int64_t[src0->ne[0]];
        for (int i = 0; i < src0->ne[0]; i++) {
            index[i] = i / 2 * 2;
        }
        int64_t index_num = src0->ne[0];
        float value = -1;
        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
                                index_num, value);
    } else {
        // roll input: [q0,q1,q2,...] ->
        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
        input_roll_buffer = roll_allocator.get();
        aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
            input_roll_buffer, ggml_cann_type_mapping(src0->type),
            ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
        aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
        int64_t shifts[] = {src0->ne[0] / 2};
        int64_t dims[] = {3};
        aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
        ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
        // init [-1, -1, -1, 1, 1，1，...]
        minus_one_scale_buffer = minus_one_scale_allocator.get();
        int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
        size_t minus_one_nb[GGML_MAX_DIMS];
        minus_one_nb[0] = sizeof(float_t);
        for (int i = 1; i < GGML_MAX_DIMS; i++) {
            minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
        }
        acl_minus_one_tensor = aclnn_ones(
            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
        // -1 * first half
        int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
        size_t first_half_nb[GGML_MAX_DIMS];
        first_half_nb[0] = sizeof(float_t);
        for (int i = 1; i < GGML_MAX_DIMS; i++) {
            first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
        }
        aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
            minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
            first_half_nb, GGML_MAX_DIMS);
        bool inplace = true;
        float scale = -1;
        aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
        ACL_CHECK(aclDestroyTensor(acl_first_half_tensor));
    }
-    // TODO: n_dims < ne0
+    aclTensor* acl_x = ggml_cann_create_tensor(src0);
    GGML_ASSERT(n_dims == src0->ne[0]);
    // input * scale
    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
                                                  ggml_nbytes(src0));
    void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
    size_t input_nb[GGML_MAX_DIMS];
    input_nb[0] = ggml_type_size(src0->type);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
        input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
    }
    aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
        input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
    aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
        input_roll_buffer, ggml_cann_type_mapping(src0->type),
        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
    aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
              acl_input_roll_mul_scale_tensor);
    // output
    aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    void* output_fp32_buffer;
+    ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
-    if (src0->type == GGML_TYPE_F32) {
+        acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor));
-        aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
+    if (workspaceSize > 0) {
-        aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
-                          acl_sin_reshape_tensor);
+        workspaceAddr = workspace_allocator.get();
        aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
        // TODO: ne0 != n_dims in mode2
    } else if (src0->type == GGML_TYPE_F16) {
        size_t input_fp32_nb[GGML_MAX_DIMS];
        input_fp32_nb[0] = sizeof(float_t);
        for (int i = 1; i < GGML_MAX_DIMS; i++) {
            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
        }
        ggml_cann_pool_alloc fp32_allocator1(
            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
        void* input_fp32_buffer1 = fp32_allocator1.get();
        aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
            input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
            input_fp32_nb, GGML_MAX_DIMS);
        ggml_cann_pool_alloc fp32_allocator2(
            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
        void* input_fp32_buffer2 = fp32_allocator2.get();
        aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
            input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
            input_fp32_nb, GGML_MAX_DIMS);
        ggml_cann_pool_alloc fp32_allocator(
            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
        output_fp32_buffer = fp32_allocator.get();
        aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
            output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
            input_fp32_nb, GGML_MAX_DIMS);
        aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
                  input_fp32_tensor2);
        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
                  output_fp32_tensor);
        aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
        ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
    }
-    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
+    ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
                                           executor, ctx.stream()));
    ACL_CHECK(aclDestroyTensor(acl_x));
    ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
    ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
    ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
    ACL_CHECK(aclDestroyTensor(acl_src0));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
 struct ggml_backend_cann_context {
    int32_t device;                  /**< Device ID. */
    std::string name;                /**< Name of the device. */
    std::string description;         /**< Description of the device. */
    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
        {nullptr}}; /**< Array of streams for the device. */
    /**
     * @brief Constructor for initializing the context with a given device.
     * @param device Device ID.
     */
    explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)) {}
+        : device(device), name("CANN" + std::to_string(device)) {
        ggml_cann_set_device(device);
        description = aclrtGetSocName();
    }
    /**
     * @brief Destructor for cleaning up resources.
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
        ACL_CHECK(aclrtMemGetAllocationGranularity(
            &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
            &info.devices[id].vmm_granularity));
        size_t free, total;
        ggml_backend_cann_get_device_memory(id, &free, &total);
        info.devices[id].total_vram = free;
    }
    // TODO: add more device info later.
@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
     * @return A pointer to the allocated buffer.
     */
    void* alloc(size_t size, size_t* actual_size) override {
        const size_t alignment = 128;
        size = GGML_PAD(size, alignment);
        if (size == 0) {
            size = alignment;
        }
 #ifdef DEBUG_CANN_MALLOC
        int nnz = 0;
        size_t max_size = 0;
@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
            return ptr;
        }
        void* ptr;
        size_t look_ahead_size = (size_t)(1.05 * size);
        look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
        ggml_cann_set_device(device);
        ACL_CHECK(
-            aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
+            aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
-        *actual_size = look_ahead_size;
+        *actual_size = size;
-        pool_size += look_ahead_size;
+        pool_size += size;
 #ifdef DEBUG_CANN_MALLOC
        GGML_LOG_INFO(
            "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
    /**
     * @brief The maximum size of the virtual memory pool (32 GB).
     */
-    static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35;  // 32 GB
+    size_t max_size;
    /**
     * @brief The device ID associated with this buffer pool.
@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     */
    explicit ggml_cann_pool_vmm(int device)
        : device(device),
-          granularity(ggml_cann_info().devices[device].vmm_granularity) {}
+          granularity(ggml_cann_info().devices[device].vmm_granularity) {
        auto dev = ggml_cann_info().devices[device];
        granularity = dev.vmm_granularity;
        max_size = dev.total_vram;
    }
    /**
     * @brief Destructor to free all buffers in the virtual memory pool.
@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
        // round up the allocation size to the alignment to ensure that all
        // allocations are aligned for all data types
        const size_t alignment = 128;
-        size = alignment * ((size + alignment - 1) / alignment);
+        size = GGML_PAD(size, alignment);
        if (size == 0) {
            size = alignment;
        }
        size_t avail = pool_size - pool_used;
        if (size > avail) {
            // round up to the next multiple of the granularity
            size_t reserve_size = size - avail;
-            reserve_size =
+            reserve_size = GGML_PAD(reserve_size, granularity);
                granularity * ((reserve_size + granularity - 1) / granularity);
-            GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
+            GGML_ASSERT(pool_size + reserve_size <= max_size);
            // allocate more physical memory
            aclrtPhysicalMemProp prop = {};
@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
            // reserve virtual address space (if not already reserved)
            if (pool_addr == 0) {
                ACL_CHECK(aclrtReserveMemAddress(
-                    &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
+                    &pool_addr, max_size, 0, NULL, 1));
            }
            // map at the end of the pool
@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
            // add to the pool
            pool_size += reserve_size;
-            // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
+#ifdef DEBUG_CANN_MALLOC
-            // reserved %llu MB)\n",
+             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-            //       device, (unsigned long long) (pool_size/1024/1024),
+                   device, (unsigned long long) (pool_size/1024/1024),
-            //       (unsigned long long) (reserve_size/1024/1024));
+                   (unsigned long long) (reserve_size/1024/1024));
 #endif
        }
        GGML_ASSERT(pool_addr != 0);
@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
 */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
    int device) {
    // return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
 }
@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
    static bool ggml_backend_cann_buffer_type_initialized = false;
    if (!ggml_backend_cann_buffer_type_initialized) {
-        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
+        for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
            ggml_backend_cann_buffer_types[i] = {
                /* .iface    = */ ggml_backend_cann_buffer_type_interface,
-                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
                /* .context  = */
                 new ggml_backend_cann_buffer_type_context{
                    i, "CANN" + std::to_string(i)},
@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) {
        return nullptr;
    }
    const size_t alignment = 128;
    size = GGML_PAD(size, alignment);
    if (size == 0) {
        size = alignment;
    }
    void * hostPtr = nullptr;
    aclError err = aclrtMallocHost((void **) &hostPtr, size);
    if (err != ACL_SUCCESS) {
        GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
        return nullptr;
@ -1669,12 +1687,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
                case GGML_TYPE_Q8_0:
                    // Current groupsize should not be greater than k-1 in
                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
                    if (op->src[0]->ne[0] <= QK8_0) {
                        return false;
                    }
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
                case GGML_TYPE_Q8_0:
                    // TODO: fix me
                    // Current groupsize should not be greater than k-1 in
                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
                case GGML_TYPE_Q4_0:
                    return true;
                default:
@ -1706,9 +1726,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
                    return false;
            }
        }
        case GGML_OP_CONT: {
            // TODO: support GGML_TYPE_BF16
            switch (op->src[0]->type) {
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                    return true;
                default:
                    return false;
            }
        }
        case GGML_OP_ROPE: {
            // TODO: with ops-test v == 1
            float * freq_scale = (float*)((int32_t*)op->op_params + 6);
            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
            float * attn_factor = (float*)((int32_t*)op->op_params + 8);
            // TODO: with freq_factors
            if (op->src[2] != NULL) {
                return false;
            }
            // TODO: n_dims <= ne0
            if (op->src[0]->ne[0] != op->op_params[1]) {
                return false;
            }
            // TODO: ext_factor != 0
            if (*ext_factor != 0) {
                return false;
            }
            // TODO: freq_scale != 1
            if (*freq_scale != 1) {
                return false;
            }
            // TODO: attn_factor != 1
            if (*attn_factor != 1) {
                return false;
            }
            //TODO: type == GGML_TYPE_F16
            switch (op->src[0]->type) {
                case GGML_TYPE_F32:
                    return true;
                default:
                    return false;
            }
        }
        case GGML_OP_UPSCALE: {
            // aclnnUpsampleNearest2dGetWorkspaceSize not support
            // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
                return false;
            }
            return true;
        }
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
        case GGML_OP_DUP:
        case GGML_OP_REPEAT:
        case GGML_OP_CONCAT:
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
@ -1722,17 +1794,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_CLAMP:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_ROPE:
        case GGML_OP_IM2COL:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -96,6 +96,39 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
        endif ()
        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
    elseif (APPLE)
        if (GGML_NATIVE)
            set(USER_PROVIDED_MARCH FALSE)
            foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
                if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
                    set(USER_PROVIDED_MARCH TRUE)
                    break()
                endif()
            endforeach()
            if (NOT USER_PROVIDED_MARCH)
                set(MARCH_FLAGS "-march=armv8.2a")
                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
                if (GGML_COMPILER_SUPPORT_DOTPROD)
                    set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
                    add_compile_definitions(__ARM_FEATURE_DOTPROD)
                endif ()
                set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
                set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
                set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
                check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
                if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                    set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
                    add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
                endif ()
                set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
                list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
            endif ()
        endif ()
    else()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -474,9 +474,15 @@ void write_output_files() {
 int main(int argc, char** argv) {
    std::map<std::string, std::string> args;
-    for (int i = 1; i < argc; i += 2) {
+    for (int i = 1; i < argc; ++i) {
-        if (i + 1 < argc) {
+        std::string arg = argv[i];
-            args[argv[i]] = argv[i + 1];
+        if (arg.rfind("--", 0) == 0) {
            if (i + 1 < argc && argv[i + 1][0] != '-') {
                args[arg] = argv[i + 1];
                ++i;
            } else {
                args[arg] = "";
            }
        }
    }
--- a/include/llama-cpp.h
+++ b/include/llama-cpp.h
@ -0,0 +1,25 @@
 #pragma once
 #ifndef __cplusplus
 #error "This header is for C++ only"
 #endif
 #include <memory>
 #include "llama.h"
 struct llama_model_deleter {
    void operator()(llama_model * model) { llama_free_model(model); }
 };
 struct llama_context_deleter {
    void operator()(llama_context * context) { llama_free(context); }
 };
 struct llama_sampler_deleter {
    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
 };
 typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
 typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
 typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;