diff --git a/.github/labeler.yml b/.github/labeler.yml index 89436740d..1b47bc968 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,19 +3,18 @@ Kompute: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-kompute.h - - ggml/src/ggml-kompute.cpp + - ggml/src/ggml-kompute/** - README-kompute.md Apple Metal: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-metal.h - - ggml/src/ggml-metal.cpp + - ggml/src/ggml-metal/** - README-metal.md SYCL: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-sycl.h - - ggml/src/ggml-sycl.cpp - ggml/src/ggml-sycl/** - docs/backend/SYCL.md - examples/sycl/** @@ -27,8 +26,8 @@ Nvidia GPU: Vulkan: - changed-files: - any-glob-to-any-file: - - ggml/ggml_vk_generate_shaders.py - - ggml/src/ggml-vulkan* + - ggml/include/ggml-vulkan.h + - ggml/src/ggml-vulkan/** documentation: - changed-files: - any-glob-to-any-file: @@ -75,11 +74,7 @@ server: ggml: - changed-files: - any-glob-to-any-file: - - ggml/include/ggml*.h - - ggml/src/ggml*.c - - ggml/src/ggml*.cpp - - ggml/src/ggml*.h - - ggml-cuda/** + - ggml/** nix: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index abaf2c504..6281663ec 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -871,8 +871,65 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip name: llama-bin-win-${{ matrix.build }}.zip + ubuntu-latest-cmake-cuda: + runs-on: ubuntu-latest + container: nvidia/cuda:12.6.2-devel-ubuntu24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Install dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt update + apt install -y cmake build-essential ninja-build libgomp1 git + + - name: Build with CMake + run: | + cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON + cmake --build build + windows-latest-cmake-cuda: + runs-on: windows-latest + + strategy: + matrix: + cuda: ['12.6.2'] + build: ['cuda'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Install CUDA toolkit + id: cuda-toolkit + uses: Jimver/cuda-toolkit@v0.2.19 + with: + cuda: ${{ matrix.cuda }} + method: 'network' + sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' + + - name: Install Ninja + id: install_ninja + run: | + choco install ninja + + - name: Build + id: cmake_build + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON -DCMAKE_CUDA_ARCHITECTURES=89-real + cmake --build build --config Release -t ggml-cuda + cmake --build build --config Release + + windows-2019-cmake-cuda: runs-on: windows-2019 + if: ${{ github.event == 'push' && github.ref == 'refs/heads/master' }} strategy: matrix: @@ -1173,7 +1230,7 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake - - windows-latest-cmake-cuda + - windows-2019-cmake-cuda - windows-latest-cmake-hip-release - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a953cdac9..bc2e5020d 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -10,12 +10,10 @@ name: Publish Docker image on: - #pull_request: - push: - branches: - - master - paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] - workflow_dispatch: # allows manual triggering, useful for debugging + workflow_dispatch: # allows manual triggering + schedule: + # Rebuild daily rather than on every push because it is expensive + - cron: '12 4 * * *' concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} @@ -29,7 +27,6 @@ permissions: jobs: push_to_registry: name: Push Docker image to Docker Hub - #if: github.event.pull_request.draft == false runs-on: ubuntu-latest env: @@ -117,7 +114,7 @@ jobs: swap-storage: true - name: Build and push Docker image (tagged + versioned) - if: github.event_name == 'push' + if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} uses: docker/build-push-action@v6 with: context: . diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index 8ecbbe53b..3fe941576 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -5,8 +5,10 @@ on: push: branches: - master + paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] pull_request: types: [opened, synchronize, reopened] + paths: ['.github/workflows/nix-ci.yml', '**/flake.nix', '**/flake.lock', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index a8d46f31d..ddfdf73b8 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -1,6 +1,13 @@ name: flake8 Lint -on: [push, pull_request] +on: + push: + branches: + - master + paths: ['.github/workflows/python-lint.yml', '**/*.py'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['.github/workflows/python-lint.yml', '**/*.py'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 668ca086b..0d389dccb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,8 +164,11 @@ if (GGML_TARGET_DEFINES) list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) endif() get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) - -set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) +# all public headers +set(LLAMA_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) +set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") install(TARGETS llama LIBRARY PUBLIC_HEADER) configure_package_config_file( diff --git a/Makefile b/Makefile index 14c05e93e..cfc74c1dc 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ BUILD_TARGETS = \ llama-server \ llama-simple \ llama-simple-chat \ + llama-run \ llama-speculative \ llama-tokenize \ llama-vdot \ @@ -1167,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +llama-run: examples/run/run.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + llama-simple: examples/simple/simple.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/common/speculative.cpp b/common/speculative.cpp index fe315a270..e559675c4 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -90,9 +90,10 @@ bool common_speculative_are_compatible( if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_token_bos(model_tgt) != llama_token_bos(model_dft) || - llama_token_eos(model_tgt) != llama_token_eos(model_dft) - ) { + llama_token_eos(model_tgt) != llama_token_eos(model_dft)) { LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); + LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt)); + LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft)); return false; } diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d33a932ed..9210e9fea 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -40,6 +40,7 @@ else() add_subdirectory(server) endif() add_subdirectory(save-load-state) + add_subdirectory(run) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt new file mode 100644 index 000000000..084f1e92d --- /dev/null +++ b/examples/run/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-run) +add_executable(${TARGET} run.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/run/README.md b/examples/run/README.md new file mode 100644 index 000000000..6e926811f --- /dev/null +++ b/examples/run/README.md @@ -0,0 +1,7 @@ +# llama.cpp/example/run + +The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. + +```bash +./llama-run Meta-Llama-3.1-8B-Instruct.gguf +... diff --git a/examples/run/run.cpp b/examples/run/run.cpp new file mode 100644 index 000000000..cac2faefc --- /dev/null +++ b/examples/run/run.cpp @@ -0,0 +1,409 @@ +#if defined(_WIN32) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llama-cpp.h" + +typedef std::unique_ptr char_array_ptr; + +struct Argument { + std::string flag; + std::string help_text; +}; + +struct Options { + std::string model_path, prompt_non_interactive; + int ngl = 99; + int n_ctx = 2048; +}; + +class ArgumentParser { + public: + ArgumentParser(const char * program_name) : program_name(program_name) {} + + void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") { + string_args[flag] = &var; + arguments.push_back({flag, help_text}); + } + + void add_argument(const std::string & flag, int & var, const std::string & help_text = "") { + int_args[flag] = &var; + arguments.push_back({flag, help_text}); + } + + int parse(int argc, const char ** argv) { + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (string_args.count(arg)) { + if (i + 1 < argc) { + *string_args[arg] = argv[++i]; + } else { + fprintf(stderr, "error: missing value for %s\n", arg.c_str()); + print_usage(); + return 1; + } + } else if (int_args.count(arg)) { + if (i + 1 < argc) { + if (parse_int_arg(argv[++i], *int_args[arg]) != 0) { + fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]); + print_usage(); + return 1; + } + } else { + fprintf(stderr, "error: missing value for %s\n", arg.c_str()); + print_usage(); + return 1; + } + } else { + fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str()); + print_usage(); + return 1; + } + } + + if (string_args["-m"]->empty()) { + fprintf(stderr, "error: -m is required\n"); + print_usage(); + return 1; + } + + return 0; + } + + private: + const char * program_name; + std::unordered_map string_args; + std::unordered_map int_args; + std::vector arguments; + + int parse_int_arg(const char * arg, int & value) { + char * end; + const long val = std::strtol(arg, &end, 10); + if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) { + value = static_cast(val); + return 0; + } + return 1; + } + + void print_usage() const { + printf("\nUsage:\n"); + printf(" %s [OPTIONS]\n\n", program_name); + printf("Options:\n"); + for (const auto & arg : arguments) { + printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str()); + } + + printf("\n"); + } +}; + +class LlamaData { + public: + llama_model_ptr model; + llama_sampler_ptr sampler; + llama_context_ptr context; + std::vector messages; + + int init(const Options & opt) { + model = initialize_model(opt.model_path, opt.ngl); + if (!model) { + return 1; + } + + context = initialize_context(model, opt.n_ctx); + if (!context) { + return 1; + } + + sampler = initialize_sampler(); + return 0; + } + + private: + // Initializes the model and returns a unique pointer to it + llama_model_ptr initialize_model(const std::string & model_path, const int ngl) { + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params)); + if (!model) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + } + + return model; + } + + // Initializes the context with the specified parameters + llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = n_ctx; + ctx_params.n_batch = n_ctx; + + llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); + if (!context) { + fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__); + } + + return context; + } + + // Initializes and configures the sampler + llama_sampler_ptr initialize_sampler() { + llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + + return sampler; + } +}; + +// Add a message to `messages` and store its content in `owned_content` +static void add_message(const char * role, const std::string & text, LlamaData & llama_data, + std::vector & owned_content) { + char_array_ptr content(new char[text.size() + 1]); + std::strcpy(content.get(), text.c_str()); + llama_data.messages.push_back({role, content.get()}); + owned_content.push_back(std::move(content)); +} + +// Function to apply the chat template and resize `formatted` if needed +static int apply_chat_template(const LlamaData & llama_data, std::vector & formatted, const bool append) { + int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), + llama_data.messages.size(), append, formatted.data(), formatted.size()); + if (result > static_cast(formatted.size())) { + formatted.resize(result); + result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), + llama_data.messages.size(), append, formatted.data(), formatted.size()); + } + + return result; +} + +// Function to tokenize the prompt +static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt, + std::vector & prompt_tokens) { + const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true); + prompt_tokens.resize(n_prompt_tokens); + if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, + true) < 0) { + GGML_ABORT("failed to tokenize the prompt\n"); + } + + return n_prompt_tokens; +} + +// Check if we have enough space in the context to evaluate this batch +static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { + const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); + if (n_ctx_used + batch.n_tokens > n_ctx) { + printf("\033[0m\n"); + fprintf(stderr, "context size exceeded\n"); + return 1; + } + + return 0; +} + +// convert the token to a string +static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { + char buf[256]; + int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + GGML_ABORT("failed to convert token to piece\n"); + } + + piece = std::string(buf, n); + return 0; +} + +static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) { + printf("%s", piece.c_str()); + fflush(stdout); + response += piece; +} + +// helper function to evaluate a prompt and generate a response +static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { + std::vector prompt_tokens; + const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens); + if (n_prompt_tokens < 0) { + return 1; + } + + // prepare a batch for the prompt + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_token new_token_id; + while (true) { + check_context_size(llama_data.context, batch); + if (llama_decode(llama_data.context.get(), batch)) { + GGML_ABORT("failed to decode\n"); + } + + // sample the next token, check is it an end of generation? + new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1); + if (llama_token_is_eog(llama_data.model.get(), new_token_id)) { + break; + } + + std::string piece; + if (convert_token_to_string(llama_data.model, new_token_id, piece)) { + return 1; + } + + print_word_and_concatenate_to_response(piece, response); + + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1); + } + + return 0; +} + +static int parse_arguments(const int argc, const char ** argv, Options & opt) { + ArgumentParser parser(argv[0]); + parser.add_argument("-m", opt.model_path, "model"); + parser.add_argument("-p", opt.prompt_non_interactive, "prompt"); + parser.add_argument("-c", opt.n_ctx, "context_size"); + parser.add_argument("-ngl", opt.ngl, "n_gpu_layers"); + if (parser.parse(argc, argv)) { + return 1; + } + + return 0; +} + +static int read_user_input(std::string & user) { + std::getline(std::cin, user); + return user.empty(); // Indicate an error or empty input +} + +// Function to generate a response based on the prompt +static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) { + // Set response color + printf("\033[33m"); + if (generate(llama_data, prompt, response)) { + fprintf(stderr, "failed to generate response\n"); + return 1; + } + + // End response with color reset and newline + printf("\n\033[0m"); + return 0; +} + +// Helper function to apply the chat template and handle errors +static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector & formatted, + const bool is_user_input, int & output_length) { + const int new_len = apply_chat_template(llama_data, formatted, is_user_input); + if (new_len < 0) { + fprintf(stderr, "failed to apply the chat template\n"); + return -1; + } + + output_length = new_len; + return 0; +} + +// Helper function to handle user input +static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) { + if (!prompt_non_interactive.empty()) { + user_input = prompt_non_interactive; + return true; // No need for interactive input + } + + printf("\033[32m> \033[0m"); + return !read_user_input(user_input); // Returns false if input ends the loop +} + +// Function to tokenize the prompt +static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) { + std::vector owned_content; + std::vector fmtted(llama_n_ctx(llama_data.context.get())); + int prev_len = 0; + + while (true) { + // Get user input + std::string user_input; + if (!handle_user_input(user_input, prompt_non_interactive)) { + break; + } + + add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data, + owned_content); + + int new_len; + if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) { + return 1; + } + + std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len); + std::string response; + if (generate_response(llama_data, prompt, response)) { + return 1; + } + } + return 0; +} + +static void log_callback(const enum ggml_log_level level, const char * text, void *) { + if (level == GGML_LOG_LEVEL_ERROR) { + fprintf(stderr, "%s", text); + } +} + +static bool is_stdin_a_terminal() { +#if defined(_WIN32) + HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdin, &mode); +#else + return isatty(STDIN_FILENO); +#endif +} + +static std::string read_pipe_data() { + std::ostringstream result; + result << std::cin.rdbuf(); // Read all data from std::cin + return result.str(); +} + +int main(int argc, const char ** argv) { + Options opt; + if (parse_arguments(argc, argv, opt)) { + return 1; + } + + if (!is_stdin_a_terminal()) { + if (!opt.prompt_non_interactive.empty()) { + opt.prompt_non_interactive += "\n\n"; + } + + opt.prompt_non_interactive += read_pipe_data(); + } + + llama_log_set(log_callback, nullptr); + LlamaData llama_data; + if (llama_data.init(opt)) { + return 1; + } + + if (chat_loop(llama_data, opt.prompt_non_interactive)) { + return 1; + } + + return 0; +} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c0ea4faf7..9c86407c2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2267,49 +2267,48 @@ struct server_context { continue; // continue loop of slots } - llama_token id; + llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); - { - completion_token_output result; + slot.i_batch = -1; - id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + common_sampler_accept(slot.smpl, id, true); - slot.i_batch = -1; - - common_sampler_accept(slot.smpl, id, true); - - slot.n_decoded += 1; - if (slot.n_decoded == 1) { - slot.t_start_generation = ggml_time_us(); - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); - } - - result.tok = id; - - const auto * cur_p = common_sampler_get_candidates(slot.smpl); - - for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { - result.probs.push_back({ - cur_p->data[i].id, - i >= cur_p->size ? 0.0f : cur_p->data[i].p, - }); - } - - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - continue; - } + slot.n_decoded += 1; + if (slot.n_decoded == 1) { + slot.t_start_generation = ggml_time_us(); + slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); } - // check if the slot supports speculative decoding - if (!slot.can_speculate()) { + completion_token_output result; + result.tok = id; + + const auto * cur_p = common_sampler_get_candidates(slot.smpl); + + for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); + } + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); continue; } + } + + // do speculative decoding + for (auto & slot : slots) { + if (!slot.is_processing() || !slot.can_speculate()) { + continue; + } + + llama_token id = slot.sampled; struct common_speculative_params params_spec; params_spec.n_draft = slot.params.speculative.n_max; diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 90cb50749..8ca84f7af 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -117,7 +117,8 @@ int main(int argc, char ** argv) { llama_token id_last = inp.back(); // all tokens currently in the target context - auto prompt_tgt = std::vector(inp.begin(), inp.end() - 1); + llama_tokens prompt_tgt(inp.begin(), inp.end() - 1); + prompt_tgt.reserve(llama_n_ctx(ctx_tgt)); int n_past = inp.size() - 1; @@ -181,54 +182,44 @@ int main(int argc, char ** argv) { GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token n_past += ids.size() - 1; - n_drafted += batch_tgt.n_tokens - 1; + n_drafted += draft.size(); // note: we ignore the discarded small drafts n_accept += ids.size() - 1; + n_predict += ids.size(); // process the accepted tokens and update contexts // // this is the standard token post-processing that we normally do // in this case, we do it for a group of accepted tokens at once // - { - llama_token id = 0; - std::string token_str; + for (size_t i = 0; i < ids.size(); ++i) { + prompt_tgt.push_back(id_last); - for (size_t i = 0; i < ids.size(); ++i) { - id = ids[i]; + id_last = ids[i]; - ++n_predict; - - if (llama_token_is_eog(model_tgt, id)) { - has_eos = true; - break; - } - - token_str = common_token_to_piece(ctx_tgt, id); - - if (params.use_color && i + 1 < ids.size()) { - LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str()); - } else { - LOG("%s", token_str.c_str()); - } - } - - if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { + if (llama_token_is_eog(model_tgt, id_last)) { + has_eos = true; break; } - LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str()); + const std::string token_str = common_token_to_piece(ctx_tgt, id_last); - { - LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); - - llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); + if (params.use_color && i + 1 < ids.size()) { + LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str()); + } else { + LOG("%s", token_str.c_str()); } + } - prompt_tgt.push_back(id_last); - prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1); + LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last); - // remember the last accepted token for the next iteration - id_last = id; + { + LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past); + + llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1); + } + + if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) { + break; } } diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 1f4ee986c..d7472ee3a 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -21,6 +21,7 @@ */ #include "aclnn_ops.h" +#include "ggml-impl.h" #include #include @@ -32,6 +33,8 @@ #include #include #include +#include +#include #include #include #include @@ -241,10 +244,14 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src1 = ggml_cann_create_tensor(src1); aclTensor* acl_dst = ggml_cann_create_tensor(dst); - int64_t concat_dim = 1; + const int32_t dim = ggml_get_op_params_i32(dst, 0); + + GGML_ASSERT(dim >= 0 && dim < 4); + int32_t acl_dim = 3 - dim; + aclTensor* tensors[] = {acl_src0, acl_src1}; aclTensorList* tensorList = aclCreateTensorList(tensors, 2); - aclnn_concat(ctx, tensorList, acl_dst, concat_dim); + aclnn_concat(ctx, tensorList, acl_dst, acl_dim); ACL_CHECK(aclDestroyTensorList(tensorList)); ACL_CHECK(aclDestroyTensor(acl_dst)); @@ -1437,10 +1444,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; // kernel ggml_tensor* src1 = dst->src[1]; // input - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); - GGML_TENSOR_BINARY_OP_LOCALS; // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D @@ -1462,9 +1465,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const int64_t OH = is_2D ? ne2 : 1; const int64_t OW = ne1; - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb10 == sizeof(float)); - // memory allocated increased to 3x when is_2D == false const int64_t n_bytes_factor = is_2D ? 1 : 3; @@ -2425,7 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, aclTensor* acl_weight, aclTensor* acl_dst) { int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is // fp32, atlas a2 will transpose it to HFLOAT32. - uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -2443,6 +2442,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream())); } +/** + * @brief Performs matrix multiplication of two 2D tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst}=\text {acl_input@acl_weight} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input, + aclTensor* acl_weight, aclTensor* acl_dst) { + int8_t cube_math_type = 2; + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Performs matrix multiplication of two 3D tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst}=\text {acl_input@acl_weight} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input, + aclTensor* acl_weight, aclTensor* acl_dst) { + int8_t cube_math_type = 2; + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + /** * @brief Performs matrix multiplication with floating-point precision on * tensors using the CANN backend. @@ -2464,20 +2537,43 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, // broadcast, when weight ne2 or ne3 is not 1, weight need repeat. BCAST_MUL_MAT_SHAPE(input, weight, dst); - // transpose weight: [1,2,3,4] -> [1,2,4,3] - int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], - bcast_weight_ne[2], bcast_weight_ne[3], - bcast_weight_ne[4], bcast_weight_ne[5]}; - size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], - bcast_weight_nb[2], bcast_weight_nb[3], - bcast_weight_nb[4], bcast_weight_nb[5]}; + int64_t n_dims = bcast_dims; + if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) { + if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) { + n_dims = 2; + } else if (bcast_input_ne[2] == 1) { + n_dims = 3; + } + } - aclTensor* acl_weight_tensor = - ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims); aclTensor* acl_input_tensor = - ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input)); - aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst)); - aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims); + int64_t transpose_ne[] = { + bcast_weight_ne[1], bcast_weight_ne[0], + bcast_weight_ne[2], bcast_weight_ne[3], + bcast_weight_ne[4], bcast_weight_ne[5] + }; + size_t transpose_nb[] = { + bcast_weight_nb[1], bcast_weight_nb[0], + bcast_weight_nb[2], bcast_weight_nb[3], + bcast_weight_nb[4], bcast_weight_nb[5] + }; + aclTensor* acl_weight_tensor = + ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); + + switch (n_dims) { + case 2: + aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + case 3: + aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + default: + aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + } ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); ACL_CHECK(aclDestroyTensor(acl_input_tensor)); @@ -2503,46 +2599,40 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src1 = dst->src[1]; // input - // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC - // is regarded as batch. weight need transpose. - int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; + // The shape of the weight is NCHW. + // Matrix multiplication uses HW dims. + // HC is regarded as batch. + // weight need transpose. float weight_elem_size; if (type == GGML_TYPE_Q4_0) { weight_elem_size = float(sizeof(uint8_t)) / 2; - } - else if (type == GGML_TYPE_Q8_0) { + } else if (type == GGML_TYPE_Q8_0) { weight_elem_size = float(sizeof(uint8_t)); - } - else { + } else { GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); } - float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; - - // size of one matrix is element_size * height * width. - size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1]; + float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size}; + size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; // scale stored at the end of weight. Also need transpose. - GGML_ASSERT(QK4_0 == QK8_0); - int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0}; size_t scale_elem_size = sizeof(uint16_t); - size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, - scale_elem_size}; - size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0; + size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size}; + size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; char* scale_offset = (char*)src0->data + weight_size; // input - void* input_buffer; size_t input_elem_size = sizeof(uint16_t); int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; - size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; - size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; - + size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size}; + size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size; ggml_cann_pool_alloc input_alloctor(ctx.pool()); + void* input_buffer = src1->data; + + // case in if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); - input_buffer = input_alloctor.get(); + input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); int64_t* input_cast_ne = src1->ne; size_t input_cast_nb[GGML_MAX_DIMS]; @@ -2552,88 +2642,139 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, } aclTensor* acl_input_tensor = ggml_cann_create_tensor( - input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, - input_cast_nb, GGML_MAX_DIMS); + input_buffer, + ACL_FLOAT16, + input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS); aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); - } else { - input_buffer = src1->data; } // output size_t output_elem_size = sizeof(uint16_t); - int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; - size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; - ggml_cann_pool_alloc output_alloctor( - ctx.pool(), ggml_nelements(dst) * output_elem_size); - void* output_buffer = output_alloctor.get(); - size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1]; + size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size}; + ggml_cann_pool_alloc output_allocator(ctx.pool()); + void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size); + size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size; // aclnn + int64_t max_elem_size = 65535; + int64_t split_size = (src0->ne[1] / max_elem_size) + 1; + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); + aclOpExecutor* executor = nullptr; uint64_t workspaceSize = 0; - aclOpExecutor* executor; void* workspaceAddr = nullptr; - for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); - int64_t batch1 = n1 * src1->ne[2] + c1; - int64_t batch0 = n0 * src0->ne[2] + c0; + int64_t batch1 = (n1 * src1->ne[2]) + c1; + int64_t batch0 = (n0 * src0->ne[2]) + c0; aclTensor* acl_input_tensor = ggml_cann_create_tensor( (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2); + + // first split + int64_t weight_ne_offset = 0; + int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]}; + int64_t scale_ne_offset = 0; + int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0}; + int64_t output_ne_offset = 0; + int64_t output_ne[2] = {weight_ne[0], dst->ne[1]}; + aclTensor* acl_weight_tensor = ggml_cann_create_tensor( (char*)src0->data + batch0 * weight_stride, - ggml_cann_type_mapping(type), weight_elem_size, weight_ne, - weight_nb, 2); + ggml_cann_type_mapping(type), + weight_elem_size, weight_ne, weight_nb, 2, + ACL_FORMAT_ND, weight_ne_offset); aclTensor* acl_scale_tensor = ggml_cann_create_tensor( - scale_offset + batch0 * scale_stride, ACL_FLOAT16, - scale_elem_size, scale_ne, scale_nb, 2); + scale_offset + batch0 * scale_stride, + ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, + ACL_FORMAT_ND, scale_ne_offset); aclTensor* acl_output_tensor = ggml_cann_create_tensor( - (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, - output_elem_size, output_ne, output_nb, 2); + (char*)output_buffer + batch1 * output_stride, + ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, + ACL_FORMAT_ND, output_ne_offset); ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( - acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, - &workspaceSize, &executor)); - - if (workspaceSize > 0 && workspaceAddr == nullptr) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), - workspaceSize); - workspaceAddr = workspace_allocator.get(); + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, + nullptr, nullptr, nullptr, nullptr, QK8_0, + acl_output_tensor, &workspaceSize, &executor)); + if (workspaceAddr == nullptr) { + workspaceAddr = workspace_allocator.alloc(workspaceSize); } - ACL_CHECK(aclnnWeightQuantBatchMatmulV2( workspaceAddr, workspaceSize, executor, ctx.stream())); - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + + // other splits + for (int64_t split = 1; split < split_size; split++) { + weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1]; + weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size; + scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1]; + scale_ne[0] = weight_ne[0]; + output_ne_offset += output_elem_size * output_ne[0] * output_ne[1]; + output_ne[0] = weight_ne[0]; + + acl_weight_tensor = ggml_cann_create_tensor( + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), + weight_elem_size, weight_ne, weight_nb, 2, + ACL_FORMAT_ND, weight_ne_offset); + acl_scale_tensor = ggml_cann_create_tensor( + scale_offset + batch0 * scale_stride, + ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, + ACL_FORMAT_ND, scale_ne_offset); + acl_output_tensor = ggml_cann_create_tensor( + (char*)output_buffer + batch1 * output_stride, + ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, + ACL_FORMAT_ND, output_ne_offset); + + ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, + nullptr, nullptr, nullptr, nullptr, QK8_0, + acl_output_tensor, &workspaceSize, &executor)); + ACL_CHECK(aclnnWeightQuantBatchMatmulV2( + workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + } + + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); } } // cast out - int64_t* output_cast_ne = dst->ne; - size_t output_cast_nb[GGML_MAX_DIMS]; - output_cast_nb[0] = sizeof(uint16_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; + if (dst->type != GGML_TYPE_F16) { + int64_t* output_cast_ne = dst->ne; + size_t output_cast_nb[GGML_MAX_DIMS]; + output_cast_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; + } + + aclTensor* acl_output_tensor = ggml_cann_create_tensor( + output_buffer, + ACL_FLOAT16, + output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS); + aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); + aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); + + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); } - - aclTensor* acl_output_tensor = - ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size, - output_cast_ne, output_cast_nb, GGML_MAX_DIMS); - aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); - aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT); - - ACL_CHECK(aclDestroyTensor(acl_output_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); } void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { @@ -2859,15 +3000,27 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ACL_CHECK(aclDestroyTensor(acl_cos_tensor)); } +#ifdef __cplusplus +extern "C" { +#endif +aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize( + const aclTensor* x, const aclTensor* cos, const aclTensor* sin, + int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize, + aclOpExecutor** executor); +aclnnStatus aclnnRotaryPositionEmbedding(void* workspace, + uint64_t workspaceSize, + aclOpExecutor* executor, + aclrtStream stream); +#ifdef __cplusplus +} +#endif + void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // TODO: use ascendc // Only test with LLAMA model. ggml_tensor* src0 = dst->src[0]; // input ggml_tensor* src2 = dst->src[2]; // freq_factors - // TODO: with freq_factors - GGML_ASSERT(src2 == NULL); - // param float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; // const int n_past = ((int32_t *) dst->op_params)[0]; @@ -2885,13 +3038,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); - GGML_ASSERT(n_dims <= ne0); + // TODO: with freq_factors + GGML_ASSERT(src2 == NULL); + // TODO: attn_factor != 1 + GGML_ASSERT(attn_factor == 1); + // TODO: n_dims <= ne0 + GGML_ASSERT(n_dims == ne0); GGML_ASSERT(n_dims % 2 == 0); - // TODO: ext_factor != 0 GGML_ASSERT(ext_factor == 0); // TODO: freq_scale != 1 GGML_ASSERT(freq_scale == 1); + // TODO: type == GGML_TYPE_F16 + GGML_ASSERT(src0->type == GGML_TYPE_F32); const float theta_scale = powf(freq_base, -2.0f / n_dims); @@ -2924,177 +3083,30 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, theta_scale, is_neox); - // roll input - void* input_roll_buffer; - aclTensor* acl_minus_one_tensor; - void* minus_one_scale_buffer = nullptr; - ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); - ggml_cann_pool_alloc minus_one_scale_allocator( - ctx.pool(), sizeof(float_t) * src0->ne[0]); - if (!is_neox) { - // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...] - input_roll_buffer = roll_allocator.get(); - int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2), - src0->ne[2], src0->ne[3]}; - size_t input_roll_nb[GGML_MAX_DIMS]; - input_roll_nb[0] = ggml_type_size(src0->type); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; - } - aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( - input_roll_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), input_roll_ne, input_roll_nb, - GGML_MAX_DIMS); - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - src0->data, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), input_roll_ne, input_roll_nb, - GGML_MAX_DIMS); + uint64_t workspaceSize = 0; + aclOpExecutor* executor; - int64_t shifts[] = {1}; - int64_t dims[] = {3}; - aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); - ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + void* workspaceAddr = nullptr; - // init [-1, 1, -1, 1, ...] - minus_one_scale_buffer = minus_one_scale_allocator.get(); - - int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; - size_t minus_one_nb[GGML_MAX_DIMS]; - minus_one_nb[0] = sizeof(float_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; - } - acl_minus_one_tensor = aclnn_ones( - ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], - minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); - int64_t dim = 3; - int64_t* index = new int64_t[src0->ne[0]]; - for (int i = 0; i < src0->ne[0]; i++) { - index[i] = i / 2 * 2; - } - int64_t index_num = src0->ne[0]; - float value = -1; - aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, - index_num, value); - } else { - // roll input: [q0,q1,q2,...] -> - // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1] - input_roll_buffer = roll_allocator.get(); - aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( - input_roll_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); - aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0); - - int64_t shifts[] = {src0->ne[0] / 2}; - int64_t dims[] = {3}; - aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); - - ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); - - // init [-1, -1, -1, 1, 1,1,...] - minus_one_scale_buffer = minus_one_scale_allocator.get(); - - int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; - size_t minus_one_nb[GGML_MAX_DIMS]; - minus_one_nb[0] = sizeof(float_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; - } - acl_minus_one_tensor = aclnn_ones( - ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], - minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); - // -1 * first half - int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; - size_t first_half_nb[GGML_MAX_DIMS]; - first_half_nb[0] = sizeof(float_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; - } - aclTensor* acl_first_half_tensor = ggml_cann_create_tensor( - minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne, - first_half_nb, GGML_MAX_DIMS); - bool inplace = true; - float scale = -1; - aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace); - ACL_CHECK(aclDestroyTensor(acl_first_half_tensor)); + int acl_mode = mode; + if (mode == 0) { + acl_mode = 1; } - // TODO: n_dims < ne0 - GGML_ASSERT(n_dims == src0->ne[0]); - - // input * scale - ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), - ggml_nbytes(src0)); - void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); - size_t input_nb[GGML_MAX_DIMS]; - input_nb[0] = ggml_type_size(src0->type); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; - } - aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor( - input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); - aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor( - input_roll_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); - - aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, - acl_input_roll_mul_scale_tensor); - - // output - aclTensor* acl_src0 = ggml_cann_create_tensor(src0); + aclTensor* acl_x = ggml_cann_create_tensor(src0); aclTensor* acl_dst = ggml_cann_create_tensor(dst); - void* output_fp32_buffer; - if (src0->type == GGML_TYPE_F32) { - aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor); - aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, - acl_sin_reshape_tensor); - aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst); - // TODO: ne0 != n_dims in mode2 - } else if (src0->type == GGML_TYPE_F16) { - size_t input_fp32_nb[GGML_MAX_DIMS]; - input_fp32_nb[0] = sizeof(float_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; - } - ggml_cann_pool_alloc fp32_allocator1( - ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); - void* input_fp32_buffer1 = fp32_allocator1.get(); - aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor( - input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne, - input_fp32_nb, GGML_MAX_DIMS); - ggml_cann_pool_alloc fp32_allocator2( - ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); - void* input_fp32_buffer2 = fp32_allocator2.get(); - aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor( - input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne, - input_fp32_nb, GGML_MAX_DIMS); - - ggml_cann_pool_alloc fp32_allocator( - ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); - output_fp32_buffer = fp32_allocator.get(); - aclTensor* output_fp32_tensor = ggml_cann_create_tensor( - output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne, - input_fp32_nb, GGML_MAX_DIMS); - aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1); - aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, - input_fp32_tensor2); - aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, - output_fp32_tensor); - aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); - - ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); - ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); - ACL_CHECK(aclDestroyTensor(output_fp32_tensor)); + ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize( + acl_x, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize, + executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_x)); ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); - ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor)); - ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor)); - ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor)); - ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst)); } diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index edfa49614..5164cb74e 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc { struct ggml_backend_cann_context { int32_t device; /**< Device ID. */ std::string name; /**< Name of the device. */ + std::string description; /**< Description of the device. */ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ - aclrtStream streams[GGML_CANN_MAX_STREAMS] = { - {nullptr}}; /**< Array of streams for the device. */ + aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ /** * @brief Constructor for initializing the context with a given device. * @param device Device ID. */ explicit ggml_backend_cann_context(int device) - : device(device), name("CANN" + std::to_string(device)) {} + : device(device), name("CANN" + std::to_string(device)) { + ggml_cann_set_device(device); + description = aclrtGetSocName(); + } /** * @brief Destructor for cleaning up resources. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index d96f65936..c7a3419c7 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() { ACL_CHECK(aclrtMemGetAllocationGranularity( &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, &info.devices[id].vmm_granularity)); + + size_t free, total; + ggml_backend_cann_get_device_memory(id, &free, &total); + info.devices[id].total_vram = free; } // TODO: add more device info later. @@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { * @return A pointer to the allocated buffer. */ void* alloc(size_t size, size_t* actual_size) override { + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } #ifdef DEBUG_CANN_MALLOC int nnz = 0; size_t max_size = 0; @@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { return ptr; } void* ptr; - size_t look_ahead_size = (size_t)(1.05 * size); - look_ahead_size = 256 * ((look_ahead_size + 255) / 256); ggml_cann_set_device(device); ACL_CHECK( - aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST)); - *actual_size = look_ahead_size; - pool_size += look_ahead_size; + aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST)); + *actual_size = size; + pool_size += size; #ifdef DEBUG_CANN_MALLOC GGML_LOG_INFO( "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " @@ -296,7 +303,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { /** * @brief The maximum size of the virtual memory pool (32 GB). */ - static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB + size_t max_size; /** * @brief The device ID associated with this buffer pool. @@ -341,7 +348,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ explicit ggml_cann_pool_vmm(int device) : device(device), - granularity(ggml_cann_info().devices[device].vmm_granularity) {} + granularity(ggml_cann_info().devices[device].vmm_granularity) { + auto dev = ggml_cann_info().devices[device]; + granularity = dev.vmm_granularity; + max_size = dev.total_vram; + } /** * @brief Destructor to free all buffers in the virtual memory pool. @@ -370,17 +381,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // round up the allocation size to the alignment to ensure that all // allocations are aligned for all data types const size_t alignment = 128; - size = alignment * ((size + alignment - 1) / alignment); + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } size_t avail = pool_size - pool_used; if (size > avail) { // round up to the next multiple of the granularity size_t reserve_size = size - avail; - reserve_size = - granularity * ((reserve_size + granularity - 1) / granularity); + reserve_size = GGML_PAD(reserve_size, granularity); - GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE); + GGML_ASSERT(pool_size + reserve_size <= max_size); // allocate more physical memory aclrtPhysicalMemProp prop = {}; @@ -396,7 +409,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // reserve virtual address space (if not already reserved) if (pool_addr == 0) { ACL_CHECK(aclrtReserveMemAddress( - &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1)); + &pool_addr, max_size, 0, NULL, 1)); } // map at the end of the pool @@ -409,10 +422,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // add to the pool pool_size += reserve_size; - // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB ( - // reserved %llu MB)\n", - // device, (unsigned long long) (pool_size/1024/1024), - // (unsigned long long) (reserve_size/1024/1024)); +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + device, (unsigned long long) (pool_size/1024/1024), + (unsigned long long) (reserve_size/1024/1024)); +#endif } GGML_ASSERT(pool_addr != 0); @@ -457,7 +471,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( int device) { - // return std::unique_ptr(new ggml_cann_pool_leg(device)); return std::unique_ptr(new ggml_cann_pool_vmm(device)); } @@ -1130,10 +1143,10 @@ ggml_backend_cann_buffer_type(int32_t device) { static bool ggml_backend_cann_buffer_type_initialized = false; if (!ggml_backend_cann_buffer_type_initialized) { - for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { + for (int32_t i = 0; i < ggml_cann_info().device_count; i++) { ggml_backend_cann_buffer_types[i] = { /* .iface = */ ggml_backend_cann_buffer_type_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i), /* .context = */ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i)}, @@ -1199,10 +1212,15 @@ static void * ggml_cann_host_malloc(size_t size) { return nullptr; } + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } + void * hostPtr = nullptr; aclError err = aclrtMallocHost((void **) &hostPtr, size); if (err != ACL_SUCCESS) { - GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, aclGetRecentErrMsg()); return nullptr; @@ -1669,12 +1687,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } case GGML_OP_MUL_MAT: { switch (op->src[0]->type) { + case GGML_TYPE_Q8_0: + // Current groupsize should not be greater than k-1 in + // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize + if (op->src[0]->ne[0] <= QK8_0) { + return false; + } case GGML_TYPE_F16: case GGML_TYPE_F32: - case GGML_TYPE_Q8_0: - // TODO: fix me - // Current groupsize should not be greater than k-1 in - // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(). case GGML_TYPE_Q4_0: return true; default: @@ -1706,9 +1726,61 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, return false; } } + case GGML_OP_CONT: { + // TODO: support GGML_TYPE_BF16 + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + return true; + default: + return false; + } + } + case GGML_OP_ROPE: { + // TODO: with ops-test v == 1 + float * freq_scale = (float*)((int32_t*)op->op_params + 6); + float * ext_factor = (float*)((int32_t*)op->op_params + 7); + float * attn_factor = (float*)((int32_t*)op->op_params + 8); + // TODO: with freq_factors + if (op->src[2] != NULL) { + return false; + } + // TODO: n_dims <= ne0 + if (op->src[0]->ne[0] != op->op_params[1]) { + return false; + } + // TODO: ext_factor != 0 + if (*ext_factor != 0) { + return false; + } + // TODO: freq_scale != 1 + if (*freq_scale != 1) { + return false; + } + // TODO: attn_factor != 1 + if (*attn_factor != 1) { + return false; + } + //TODO: type == GGML_TYPE_F16 + switch (op->src[0]->type) { + case GGML_TYPE_F32: + return true; + default: + return false; + } + } + case GGML_OP_UPSCALE: { + // aclnnUpsampleNearest2dGetWorkspaceSize not support + // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal + if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) { + return false; + } + return true; + } + case GGML_OP_IM2COL: + case GGML_OP_CONCAT: case GGML_OP_DUP: case GGML_OP_REPEAT: - case GGML_OP_CONCAT: case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -1722,17 +1794,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_CLAMP: - case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: - case GGML_OP_ROPE: - case GGML_OP_IM2COL: case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: case GGML_OP_GROUP_NORM: - case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index c2905d1fb..ddc05ecef 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -96,6 +96,39 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR endif () set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + elseif (APPLE) + if (GGML_NATIVE) + set(USER_PROVIDED_MARCH FALSE) + foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) + if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") + set(USER_PROVIDED_MARCH TRUE) + break() + endif() + endforeach() + + if (NOT USER_PROVIDED_MARCH) + set(MARCH_FLAGS "-march=armv8.2a") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") + add_compile_definitions(__ARM_FEATURE_DOTPROD) + endif () + + set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") + + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") + add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + endif () + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + endif () + endif () else() check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 6bbe8e96e..5c317b68b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -474,9 +474,15 @@ void write_output_files() { int main(int argc, char** argv) { std::map args; - for (int i = 1; i < argc; i += 2) { - if (i + 1 < argc) { - args[argv[i]] = argv[i + 1]; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (arg.rfind("--", 0) == 0) { + if (i + 1 < argc && argv[i + 1][0] != '-') { + args[arg] = argv[i + 1]; + ++i; + } else { + args[arg] = ""; + } } } diff --git a/include/llama-cpp.h b/include/llama-cpp.h new file mode 100644 index 000000000..daa04d4d8 --- /dev/null +++ b/include/llama-cpp.h @@ -0,0 +1,25 @@ +#pragma once + +#ifndef __cplusplus +#error "This header is for C++ only" +#endif + +#include + +#include "llama.h" + +struct llama_model_deleter { + void operator()(llama_model * model) { llama_free_model(model); } +}; + +struct llama_context_deleter { + void operator()(llama_context * context) { llama_free(context); } +}; + +struct llama_sampler_deleter { + void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); } +}; + +typedef std::unique_ptr llama_model_ptr; +typedef std::unique_ptr llama_context_ptr; +typedef std::unique_ptr llama_sampler_ptr;