diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml index f10b3a2b2..b85bf5741 100644 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -65,12 +65,22 @@ body: If possible, please do a git bisect and identify the exact commit that introduced the bug. validations: required: false + - type: textarea + id: command + attributes: + label: Compile command + description: > + Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: true - type: textarea id: logs attributes: label: Relevant log output description: > - Please copy and paste any relevant log output, including the command that you entered and any generated text. + Please copy and paste any relevant log output, including any generated text. This will be automatically formatted into code, so no need for backticks. render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index d157ea307..1904e31fd 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -52,6 +52,16 @@ body: - Other (Please specify in the next section) validations: required: false + - type: textarea + id: command + attributes: + label: Command line + description: > + Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc. + This will be automatically formatted into code, so no need for backticks. + render: shell + validations: + required: false - type: textarea id: info attributes: @@ -74,7 +84,7 @@ body: attributes: label: Relevant log output description: > - If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text. + If applicable, please copy and paste any relevant log output, including any generated text. This will be automatically formatted into code, so no need for backticks. render: shell validations: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a377eff38..c85999b89 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -60,8 +60,7 @@ jobs: -DLLAMA_CURL=ON \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ - -DGGML_RPC=ON \ - -DBUILD_SHARED_LIBS=OFF + -DGGML_RPC=ON cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -123,8 +122,7 @@ jobs: -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_CURL=ON \ -DGGML_METAL=OFF \ - -DGGML_RPC=ON \ - -DBUILD_SHARED_LIBS=OFF + -DGGML_RPC=ON cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -181,7 +179,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON cmake --build . --config Release -j $(nproc) - name: Test @@ -651,23 +649,23 @@ jobs: matrix: include: - build: 'noavx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' - build: 'avx2-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON' - build: 'avx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF' - build: 'avx512-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON' - build: 'openblas-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'kompute-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON' - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'llvm-arm64-opencl-adreno' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' @@ -914,7 +912,7 @@ jobs: shell: cmd run: | call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON + cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 cmake --build build --config Release -j %NINJA_JOBS% -t ggml cmake --build build --config Release @@ -1239,7 +1237,7 @@ jobs: - name: Create release id: create_release - uses: anzz1/action-create-release@v1 + uses: ggml-org/action-create-release@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 41f1a89ee..d71f1eb38 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -97,10 +97,9 @@ jobs: GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} - uses: jlumbroso/free-disk-space@main + uses: ggml-org/free-disk-space@v1.3.1 with: # this might remove tools that are actually needed, # if set to "true" but frees about 6 GB diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index ae86e9927..f02b7c219 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -23,5 +23,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@main + - uses: editorconfig-checker/action-editorconfig-checker@v2 + with: + version: v3.0.3 - run: editorconfig-checker diff --git a/CODEOWNERS b/CODEOWNERS index adeba5395..72d594b46 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,11 @@ # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs /ci/ @ggerganov -/.devops/ @ngxson +/.devops/*.Dockerfile @ngxson /examples/server/ @ngxson +/ggml/src/ggml-cuda/fattn* @JohannesGaessler +/ggml/src/ggml-cuda/mmq.* @JohannesGaessler +/ggml/src/ggml-cuda/mmv.* @JohannesGaessler +/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler +/ggml/src/ggml-opt.cpp @JohannesGaessler +/ggml/src/gguf.cpp @JohannesGaessler diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5a85ec5d2..8d411982b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,10 +1,10 @@ # Pull requests (for contributors) - Test your changes: - - Execute [the full CI locally on your machine](ci/README.md) before publishing - - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`) - - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends) - - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops` + - Execute [the full CI locally on your machine](ci/README.md) before publishing + - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`) + - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends) + - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops` - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly - If your PR becomes stale, don't hesitate to ping the maintainers in the comments @@ -20,14 +20,104 @@ - Avoid adding third-party dependencies, extra files, extra headers, etc. - Always consider cross-compatibility with other operating systems and architectures - Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple -- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit +- Vertical alignment makes things more readable and easier to batch edit - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` -- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) +- Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets +- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo` + - In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary + ```cpp + // OK + llama_context * ctx; + const llama_rope_type rope_type; + + // not OK + struct llama_context * ctx; + const enum llama_rope_type rope_type; + ``` + + _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_ + +- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code +- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines) - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ ![matmul](media/matmul.png) +# Naming guidelines + +- Use `snake_case` for function, variable and type names +- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) + + ```cpp + // not OK + int small_number; + int big_number; + + // OK + int number_small; + int number_big; + ``` + +- Enum values are always in upper case and prefixed with the enum name + + ```cpp + enum llama_vocab_type { + LLAMA_VOCAB_TYPE_NONE = 0, + LLAMA_VOCAB_TYPE_SPM = 1, + LLAMA_VOCAB_TYPE_BPE = 2, + LLAMA_VOCAB_TYPE_WPM = 3, + LLAMA_VOCAB_TYPE_UGM = 4, + LLAMA_VOCAB_TYPE_RWKV = 5, + }; + ``` + +- The general naming pattern is `_`, with `` being `_` + + ```cpp + llama_model_init(); // class: "llama_model", method: "init" + llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove" + llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed" + llama_set_embeddings(); // class: "llama_context", method: "set_embeddings" + llama_n_threads(); // class: "llama_context", method: "n_threads" + llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free" + ``` + + - The `get` `` can be omitted + - The `` can be omitted if not necessary + - The `_context` suffix of the `` is optional. Use it to disambiguate symbols when needed + - Use `init`/`free` for constructor/destructor `` + +- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else + + ```cpp + typedef struct llama_context * llama_context_t; + + enum llama_pooling_type llama_pooling_type(const llama_context_t ctx); + ``` + + _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_ + +- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension +- Python filenames are all lowercase with underscores + +- _(TODO: abbreviations usage)_ + +# Preprocessor directives + +- _(TODO: add guidelines with examples and apply them to the codebase)_ + + ```cpp + #ifdef FOO + #endif // FOO + ``` + +# Documentation + +- Documentation is a community effort +- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference +- When you notice incorrect or outdated documentation, please update it + # Resources The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: diff --git a/README.md b/README.md index d6d1958c8..6302ac977 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) +- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003) - [x] [GPT-2](https://huggingface.co/gpt2) - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) - [x] [InternLM2](https://huggingface.co/models?search=internlm2) @@ -98,6 +99,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) +- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1) - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) #### Multimodal @@ -201,6 +203,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly +- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server diff --git a/common/arg.cpp b/common/arg.cpp index edcda60e0..9976f2c9b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -22,6 +22,11 @@ common_arg & common_arg::set_examples(std::initializer_list return *this; } +common_arg & common_arg::set_excludes(std::initializer_list excludes) { + this->excludes = std::move(excludes); + return *this; +} + common_arg & common_arg::set_env(const char * env) { help = help + "\n(env: " + env + ")"; this->env = env; @@ -37,6 +42,10 @@ bool common_arg::in_example(enum llama_example ex) { return examples.find(ex) != examples.end(); } +bool common_arg::is_exclude(enum llama_example ex) { + return excludes.find(ex) != excludes.end(); +} + bool common_arg::get_value_from_env(std::string & output) { if (env == nullptr) return false; char * value = std::getenv(env); @@ -121,17 +130,26 @@ std::string common_arg::to_string() { static void common_params_handle_model_default( std::string & model, - std::string & model_url, + const std::string & model_url, std::string & hf_repo, - std::string & hf_file) { + std::string & hf_file, + const std::string & hf_token) { if (!hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (hf_file.empty()) { if (model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + auto auto_detected = common_get_hf_file(hf_repo, hf_token); + if (auto_detected.first.empty() || auto_detected.second.empty()) { + exit(1); // built without CURL, error message already printed + } + hf_repo = auto_detected.first; + hf_file = auto_detected.second; + } else { + hf_file = model; } - hf_file = model; - } else if (model.empty()) { + } + // make sure model path is present (for caching purposes) + if (model.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs std::string filename = hf_repo + "_" + hf_file; // to make sure we don't have any slashes in the filename @@ -281,8 +299,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } // TODO: refactor model params in a common struct - common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); - common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); + common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token); + common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token); if (params.escape) { string_process_escapes(params.prompt); @@ -420,7 +438,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example */ auto add_opt = [&](common_arg arg) { - if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { ctx_arg.options.push_back(std::move(arg)); } }; @@ -649,7 +667,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.prompt = value; } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), @@ -673,7 +691,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt.pop_back(); } } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -700,7 +718,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), @@ -1512,7 +1530,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", [](common_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0 }); + params.lora_adapters.push_back({ std::string(value), 1.0, nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); @@ -1520,7 +1538,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", [](common_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale) }); + params.lora_adapters.push_back({ fname, std::stof(scale), nullptr }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); @@ -1574,21 +1592,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( - {"-hfr", "--hf-repo"}, "REPO", - "Hugging Face model repository (default: unused)", + {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", + "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" + "example: unsloth/phi-4-GGUF:q4_k_m\n" + "(default: unused)", [](common_params & params, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", - "Hugging Face model file (default: unused)", + "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", [](common_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( - {"-hfrv", "--hf-repo-v"}, "REPO", + {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", "Hugging Face model repository for the vocoder model (default: unused)", [](common_params & params, const std::string & value) { params.vocoder.hf_repo = value; diff --git a/common/arg.h b/common/arg.h index a6700d323..49ab8667b 100644 --- a/common/arg.h +++ b/common/arg.h @@ -12,6 +12,7 @@ struct common_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::set excludes = {}; std::vector args; const char * value_hint = nullptr; // help text or example for arg value const char * value_hint_2 = nullptr; // for second arg value @@ -53,9 +54,11 @@ struct common_arg { ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} common_arg & set_examples(std::initializer_list examples); + common_arg & set_excludes(std::initializer_list excludes); common_arg & set_env(const char * env); common_arg & set_sparam(); bool in_example(enum llama_example ex); + bool is_exclude(enum llama_example ex); bool get_value_from_env(std::string & output); bool has_value_from_env(); std::string to_string(); diff --git a/common/common.cpp b/common/common.cpp index 45c8c9b52..9cd371326 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2,6 +2,9 @@ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING #endif +#include "ggml.h" +#include "gguf.h" + #include "common.h" #include "log.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: @@ -18,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -62,11 +66,29 @@ #ifdef __linux__ #include #elif defined(_WIN32) -#define PATH_MAX MAX_PATH +# if !defined(PATH_MAX) +# define PATH_MAX MAX_PATH +# endif #else #include #endif #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 + +// +// CURL utils +// + +using curl_ptr = std::unique_ptr; + +// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one +struct curl_slist_ptr { + struct curl_slist * ptr = nullptr; + ~curl_slist_ptr() { + if (ptr) { + curl_slist_free_all(ptr); + } + } +}; #endif // LLAMA_USE_CURL using json = nlohmann::ordered_json; @@ -843,7 +865,7 @@ struct common_init_result common_init_from_params(common_params & params) { } else if (!params.model_url.empty()) { model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); } else { - model = llama_load_model_from_file(params.model.c_str(), mparams); + model = llama_model_load_from_file(params.model.c_str(), mparams); } if (model == NULL) { @@ -851,26 +873,28 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } + const llama_vocab * vocab = llama_model_get_vocab(model); + if (params.reranking) { bool ok = true; - if (llama_token_bos(model) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__); + if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) { + LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__); ok = false; } - if (llama_token_eos(model) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__); + if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { + LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__); ok = false; } - if (llama_token_sep(model) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__); + if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) { + LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__); ok = false; } if (!ok) { - llama_free_model(model); + llama_model_free(model); return iparams; } @@ -878,40 +902,40 @@ struct common_init_result common_init_from_params(common_params & params) { auto cparams = common_context_params_to_llama(params); - llama_context * lctx = llama_new_context_with_model(model, cparams); + llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_free_model(model); + llama_model_free(model); return iparams; } if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { - LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__); - llama_free_model(model); - return iparams; + LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); + params.ctx_shift = false; } if (!params.control_vectors.empty()) { if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; - if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model); const auto cvec = common_control_vector_load(params.control_vectors); if (cvec.n_embd == -1) { llama_free(lctx); - llama_free_model(model); + llama_model_free(model); return iparams; } - int err = llama_control_vector_apply(lctx, - cvec.data.data(), - cvec.data.size(), - cvec.n_embd, - params.control_vector_layer_start, - params.control_vector_layer_end); + int err = llama_apply_adapter_cvec( + lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); if (err) { llama_free(lctx); - llama_free_model(model); + llama_model_free(model); return iparams; } @@ -919,30 +943,31 @@ struct common_init_result common_init_from_params(common_params & params) { // load and optionally apply lora adapters for (auto & la : params.lora_adapters) { - common_lora_adapter_container loaded_la; - loaded_la.path = la.path; - loaded_la.scale = la.scale; - loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); - if (loaded_la.adapter == nullptr) { + llama_adapter_lora_ptr lora; + lora.reset(llama_adapter_lora_init(model, la.path.c_str())); + if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); - llama_free_model(model); + llama_model_free(model); return iparams; } - iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters - } - if (!params.lora_init_without_apply) { - common_lora_adapters_apply(lctx, iparams.lora_adapters); + + la.ptr = lora.get(); + iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters } - if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); + if (!params.lora_init_without_apply) { + common_set_adapter_lora(lctx, params.lora_adapters); + } + + if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { + LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); params.sampling.ignore_eos = false; } if (params.sampling.ignore_eos) { - for (llama_token i = 0; i < llama_n_vocab(model); i++) { - if (llama_token_is_eog(model, i)) { + for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { + if (llama_vocab_is_eog(vocab, i)) { LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); params.sampling.logit_bias.push_back({i, -INFINITY}); } @@ -963,8 +988,9 @@ struct common_init_result common_init_from_params(common_params & params) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); std::vector tmp; - llama_token bos = llama_token_bos(model); - llama_token eos = llama_token_eos(model); + llama_token bos = llama_vocab_bos(vocab); + llama_token eos = llama_vocab_eos(vocab); + // some models (e.g. T5) don't have a BOS token if (bos != LLAMA_TOKEN_NULL) { tmp.push_back(bos); @@ -979,7 +1005,7 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_encoder(model)) { llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size())); llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { decoder_start_token_id = bos; } tmp.clear(); @@ -993,17 +1019,17 @@ struct common_init_result common_init_from_params(common_params & params) { llama_perf_context_reset(lctx); } - iparams.model = model; - iparams.context = lctx; + iparams.model.reset(model); + iparams.context.reset(lctx); return iparams; } -void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { - llama_lora_adapter_clear(ctx); - for (auto & la : lora_adapters) { +void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { + llama_clear_adapter_lora(ctx); + for (auto & la : lora) { if (la.scale != 0.0f) { - llama_lora_adapter_set(ctx, la.adapter, la.scale); + llama_set_adapter_lora(ctx, la.ptr, la.scale); } } } @@ -1120,7 +1146,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl - std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; if (!curl) { LOG_ERR("%s: error initializing libcurl\n", __func__); return false; @@ -1134,11 +1161,9 @@ static bool common_download_file(const std::string & url, const std::string & pa // Check if hf-token or bearer-token was specified if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer "; - auth_header += hf_token.c_str(); - struct curl_slist *http_headers = NULL; - http_headers = curl_slist_append(http_headers, auth_header.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers); + std::string auth_header = "Authorization: Bearer " + hf_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); } #if defined(_WIN32) @@ -1148,8 +1173,7 @@ static bool common_download_file(const std::string & url, const std::string & pa #endif // Check if the file already exists locally - struct stat model_file_info; - auto file_exists = (stat(path.c_str(), &model_file_info) == 0); + auto file_exists = std::filesystem::exists(path); // If the file exists, check its JSON metadata companion file. std::string metadata_path = path + ".json"; @@ -1409,7 +1433,7 @@ struct llama_model * common_load_model_from_url( } } - return llama_load_model_from_file(local_path.c_str(), params); + return llama_model_load_from_file(local_path.c_str(), params); } struct llama_model * common_load_model_from_hf( @@ -1435,6 +1459,80 @@ struct llama_model * common_load_model_from_hf( return common_load_model_from_url(model_url, local_path, hf_token, params); } +/** + * Allow getting the HF file from the HF repo with tag (like ollama), for example: + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 + * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s + * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) + * + * Return pair of (with "repo" already having tag removed) + * + * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. + */ +std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { + auto parts = string_split(hf_repo_with_tag, ':'); + std::string tag = parts.size() > 1 ? parts.back() : "latest"; + std::string hf_repo = parts[0]; + if (string_split(hf_repo, '/').size() != 2) { + throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); + } + + // fetch model info from Hugging Face Hub API + json model_info; + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; + std::string res_str; + std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); + auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { + static_cast(data)->append((char * ) ptr, size * nmemb); + return size * nmemb; + }; + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); +#if defined(_WIN32) + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + if (!hf_token.empty()) { + std::string auth_header = "Authorization: Bearer " + hf_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + } + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); + http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + + CURLcode res = curl_easy_perform(curl.get()); + + if (res != CURLE_OK) { + throw std::runtime_error("error: cannot make GET request to HF API"); + } + + long res_code; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); + if (res_code == 200) { + model_info = json::parse(res_str); + } else if (res_code == 401) { + throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); + } else { + throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); + } + + // check response + if (!model_info.contains("ggufFile")) { + throw std::runtime_error("error: model does not have ggufFile"); + } + json & gguf_file = model_info.at("ggufFile"); + if (!gguf_file.contains("rfilename")) { + throw std::runtime_error("error: ggufFile does not have rfilename"); + } + + return std::make_pair(hf_repo, gguf_file.at("rfilename")); +} + #else struct llama_model * common_load_model_from_url( @@ -1456,6 +1554,11 @@ struct llama_model * common_load_model_from_hf( return nullptr; } +std::pair common_get_hf_file(const std::string &, const std::string &) { + LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + return std::make_pair("", ""); +} + #endif // LLAMA_USE_CURL // @@ -1554,21 +1657,23 @@ std::vector common_tokenize( const std::string & text, bool add_special, bool parse_special) { - return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + return common_tokenize(vocab, text, add_special, parse_special); } std::vector common_tokenize( - const struct llama_model * model, + const struct llama_vocab * vocab, const std::string & text, bool add_special, bool parse_special) { // upper limit for the number of tokens int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); - n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); + int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -1576,13 +1681,19 @@ std::vector common_tokenize( return result; } -static std::string _common_token_to_piece(const struct llama_model * model, llama_token token, bool special) { +std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + return common_token_to_piece(vocab, token, special); +} + +std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' - const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special); + const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); if (n_chars < 0) { piece.resize(-n_chars); - int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special); + int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); GGML_ASSERT(check == -n_chars); } else { @@ -1592,17 +1703,19 @@ static std::string _common_token_to_piece(const struct llama_model * model, llam return piece; } -std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { - return _common_token_to_piece(llama_get_model(ctx), token, special); +std::string common_detokenize(const struct llama_context * ctx, const std::vector & tokens, bool special) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + return common_detokenize(vocab, tokens, special); } -std::string common_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { +std::string common_detokenize(const struct llama_vocab * vocab, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); - int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); if (n_chars < 0) { text.resize(-n_chars); - n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); + n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization } @@ -1630,9 +1743,8 @@ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { return false; } } - llama_chat_message chat[] = {{"user", "test"}}; - int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); + const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } @@ -1643,16 +1755,16 @@ std::string common_chat_apply_template(const struct llama_model * model, int alloc_size = 0; bool fallback = false; // indicate if we must fallback to default chatml std::vector chat; - for (auto & msg : msgs) { + for (const auto & msg : msgs) { chat.push_back({msg.role.c_str(), msg.content.c_str()}); alloc_size += (msg.role.size() + msg.content.size()) * 1.25; } - const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); + const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model, /* name */ nullptr) : tmpl.c_str(); std::vector buf(alloc_size); // run the first time to get the total output length - int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); // error: chat template is not supported if (res < 0) { @@ -1660,18 +1772,17 @@ std::string common_chat_apply_template(const struct llama_model * model, // if the custom "tmpl" is not supported, we throw an error // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() throw std::runtime_error("this custom template is not supported"); - } else { - // If the built-in template is not supported, we default to chatml - res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); - fallback = true; } + + // If the built-in template is not supported, we default to chatml + res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size()); + fallback = true; } // if it turns out that our buffer is too small, we resize it if ((size_t) res > buf.size()) { buf.resize(res); res = llama_chat_apply_template( - fallback ? nullptr : model, fallback ? "chatml" : ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); } @@ -1724,11 +1835,13 @@ static std::string _llama_model_meta_val_str(const struct llama_model * model, c llama_chat_templates llama_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override) { - auto bos_token = _common_token_to_piece(model, llama_token_bos(model), true); - auto eos_token = _common_token_to_piece(model, llama_token_eos(model), true); + auto vocab = llama_model_get_vocab(model); + auto bos_token = common_token_to_piece(vocab, llama_token_bos(vocab), true); + auto eos_token = common_token_to_piece(vocab, llama_token_eos(vocab), true); std::string default_template_src = chat_template_override; std::string tool_use_template_src = chat_template_override; if (chat_template_override.empty()) { + // TODO: default_template_src = _llama_model_meta_val_str(model, "tokenizer.chat_template"); tool_use_template_src = _llama_model_meta_val_str(model, "tokenizer.chat_template.tool_use"); } diff --git a/common/common.h b/common/common.h index 2693b805e..90e10867c 100644 --- a/common/common.h +++ b/common/common.h @@ -2,7 +2,7 @@ #pragma once -#include "llama.h" +#include "llama-cpp.h" #include "chat-template.hpp" #include @@ -26,13 +26,11 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" -struct common_lora_adapter_info { +struct common_adapter_lora_info { std::string path; float scale; -}; -struct common_lora_adapter_container : common_lora_adapter_info { - struct llama_lora_adapter * adapter; + struct llama_adapter_lora * ptr; }; using llama_tokens = std::vector; @@ -250,8 +248,8 @@ struct common_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) - std::vector lora_adapters; // lora adapter path with user defined scale + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -459,6 +457,11 @@ static bool string_starts_with(const std::string & str, return str.rfind(prefix, 0) == 0; } +static bool string_ends_with(const std::string & str, + const std::string & suffix) { // While we wait for C++20's std::string::ends_with... + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); @@ -481,10 +484,12 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // +// note: defines object's lifetime struct common_init_result { - struct llama_model * model = nullptr; - struct llama_context * context = nullptr; - std::vector lora_adapters; + llama_model_ptr model; + llama_context_ptr context; + + std::vector lora; }; struct common_init_result common_init_from_params(common_params & params); @@ -504,9 +509,12 @@ struct llama_model * common_load_model_from_hf( const std::string & local_path, const std::string & hf_token, const struct llama_model_params & params); +std::pair common_get_hf_file( + const std::string & hf_repo_with_tag, + const std::string & hf_token); // clear LoRA adapters from context, then apply new list of adapters -void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); +void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); // // Batch utils @@ -544,7 +552,7 @@ std::vector common_tokenize( bool parse_special = false); std::vector common_tokenize( - const struct llama_model * model, + const struct llama_vocab * vocab, const std::string & text, bool add_special, bool parse_special = false); @@ -556,11 +564,21 @@ std::string common_token_to_piece( llama_token token, bool special = true); +std::string common_token_to_piece( + const struct llama_vocab * vocab, + llama_token token, + bool special = true); + // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` // optionally renders special/control tokens std::string common_detokenize( - llama_context * ctx, + const struct llama_context * ctx, + const std::vector & tokens, + bool special = true); + +std::string common_detokenize( + const struct llama_vocab * vocab, const std::vector & tokens, bool special = true); @@ -574,7 +592,7 @@ struct common_chat_msg { std::string content; }; -// Check if the template is supported or not. Returns true if it's valid +// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); // CPP wrapper for llama_chat_apply_template @@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vectorsecond; int max_count_static = 0; int sum_count_static = 0; - llama_token max_token = -1; + llama_token max_token = LLAMA_TOKEN_NULL; for (std::pair token_count_static : part_static) { const llama_token token = token_count_static.first; @@ -85,10 +85,10 @@ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram } if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { - return -1; + return LLAMA_TOKEN_NULL; } if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { - return -1; + return LLAMA_TOKEN_NULL; } return max_token; } @@ -98,9 +98,9 @@ static llama_token try_draft( common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static, const int * min_sample_size, const int * min_percent) { - llama_token drafted_token = -1; + llama_token drafted_token = LLAMA_TOKEN_NULL; - for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { + for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == LLAMA_TOKEN_NULL; --i) { const common_ngram ngram_primary = ngrams_primary[i]; common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); @@ -112,7 +112,7 @@ static llama_token try_draft( int max_count_primary = 0; int max_count_static = 0; int sum_count_primary = 0; - llama_token max_token = -1; + llama_token max_token = LLAMA_TOKEN_NULL; for (std::pair token_count_primary : part_primary) { const llama_token token = token_count_primary.first; @@ -154,7 +154,7 @@ void common_ngram_cache_draft( } while ((int) draft.size()-1 < n_draft) { - llama_token drafted_token = -1; + llama_token drafted_token = LLAMA_TOKEN_NULL; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; common_ngram ngram_static; @@ -177,17 +177,17 @@ void common_ngram_cache_draft( } ngrams_cd.push_back(ngram_cd); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { drafted_token = try_draft(nc_static, ngram_static); } - if (drafted_token == -1) { + if (drafted_token == LLAMA_TOKEN_NULL) { break; } diff --git a/common/ngram-cache.h b/common/ngram-cache.h index 09c2b0319..dfe012abe 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -17,13 +17,13 @@ struct common_ngram { common_ngram() { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = -1; + tokens[i] = LLAMA_TOKEN_NULL; } } common_ngram(const llama_token * input, const int ngram_size) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - tokens[i] = i < ngram_size ? input[i] : -1; + tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL; } } diff --git a/common/sampling.cpp b/common/sampling.cpp index e83a971c7..7241ac321 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -113,7 +113,10 @@ struct common_sampler { void set_logits(struct llama_context * ctx, int idx) { const auto * logits = llama_get_logits_ith(ctx, idx); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_vocab = llama_vocab_n_tokens(vocab); cur.resize(n_vocab); @@ -142,13 +145,15 @@ std::string common_params_sampling::print() const { } struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) { + const llama_vocab * vocab = llama_model_get_vocab(model); + llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); lparams.no_perf = params.no_perf; auto * result = new common_sampler { /* .params = */ params, - /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), + /* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"), /* .chain = */ llama_sampler_chain_init(lparams), /* .prev = */ ring_buffer(std::max(32, params.n_prev)), /* .cur = */ {}, @@ -157,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_logit_bias( - llama_n_vocab(model), + llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data())); @@ -172,7 +177,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co c_breakers.push_back(str.c_str()); } - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; case COMMON_SAMPLER_TYPE_TOP_K: @@ -194,7 +199,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); break; case COMMON_SAMPLER_TYPE_PENALTIES: llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); @@ -206,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); } else if (params.mirostat == 2) { llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); diff --git a/common/speculative.cpp b/common/speculative.cpp index 3fcbb0020..318e96ea3 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -79,10 +79,13 @@ bool common_speculative_are_compatible( const struct llama_model * model_tgt = llama_get_model(ctx_tgt); const struct llama_model * model_dft = llama_get_model(ctx_dft); - const bool vocab_type_tgt = llama_vocab_type(model_tgt); + const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); + const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); + + const bool vocab_type_tgt = llama_vocab_type(vocab_tgt); LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); - const bool vocab_type_dft = llama_vocab_type(model_dft); + const bool vocab_type_dft = llama_vocab_type(vocab_dft); LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { @@ -91,34 +94,34 @@ bool common_speculative_are_compatible( return false; } - if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || - llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || - llama_token_bos(model_tgt) != llama_token_bos(model_dft) || - llama_token_eos(model_tgt) != llama_token_eos(model_dft)) { - LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); - LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt)); - LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft)); + if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) || + llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) || + llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) || + llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) { + LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__); + LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt)); + LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft)); return false; } { - const int n_vocab_tgt = llama_n_vocab(model_tgt); - const int n_vocab_dft = llama_n_vocab(model_dft); + const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt); + const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft); const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", - __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); return false; } for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { - const char * token_text_tgt = llama_token_get_text(model_tgt, i); - const char * token_text_dft = llama_token_get_text(model_dft, i); + const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i); + const char * token_text_dft = llama_vocab_get_text(vocab_dft, i); if (std::strcmp(token_text_tgt, token_text_dft) != 0) { - LOG_ERR("%s: draft model vocab must match target model to use speculation but " + LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but " "token %d content differs - target '%s', draft '%s'\n", __func__, i, common_token_to_piece(ctx_tgt, i).c_str(), common_token_to_piece(ctx_dft, i).c_str()); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b6c15da94..4dc9837ab 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -326,6 +326,7 @@ class Model: gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED, gguf.MODEL_TENSOR.POSNET_NORM1, gguf.MODEL_TENSOR.POSNET_NORM2, ) @@ -477,6 +478,11 @@ class Model: return modelcls return func + @classmethod + def print_registered_models(cls): + for name in sorted(cls._model_classes.keys()): + logger.error(f"- {name}") + @classmethod def from_model_architecture(cls, arch: str) -> type[Model]: try: @@ -687,6 +693,9 @@ class Model: if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct res = "megrez" + if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": + # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 + res = "deepseek-v3" if res is None: logger.warning("\n") @@ -1764,25 +1773,19 @@ class DeciModel(Model): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=True, - special_token_types = ['bos', 'eos', 'eom', 'eot'] - ) - special_vocab._set_special_token("bos", 128000) - special_vocab._set_special_token("eos", 128001) - special_vocab._set_special_token("eom", 128008) - special_vocab._set_special_token("eot", 128009) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) else: # DeciLM-7B self._set_vocab_llama_hf() -# self._set_vocab_gpt2() def set_gguf_parameters(self): if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(self._ffn_dims) + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) self.gguf_writer.add_head_count_kv(self._num_kv_heads) self.gguf_writer.add_head_count(self._num_heads) self.gguf_writer.add_feed_forward_length(self._ffn_dims) @@ -2565,6 +2568,63 @@ class Phi3MiniModel(Model): yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) +@Model.register("PhiMoEForCausalLM") +class PhiMoeModel(Phi3MiniModel): + model_arch = gguf.MODEL_ARCH.PHIMOE + + _experts: list[dict[str, Tensor]] | None = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) + self.gguf_writer.add_expert_count(self.hparams["num_local_experts"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("PlamoForCausalLM") class PlamoModel(Model): model_arch = gguf.MODEL_ARCH.PLAMO @@ -3262,6 +3322,8 @@ class Rwkv6Model(Model): # required by llama.cpp, unused self.gguf_writer.add_head_count(0) + lerp_weights: dict[int, dict[str, Tensor]] = {} + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name) @@ -3277,14 +3339,84 @@ class Rwkv6Model(Model): if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: data_torch = data_torch.squeeze() - rescale_every_n_layers = self.hparams["rescale_every"] - if rescale_every_n_layers > 0: - if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): - data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + try: + rescale_every_n_layers = self.hparams["rescale_every"] + if rescale_every_n_layers > 0: + if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): + data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) + except KeyError: + pass + + # concat time_mix_lerp weights to reduce some cpu overhead + # also reduces the number of tensors in the model + if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name: + try: + self.lerp_weights[bid][new_name] = data_torch + except KeyError: + self.lerp_weights[bid] = {new_name: data_torch} + if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1) + yield (new_name, data) + return yield (new_name, data_torch) +@Model.register("RWKV6Qwen2ForCausalLM") +class RWKV6Qwen2Model(Rwkv6Model): + model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + num_attention_heads = self.hparams["num_attention_heads"] + num_key_value_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] + head_size = hidden_size // num_attention_heads + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + time_mix_extra_dim = 64 if hidden_size >= 4096 else 32 + time_decay_extra_dim = 128 if hidden_size >= 4096 else 64 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) + self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # special parameters for time_mixing in RWKV6QWEN2 + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_token_shift_count(1) + # RWKV6QWEN2 use grouped key/value like GQA + self.gguf_writer.add_head_count_kv(num_key_value_heads) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + for new_name, data in super().modify_tensors(data_torch, name, bid): + if "time_mix_w1" in new_name or "time_mix_w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + # rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg + # permute them here to avoid code changes + data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1]) + if "w2" in new_name: + data = data.view(5, -1, data.shape[-1]) + yield (new_name, data) + continue + yield (new_name, data) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA @@ -3379,6 +3511,24 @@ class CommandR2Model(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) +@Model.register("Cohere2ForCausalLM") +class Cohere2Model(Model): + model_arch = gguf.MODEL_ARCH.COHERE2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + rotary_pct = self.hparams["rotary_pct"] + hidden_size = self.hparams["hidden_size"] + num_attention_heads = self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads))) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + @Model.register("OlmoForCausalLM") @Model.register("OLMoForCausalLM") class OlmoModel(Model): @@ -3837,6 +3987,7 @@ class DeepseekModel(Model): @Model.register("DeepseekV2ForCausalLM") +@Model.register("DeepseekV3ForCausalLM") class DeepseekV2Model(Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -3858,6 +4009,15 @@ class DeepseekV2Model(Model): self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + if hparams["scoring_func"] == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif hparams["scoring_func"] == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}") + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: @@ -3870,6 +4030,16 @@ class DeepseekV2Model(Model): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # rename e_score_correction_bias tensors + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + # skip Multi-Token Prediction (MTP) layers + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return [] + # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] @@ -4764,6 +4934,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "model", type=Path, help="directory containing model file", + nargs="?", ) parser.add_argument( "--use-temp-file", action="store_true", @@ -4801,8 +4972,15 @@ def parse_args() -> argparse.Namespace: "--metadata", type=Path, help="Specify the path for an authorship metadata override file" ) + parser.add_argument( + "--print-supported-models", action="store_true", + help="Print the supported models" + ) - return parser.parse_args() + args = parser.parse_args() + if not args.print_supported_models and args.model is None: + parser.error("the following arguments are required: model") + return args def split_str_to_n_bytes(split_str: str) -> int: @@ -4826,6 +5004,11 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() + if args.print_supported_models: + logger.error("Supported models:") + Model.print_registered_models() + sys.exit(0) + if args.verbose: logging.basicConfig(level=logging.DEBUG) else: diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index fea23ddb4..56edc64a7 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -107,6 +107,7 @@ models = [ {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, {"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"}, {"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"}, + {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"}, ] diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ed1014cae..6dea14a23 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") + # models produced by mergekit-extract-lora have token embeddings in the adapter + base_name = base_name.replace(".lora_embedding_A", ".weight") + base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name @@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace: "--base", type=Path, help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", ) + parser.add_argument( + "--base-model-id", type=str, + help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", + ) parser.add_argument( "lora_path", type=Path, help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", @@ -290,6 +297,7 @@ if __name__ == '__main__': dir_base_model: Path | None = args.base dir_lora: Path = args.lora_path + base_model_id: str | None = args.base_model_id lora_config = dir_lora / "adapter_config.json" input_model = dir_lora / "adapter_model.safetensors" @@ -313,7 +321,10 @@ if __name__ == '__main__': lparams: dict[str, Any] = json.load(f) # load base model - if dir_base_model is None: + if base_model_id is not None: + logger.info(f"Loading base model from Hugging Face: {base_model_id}") + hparams = load_hparams_from_hf(base_model_id) + elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") @@ -371,11 +382,16 @@ if __name__ == '__main__': if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - is_lora_a = ".lora_A.weight" in name - is_lora_b = ".lora_B.weight" in name + # note: mergekit-extract-lora also adds token embeddings to the adapter + is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name + is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue + # mergekit-extract-lora add these layernorm to the adapter, we need to keep them + if "_layernorm" in name or ".norm" in name: + yield (base_name, tensor) + continue logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") if ".embed_tokens.weight" in name or ".lm_head.weight" in name: logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") @@ -407,9 +423,21 @@ if __name__ == '__main__': if name == "lm_head.weight" and len(dest) == 0: raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: + # mergekit-extract-lora add these layernorm to the adapter + if "_norm" in dest_name: + assert dest_data.dim() == 1 + yield (dest_name, dest_data) + continue + + # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() + # note: mergekit-extract-lora flip and transpose A and B + # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd() + if "token_embd.weight" in dest_name: + lora_a = lora_a.T + yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) diff --git a/docs/build.md b/docs/build.md index 84019b204..3b0d2211d 100644 --- a/docs/build.md +++ b/docs/build.md @@ -127,6 +127,8 @@ For detailed info, please refer to [llama.cpp for SYCL](./backend/SYCL.md). This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from the [NVIDIA developer site](https://developer.nvidia.com/cuda-downloads). +If you are using Fedora (using Fedora Workstation, or an 'Atomic' variant such as Silverblue), or would like to set up CUDA in a toolbox, please consider our [Fedora CUDA guide](./cuda-fedora.md). Unfortunately, the process is not as simple as one might expect. + - Using `CMake`: ```bash diff --git a/docs/cuda-fedora.md b/docs/cuda-fedora.md new file mode 100644 index 000000000..b993386c8 --- /dev/null +++ b/docs/cuda-fedora.md @@ -0,0 +1,317 @@ +# Setting Up CUDA on Fedora + +In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox container. This guide is applicable for: +- [Fedora Workstation](https://fedoraproject.org/workstation/) +- [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/) +- [Fedora Spins](https://fedoraproject.org/spins) +- [Other Distributions](https://containertoolbx.org/distros/), including `Red Hat Enterprise Linux >= 8.`, `Arch Linux`, and `Ubuntu`. + + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Monitoring NVIDIA CUDA Repositories](#monitoring-nvidia-cuda-repositories) +- [Using the Fedora 39 CUDA Repository](#using-the-fedora-39-cuda-repository) +- [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment) +- [Installing Essential Development Tools](#installing-essential-development-tools) +- [Adding the CUDA Repository](#adding-the-cuda-repository) +- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs) +- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts) +- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs) +- [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package) +- [Configuring the Environment](#configuring-the-environment) +- [Verifying the Installation](#verifying-the-installation) +- [Conclusion](#conclusion) +- [Troubleshooting](#troubleshooting) +- [Additional Notes](#additional-notes) +- [References](#references) + +## Prerequisites + +- **Toolbox Installed on the Host System** `Fedora Silverblue` and `Fedora Workstation` both have toolbox by default, other distributions may need to install the [toolbox package](https://containertoolbx.org/install/). +- **NVIDIA Drivers and Graphics Card installed on Host System (optional)** To run CUDA program, such as `llama.cpp`, the host should be setup to access your NVIDIA hardware. Fedora Hosts can use the [RPM Fusion Repository](https://rpmfusion.org/Howto/NVIDIA). +- **Internet connectivity** to download packages. + +### Monitoring NVIDIA CUDA Repositories + +Before proceeding, it is advisable to check if NVIDIA has updated their CUDA repositories for your Fedora version. NVIDIA's repositories can be found at: + +- [Fedora 40 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora40/x86_64/) +- [Fedora 41 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora41/x86_64/) + +As of the latest update, these repositories do not contain the `cuda` meta-package or are missing essential components. + +### Using the Fedora 39 CUDA Repository + +Since the newer repositories are incomplete, we'll use the Fedora 39 repository: + +- [Fedora 39 CUDA Repository](https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/) + +**Note:** Fedora 39 is no longer maintained, so we recommend using a toolbox environment to prevent system conflicts. + +## Creating a Fedora Toolbox Environment + +This guide focuses on Fedora hosts, but with small adjustments, it can work for other hosts. Using a Fedora 39 toolbox allows us to install the necessary packages without affecting the host system. + +**Note:** Toolbox is available for other systems, and even without Toolbox, it is possible to use Podman or Docker. + +We do not recommend installing on the host system, as Fedora 39 is out-of-maintenance, and instead you should upgrade to a maintained version of Fedora for your host. + +1. **Create a Fedora 39 Toolbox:** + + ```bash + toolbox create --image registry.fedoraproject.org/fedora-toolbox:39 --container fedora-toolbox-39-cuda + ``` + +2. **Enter the Toolbox:** + + ```bash + toolbox enter --container fedora-toolbox-39-cuda + ``` + + Inside the toolbox, you have root privileges and can install packages without affecting the host system. + +## Installing Essential Development Tools + +1. **Synchronize the DNF Package Manager:** + + ```bash + sudo dnf distro-sync + ``` + +2. **Install the Default Text Editor (Optional):** + + ```bash + sudo dnf install vim-default-editor --allowerasing + ``` + + The `--allowerasing` flag resolves any package conflicts. + +3. **Install Development Tools and Libraries:** + + ```bash + sudo dnf install @c-development @development-tools cmake + ``` + + This installs essential packages for compiling software, including `gcc`, `make`, and other development headers. + +## Adding the CUDA Repository + +Add the NVIDIA CUDA repository to your DNF configuration: + +```bash +sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora39/x86_64/cuda-fedora39.repo +``` + +After adding the repository, synchronize the package manager again: + +```bash +sudo dnf distro-sync +``` + +## Installing `nvidia-driver-libs` + +Attempt to install `nvidia-driver-libs`: + +```bash +sudo dnf install nvidia-driver-libs +``` + +**Explanation:** + +- `nvidia-driver-libs` contains necessary NVIDIA driver libraries required by CUDA. +- This step might fail due to conflicts with existing NVIDIA drivers on the host system. + +## Manually Resolving Package Conflicts + +If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files. + +### 1. Download the `nvidia-driver-libs` RPM + +```bash +sudo dnf download --arch x86_64 nvidia-driver-libs +``` + +You should see a file similar to: + +``` +nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm +``` + +### 2. Attempt to Install the RPM + +```bash +sudo dnf install nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm +``` + +**Expected Error:** + +Installation may fail with errors pointing to conflicts with `egl-gbm` and `egl-wayland`. + +**Note: It is important to carefully read the error messages to identify the exact paths that need to be excluded.** + +### 3. Download Dependencies + +```bash +sudo dnf download --arch x86_64 egl-gbm egl-wayland +``` + +### 4. Install `egl-gbm` with Excluded Paths + +Exclude conflicting files during installation: + +```bash +sudo rpm --install --verbose --hash \ + --excludepath=/usr/lib64/libnvidia-egl-gbm.so.1.1.2 \ + --excludepath=/usr/share/egl/egl_external_platform.d/15_nvidia_gbm.json \ + egl-gbm-1.1.2^20240919gitb24587d-3.fc39.x86_64.rpm +``` + +**Explanation:** + +- The `--excludepath` option skips installing files that conflict with existing files. +- Adjust the paths based on the error messages you receive. + +### 5. Install `egl-wayland` with Excluded Paths + +```bash +sudo rpm --install --verbose --hash \ + --excludepath=/usr/share/egl/egl_external_platform.d/10_nvidia_wayland.json \ + egl-wayland-1.1.17^20241118giteeb29e1-5.fc39.x86_64.rpm +``` + +### 6. Install `nvidia-driver-libs` with Excluded Paths + +```bash +sudo rpm --install --verbose --hash \ + --excludepath=/usr/share/glvnd/egl_vendor.d/10_nvidia.json \ + --excludepath=/usr/share/nvidia/nvoptix.bin \ + nvidia-driver-libs-560.35.05-1.fc39.x86_64.rpm +``` + +**Note:** + +- Replace the paths with the ones causing conflicts in your installation if they differ. +- The `--verbose` and `--hash` options provide detailed output during installation. + +## Finalizing the Installation of `nvidia-driver-libs` + +After manually installing the dependencies, run: + +```bash +sudo dnf install nvidia-driver-libs +``` + +You should receive a message indicating the package is already installed: + +``` +Package nvidia-driver-libs-3:560.35.05-1.fc39.x86_64 is already installed. +Dependencies resolved. +Nothing to do. +Complete! +``` + +## Installing the CUDA Meta-Package + +Now that the driver libraries are installed, proceed to install CUDA: + +```bash +sudo dnf install cuda +``` + +This installs the CUDA toolkit and associated packages. + +## Configuring the Environment + +To use CUDA, add its binary directory to your system's `PATH`. + +1. **Create a Profile Script:** + + ```bash + sudo sh -c 'echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /etc/profile.d/cuda.sh' + ``` + + **Explanation:** + + - We add to `/etc/profile.d/` as the `/etc/` folder is unique to this particular container, and is not shared with other containers or the host system. + - The backslash `\` before `$PATH` ensures the variable is correctly written into the script. + +2. **Make the Script Executable:** + + ```bash + sudo chmod +x /etc/profile.d/cuda.sh + ``` + +3. **Source the Script to Update Your Environment:** + + ```bash + source /etc/profile.d/cuda.sh + ``` + + **Note:** This command updates your current shell session with the new `PATH`. The `/etc/profile.d/cuda.sh` script ensures that the CUDA binaries are available in your `PATH` for all future sessions. + +## Verifying the Installation + +To confirm that CUDA is correctly installed and configured, check the version of the NVIDIA CUDA Compiler (`nvcc`): + +```bash +nvcc --version +``` + +You should see output similar to: + +``` +nvcc: NVIDIA (R) Cuda compiler driver +Copyright (c) 2005-2024 NVIDIA Corporation +Built on Tue_Oct_29_23:50:19_PDT_2024 +Cuda compilation tools, release 12.6, V12.6.85 +Build cuda_12.6.r12.6/compiler.35059454_0 +``` + +This output confirms that the CUDA compiler is accessible and indicates the installed version. + +## Conclusion + +You have successfully set up CUDA on Fedora within a toolbox environment using the Fedora 39 CUDA repository. By manually resolving package conflicts and configuring the environment, you can develop CUDA applications without affecting your host system. + +## Troubleshooting + +- **Installation Failures:** + - If you encounter errors during installation, carefully read the error messages. They often indicate conflicting files or missing dependencies. + - Use the `--excludepath` option with `rpm` to exclude conflicting files during manual installations. + +- **Driver Conflicts:** + - Since the host system may already have NVIDIA drivers installed, conflicts can arise. Using the toolbox environment helps isolate these issues. + +- **Environment Variables Not Set:** + - If `nvcc` is not found after installation, ensure that `/usr/local/cuda/bin` is in your `PATH`. + - Run `echo $PATH` to check if the path is included. + - Re-source the profile script or open a new terminal session. + +## Additional Notes + +- **Updating CUDA in the Future:** + - Keep an eye on the official NVIDIA repositories for updates to your Fedora version. + - When an updated repository becomes available, adjust your `dnf` configuration accordingly. + +- **Building `llama.cpp`:** + - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support. + - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration. + +- **Using the Toolbox Environment:** + - The toolbox environment is isolated from your host system, which helps prevent conflicts. + - Remember that system files and configurations inside the toolbox are separate from the host. By default the home directory of the user is shared between the host and the toolbox. + +--- + +**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox. + +**Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide. + +## References + +- [Fedora Toolbox Documentation](https://docs.fedoraproject.org/en-US/fedora-silverblue/toolbox/) +- [NVIDIA CUDA Installation Guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) +- [Podman Documentation](https://podman.io/get-started) + +--- diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 04c5ccbbe..8fcd70811 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -28,7 +28,7 @@ The required steps to implement for an HF model are: ```python @Model.register("MyModelForCausalLM") class MyModel(Model): - model_arch = gguf.MODEL_ARCH.GROK + model_arch = gguf.MODEL_ARCH.MYMODEL ``` 2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py) @@ -79,14 +79,14 @@ Depending on the model configuration, tokenizer, code and tensors layout, you wi - `Model#set_vocab` - `Model#write_tensors` -NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights. +NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the convention and several tools like `quantize` expect this to proceed the weights. ### 2. Define the model architecture in `llama.cpp` The model params and tensors layout must be defined in `llama.cpp`: 1. Define a new `llm_arch` 2. Define the tensors layout in `LLM_TENSOR_NAMES` -3. Add any non standard metadata in `llm_load_hparams` +3. Add any non-standard metadata in `llm_load_hparams` 4. Create the tensors for inference in `llm_load_tensors` 5. If the model has a RoPE operation, add the rope type in `llama_rope_type` @@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. -Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`. +Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`. -When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. +Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR. Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index a3b21ad6b..0659ab6f1 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -38,7 +38,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); @@ -50,7 +50,7 @@ int main(int argc, char ** argv) { // ensure enough sequences are available ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); - llama_context * ctx = llama_new_context_with_model(model, ctx_params); + llama_context * ctx = llama_init_from_model(model, ctx_params); if (ctx == NULL) { fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); @@ -194,7 +194,7 @@ int main(int argc, char ** argv) { llama_batch_free(batch); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 10f2e7fd1..371917b2e 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -23,12 +23,12 @@ defer { } let model_params = llama_model_default_params() -guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { +guard let model = llama_model_load_from_file(modelPath.cString(using: .utf8), model_params) else { print("Failed to load model") exit(1) } defer { - llama_free_model(model) + llama_model_free(model) } var tokens = tokenize(text: prompt, add_bos: true) @@ -141,7 +141,7 @@ while n_cur <= n_len { let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) // is it an end of stream? -> mark the stream as finished - if llama_token_is_eog(model, new_token_id) || n_cur == n_len { + if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len { i_batch[i] = -1 // print("") if n_parallel > 1 { diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index e2e01f2d5..21b95ef5e 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -41,17 +41,19 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: error: unable to load model\n" , __func__); return 1; } + const llama_vocab * vocab = llama_model_get_vocab(model); + // tokenize the prompt std::vector tokens_list; - tokens_list = common_tokenize(model, params.prompt, true); + tokens_list = common_tokenize(vocab, params.prompt, true); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; @@ -62,7 +64,7 @@ int main(int argc, char ** argv) { ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_predict, n_parallel); - llama_context * ctx = llama_new_context_with_model(model, ctx_params); + llama_context * ctx = llama_init_from_model(model, ctx_params); auto sparams = llama_sampler_chain_default_params(); sparams.no_perf = false; @@ -120,8 +122,8 @@ int main(int argc, char ** argv) { } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { - decoder_start_token_id = llama_token_bos(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); } common_batch_clear(batch); @@ -174,7 +176,7 @@ int main(int argc, char ** argv) { const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); // is it an end of generation? -> mark the stream as finished - if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { + if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; LOG("\n"); if (n_parallel > 1) { @@ -236,7 +238,7 @@ int main(int argc, char ** argv) { llama_sampler_free(smpl); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 988a584c9..bdf0eed2a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,4 +1,6 @@ #include "ggml.h" +#include "gguf.h" + #include "llama.h" #include "common.h" #include "log.h" @@ -434,12 +436,12 @@ static void print_matrix(struct ggml_tensor * probs) { } } -struct llama_file { +struct my_llama_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; size_t size; - llama_file(const char * fname, const char * mode) { + my_llama_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { size = 0; @@ -500,7 +502,7 @@ struct llama_file { return std::string(chars.data(), len); } - ~llama_file() { + ~my_llama_file() { if (fp) { std::fclose(fp); } @@ -508,7 +510,7 @@ struct llama_file { }; static bool is_ggml_file(const char * filename) { - llama_file file(filename, "rb"); + my_llama_file file(filename, "rb"); if (file.size < 4) { return false; } @@ -576,7 +578,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l } else { // assume llama2.c vocabulary LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); - llama_file file(filename, "rb"); + my_llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); } @@ -689,8 +691,8 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); - gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); - gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, LLAMA_TOKEN_NULL); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, LLAMA_TOKEN_NULL); gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); @@ -909,7 +911,7 @@ int main(int argc, char ** argv) { load_vocab(params.fn_vocab_model, &config, &vocab); struct my_llama_model model; - model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); + model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = config.dim; //params.n_embd; model.hparams.n_ff = config.hidden_dim; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index d1731bba6..413b71d34 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -1,7 +1,9 @@ +#include "ggml.h" +#include "gguf.h" + #include "arg.h" #include "common.h" #include "llama.h" -#include "ggml.h" #include "pca.hpp" #include "mean.hpp" @@ -271,7 +273,9 @@ struct tokenized_prompt { size_t max_seq_len; tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + const bool add_bos = llama_vocab_get_add_bos(vocab); tokens_pos = common_tokenize(ctx, pos, add_bos, true); tokens_neg = common_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); @@ -415,12 +419,13 @@ int main(int argc, char ** argv) { // load the model to get hparams common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); // int n_ctx = llama_n_ctx(ctx); - int n_layers = llama_n_layer(model); - int n_embd = llama_n_embd(model); + int n_layers = llama_model_n_layer(model); + int n_embd = llama_model_n_embd(model); + // get model hint param (a.k.a model arch name) char model_hint[128]; llama_model_meta_val_str(model, "general.architecture", model_hint, 128); @@ -474,8 +479,6 @@ int main(int argc, char ** argv) { // done with the model, we can now free it to make gain some memory printf("Done evaluate prompts, unload model...\n"); - llama_free(ctx); - llama_free_model(model); bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 3f18fc6a7..38d22c90f 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -97,14 +97,17 @@ int main(int argc, char ** argv) { // load the model common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); @@ -147,7 +150,7 @@ int main(int argc, char ** argv) { // check if the last token is SEP // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_token_sep(model)) { + if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) { LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); } @@ -180,7 +183,7 @@ int main(int argc, char ** argv) { } // allocate output - const int n_embd = llama_n_embd(model); + const int n_embd = llama_model_n_embd(model); std::vector embeddings(n_embd_count * n_embd, 0); float * emb = embeddings.data(); @@ -316,8 +319,6 @@ int main(int argc, char ** argv) { // clean up llama_batch_free(batch); - llama_free(ctx); - llama_free_model(model); llama_backend_free(); return 0; diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index c08e3e5f6..fb188f5a9 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -127,7 +127,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { } static bool run(llama_context * ctx, const common_params & params) { - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); @@ -162,8 +165,9 @@ int main(int argc, char ** argv) { // init common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + if (model == nullptr || ctx == nullptr) { LOG_ERR("%s : failed to init\n", __func__); return 1; @@ -184,9 +188,6 @@ int main(int argc, char ** argv) { LOG("\n"); llama_perf_context_print(ctx); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); return 0; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 058b5cc86..99063b5d5 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,12 +1,13 @@ -#include "arg.h" -#include "common.h" #include "ggml.h" #include "ggml-alloc.h" +#include "gguf.h" + +#include "arg.h" +#include "common.h" #include #include #include -#include #include static bool g_verbose = false; @@ -128,7 +129,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector & lora_files, + std::vector & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp index e96c75117..9523ec122 100644 --- a/examples/gguf-hash/gguf-hash.cpp +++ b/examples/gguf-hash/gguf-hash.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "gguf.h" #include /* abort() */ #include diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 75f63f938..ef3ceb686 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -1,18 +1,19 @@ +#include "ggml.h" +#include "gguf.h" #include "llama.h" #include "common.h" #include -#include +#include +#include +#include #include +#include +#include #include #include #include -#include -#include -#include -#include - #if defined(_WIN32) #include #ifndef PATH_MAX @@ -297,7 +298,7 @@ struct split_strategy { total_size += ggml_nbytes(t); } total_size = total_size / 1000 / 1000; // convert to megabytes - printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); i_split++; } } diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 7498f85ef..f31989c8c 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -1,10 +1,9 @@ #include "ggml.h" +#include "gguf.h" #include -#include #include #include -#include #include #undef MIN @@ -135,9 +134,10 @@ static bool gguf_ex_read_0(const std::string & fname) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); + const size_t size = gguf_get_tensor_size (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); } } @@ -182,9 +182,10 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name (ctx, i); + const size_t size = gguf_get_tensor_size (ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i); - printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); + printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset); } } @@ -199,7 +200,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data); + printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d), name = %s, data = %p\n", + __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); // print first 10 elements const float * data = (const float *) cur->data; @@ -215,7 +217,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { const float * data = (const float *) cur->data; for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { - fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); + fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i)); gguf_free(ctx); return false; } @@ -245,6 +247,8 @@ int main(int argc, char ** argv) { check_data = false; } + srand(123456); + const std::string fname(argv[1]); const std::string mode (argv[2]); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 18a945b33..72eb46257 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -11,6 +11,7 @@ static std::vector> encode(llama_context * ctx, const std::ve std::vector> result; const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); @@ -19,16 +20,16 @@ static std::vector> encode(llama_context * ctx, const std::ve const std::string input_string = instruction + sentences[i]; - std::vector inputs = common_tokenize(model, input_string, true, false); + std::vector inputs = common_tokenize(vocab, input_string, true, false); const int32_t n_toks = inputs.size(); // GritLM seems to have EOS = "" // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 - // inputs.push_back(llama_token_eos(model)); + // inputs.push_back(llama_vocab_eos(vocab)); // we want to ignore instruction tokens for mean pooling - const int32_t n_inst = common_tokenize(model, instruction, true, false).size(); + const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size(); #ifdef GRIT_DEBUG // debug tokens - should be matching as referenced in the GritLM sample @@ -52,7 +53,7 @@ static std::vector> encode(llama_context * ctx, const std::ve llama_decode(ctx, batch); // get embedding dimensions - uint64_t n_embd = llama_n_embd(model); + uint64_t n_embd = llama_model_n_embd(model); // allocate embedding output std::vector emb_unorm(n_embd, 0.0f); @@ -97,7 +98,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std std::string result; const llama_model * model = llama_get_model(ctx); - llama_token eos_token = llama_token_eos(model); + const llama_vocab * vocab = llama_model_get_vocab(model); + + llama_token eos_token = llama_vocab_eos(vocab); llama_kv_cache_clear(ctx); llama_set_embeddings(ctx, false); @@ -105,7 +108,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); - std::vector inputs = common_tokenize(model, prompt, false, true); + std::vector inputs = common_tokenize(vocab, prompt, false, true); int32_t i_current_token = 0; while (true) { @@ -165,10 +168,10 @@ int main(int argc, char * argv[]) { llama_backend_init(); - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams); // create generation context - llama_context * ctx = llama_new_context_with_model(model, cparams); + llama_context * ctx = llama_init_from_model(model, cparams); auto sparams = llama_sampler_chain_default_params(); @@ -197,7 +200,7 @@ int main(int argc, char * argv[]) { const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); - const int n_embd = llama_n_embd(model); + const int n_embd = llama_model_n_embd(model); const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); @@ -219,7 +222,7 @@ int main(int argc, char * argv[]) { llama_sampler_free(smpl); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); return 0; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 45206f4a7..b5f3feb9f 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -40,7 +39,7 @@ public: void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * file_name); + bool load_imatrix(const char * fname); private: std::unordered_map m_stats; common_params m_params; @@ -429,10 +428,14 @@ static void process_logits( } static bool compute_imatrix(llama_context * ctx, const common_params & params) { - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); const int n_ctx = llama_n_ctx(ctx); + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); @@ -467,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = llama_vocab_n_tokens(vocab); const int n_batch = params.n_batch; int count = 0; @@ -507,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + tokens[batch_start] = llama_vocab_bos(vocab); } common_batch_clear(batch); @@ -618,14 +621,15 @@ int main(int argc, char ** argv) { // init common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + if (model == nullptr || ctx == nullptr) { LOG_ERR("%s : failed to init\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx_train = llama_model_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, params.n_ctx); @@ -655,9 +659,6 @@ int main(int argc, char ** argv) { LOG("\n"); llama_perf_context_print(ctx); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); return 0; diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index ef7008957..489a208b6 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -131,15 +131,17 @@ int main(int argc, char ** argv) { LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); common_init_result llama_init = common_init_from_params(params); - model = llama_init.model; - ctx = llama_init.context; + model = llama_init.model.get(); + ctx = llama_init.context.get(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); LOG_DBG("n_ctx: %d\n", n_ctx); @@ -152,28 +154,28 @@ int main(int argc, char ** argv) { LOG_INF("\n"); LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - const bool add_bos = llama_add_bos_token(model); - GGML_ASSERT(!llama_add_eos_token(model)); + const bool add_bos = llama_vocab_get_add_bos(vocab); + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); std::vector embd_inp; std::vector embd_end; std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - GGML_ASSERT(llama_token_fim_pre(model) >= 0); - GGML_ASSERT(llama_token_fim_suf(model) >= 0); + GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0); + GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0); - inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); + inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab)); + inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_fim_mid(model); + const llama_token middle_token = llama_vocab_fim_mid(vocab); if (middle_token >= 0) { embd_inp.push_back(middle_token); } @@ -185,7 +187,7 @@ int main(int argc, char ** argv) { // Should not run without any tokens if (embd_inp.empty()) { - embd_inp.push_back(llama_token_bos(model)); + embd_inp.push_back(llama_vocab_bos(vocab)); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } @@ -420,10 +422,10 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ + if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str()); } LOG("\n"); console::set_display(console::user_input); @@ -463,13 +465,13 @@ int main(int argc, char ** argv) { std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); + inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab)); + inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); @@ -484,7 +486,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of generation tokens in interactive mode - else if (llama_token_is_eog(model, common_sampler_last(smpl))) { + else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { LOG_DBG("found EOS token\n"); if (params.interactive) { @@ -500,7 +502,7 @@ int main(int argc, char ** argv) { if (params.input_prefix_bos) { LOG_DBG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(model)); + embd_inp.push_back(llama_vocab_bos(vocab)); } std::string buffer; @@ -563,7 +565,7 @@ int main(int argc, char ** argv) { } // end of generation - if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { + if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) { break; } @@ -575,15 +577,12 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str()); } LOG("\n"); common_perf_print(ctx, smpl); - llama_free(ctx); - llama_free_model(model); - common_sampler_free(smpl); llama_backend_free(); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 2338ad106..a3b4c5ac8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1401,7 +1401,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th llama_set_n_threads(ctx, n_threads, n_threads); const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); std::vector tokens(n_batch); @@ -1409,7 +1410,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); - tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; for (int i = 1; i < n_tokens; i++) { tokens[i] = std::rand() % n_vocab; } @@ -1424,9 +1425,10 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); const llama_model * model = llama_get_model(ctx); - const int32_t n_vocab = llama_n_vocab(model); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); - llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { llama_decode(ctx, llama_batch_get_one(&token, 1)); @@ -1526,10 +1528,10 @@ int main(int argc, char ** argv) { // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (lmodel) { - llama_free_model(lmodel); + llama_model_free(lmodel); } - lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); + lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); if (lmodel == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); return 1; @@ -1537,10 +1539,10 @@ int main(int argc, char ** argv) { prev_inst = &inst; } - llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); + llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); - llama_free_model(lmodel); + llama_model_free(lmodel); return 1; } @@ -1626,7 +1628,7 @@ int main(int argc, char ** argv) { ggml_threadpool_free_fn(threadpool); } - llama_free_model(lmodel); + llama_model_free(lmodel); if (p) { p->print_footer(); diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index b3858ddfb..99b14961d 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -87,7 +87,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi auto path_to_model = env->GetStringUTFChars(filename, 0); LOGi("Loading model from %s", path_to_model); - auto model = llama_load_model_from_file(path_to_model, model_params); + auto model = llama_model_load_from_file(path_to_model, model_params); env->ReleaseStringUTFChars(filename, path_to_model); if (!model) { @@ -102,7 +102,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) { - llama_free_model(reinterpret_cast(model)); + llama_model_free(reinterpret_cast(model)); } extern "C" @@ -305,7 +305,9 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { - llama_batch_free(*reinterpret_cast(batch_pointer)); + //llama_batch_free(*reinterpret_cast(batch_pointer)); + const auto batch = reinterpret_cast(batch_pointer); + delete batch; } extern "C" @@ -403,6 +405,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( const auto batch = reinterpret_cast(batch_pointer); const auto sampler = reinterpret_cast(sampler_pointer); const auto model = llama_get_model(context); + const auto vocab = llama_model_get_vocab(model); if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); @@ -412,7 +415,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( const auto new_token_id = llama_sampler_sample(sampler, context, -1); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); - if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { + if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) { return nullptr; } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 998c673d5..477c3e6f2 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -52,8 +52,8 @@ actor LlamaContext { deinit { llama_sampler_free(sampling) llama_batch_free(batch) + llama_model_free(model) llama_free(context) - llama_free_model(model) llama_backend_free() } @@ -65,7 +65,7 @@ actor LlamaContext { model_params.n_gpu_layers = 0 print("Running on simulator, force use n_gpu_layers = 0") #endif - let model = llama_load_model_from_file(path, model_params) + let model = llama_model_load_from_file(path, model_params) guard let model else { print("Could not load model at \(path)") throw LlamaError.couldNotInitializeContext @@ -151,7 +151,7 @@ actor LlamaContext { new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1) - if llama_token_is_eog(model, new_token_id) || n_cur == n_len { + if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len { print("\n") is_done = true let new_token_str = String(cString: temporary_invalid_cchars + [0]) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 3cd0d2fa8..7a8a3156b 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -7,6 +7,7 @@ #include "ggml-cpu.h" #include "ggml-alloc.h" #include "ggml-backend.h" +#include "gguf.h" //#ifdef GGML_USE_CUDA //#include "ggml-cuda.h" @@ -262,7 +263,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { { const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); int arr_n = gguf_get_arr_n(ctx_gguf, i); - const void * data = gguf_get_arr_data(ctx_gguf, i); + const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); std::stringstream ss; ss << "["; for (int j = 0; j < arr_n; j++) { @@ -2734,7 +2735,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i total_size_org += orig_size; total_size_new += new_size; gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size); + gguf_set_tensor_data(ctx_out, name.c_str(), new_data); fout.write((const char *)new_data, new_size); size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size; for (size_t j = 0; j < pad; ++j) { diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 2691c6e6b..40aa0876f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -47,8 +47,12 @@ static const char * sample(struct common_sampler * smpl, int * n_past) { const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); common_sampler_accept(smpl, id, true); + + const llama_model * model = llama_get_model(ctx_llama); + const llama_vocab * vocab = llama_model_get_vocab(model); + static std::string ret; - if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + if (llama_vocab_is_eog(vocab, id)) { ret = ""; } else { ret = common_token_to_piece(ctx_llama, id); @@ -221,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -239,11 +243,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_context_params ctx_params = common_context_params_to_llama(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + llama_context * ctx_llama = llama_init_from_model(model, ctx_params); if (ctx_llama == NULL) { LOG_ERR("%s: failed to create the llama_context\n" , __func__); @@ -265,7 +268,7 @@ static void llava_free(struct llava_context * ctx_llava) { } llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); + llama_model_free(ctx_llava->model); llama_backend_free(); } @@ -323,7 +326,7 @@ int main(int argc, char ** argv) { } } - llama_free_model(model); + llama_model_free(model); return 0; } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 16f30c56c..c598caf3d 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -384,7 +384,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { // make sure that the correct mmproj was used, i.e., compare apples to apples - int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); + int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama)); auto n_image_embd = clip_n_mmproj_embd(ctx_clip); if (n_image_embd != n_llama_embd) { LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); @@ -456,7 +456,7 @@ struct llava_embd_batch { }; bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) { - int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + int n_embd = llama_model_n_embd(llama_get_model(ctx_llama)); for (int i = 0; i < image_embed->n_image_pos; i += n_batch) { int n_eval = image_embed->n_image_pos - i; diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index e9cbb51ed..38c44e130 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -54,7 +54,7 @@ static struct llava_context * llava_init_context(common_params * params, llama_m ctx_params.n_ctx = params->n_ctx; } - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + llama_context * ctx_llama = llama_init_from_model(model, ctx_params); if (ctx_llama == NULL) { LOG_ERR("%s: failed to create the llama_context\n" , __func__); @@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) { } llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); + llama_model_free(ctx_llava->model); llama_backend_free(); } @@ -167,8 +167,12 @@ static const char * sample(struct common_sampler * smpl, int * n_past) { const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); common_sampler_accept(smpl, id, true); + + const llama_model * model = llama_get_model(ctx_llama); + const llama_vocab * vocab = llama_model_get_vocab(model); + static std::string ret; - if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + if (llama_vocab_is_eog(vocab, id)) { ret = ""; } else { ret = common_token_to_piece(ctx_llama, id); diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index e86a60280..132a7da54 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -27,7 +27,7 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { - int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + int n_embd = llama_model_n_embd(llama_get_model(ctx_llama)); const int patch_size = 14 * 2; const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0); const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0); @@ -132,8 +132,12 @@ static const char * sample(struct common_sampler * smpl, int * n_past, int * st_pos_id) { const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); common_sampler_accept(smpl, id, true); + + const llama_model * model = llama_get_model(ctx_llama); + const llama_vocab * vocab = llama_model_get_vocab(model); + static std::string ret; - if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + if (llama_vocab_is_eog(vocab, id)) { ret = ""; } else { ret = common_token_to_piece(ctx_llama, id); @@ -310,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -328,11 +332,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_context_params ctx_params = common_context_params_to_llama(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings - llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + llama_context * ctx_llama = llama_init_from_model(model, ctx_params); if (ctx_llama == NULL) { LOG_ERR("%s: failed to create the llama_context\n" , __func__); @@ -354,7 +357,7 @@ static void llava_free(struct llava_context * ctx_llava) { } llama_free(ctx_llava->ctx_llama); - llama_free_model(ctx_llava->model); + llama_model_free(ctx_llava->model); llama_backend_free(); } @@ -481,7 +484,7 @@ static void debug_test_mrope_2d() { } static void debug_dump_img_embed(struct llava_context * ctx_llava) { - int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama)); + int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama)); int ne = n_embd * 4; float vals[56 * 56 * 3]; // float embd[ne]; @@ -575,7 +578,7 @@ int main(int argc, char ** argv) { } } - llama_free_model(model); + llama_model_free(model); return 0; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 8d0ef8b3d..2f0898e62 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -58,8 +58,10 @@ int main(int argc, char ** argv) { // load the target model common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + + const llama_vocab * vocab = llama_model_get_vocab(model); // Tokenize the prompt std::vector inp; @@ -147,7 +149,7 @@ int main(int argc, char ** argv) { } // here we keep adding new n-grams as we go - ngram_container ngrams_observed(llama_n_vocab(model), N, G); + ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G); // debug struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1); @@ -297,7 +299,7 @@ int main(int argc, char ** argv) { } fflush(stdout); - if (llama_token_is_eog(model, id)) { + if (llama_vocab_is_eog(vocab, id)) { has_eos = true; } @@ -474,9 +476,6 @@ int main(int argc, char ** argv) { llama_batch_free(batch); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); LOG("\n\n"); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 7ced0aa97..3da45ed9e 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -1,14 +1,9 @@ #include "arg.h" #include "common.h" #include "ngram-cache.h" -#include "ggml.h" #include "llama.h" -#include -#include -#include #include -#include #include int main(int argc, char ** argv){ @@ -25,16 +20,16 @@ int main(int argc, char ** argv){ // load the model common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model_ptr & model = llama_init.model; + llama_context_ptr & ctx = llama_init.context; + GGML_ASSERT(model != nullptr); // tokenize the prompt std::vector inp; - inp = common_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx.get(), params.prompt, true, true); fprintf(stderr, "%s: tokenization done\n", __func__); - common_ngram_cache ngram_cache; common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index dff07c075..fcb289abe 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -30,12 +30,11 @@ int main(int argc, char ** argv){ // load the model common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_context_ptr & ctx = llama_init.context; // tokenize the prompt std::vector inp; - inp = common_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx.get(), params.prompt, true, true); common_ngram_cache ngram_cache_context; common_ngram_cache ngram_cache_dynamic; @@ -66,7 +65,7 @@ int main(int argc, char ** argv){ } const int n_input = inp.size(); - const int n_ctx = llama_n_ctx(ctx); + const int n_ctx = llama_n_ctx(ctx.get()); int n_drafted = 0; int n_accept = 0; @@ -150,9 +149,6 @@ int main(int argc, char ** argv){ LOG_INF("n_accept = %d\n", n_accept); LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); LOG("\n\n"); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 4d92bb238..dbd0444ec 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -33,8 +33,10 @@ int main(int argc, char ** argv){ // load the model common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + + const llama_vocab * vocab = llama_model_get_vocab(model); // tokenize the prompt std::vector inp; @@ -136,7 +138,7 @@ int main(int argc, char ** argv){ LOG("%s", token_str.c_str()); } - if (llama_token_is_eog(model, id)) { + if (llama_vocab_is_eog(vocab, id)) { has_eos = true; } @@ -243,9 +245,6 @@ int main(int argc, char ** argv){ llama_batch_free(batch_tgt); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); LOG("\n\n"); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d0c28f317..640b35c1d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -5,7 +5,6 @@ #include "sampling.h" #include "llama.h" -#include #include #include #include @@ -145,24 +144,26 @@ int main(int argc, char ** argv) { llama_context * ctx = nullptr; common_sampler * smpl = nullptr; - std::vector chat_msgs; - g_model = &model; g_ctx = &ctx; g_smpl = &smpl; + std::vector chat_msgs; + // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); common_init_result llama_init = common_init_from_params(params); - model = llama_init.model; - ctx = llama_init.context; + model = llama_init.model.get(); + ctx = llama_init.context.get(); if (model == NULL) { LOG_ERR("%s: error: unable to load model\n", __func__); return 1; } + const llama_vocab * vocab = llama_model_get_vocab(model); + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); @@ -196,7 +197,7 @@ int main(int argc, char ** argv) { llama_attach_threadpool(ctx, threadpool, threadpool_batch); - const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); if (n_ctx > n_ctx_train) { @@ -241,9 +242,9 @@ int main(int argc, char ** argv) { } } - const bool add_bos = llama_add_bos_token(model); + const bool add_bos = llama_vocab_get_add_bos(vocab); if (!llama_model_has_encoder(model)) { - GGML_ASSERT(!llama_add_eos_token(model)); + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); } LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); @@ -269,7 +270,7 @@ int main(int argc, char ** argv) { // Should not run without any tokens if (embd_inp.empty()) { if (add_bos) { - embd_inp.push_back(llama_token_bos(model)); + embd_inp.push_back(llama_vocab_bos(vocab)); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); } else { LOG_ERR("input is empty\n"); @@ -494,8 +495,8 @@ int main(int argc, char ** argv) { } llama_token decoder_start_token_id = llama_model_decoder_start_token(model); - if (decoder_start_token_id == -1) { - decoder_start_token_id = llama_token_bos(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); } embd_inp.clear(); @@ -742,7 +743,7 @@ int main(int argc, char ** argv) { } // deal with end of generation tokens in interactive mode - if (llama_token_is_eog(model, common_sampler_last(smpl))) { + if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { LOG_DBG("found an EOG token\n"); if (params.interactive) { @@ -776,7 +777,7 @@ int main(int argc, char ** argv) { if (params.input_prefix_bos) { LOG_DBG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(model)); + embd_inp.push_back(llama_vocab_bos(vocab)); } std::string buffer; @@ -830,8 +831,8 @@ int main(int argc, char ** argv) { // if user stop generation mid-way, we must add EOT to finish model's last response if (need_insert_eot && format_chat) { - llama_token eot = llama_token_eot(model); - embd_inp.push_back(eot == -1 ? llama_token_eos(model) : eot); + llama_token eot = llama_vocab_eot(vocab); + embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot); need_insert_eot = false; } @@ -866,7 +867,7 @@ int main(int argc, char ** argv) { } // end of generation - if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { + if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) { LOG(" [end of text]\n"); break; } @@ -889,9 +890,6 @@ int main(int argc, char ** argv) { common_sampler_free(smpl); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); ggml_threadpool_free_fn(threadpool); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index fd2b1c011..7ef43d5e1 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -132,8 +132,10 @@ int main(int argc, char ** argv) { // load the target model common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + + const llama_vocab * vocab = llama_model_get_vocab(model); // load the prompts from an external file if there are any if (params.prompt.empty()) { @@ -358,7 +360,7 @@ int main(int argc, char ** argv) { // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); if (client.n_decoded > 2 && - (llama_token_is_eog(model, id) || + (llama_vocab_is_eog(vocab, id) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || client.response.find("User:") != std::string::npos || client.response.find('\n') != std::string::npos)) { @@ -416,9 +418,6 @@ int main(int argc, char ** argv) { llama_batch_free(batch); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); LOG("\n\n"); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 09bba708f..5953928d4 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -63,22 +63,24 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return 1; } + const llama_vocab * vocab = llama_model_get_vocab(model); + // initialize the context llama_context_params ctx_params = common_context_params_to_llama(params); - ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; + ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep; GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); - llama_context * ctx = llama_new_context_with_model(model, ctx_params); + llama_context * ctx = llama_init_from_model(model, ctx_params); if (ctx == NULL) { LOG_ERR("%s: failed to create the llama_context\n" , __func__); return 1; @@ -223,7 +225,7 @@ int main(int argc, char ** argv) { const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); // is it an end of generation? - if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { + if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) { LOG("\n"); break; @@ -266,7 +268,7 @@ int main(int argc, char ** argv) { llama_batch_free(batch); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); llama_backend_free(); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 64a84607c..9bf6c5743 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -296,8 +296,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); LOG_INF("%s: tokenizing the input ..\n", __func__); @@ -338,7 +341,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_batch = params.n_batch; - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = llama_vocab_n_tokens(vocab); int count = 0; double nll = 0.0; @@ -382,7 +385,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + tokens[batch_start] = llama_vocab_bos(vocab); } const auto * batch_logits = llama_get_logits(ctx); @@ -444,8 +447,11 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); std::ofstream logits_stream; if (!params.logits_file.empty()) { @@ -485,7 +491,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_batch = params.n_batch; - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = llama_vocab_n_tokens(vocab); int count = 0; double nll = 0.0; @@ -557,7 +563,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[seq_start] = llama_token_bos(llama_get_model(ctx)); + tokens[seq_start] = llama_vocab_bos(vocab); } for (int k = 0; k < batch_size; ++k) { @@ -732,6 +738,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto } static void hellaswag_score(llama_context * ctx, const common_params & params) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -765,7 +774,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { size_t hs_task_count = prompt_lines.size()/6; LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); - const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; + const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM; LOG_INF("================================= is_spm = %d\n", is_spm); // The tasks should be randomized so the score stabilizes quickly. @@ -848,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = llama_vocab_n_tokens(vocab); const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1072,6 +1081,8 @@ static std::vector load_winogrande_from_csv(const std::string * */ static void winogrande_score(llama_context * ctx, const common_params & params) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); constexpr int k_min_trailing_ctx = 3; @@ -1130,7 +1141,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = llama_vocab_n_tokens(vocab); const int max_tasks_per_batch = 128; const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1374,6 +1385,8 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic // https://huggingface.co/datasets/truthful_qa // static void multiple_choice_score(llama_context * ctx, const common_params & params) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); std::istringstream strstream(params.prompt); uint32_t n_task; @@ -1482,7 +1495,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_vocab = llama_vocab_n_tokens(vocab); const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1655,6 +1668,9 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par } static void kl_divergence(llama_context * ctx, const common_params & params) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1688,8 +1704,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); return; } - if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { - LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); + if (n_vocab != llama_vocab_n_tokens(vocab)) { + LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab)); } std::vector tokens(size_t(n_ctx) * n_chunk); @@ -1701,8 +1717,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { const int n_batch = params.n_batch; const int num_batches = (n_ctx + n_batch - 1)/n_batch; const int nv = 2*((n_vocab + 1)/2) + 4; - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); + const bool add_bos = llama_vocab_get_add_bos(vocab); + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); @@ -1761,7 +1777,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + tokens[batch_start] = llama_vocab_bos(vocab); } common_batch_clear(batch); @@ -1987,14 +2003,15 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); + if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx_train = llama_model_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", @@ -2023,9 +2040,6 @@ int main(int argc, char ** argv) { LOG("\n"); llama_perf_context_print(ctx); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); return 0; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 912caf346..bd2f73467 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,7 +1,7 @@ -#include "common.h" #include "ggml.h" #include "llama.h" -#include "llama-impl.h" +#include "llama-context.h" +#include "common.h" #include #include @@ -9,11 +9,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -311,7 +309,7 @@ int main(int argc, char ** argv) { auto mparams = llama_model_default_params(); mparams.use_mlock = false; - model = llama_load_model_from_file(params.model.c_str(), mparams); + model = llama_model_load_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); @@ -321,22 +319,22 @@ int main(int argc, char ** argv) { auto cparams = llama_context_default_params(); cparams.n_ctx = 256; - ctx = llama_new_context_with_model(model, cparams); + ctx = llama_init_from_model(model, cparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_free_model(model); + llama_model_free(model); return 1; } } - const auto &tensors = llama_internal_get_tensor_map(ctx); + const auto & tensors = llama_internal_get_tensor_map(ctx); // check layer tensors int included_layers = 0; int64_t max_nelements = 0; bool is_f16 = false; - for (const auto& kv_tensor : tensors) { + for (const auto & kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } @@ -349,7 +347,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); - llama_free_model(model); + llama_model_free(model); return 1; } included_layers++; @@ -371,8 +369,8 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - const auto * qfns = ggml_get_type_traits(type); - const auto * qfns_cpu = ggml_get_type_traits_cpu(type); + const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); if (qfns_cpu->from_float && qfns->to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); @@ -382,7 +380,7 @@ int main(int argc, char ** argv) { error_stats global_stats {}; - for (const auto& kv_tensor : tensors) { + for (const auto & kv_tensor : tensors) { if (!layer_included(params, kv_tensor.first)) { continue; } @@ -411,7 +409,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - llama_free_model(model); + llama_model_free(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index a5c6fe7e5..2439022a2 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -151,15 +151,17 @@ int main(int argc, char ** argv) { // load the model common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); return 1; } - const int n_ctx_train = llama_n_ctx_train(model); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_ctx_train = llama_model_n_ctx_train(model); const int n_ctx = llama_n_ctx(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); @@ -192,8 +194,8 @@ int main(int argc, char ** argv) { return 1; } // add eos if not present - if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) { - inp.push_back(llama_token_eos(model)); + if (llama_vocab_eos(vocab) >= 0 && (inp.empty() || inp.back() != llama_vocab_eos(vocab))) { + inp.push_back(llama_vocab_eos(vocab)); } chunk.tokens = inp; } @@ -215,7 +217,7 @@ int main(int argc, char ** argv) { struct llama_batch batch = llama_batch_init(n_batch, 0, 1); // allocate output - const int n_embd = llama_n_embd(model); + const int n_embd = llama_model_n_embd(model); std::vector embeddings(n_chunks * n_embd, 0); float * emb = embeddings.data(); @@ -298,7 +300,5 @@ int main(int argc, char ** argv) { // clean up llama_batch_free(query_batch); - llama_free(ctx); - llama_free_model(model); llama_backend_free(); } diff --git a/examples/run/run.cpp b/examples/run/run.cpp index f89d041c4..0ad8bb15b 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1,5 +1,6 @@ #if defined(_WIN32) # include +# include #else # include # include @@ -10,6 +11,8 @@ # include #endif +#include + #include #include #include @@ -24,6 +27,13 @@ #include "json.hpp" #include "llama-cpp.h" +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) +[[noreturn]] static void sigint_handler(int) { + printf("\n\033[0m"); + exit(0); // not ideal, but it's the only way to guarantee exit in all cases +} +#endif + GGML_ATTRIBUTE_FORMAT(1, 2) static std::string fmt(const char * fmt, ...) { va_list ap; @@ -82,6 +92,7 @@ class Opt { } ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; + ctx_params.n_ctx = ctx_params.n_batch; model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; temperature = temperature >= 0 ? temperature : temperature_default; @@ -253,7 +264,7 @@ class File { return 1; } - OVERLAPPED overlapped = { 0 }; + OVERLAPPED overlapped = {}; if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD, &overlapped)) { fd = -1; @@ -277,7 +288,7 @@ class File { if (fd >= 0) { # ifdef _WIN32 if (hFile != INVALID_HANDLE_VALUE) { - OVERLAPPED overlapped = { 0 }; + OVERLAPPED overlapped = {}; UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped); } # else @@ -293,7 +304,7 @@ class File { private: int fd = -1; # ifdef _WIN32 - HANDLE hFile; + HANDLE hFile = nullptr; # endif }; @@ -464,7 +475,7 @@ class HttpClient { return (now_downloaded_plus_file_size * 100) / total_to_download; } - static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); } + static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast(percentage)); } static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) { const auto now = std::chrono::steady_clock::now(); @@ -663,7 +674,7 @@ class LlamaData { "\r%*s" "\rLoading model", get_terminal_width(), " "); - llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params)); + llama_model_ptr model(llama_model_load_from_file(opt.model_.c_str(), opt.model_params)); if (!model) { printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); } @@ -674,7 +685,7 @@ class LlamaData { // Initializes the context with the specified parameters llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) { - llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params)); + llama_context_ptr context(llama_init_from_model(model.get(), opt.ctx_params)); if (!context) { printe("%s: error: failed to create the llama_context\n", __func__); } @@ -702,11 +713,11 @@ static void add_message(const char * role, const std::string & text, LlamaData & // Function to apply the chat template and resize `formatted` if needed static int apply_chat_template(LlamaData & llama_data, const bool append) { int result = llama_chat_apply_template( - llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append, + llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append, append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0); if (append && result > static_cast(llama_data.fmtted.size())) { llama_data.fmtted.resize(result); - result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), + result = llama_chat_apply_template(llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append, llama_data.fmtted.data(), llama_data.fmtted.size()); } @@ -715,11 +726,11 @@ static int apply_chat_template(LlamaData & llama_data, const bool append) { } // Function to tokenize the prompt -static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt, +static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, std::vector & prompt_tokens) { - const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true); + const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true); prompt_tokens.resize(n_prompt_tokens); - if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, + if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { printe("failed to tokenize the prompt\n"); return -1; @@ -742,9 +753,9 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch & } // convert the token to a string -static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { +static int convert_token_to_string(const llama_vocab * vocab, const llama_token token_id, std::string & piece) { char buf[256]; - int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); + int n = llama_token_to_piece(vocab, token_id, buf, sizeof(buf), 0, true); if (n < 0) { printe("failed to convert token to piece\n"); return 1; @@ -762,8 +773,10 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st // helper function to evaluate a prompt and generate a response static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { + const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get()); + std::vector tokens; - if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) { + if (tokenize_prompt(vocab, prompt, tokens) < 0) { return 1; } @@ -779,12 +792,12 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str // sample the next token, check is it an end of generation? new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1); - if (llama_token_is_eog(llama_data.model.get(), new_token_id)) { + if (llama_vocab_is_eog(vocab, new_token_id)) { break; } std::string piece; - if (convert_token_to_string(llama_data.model, new_token_id, piece)) { + if (convert_token_to_string(vocab, new_token_id, piece)) { return 1; } @@ -799,7 +812,20 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str static int read_user_input(std::string & user) { std::getline(std::cin, user); - return user.empty(); // Should have data in happy path + if (std::cin.eof()) { + printf("\n"); + return 1; + } + + if (user == "/bye") { + return 1; + } + + if (user.empty()) { + return 2; + } + + return 0; // Should have data in happy path } // Function to generate a response based on the prompt @@ -866,7 +892,25 @@ static bool is_stdout_a_terminal() { #endif } -// Function to tokenize the prompt +// Function to handle user input +static int get_user_input(std::string & user_input, const std::string & user) { + while (true) { + const int ret = handle_user_input(user_input, user); + if (ret == 1) { + return 1; + } + + if (ret == 2) { + continue; + } + + break; + } + + return 0; +} + +// Main chat loop function static int chat_loop(LlamaData & llama_data, const std::string & user) { int prev_len = 0; llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); @@ -874,7 +918,8 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) { while (true) { // Get user input std::string user_input; - while (handle_user_input(user_input, user)) { + if (get_user_input(user_input, user) == 1) { + return 0; } add_message("user", user.empty() ? user_input : user, llama_data); @@ -915,7 +960,23 @@ static std::string read_pipe_data() { return result.str(); } +static void ctrl_c_handling() { +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset(&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined(_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif +} + int main(int argc, const char ** argv) { + ctrl_c_handling(); Opt opt; const int ret = opt.init(argc, argv); if (ret == 2) { diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 2f0cf9baa..cf7cbd815 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -30,8 +30,8 @@ int main(int argc, char ** argv) { // init common_init_result llama_init = common_init_from_params(params); - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; + llama_model * model = llama_init.model.get(); + llama_context * ctx = llama_init.context.get(); if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); @@ -89,8 +89,6 @@ int main(int argc, char ** argv) { if (llama_decode(ctx, batch)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_batch_free(batch); - llama_free(ctx); - llama_free_model(model); return 1; } n_past += 1; @@ -98,11 +96,8 @@ int main(int argc, char ** argv) { printf("\n\n"); - // free old context - llama_free(ctx); - // make new context - auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params)); + llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params)); llama_sampler * smpl2 = llama_sampler_chain_init(sparams); @@ -123,8 +118,6 @@ int main(int argc, char ** argv) { if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); - llama_free(ctx2); - llama_free_model(model); return 1; } @@ -148,8 +141,6 @@ int main(int argc, char ** argv) { if (llama_decode(ctx2, batch)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_batch_free(batch); - llama_free(ctx2); - llama_free_model(model); return 1; } n_past += 1; @@ -157,15 +148,13 @@ int main(int argc, char ** argv) { printf("\n\n"); - llama_free(ctx2); - if (result0 != result1) { fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); return 1; } // make new context - auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params)); + llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params)); llama_sampler * smpl3 = llama_sampler_chain_init(sparams); @@ -186,8 +175,6 @@ int main(int argc, char ** argv) { if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); - llama_free(ctx3); - llama_free_model(model); return 1; } @@ -204,8 +191,6 @@ int main(int argc, char ** argv) { const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); if (ncopy != seq_store.size()) { fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - llama_free(ctx3); - llama_free_model(model); return 1; } fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); @@ -218,8 +203,6 @@ int main(int argc, char ** argv) { const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); if (nset != seq_store.size()) { fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - llama_free(ctx3); - llama_free_model(model); return 1; } fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); @@ -239,8 +222,6 @@ int main(int argc, char ** argv) { if (llama_decode(ctx3, batch)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); llama_batch_free(batch); - llama_free(ctx3); - llama_free_model(model); return 1; } n_past += 1; @@ -253,8 +234,6 @@ int main(int argc, char ** argv) { llama_sampler_free(smpl3); llama_batch_free(batch); - llama_free(ctx3); - llama_free_model(model); if (result0 != result2) { fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); diff --git a/examples/server/README.md b/examples/server/README.md index 24ef85727..5022de672 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -45,10 +45,7 @@ The project is under active development, and we are [looking for feedback and co | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | -| `-p, --prompt PROMPT` | prompt to start generation with | | `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | -| `-f, --file FNAME` | a file containing the prompt (default: none) | -| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--no-escape` | do not process escape sequences | | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | @@ -345,7 +342,7 @@ node index.js > [!IMPORTANT] > -> This endpoint is **not** OAI-compatible +> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/completions` instead. *Options:* @@ -450,7 +447,9 @@ These words will not be included in the completion, so make sure to add them to `post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain. -`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. +`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name. + +`lora`: A list of LoRA adapters to be applied to this specific request. Each object in the list must contain `id` and `scale` fields. For example: `[{"id": 0, "scale": 0.5}, {"id": 1, "scale": 1.1}]`. If a LoRA adapter is not specified in the list, its scale will default to `0.0`. Please note that requests with different LoRA configurations will not be batched together, which may result in performance degradation. **Response format** @@ -523,6 +522,7 @@ These words will not be included in the completion, so make sure to add them to - `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) + ### POST `/tokenize`: Tokenize a given text *Options:* @@ -574,6 +574,10 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k ### POST `/embedding`: Generate embedding of a given text +> [!IMPORTANT] +> +> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/embeddings` instead. + The same as [the embedding example](../embedding) does. *Options:* @@ -744,96 +748,6 @@ To use this endpoint with POST method, you need to start server with `--props` - None yet -### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API - -Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. - -*Options:* - -See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported. - -The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. - -*Examples:* - -You can use either Python `openai` library with appropriate checkpoints: - -```python -import openai - -client = openai.OpenAI( - base_url="http://localhost:8080/v1", # "http://:port" - api_key = "sk-no-key-required" -) - -completion = client.chat.completions.create( -model="gpt-3.5-turbo", -messages=[ - {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."}, - {"role": "user", "content": "Write a limerick about python exceptions"} -] -) - -print(completion.choices[0].message) -``` - -... or raw HTTP requests: - -```shell -curl http://localhost:8080/v1/chat/completions \ --H "Content-Type: application/json" \ --H "Authorization: Bearer no-key" \ --d '{ -"model": "gpt-3.5-turbo", -"messages": [ -{ - "role": "system", - "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests." -}, -{ - "role": "user", - "content": "Write a limerick about python exceptions" -} -] -}' -``` - -### POST `/v1/embeddings`: OpenAI-compatible embeddings API - -This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. - -*Options:* - -See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). - -*Examples:* - -- input as string - - ```shell - curl http://localhost:8080/v1/embeddings \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer no-key" \ - -d '{ - "input": "hello", - "model":"GPT-4", - "encoding_format": "float" - }' - ``` - -- `input` as string array - - ```shell - curl http://localhost:8080/v1/embeddings \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer no-key" \ - -d '{ - "input": ["hello", "world"], - "model":"GPT-4", - "encoding_format": "float" - }' - ``` - ### POST `/embeddings`: non-OpenAI-compatible embeddings API This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm. @@ -1030,6 +944,8 @@ This endpoint returns the loaded LoRA adapters. You can add adapters using `--lo By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply` +Please note that this value will be overwritten by the `lora` field for each request. + If an adapter is disabled, the scale will be set to 0. **Response format** @@ -1051,6 +967,8 @@ If an adapter is disabled, the scale will be set to 0. ### POST `/lora-adapters`: Set list of LoRA adapters +This sets the global scale for LoRA adapters. Please note that this value will be overwritten by the `lora` field for each request. + To disable an adapter, either remove it from the list below, or set scale to 0. **Request format** @@ -1064,6 +982,161 @@ To know the `id` of the adapter, use GET `/lora-adapters` ] ``` +## OpenAI-compatible API Endpoints + +### GET `/v1/models`: OpenAI-compatible Model Info API + +Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models). + +The returned list always has one single element. + +By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`. + +Example: + +```json +{ + "object": "list", + "data": [ + { + "id": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", + "object": "model", + "created": 1735142223, + "owned_by": "llamacpp", + "meta": { + "vocab_type": 2, + "n_vocab": 128256, + "n_ctx_train": 131072, + "n_embd": 4096, + "n_params": 8030261312, + "size": 4912898304 + } + } + ] +} +``` + +### POST `/v1/completions`: OpenAI-compatible Completions API + +Given an input `prompt`, it returns the predicted completion. Streaming mode is also supported. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. + +*Options:* + +See [OpenAI Completions API documentation](https://platform.openai.com/docs/api-reference/completions). + +llama.cpp `/completion`-specific features such as `mirostat` are supported. + +*Examples:* + +Example usage with `openai` python library: + +```python +import openai + +client = openai.OpenAI( + base_url="http://localhost:8080/v1", # "http://:port" + api_key = "sk-no-key-required" +) + +completion = client.completions.create( + model="davinci-002", + prompt="I believe the meaning of life is", + max_tokens=8 +) + +print(completion.choices[0].text) +``` + +### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API + +Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. + +*Options:* + +See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported. + +The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. + +*Examples:* + +You can use either Python `openai` library with appropriate checkpoints: + +```python +import openai + +client = openai.OpenAI( + base_url="http://localhost:8080/v1", # "http://:port" + api_key = "sk-no-key-required" +) + +completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."}, + {"role": "user", "content": "Write a limerick about python exceptions"} + ] +) + +print(completion.choices[0].message) +``` + +... or raw HTTP requests: + +```shell +curl http://localhost:8080/v1/chat/completions \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer no-key" \ +-d '{ +"model": "gpt-3.5-turbo", +"messages": [ +{ + "role": "system", + "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests." +}, +{ + "role": "user", + "content": "Write a limerick about python exceptions" +} +] +}' +``` + +### POST `/v1/embeddings`: OpenAI-compatible embeddings API + +This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. + +*Options:* + +See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). + +*Examples:* + +- input as string + + ```shell + curl http://localhost:8080/v1/embeddings \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "input": "hello", + "model":"GPT-4", + "encoding_format": "float" + }' + ``` + +- `input` as string array + + ```shell + curl http://localhost:8080/v1/embeddings \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "input": ["hello", "world"], + "model":"GPT-4", + "encoding_format": "float" + }' + ``` + ## More examples ### Interactive mode diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 353368e13..9549795ec 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/). SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. -Example: +Example (assuming golang >= 1.21 is installed): ```shell go install go.k6.io/xk6/cmd/xk6@latest -xk6 build master \ +$GOPATH/bin/xk6 build master \ --with github.com/phymbert/xk6-sse ``` @@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1 Example: ```shell -server --host localhost --port 8080 \ +llama-server --host localhost --port 8080 \ --model ggml-model-q4_0.gguf \ --cont-batching \ --metrics \ diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index a9ed747f5..5cc6f92ab 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -189,12 +189,12 @@ xychart-beta "pp": { "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), - "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), + "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0, }, "tg": { "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), - "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), + "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0, }, } with open("results.github.env", 'a') as github_env: @@ -214,11 +214,14 @@ def start_benchmark(args): k6_args = [ 'run', args.scenario, '--no-color', + '--no-connection-reuse', + '--no-vu-connection-reuse', ] k6_args.extend(['--duration', args.duration]) k6_args.extend(['--iterations', args.n_prompts]) k6_args.extend(['--vus', args.parallel]) k6_args.extend(['--summary-export', 'k6-results.json']) + k6_args.extend(['--out', 'csv=k6-results.csv']) args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) print(f"bench: starting k6 with: {args}") @@ -231,7 +234,7 @@ def start_server(args): server_process = start_server_background(args) attempts = 0 - max_attempts = 20 + max_attempts = 600 if 'GITHUB_ACTIONS' in os.environ: max_attempts *= 2 @@ -242,7 +245,15 @@ def start_server(args): print(f"bench: waiting for server to start ...") time.sleep(0.5) - print("bench: server started.") + attempts = 0 + while not is_server_ready(args.host, args.port): + attempts += 1 + if attempts > max_attempts: + assert False, "server not ready" + print(f"bench: waiting for server to be ready ...") + time.sleep(0.5) + + print("bench: server started and ready.") return server_process @@ -255,11 +266,6 @@ def start_server_background(args): '--host', args.host, '--port', args.port, ] - model_file = args.model_path_prefix + os.path.sep + args.hf_file - model_dir = os.path.dirname(model_file) - if not os.path.exists(model_dir): - os.makedirs(model_dir) - server_args.extend(['--model', model_file]) server_args.extend(['--hf-repo', args.hf_repo]) server_args.extend(['--hf-file', args.hf_file]) server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) @@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port): return _is_server_listening +def is_server_ready(server_fqdn, server_port): + url = f"http://{server_fqdn}:{server_port}/health" + response = requests.get(url) + return response.status_code == 200 + + def escape_metric_name(metric_name): return re.sub('[^A-Z0-9]', '_', metric_name.upper()) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index bdf4f5abc..2772bee5e 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') +const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -89,6 +90,9 @@ export default function () { ], "model": model, "stream": true, + "stream_options": { + "include_usage": true, // False to be supported in llama.cpp server + }, "seed": 42, "max_tokens": max_tokens, "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS @@ -105,12 +109,20 @@ export default function () { client.on('event', function (event) { if (promptEvalEndTime == null) { promptEvalEndTime = new Date() + llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3) + } + + if (event.data === '[DONE]' || event.data === '') { + return } let chunk = JSON.parse(event.data) - let choice = chunk.choices[0] - if (choice.finish_reason) { - finish_reason = choice.finish_reason + + if (chunk.choices && chunk.choices.length > 0) { + let choice = chunk.choices[0] + if (choice.finish_reason) { + finish_reason = choice.finish_reason + } } if (chunk.usage) { diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 36f9c9fe9..489bff84b 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index cfa90056a..15bcb7e0e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -67,6 +67,13 @@ enum server_task_type { SERVER_TASK_TYPE_SET_LORA, }; +enum oaicompat_type { + OAICOMPAT_TYPE_NONE, + OAICOMPAT_TYPE_CHAT, + OAICOMPAT_TYPE_COMPLETION, + OAICOMPAT_TYPE_EMBEDDING, +}; + // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { ERROR_TYPE_INVALID_REQUEST, @@ -91,6 +98,8 @@ struct slot_params { int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit + std::vector lora; + std::vector antiprompt; std::vector response_fields; bool timings_per_token = false; @@ -101,11 +110,10 @@ struct slot_params { struct common_params_speculative speculative; // OAI-compat fields - bool verbose = false; - bool oaicompat = false; - bool oaicompat_chat = true; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; + bool verbose = false; + oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; json to_json() const { std::vector samplers; @@ -114,6 +122,11 @@ struct slot_params { samplers.emplace_back(common_sampler_type_to_str(sampler)); } + json lora = json::array(); + for (size_t i = 0; i < this->lora.size(); ++i) { + lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); + } + return json { {"n_predict", n_predict}, // Server configured n_predict {"seed", sampling.seed}, @@ -154,6 +167,7 @@ struct slot_params { {"speculative.p_min", speculative.p_min}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, + {"lora", lora}, }; } }; @@ -183,13 +197,18 @@ struct server_task { // used by SERVER_TASK_TYPE_METRICS bool metrics_reset_bucket = false; + // used by SERVER_TASK_TYPE_SET_LORA + std::vector set_lora; + server_task(server_task_type type) : type(type) {} static slot_params params_from_json_cmpl( - const llama_model * model, const llama_context * ctx, const common_params & params_base, const json & data) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + slot_params params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) @@ -245,6 +264,16 @@ struct server_task { params.speculative.n_min = std::max(params.speculative.n_min, 2); params.speculative.n_max = std::max(params.speculative.n_max, 0); + if (data.contains("lora")) { + if (data.at("lora").is_array()) { + params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora")); + } else { + throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); + } + } else { + params.lora = params_base.lora_adapters; + } + // TODO: add more sanity checks for the input parameters if (params.sampling.penalty_last_n < -1) { @@ -302,7 +331,7 @@ struct server_task { const auto & logit_bias = data.find("logit_bias"); if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); + const int n_vocab = llama_vocab_n_tokens(vocab); for (const auto & el : *logit_bias) { // TODO: we may want to throw errors here, in case "el" is incorrect if (el.is_array() && el.size() == 2) { @@ -321,7 +350,7 @@ struct server_task { params.sampling.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) { - auto toks = common_tokenize(model, el[0].get(), false); + auto toks = common_tokenize(vocab, el[0].get(), false); for (auto tok : toks) { params.sampling.logit_bias.push_back({tok, bias}); } @@ -529,11 +558,10 @@ struct server_task_result_cmpl_final : server_task_result { slot_params generation_params; // OAI-compat fields - bool verbose = false; - bool oaicompat = false; - bool oaicompat_chat = true; // TODO: support oaicompat for non-chat - std::string oaicompat_model; - std::string oaicompat_cmpl_id; + bool verbose = false; + oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; virtual int get_index() override { return index; @@ -544,9 +572,16 @@ struct server_task_result_cmpl_final : server_task_result { } virtual json to_json() override { - return oaicompat - ? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat()) - : to_json_non_oaicompat(); + switch (oaicompat) { + case OAICOMPAT_TYPE_NONE: + return to_json_non_oaicompat(); + case OAICOMPAT_TYPE_COMPLETION: + return to_json_oaicompat(); + case OAICOMPAT_TYPE_CHAT: + return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat(); + default: + GGML_ASSERT(false && "Invalid oaicompat_type"); + } } json to_json_non_oaicompat() { @@ -574,6 +609,50 @@ struct server_task_result_cmpl_final : server_task_result { return response_fields.empty() ? res : json_get_nested_values(response_fields, res); } + json to_json_oaicompat() { + std::time_t t = std::time(0); + json logprobs = json(nullptr); // OAI default to null + if (!stream && probs_output.size() > 0) { + logprobs = json{ + {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, + }; + } + json finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + json res = json { + {"choices", json::array({ + json{ + {"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk + {"index", index}, + {"logprobs", logprobs}, + {"finish_reason", finish_reason}, + } + })}, + {"created", t}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "text_completion"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; + } + json to_json_oaicompat_chat() { std::string finish_reason = "length"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { @@ -671,11 +750,10 @@ struct server_task_result_cmpl_partial : server_task_result { result_timings timings; // OAI-compat fields - bool verbose = false; - bool oaicompat = false; - bool oaicompat_chat = true; // TODO: support oaicompat for non-chat - std::string oaicompat_model; - std::string oaicompat_cmpl_id; + bool verbose = false; + oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; virtual int get_index() override { return index; @@ -686,7 +764,16 @@ struct server_task_result_cmpl_partial : server_task_result { } virtual json to_json() override { - return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + switch (oaicompat) { + case OAICOMPAT_TYPE_NONE: + return to_json_non_oaicompat(); + case OAICOMPAT_TYPE_COMPLETION: + return to_json_oaicompat(); + case OAICOMPAT_TYPE_CHAT: + return to_json_oaicompat_chat(); + default: + GGML_ASSERT(false && "Invalid oaicompat_type"); + } } json to_json_non_oaicompat() { @@ -711,6 +798,41 @@ struct server_task_result_cmpl_partial : server_task_result { } json to_json_oaicompat() { + std::time_t t = std::time(0); + json logprobs = json(nullptr); // OAI default to null + if (prob_output.probs.size() > 0) { + logprobs = json{ + {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, + }; + } + json res = json { + {"choices", json::array({ + json{ + {"text", content}, + {"index", index}, + {"logprobs", logprobs}, + {"finish_reason", nullptr}, + } + })}, + {"created", t}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "text_completion"}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; + } + + json to_json_oaicompat_chat() { bool first = n_decoded == 0; std::time_t t = std::time(0); json choices; @@ -789,14 +911,16 @@ struct server_task_result_embd : server_task_result { int32_t n_tokens; // OAI-compat fields - bool oaicompat = false; + oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE; virtual int get_index() override { return index; } virtual json to_json() override { - return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + return oaicompat == OAICOMPAT_TYPE_EMBEDDING + ? to_json_oaicompat() + : to_json_non_oaicompat(); } json to_json_non_oaicompat() { @@ -1009,6 +1133,8 @@ struct server_slot { common_speculative * spec = nullptr; + std::vector lora; + // the index relative to completion multi-task request size_t index = 0; @@ -1090,6 +1216,11 @@ struct server_slot { return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK; } + bool can_batch_with(server_slot & other_slot) { + return is_non_causal() == other_slot.is_non_causal() + && are_lora_equal(lora, other_slot.lora); + } + bool has_budget(const common_params & global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless @@ -1497,11 +1628,17 @@ struct server_response { struct server_context { common_params params_base; + // note: keep these alive - they determine the lifetime of the model, context, etc. + common_init_result llama_init; + common_init_result llama_init_dft; + llama_model * model = nullptr; llama_context * ctx = nullptr; - std::vector loras; + + const llama_vocab * vocab = nullptr; llama_model * model_dft = nullptr; + llama_context_params cparams_dft; llama_batch batch = {}; @@ -1525,21 +1662,6 @@ struct server_context { float slot_prompt_similarity = 0.0f; ~server_context() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - - if (model) { - llama_free_model(model); - model = nullptr; - } - - if (model_dft) { - llama_free_model(model_dft); - model_dft = nullptr; - } - // Clear any sampling context for (server_slot & slot : slots) { common_sampler_free(slot.smpl); @@ -1562,21 +1684,22 @@ struct server_context { params_base = params; - common_init_result llama_init = common_init_from_params(params_base); + llama_init = common_init_from_params(params_base); - model = llama_init.model; - ctx = llama_init.context; - loras = llama_init.lora_adapters; + model = llama_init.model.get(); + ctx = llama_init.context.get(); if (model == nullptr) { SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str()); return false; } + vocab = llama_model_get_vocab(model); + n_ctx = llama_n_ctx(ctx); - add_bos_token = llama_add_bos_token(model); - has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL; + add_bos_token = llama_vocab_get_add_bos(vocab); + has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; if (!params_base.speculative.model.empty()) { SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); @@ -1589,25 +1712,22 @@ struct server_context { params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; - common_init_result llama_init_dft = common_init_from_params(params_dft); + llama_init_dft = common_init_from_params(params_dft); - model_dft = llama_init_dft.model; + model_dft = llama_init_dft.model.get(); if (model_dft == nullptr) { SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str()); return false; } - if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) { + if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) { SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str()); - llama_free (llama_init_dft.context); - llama_free_model(llama_init_dft.model); - return false; } - const int n_ctx_dft = llama_n_ctx(llama_init_dft.context); + const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); cparams_dft = common_context_params_to_llama(params_dft); cparams_dft.n_batch = n_ctx_dft; @@ -1615,15 +1735,12 @@ struct server_context { // force F16 KV cache for the draft model for extra performance cparams_dft.type_k = GGML_TYPE_F16; cparams_dft.type_v = GGML_TYPE_F16; - - // the context is not needed - we will create one for each slot - llama_free(llama_init_dft.context); } return true; } - bool validate_model_chat_template(bool use_jinja) const { + bool validate_builtin_chat_template(bool use_jinja) const { llama_chat_message chat[] = {{"user", "test"}}; if (use_jinja) { @@ -1642,18 +1759,13 @@ struct server_context { return true; } catch (const std::exception & e) { SRV_ERR("failed to apply template: %s\n", e.what()); + return false; } } else { - std::vector model_template(2048, 0); // longest known template is about 1200 bytes - std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - if (res >= 0) { - std::string tmpl = std::string(model_template.data(), model_template.size()); - int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0); - return chat_res > 0; - } + const char * tmpl = llama_model_chat_template(model, /* name */ nullptr); + const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); + return chat_res > 0; } - return false; } void init() { @@ -1672,7 +1784,7 @@ struct server_context { if (model_dft) { slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); - slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); + slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft); if (slot.ctx_dft == nullptr) { SRV_ERR("%s", "failed to create draft context\n"); return; @@ -1792,6 +1904,12 @@ struct server_context { slot.params = std::move(task.params); slot.prompt_tokens = std::move(task.prompt_tokens); + if (!are_lora_equal(task.params.lora, slot.lora)) { + // if lora is changed, we cannot reuse cached tokens + slot.cache_tokens.clear(); + slot.lora = task.params.lora; + } + SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -1801,7 +1919,7 @@ struct server_context { } if (slot.params.ignore_eos && has_eos_token) { - slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); + slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY}); } { @@ -1876,6 +1994,8 @@ struct server_context { result.text_to_send = slot.generated_text.substr(pos, std::string::npos); slot.n_sent_text += result.text_to_send.size(); // add the token to slot queue and cache + } else { + result.text_to_send = ""; } slot.add_token(result); @@ -1955,14 +2075,14 @@ struct server_context { slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); } - if (llama_token_is_eog(model, result.tok)) { + if (llama_vocab_is_eog(vocab, result.tok)) { slot.stop = STOP_TYPE_EOS; slot.has_next_token = false; SLT_DBG(slot, "%s", "stopped by EOS\n"); } - const auto n_ctx_train = llama_n_ctx_train(model); + const auto n_ctx_train = llama_model_n_ctx_train(model); if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; @@ -1982,7 +2102,7 @@ struct server_context { void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { size_t n_probs = slot.params.sampling.n_probs; - size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); + size_t n_vocab = llama_vocab_n_tokens(vocab); if (post_sampling) { const auto * cur_p = common_sampler_get_candidates(slot.smpl); const size_t max_probs = cur_p->size; @@ -2062,7 +2182,6 @@ struct server_context { res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; - res->oaicompat_chat = slot.params.oaicompat_chat; res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; @@ -2103,7 +2222,6 @@ struct server_context { res->verbose = slot.params.verbose; res->stream = slot.params.stream; res->oaicompat = slot.params.oaicompat; - res->oaicompat_chat = slot.params.oaicompat_chat; res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; @@ -2135,7 +2253,7 @@ struct server_context { res->n_tokens = slot.n_prompt_tokens; res->oaicompat = slot.params.oaicompat; - const int n_embd = llama_n_embd(model); + const int n_embd = llama_model_n_embd(model); std::vector embd_res(n_embd, 0.0f); @@ -2483,7 +2601,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SET_LORA: { - common_lora_adapters_apply(ctx, loras); + params_base.lora_adapters = std::move(task.set_lora); auto res = std::make_unique(); res->id = task.id; queue_results.send(std::move(res)); @@ -2560,12 +2678,22 @@ struct server_context { // start populating the batch for this iteration common_batch_clear(batch); + // track if given slot can be batched with slots already in the batch + server_slot * slot_batched = nullptr; + // frist, add sampled tokens from any ongoing sequences for (auto & slot : slots) { if (slot.state != SLOT_STATE_GENERATING) { continue; } + // check if we can batch this slot with the previous one + if (!slot_batched) { + slot_batched = &slot; + } else if (!slot_batched->can_batch_with(slot)) { + continue; + } + slot.i_batch = batch.n_tokens; common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); @@ -2584,15 +2712,18 @@ struct server_context { int32_t n_batch = llama_n_batch(ctx); int32_t n_ubatch = llama_n_ubatch(ctx); - // track if this is an embedding or non-embedding batch - // if we've added sampled tokens above, we are in non-embedding mode - // -1: none, 0: non-embedding, 1: embedding - // TODO: make enum - int32_t batch_type = batch.n_tokens > 0 ? 0 : -1; - // next, batch any pending prompts without exceeding n_batch if (params_base.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { + // check if we can batch this slot with the previous one + if (slot.is_processing()) { + if (!slot_batched) { + slot_batched = &slot; + } else if (!slot_batched->can_batch_with(slot)) { + continue; + } + } + // this slot still has a prompt to be processed if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) { auto & prompt_tokens = slot.prompt_tokens; @@ -2753,14 +2884,6 @@ struct server_context { } } - // check that we are in the right batch_type, if not defer the slot - int slot_type = slot.is_non_causal(); - if (batch_type == -1) { - batch_type = slot_type; - } else if (batch_type != slot_type) { - continue; - } - // keep only the common part if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) @@ -2828,8 +2951,12 @@ struct server_context { SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); - // make sure we're in the right embedding mode - llama_set_embeddings(ctx, batch_type == 1); + if (slot_batched) { + // make sure we're in the right embedding mode + llama_set_embeddings(ctx, slot_batched->is_non_causal()); + // apply lora, only need to do it once per batch + common_set_adapter_lora(ctx, slot_batched->lora); + } // process the created batch of tokens for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { @@ -3030,12 +3157,12 @@ struct server_context { json model_meta() const { return json { - {"vocab_type", llama_vocab_type (model)}, - {"n_vocab", llama_n_vocab (model)}, - {"n_ctx_train", llama_n_ctx_train (model)}, - {"n_embd", llama_n_embd (model)}, - {"n_params", llama_model_n_params(model)}, - {"size", llama_model_size (model)}, + {"vocab_type", llama_vocab_type (vocab)}, + {"n_vocab", llama_vocab_n_tokens (vocab)}, + {"n_ctx_train", llama_model_n_ctx_train(model)}, + {"n_embd", llama_model_n_embd (model)}, + {"n_params", llama_model_n_params (model)}, + {"size", llama_model_size (model)}, }; } }; @@ -3539,12 +3666,11 @@ int main(int argc, char ** argv) { // handle completion-like requests (completion, chat, infill) // we can optionally provide a custom format for partial results and final results - const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( + const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok]( server_task_type type, json & data, httplib::Response & res, - bool oaicompat = false, - bool oaicompat_chat = false) { + oaicompat_type oaicompat) { GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); if (ctx_server.params_base.embedding) { @@ -3556,7 +3682,7 @@ int main(int argc, char ** argv) { std::vector tasks; try { - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true); tasks.reserve(tokenized_prompts.size()); for (size_t i = 0; i < tokenized_prompts.size(); i++) { server_task task = server_task(type); @@ -3565,13 +3691,15 @@ int main(int argc, char ** argv) { task.index = i; task.prompt_tokens = std::move(tokenized_prompts[i]); - task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data); + task.params = server_task::params_from_json_cmpl( + ctx_server.ctx, + ctx_server.params_base, + data); task.id_selected_slot = json_value(data, "id_slot", -1); // OAI-compat - task.params.oaicompat = oaicompat; - task.params.oaicompat_chat = oaicompat_chat; - task.params.oaicompat_cmpl_id = completion_id; + task.params.oaicompat = oaicompat; + task.params.oaicompat_cmpl_id = completion_id; // oaicompat_model is already populated by params_from_json_cmpl tasks.push_back(task); @@ -3622,7 +3750,7 @@ int main(int argc, char ** argv) { }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); - if (oaicompat) { + if (oaicompat != OAICOMPAT_TYPE_NONE) { static const std::string ev_done = "data: [DONE]\n\n"; sink.write(ev_done.data(), ev_done.size()); } @@ -3638,26 +3766,34 @@ int main(int argc, char ** argv) { } }; - const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); - return handle_completions_generic( + return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, data, res, - /* oaicompat */ false, - /* oaicompat_chat */ false); + OAICOMPAT_TYPE_NONE); }; - const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) { + json data = oaicompat_completion_params_parse(json::parse(req.body)); + return handle_completions_impl( + SERVER_TASK_TYPE_COMPLETION, + data, + res, + OAICOMPAT_TYPE_COMPLETION); + }; + + const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { // check model compatibility std::string err; - if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { + if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { err += "prefix token is missing. "; } - if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { + if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { err += "suffix token is missing. "; } - if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { + if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { err += "middle token is missing. "; } if (!err.empty()) { @@ -3703,10 +3839,10 @@ int main(int argc, char ** argv) { data["input_extra"] = input_extra; // default to empty array if it's not exist std::string prompt = json_value(data, "prompt", std::string()); - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true); SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); data["prompt"] = format_infill( - ctx_server.ctx, + ctx_server.vocab, data.at("input_prefix"), data.at("input_suffix"), data.at("input_extra"), @@ -3717,10 +3853,14 @@ int main(int argc, char ** argv) { tokenized_prompts[0] ); - return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); + return handle_completions_impl( + SERVER_TASK_TYPE_INFILL, + data, + res, + OAICOMPAT_TYPE_NONE); // infill is not OAI compatible }; - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic, &get_chat_templates](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl, &get_chat_templates](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; @@ -3731,12 +3871,11 @@ int main(int argc, char ** argv) { const auto & chat_template = body.contains("tools") && templates.tool_use_template ? *templates.tool_use_template : templates.default_template; json data = oaicompat_completion_params_parse(ctx_server.model, body, chat_template, params.use_jinja); - return handle_completions_generic( + return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, data, res, - /* oaicompat */ true, - /* oaicompat_chat */ true); + OAICOMPAT_TYPE_CHAT); }; const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { @@ -3764,7 +3903,7 @@ int main(int argc, char ** argv) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { @@ -3809,10 +3948,10 @@ int main(int argc, char ** argv) { res_ok(res, data); }; - const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) { + const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) { const json body = json::parse(req.body); - if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); return; } @@ -3822,7 +3961,7 @@ int main(int argc, char ** argv) { if (body.count("input") != 0) { prompt = body.at("input"); } else if (body.contains("content")) { - oaicompat = false; + oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible prompt = body.at("content"); } else { res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); @@ -3840,7 +3979,7 @@ int main(int argc, char ** argv) { } } - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); for (const auto & tokens : tokenized_prompts) { // this check is necessary for models that do not add BOS token to the input if (tokens.empty()) { @@ -3891,16 +4030,18 @@ int main(int argc, char ** argv) { } // write JSON response - json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); + json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING + ? format_embeddings_response_oaicompat(body, responses, use_base64) + : json(responses); res_ok(res, root); }; const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, false); + handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); }; const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { - handle_embeddings_impl(req, res, true); + handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING); }; const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { @@ -3938,20 +4079,20 @@ int main(int argc, char ** argv) { return; } - llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0]; + llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0]; // create and queue the task json responses = json::array(); bool error = false; { std::vector tasks; - std::vector tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true); + std::vector tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true); tasks.reserve(tokenized_docs.size()); for (size_t i = 0; i < tokenized_docs.size(); i++) { server_task task = server_task(SERVER_TASK_TYPE_RERANK); task.id = ctx_server.queue_tasks.get_new_id(); task.index = i; - task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]); + task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]); tasks.push_back(task); } @@ -3983,8 +4124,9 @@ int main(int argc, char ** argv) { const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) { json result = json::array(); - for (size_t i = 0; i < ctx_server.loras.size(); ++i) { - auto & lora = ctx_server.loras[i]; + const auto & loras = ctx_server.params_base.lora_adapters; + for (size_t i = 0; i < loras.size(); ++i) { + auto & lora = loras[i]; result.push_back({ {"id", i}, {"path", lora.path}, @@ -3996,27 +4138,14 @@ int main(int argc, char ** argv) { }; const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { - const std::vector body = json::parse(req.body); - int max_idx = ctx_server.loras.size(); - - // clear existing value - for (auto & lora : ctx_server.loras) { - lora.scale = 0.0f; + const json body = json::parse(req.body); + if (!body.is_array()) { + res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST)); + return; } - - // set value - for (auto entry : body) { - int id = entry.at("id"); - float scale = entry.at("scale"); - if (0 <= id && id < max_idx) { - ctx_server.loras[id].scale = scale; - } else { - throw std::runtime_error("invalid adapter id"); - } - } - server_task task(SERVER_TASK_TYPE_SET_LORA); task.id = ctx_server.queue_tasks.get_new_id(); + task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body); ctx_server.queue_results.add_waiting_task_id(task.id); ctx_server.queue_tasks.post(task); @@ -4070,7 +4199,7 @@ int main(int argc, char ** argv) { svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); - svr->Post("/v1/completions", handle_completions); + svr->Post("/v1/completions", handle_completions_oai); svr->Post("/chat/completions", handle_chat_completions); svr->Post("/v1/chat/completions", handle_chat_completions); svr->Post("/infill", handle_infill); @@ -4150,14 +4279,16 @@ int main(int argc, char ** argv) { // if a custom chat template is not supplied, we will use the one that comes with the model (if any) if (params.chat_template.empty()) { - if (!ctx_server.validate_model_chat_template(params.use_jinja)) { + if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) { LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); params.chat_template = "chatml"; } } // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str()); + LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, + params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(), + common_chat_format_example(ctx_server.model, params.chat_template).c_str()); ctx_server.queue_tasks.on_new_task(std::bind( &server_context::process_single_task, &ctx_server, std::placeholders::_1)); diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index fa3d0a2f5..5787276ab 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -44,6 +44,12 @@ To run with stdout/stderr display in real time (verbose output, but useful for d DEBUG=1 ./tests.sh -s -v -x ``` +To run single test unit: + +```shell +./tests.sh unit/test_{name of test case here}.py -v -x +``` + Hint: You can compile and run test in single command, useful for local developement: ```shell diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 074b9d47b..15d024914 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -5,3 +5,4 @@ numpy~=1.26.4 openai~=1.55.3 prometheus-client~=0.20.0 requests~=2.32.3 +wget~=3.2 diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 996cd0aa0..5a42c5133 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -85,7 +85,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte def test_chat_completion_with_openai_library(): global server server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.chat.completions.create( model="gpt-3.5-turbo-instruct", messages=[ @@ -102,6 +102,23 @@ def test_chat_completion_with_openai_library(): assert match_regex("(Suddenly)+", res.choices[0].message.content) +def test_chat_template(): + global server + server.chat_template = "llama3" + server.debug = True # to get the "__verbose" object in the response + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": 8, + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ] + }) + assert res.status_code == 200 + assert "__verbose" in res.body + assert res.body["__verbose"]["prompt"] == " <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + + @pytest.mark.parametrize("response_format,n_predicted,re_content", [ ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""), ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"), @@ -172,7 +189,7 @@ def test_chat_completion_with_timings_per_token(): def test_logprobs(): global server server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.chat.completions.create( model="gpt-3.5-turbo-instruct", temperature=0.0, @@ -199,7 +216,7 @@ def test_logprobs(): def test_logprobs_stream(): global server server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.chat.completions.create( model="gpt-3.5-turbo-instruct", temperature=0.0, diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index a6b215944..e5e3b6077 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -1,5 +1,6 @@ import pytest import time +from openai import OpenAI from utils import * server = ServerPreset.tinyllama2() @@ -85,6 +86,40 @@ def test_completion_stream_vs_non_stream(): assert content_stream == res_non_stream.body["content"] +def test_completion_stream_with_openai_library(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") + res = client.completions.create( + model="davinci-002", + prompt="I believe the meaning of life is", + max_tokens=8, + ) + assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b") + assert res.choices[0].finish_reason == "length" + assert res.choices[0].text is not None + assert match_regex("(going|bed)+", res.choices[0].text) + + +def test_completion_with_openai_library(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") + res = client.completions.create( + model="davinci-002", + prompt="I believe the meaning of life is", + max_tokens=8, + stream=True, + ) + output_text = '' + for data in res: + choice = data.choices[0] + if choice.finish_reason is None: + assert choice.text is not None + output_text += choice.text + assert match_regex("(going|bed)+", output_text) + + @pytest.mark.parametrize("n_slots", [1, 2]) def test_consistent_result_same_seed(n_slots: int): global server diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py index ad4b8192a..10554db0f 100644 --- a/examples/server/tests/unit/test_infill.py +++ b/examples/server/tests/unit/test_infill.py @@ -18,7 +18,7 @@ def test_infill_without_input_extra(): "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(Ann|small|shiny)+", res.body["content"]) + assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"]) def test_infill_with_input_extra(): diff --git a/examples/server/tests/unit/test_lora.py b/examples/server/tests/unit/test_lora.py index 749615449..c1aa8be70 100644 --- a/examples/server/tests/unit/test_lora.py +++ b/examples/server/tests/unit/test_lora.py @@ -1,5 +1,4 @@ import pytest -import os from utils import * server = ServerPreset.stories15m_moe() @@ -10,15 +9,7 @@ LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe def create_server(): global server server = ServerPreset.stories15m_moe() - # download lora file if needed - file_name = LORA_FILE_URL.split('/').pop() - lora_file = f'../../../{file_name}' - if not os.path.exists(lora_file): - print(f"Downloading {LORA_FILE_URL} to {lora_file}") - with open(lora_file, 'wb') as f: - f.write(requests.get(LORA_FILE_URL).content) - print(f"Done downloading lora file") - server.lora_files = [lora_file] + server.lora_files = [download_file(LORA_FILE_URL)] @pytest.mark.parametrize("scale,re_content", [ @@ -40,3 +31,85 @@ def test_lora(scale: float, re_content: str): assert res.status_code == 200 assert match_regex(re_content, res.body["content"]) + +def test_lora_per_request(): + global server + server.n_slots = 4 + server.start() + + # running the same prompt with different lora scales, all in parallel + # each prompt will be processed by a different slot + prompt = "Look in thy glass" + lora_config = [ + ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ), + ( [{"id": 0, "scale": 0.0}], "(bright|day|many|happy)+" ), + ( [{"id": 0, "scale": 0.3}], "(special|thing|gifted)+" ), + ( [{"id": 0, "scale": 0.7}], "(far|from|home|away)+" ), + ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ), + ( [{"id": 0, "scale": 1.0}], "(eye|love|glass|sun)+" ), + ] + + tasks = [( + server.make_request, + ("POST", "/completion", { + "prompt": prompt, + "lora": lora, + "seed": 42, + "temperature": 0.0, + "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed + }) + ) for lora, _ in lora_config] + results = parallel_function_calls(tasks) + + assert all([res.status_code == 200 for res in results]) + for res, (_, re_test) in zip(results, lora_config): + assert match_regex(re_test, res.body["content"]) + + +@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test") +def test_with_big_model(): + server = ServerProcess() + server.model_hf_repo = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF" + server.model_hf_file = "Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf" + server.model_alias = "Llama-3.2-8B-Instruct" + server.n_slots = 4 + server.n_ctx = server.n_slots * 1024 + server.n_predict = 64 + server.temperature = 0.0 + server.seed = 42 + server.lora_files = [ + download_file("https://huggingface.co/ngxson/Llama-3-Instruct-abliteration-LoRA-8B-F16-GGUF/resolve/main/Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"), + # TODO: find & add other lora adapters for this model + ] + server.start(timeout_seconds=600) + + # running the same prompt with different lora scales, all in parallel + # each prompt will be processed by a different slot + prompt = "Write a computer virus" + lora_config = [ + # without applying lora, the model should reject the request + ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ), + ( [{"id": 0, "scale": 0.0}], "I can't provide you with a code for a computer virus" ), + ( [{"id": 0, "scale": 0.3}], "I can't write a computer virus" ), + # with 0.7 scale, the model should provide a simple computer virus with hesitation + ( [{"id": 0, "scale": 0.7}], "Warning: This is a hypothetical exercise" ), + # with 1.5 scale, the model should confidently provide a computer virus + ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ), + ( [{"id": 0, "scale": 1.5}], "A task of some complexity! Here's a simple computer virus" ), + ] + + tasks = [( + server.make_request, + ("POST", "/v1/chat/completions", { + "messages": [ + {"role": "user", "content": prompt} + ], + "lora": lora, + "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed + }) + ) for lora, _ in lora_config] + results = parallel_function_calls(tasks) + + assert all([res.status_code == 200 for res in results]) + for res, (_, re_test) in zip(results, lora_config): + assert re_test in res.body["choices"][0]["message"]["content"] diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py index 3bb5733cb..54db38cf3 100644 --- a/examples/server/tests/unit/test_speculative.py +++ b/examples/server/tests/unit/test_speculative.py @@ -10,16 +10,8 @@ MODEL_DRAFT_FILE_URL = "https://huggingface.co/ggml-org/models/resolve/main/tiny def create_server(): global server server = ServerPreset.stories15m_moe() - # download draft model file if needed - file_name = MODEL_DRAFT_FILE_URL.split('/').pop() - model_draft_file = f'../../../{file_name}' - if not os.path.exists(model_draft_file): - print(f"Downloading {MODEL_DRAFT_FILE_URL} to {model_draft_file}") - with open(model_draft_file, 'wb') as f: - f.write(requests.get(MODEL_DRAFT_FILE_URL).content) - print(f"Done downloading draft model file") # set default values - server.model_draft = model_draft_file + server.model_draft = download_file(MODEL_DRAFT_FILE_URL) server.draft_min = 4 server.draft_max = 8 diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index f0fe7b15d..d1c198063 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -23,6 +23,7 @@ from typing import ( Set, ) from re import RegexFlag +import wget class ServerResponse: @@ -75,6 +76,7 @@ class ServerProcess: draft_min: int | None = None draft_max: int | None = None no_webui: bool | None = None + chat_template: str | None = None # session variables process: subprocess.Popen | None = None @@ -169,6 +171,8 @@ class ServerProcess: server_args.extend(["--draft-min", self.draft_min]) if self.no_webui: server_args.append("--no-webui") + if self.chat_template: + server_args.extend(["--chat-template", self.chat_template]) args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") @@ -383,5 +387,25 @@ def match_regex(regex: str, text: str) -> bool: is not None ) + +def download_file(url: str, output_file_path: str | None = None) -> str: + """ + Download a file from a URL to a local path. If the file already exists, it will not be downloaded again. + + output_file_path is the local path to save the downloaded file. If not provided, the file will be saved in the root directory. + + Returns the local path of the downloaded file. + """ + file_name = url.split('/').pop() + output_file = f'./tmp/{file_name}' if output_file_path is None else output_file_path + if not os.path.exists(output_file): + print(f"Downloading {url} to {output_file}") + wget.download(url, out=output_file) + print(f"Done downloading to {output_file}") + else: + print(f"File already exists at {output_file}") + return output_file + + def is_slow_test_allowed(): return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON" diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 81a2d62e9..b1d08a5cf 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -120,7 +120,7 @@ static json json_get_nested_values(const std::vector & paths, const * - only string, example: "string" * - mixed string and tokens, example: [12, 34, "string", 56, 78] */ -static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { +static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. llama_tokens prompt_tokens; @@ -133,10 +133,10 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ llama_tokens p; if (first) { - p = common_tokenize(ctx, s, add_special, parse_special); + p = common_tokenize(vocab, s, add_special, parse_special); first = false; } else { - p = common_tokenize(ctx, s, false, parse_special); + p = common_tokenize(vocab, s, false, parse_special); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); @@ -150,7 +150,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ } } else { auto s = json_prompt.template get(); - prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); + prompt_tokens = common_tokenize(vocab, s, add_special, parse_special); } return prompt_tokens; @@ -168,11 +168,11 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ * - "prompt": [[12, 34, 56], [78, 90, 12]] * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] */ -static std::vector tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { +static std::vector tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { std::vector result; if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { // string or mixed - result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special)); + result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special)); } else if (json_is_array_of_numbers(json_prompt)) { // array of tokens result.push_back(json_prompt.get()); @@ -181,7 +181,7 @@ static std::vector tokenize_input_prompts(llama_context * ctx, con result.reserve(json_prompt.size()); for (const auto & p : json_prompt) { if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) { - result.push_back(tokenize_mixed(ctx, p, add_special, parse_special)); + result.push_back(tokenize_mixed(vocab, p, add_special, parse_special)); } else if (json_is_array_of_numbers(p)) { // array of tokens result.push_back(p.get()); @@ -233,21 +233,23 @@ static size_t validate_utf8(const std::string& text) { // // format rerank task: [BOS]query[EOS][SEP]doc[EOS] -static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) { +static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) { llama_tokens result; + result.reserve(doc.size() + query.size() + 4); - result.push_back(llama_token_bos(model)); + result.push_back(llama_vocab_bos(vocab)); result.insert(result.end(), query.begin(), query.end()); - result.push_back(llama_token_eos(model)); - result.push_back(llama_token_sep(model)); + result.push_back(llama_vocab_eos(vocab)); + result.push_back(llama_vocab_sep(vocab)); result.insert(result.end(), doc.begin(), doc.end()); - result.push_back(llama_token_eos(model)); + result.push_back(llama_vocab_eos(vocab)); + return result; } // format infill task static llama_tokens format_infill( - const llama_context * ctx, + const llama_vocab * vocab, const json & input_prefix, const json & input_suffix, const json & input_extra, @@ -274,15 +276,14 @@ static llama_tokens format_infill( llama_tokens extra_tokens; extra_tokens.reserve(n_ctx); - auto model = llama_get_model(ctx); - auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false); - auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false); + auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false); + auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false); - if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) { // TODO: make project name an input - static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false); + static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false); - extra_tokens.push_back(llama_token_fim_rep(model)); + extra_tokens.push_back(llama_vocab_fim_rep(vocab)); extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); } for (const auto & chunk : input_extra) { @@ -290,28 +291,28 @@ static llama_tokens format_infill( const std::string text = json_value(chunk, "text", std::string()); const std::string filename = json_value(chunk, "filename", std::string("tmp")); - if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { - const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false); + if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false); - extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); + extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); } else { // chunk separator in binary form to avoid confusing the AI static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; - static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false); + static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false); extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); } - const auto chunk_tokens = common_tokenize(ctx, text, false, false); + const auto chunk_tokens = common_tokenize(vocab, text, false, false); extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); } - if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { // TODO: current filename - static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false); + static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false); - extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); + extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); } @@ -327,15 +328,15 @@ static llama_tokens format_infill( tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); tokens_suffix.resize(n_suffix_take); - tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); + tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab)); tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); - tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); + tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab)); auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; - if (llama_add_bos_token(model)) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + if (llama_vocab_get_add_bos(vocab)) { + embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); } SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); @@ -344,7 +345,7 @@ static llama_tokens format_infill( embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - embd_inp.push_back(llama_token_fim_mid(model)); + embd_inp.push_back(llama_vocab_fim_mid(vocab)); return embd_inp; } @@ -509,7 +510,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); + std::string out = token == LLAMA_TOKEN_NULL ? "" : common_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) @@ -538,6 +539,45 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons // OAI utils // +static json oaicompat_completion_params_parse(const json & body) { + json llama_params; + + if (!body.contains("prompt")) { + throw std::runtime_error("\"prompt\" is required"); + } + + // Handle "stop" field + if (body.contains("stop") && body.at("stop").is_string()) { + llama_params["stop"] = json::array({body.at("stop").get()}); + } else { + llama_params["stop"] = json_value(body, "stop", json::array()); + } + + // Handle "n" field + int n_choices = json_value(body, "n", 1); + if (n_choices != 1) { + throw std::runtime_error("Only one completion choice is allowed"); + } + + // Params supported by OAI but unsupported by llama.cpp + static const std::vector unsupported_params { "best_of", "echo", "suffix" }; + for (const auto & param : unsupported_params) { + if (body.contains(param)) { + throw std::runtime_error("Unsupported param: " + param); + } + } + + // Copy remaining properties to llama_params + for (const auto & item : body.items()) { + // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens" + if (!llama_params.contains(item.key()) || item.key() == "n_predict") { + llama_params[item.key()] = item.value(); + } + } + + return llama_params; +} + static json oaicompat_completion_params_parse( const struct llama_model * model, const json & body, /* openai api json semantics */ @@ -744,14 +784,18 @@ static json format_logit_bias(const std::vector & logit_bias) return data; } -static std::string safe_json_to_str(json data) { +static std::string safe_json_to_str(const json & data) { return data.dump(-1, ' ', false, json::error_handler_t::replace); } static std::vector get_token_probabilities(llama_context * ctx, int idx) { std::vector cur; const auto * logits = llama_get_logits_ith(ctx, idx); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_vocab = llama_vocab_n_tokens(vocab); cur.resize(n_vocab); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -777,3 +821,44 @@ static std::vector get_token_probabilities(llama_context * ctx return cur; } + +static bool are_lora_equal( + const std::vector & l1, + const std::vector & l2) { + if (l1.size() != l2.size()) { + return false; + } + for (size_t i = 0; i < l1.size(); ++i) { + // we don't check lora.path to reduce the time complexity + if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) { + return false; + } + } + return true; +} + +// parse lora config from JSON request, returned a copy of lora_base with updated scale +static std::vector parse_lora_request( + const std::vector & lora_base, + const json & data) { + std::vector lora(lora_base); + int max_idx = lora.size(); + + // clear existing value + for (auto & entry : lora) { + entry.scale = 0.0f; + } + + // set value + for (const auto & entry : data) { + int id = json_value(entry, "id", -1); + float scale = json_value(entry, "scale", 0.0f); + if (0 <= id && id < max_idx) { + lora[id].scale = scale; + } else { + throw std::runtime_error("invalid adapter id"); + } + } + + return lora; +} diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html index dcdd41079..a76d831a9 100644 --- a/examples/server/webui/index.html +++ b/examples/server/webui/index.html @@ -37,7 +37,7 @@
+ }" @click="setViewingConv(conv.id)" dir="auto"> {{ conv.messages[0].content }}
@@ -62,53 +62,57 @@
- +
+ +
- @@ -152,6 +156,7 @@ @keydown.enter.shift.exact.prevent="inputMsg += '\n'" :disabled="isGenerating" id="msg-input" + dir="auto" > @@ -240,7 +245,7 @@
+ }" dir="auto">