diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile new file mode 100644 index 000000000..34ba856d3 --- /dev/null +++ b/.devops/full-musa.Dockerfile @@ -0,0 +1,26 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +RUN apt-get update && \ + apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) && \ + cp build/bin/* . + +ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile new file mode 100644 index 000000000..b5696794f --- /dev/null +++ b/.devops/llama-cli-musa.Dockerfile @@ -0,0 +1,30 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the MUSA runtime image +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +RUN apt-get update && \ + apt-get install -y build-essential git cmake + +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release --target llama-cli -j$(nproc) + +FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime + +RUN apt-get update && \ + apt-get install -y libgomp1 + +COPY --from=build /app/build/ggml/src/libggml.so /libggml.so +COPY --from=build /app/build/src/libllama.so /libllama.so +COPY --from=build /app/build/bin/llama-cli /llama-cli + +ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile new file mode 100644 index 000000000..193a6d77c --- /dev/null +++ b/.devops/llama-server-musa.Dockerfile @@ -0,0 +1,35 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the MUSA runtime image +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +RUN apt-get update && \ + apt-get install -y build-essential git cmake libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release --target llama-server -j$(nproc) + +FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev libgomp1 curl + +COPY --from=build /app/build/ggml/src/libggml.so /libggml.so +COPY --from=build /app/build/src/libllama.so /libllama.so +COPY --from=build /app/build/bin/llama-server /llama-server + +# Must be set to 0.0.0.0 so it can listen to requests from host machine +ENV LLAMA_ARG_HOST=0.0.0.0 + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/llama-server" ] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a4ac9b217..a953cdac9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,6 +43,9 @@ jobs: - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" } + - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" } # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } diff --git a/CMakeLists.txt b/CMakeLists.txt index 415743c2a..64a335378 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,7 +63,7 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) # utils -option(LLAMA_BUILD_COMMON "llama: build common utils library" ON) +option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE}) # extra artifacts option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) @@ -201,12 +201,12 @@ if (LLAMA_BUILD_COMMON) add_subdirectory(common) endif() -if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION) include(CTest) add_subdirectory(tests) endif() -if (LLAMA_BUILD_EXAMPLES) +if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES) add_subdirectory(examples) add_subdirectory(pocs) endif() diff --git a/README.md b/README.md index 41e5e5448..08fe8cc92 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ variety of hardware - locally and in the cloud. - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - AVX, AVX2 and AVX512 support for x86 architectures - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use -- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) +- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA) - Vulkan and SYCL backend support - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity @@ -130,6 +130,7 @@ Typically finetunes of the base models below are supported as well. - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) +- Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) **UI:** @@ -413,7 +414,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md) | [BLAS](./docs/build.md#blas-build) | All | | [BLIS](./docs/backend/BLIS.md) | All | | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | -| [MUSA](./docs/build.md#musa) | Moore Threads GPU | +| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [Vulkan](./docs/build.md#vulkan) | GPU | diff --git a/common/arg.cpp b/common/arg.cpp index 2a85ad845..d6a8e1f6f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -17,27 +17,27 @@ using json = nlohmann::ordered_json; -llama_arg & llama_arg::set_examples(std::initializer_list examples) { +common_arg & common_arg::set_examples(std::initializer_list examples) { this->examples = std::move(examples); return *this; } -llama_arg & llama_arg::set_env(const char * env) { +common_arg & common_arg::set_env(const char * env) { help = help + "\n(env: " + env + ")"; this->env = env; return *this; } -llama_arg & llama_arg::set_sparam() { +common_arg & common_arg::set_sparam() { is_sparam = true; return *this; } -bool llama_arg::in_example(enum llama_example ex) { +bool common_arg::in_example(enum llama_example ex) { return examples.find(ex) != examples.end(); } -bool llama_arg::get_value_from_env(std::string & output) { +bool common_arg::get_value_from_env(std::string & output) { if (env == nullptr) return false; char * value = std::getenv(env); if (value) { @@ -47,7 +47,7 @@ bool llama_arg::get_value_from_env(std::string & output) { return false; } -bool llama_arg::has_value_from_env() { +bool common_arg::has_value_from_env() { return env != nullptr && std::getenv(env); } @@ -78,7 +78,7 @@ static std::vector break_str_into_lines(std::string input, size_t m return result; } -std::string llama_arg::to_string() { +std::string common_arg::to_string() { // params for printing to console const static int n_leading_spaces = 40; const static int n_char_per_line_help = 70; // TODO: detect this based on current console @@ -119,33 +119,7 @@ std::string llama_arg::to_string() { // utils // -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -static void gpt_params_handle_model_default(gpt_params & params) { +static void common_params_handle_model_default(common_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (params.hf_file.empty()) { @@ -171,12 +145,12 @@ static void gpt_params_handle_model_default(gpt_params & params) { // CLI argument parsing functions // -static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) { +static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { std::string arg; const std::string arg_prefix = "--"; - gpt_params & params = ctx_arg.params; + common_params & params = ctx_arg.params; - std::unordered_map arg_to_options; + std::unordered_map arg_to_options; for (auto & opt : ctx_arg.options) { for (const auto & arg : opt.args) { arg_to_options[arg] = &opt; @@ -199,7 +173,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); } } @@ -220,7 +194,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx std::replace(arg.begin(), arg.end(), '_', '-'); } if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } auto opt = *arg_to_options[arg]; if (opt.has_value_from_env()) { @@ -252,7 +226,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); @@ -268,7 +242,7 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - gpt_params_handle_model_default(params); + common_params_handle_model_default(params); if (params.escape) { string_process_escapes(params.prompt); @@ -291,16 +265,16 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx return true; } -static void gpt_params_print_usage(gpt_params_context & ctx_arg) { - auto print_options = [](std::vector & options) { - for (llama_arg * opt : options) { +static void common_params_print_usage(common_params_context & ctx_arg) { + auto print_options = [](std::vector & options) { + for (common_arg * opt : options) { printf("%s", opt->to_string().c_str()); } }; - std::vector common_options; - std::vector sparam_options; - std::vector specific_options; + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; for (auto & opt : ctx_arg.options) { // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example if (opt.is_sparam) { @@ -320,17 +294,17 @@ static void gpt_params_print_usage(gpt_params_context & ctx_arg) { print_options(specific_options); } -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { - auto ctx_arg = gpt_params_parser_init(params, ex, print_usage); - const gpt_params params_org = ctx_arg.params; // the example can modify the default params +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + auto ctx_arg = common_params_parser_init(params, ex, print_usage); + const common_params params_org = ctx_arg.params; // the example can modify the default params try { - if (!gpt_params_parse_ex(argc, argv, ctx_arg)) { + if (!common_params_parse_ex(argc, argv, ctx_arg)) { ctx_arg.params = params_org; return false; } if (ctx_arg.params.usage) { - gpt_params_print_usage(ctx_arg); + common_params_print_usage(ctx_arg); if (ctx_arg.print_usage) { ctx_arg.print_usage(argc, argv); } @@ -345,16 +319,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example return true; } -gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { - gpt_params_context ctx_arg(params); +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + common_params_context ctx_arg(params); ctx_arg.print_usage = print_usage; ctx_arg.ex = ex; std::string sampler_type_chars; std::string sampler_type_names; for (const auto & sampler : params.sparams.samplers) { - sampler_type_chars += gpt_sampler_type_to_chr(sampler); - sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; + sampler_type_chars += common_sampler_type_to_chr(sampler); + sampler_type_names += common_sampler_type_to_str(sampler) + ";"; } sampler_type_names.pop_back(); @@ -366,374 +340,374 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example */ - auto add_opt = [&](llama_arg arg) { + auto add_opt = [&](common_arg arg) { if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { ctx_arg.options.push_back(std::move(arg)); } }; - add_opt(llama_arg( + add_opt(common_arg( {"-h", "--help", "--usage"}, "print usage and exit", - [](gpt_params & params) { + [](common_params & params) { params.usage = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--version"}, "show version and build info", - [](gpt_params &) { + [](common_params &) { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--verbose-prompt"}, - format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](gpt_params & params) { + string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [](common_params & params) { params.verbose_prompt = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--no-display-prompt"}, - format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](gpt_params & params) { + string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [](common_params & params) { params.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-co", "--color"}, - format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](gpt_params & params) { + string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [](common_params & params) { params.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-t", "--threads"}, "N", - format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](gpt_params & params, int value) { + string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [](common_params & params, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { params.cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_env("LLAMA_ARG_THREADS")); - add_opt(llama_arg( + add_opt(common_arg( {"-tb", "--threads-batch"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.n_threads = value; if (params.cpuparams_batch.n_threads <= 0) { params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.n_threads = value; if (params.draft_cpuparams.n_threads <= 0) { params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.n_threads = value; if (params.draft_cpuparams_batch.n_threads <= 0) { params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-C", "--cpu-mask"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Cr", "--cpu-range"}, "lo-hi", "range of CPUs for affinity. Complements --cpu-mask", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict"}, "<0|1>", - format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](gpt_params & params, const std::string & value) { + string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [](common_params & params, const std::string & value) { params.cpuparams.strict_cpu = std::stoul(value); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--prio"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](gpt_params & params, int prio) { + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.cpuparams.priority = (enum ggml_sched_priority) prio; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--poll"}, "<0...100>", - format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](gpt_params & params, const std::string & value) { + string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [](common_params & params, const std::string & value) { params.cpuparams.poll = std::stoul(value); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Cb", "--cpu-mask-batch"}, "M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Crb", "--cpu-range-batch"}, "lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid range"); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict-batch"}, "<0|1>", "use strict CPU placement (default: same as --cpu-strict)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.strict_cpu = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--prio-batch"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](gpt_params & params, int prio) { + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--poll-batch"}, "<0|1>", "use polling to wait for work (default: same as --poll)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.cpuparams_batch.poll = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.draft_cpuparams.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.draft_cpuparams.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--prio-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](gpt_params & params, int prio) { + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-Cbd", "--cpu-mask-batch-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { + [](common_params & params, const std::string & mask) { params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](gpt_params & params, const std::string & range) { + [](common_params & params, const std::string & range) { params.draft_cpuparams_batch.mask_valid = true; if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.strict_cpu = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--prio-batch-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](gpt_params & params, int prio) { + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.draft_cpuparams_batch.poll = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"--draft"}, "N", - format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](gpt_params & params, int value) { + string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + [](common_params & params, int value) { params.n_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-ps", "--p-split"}, "N", - format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](gpt_params & params, const std::string & value) { + string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + [](common_params & params, const std::string & value) { params.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lookup_cache_static = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lookup_cache_dynamic = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP})); - add_opt(llama_arg( + add_opt(common_arg( {"-c", "--ctx-size"}, "N", - format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](gpt_params & params, int value) { + string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [](common_params & params, int value) { params.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); - add_opt(llama_arg( + add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", - format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](gpt_params & params, int value) { + string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + [](common_params & params, int value) { params.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); - add_opt(llama_arg( + add_opt(common_arg( {"-b", "--batch-size"}, "N", - format("logical maximum batch size (default: %d)", params.n_batch), - [](gpt_params & params, int value) { + string_format("logical maximum batch size (default: %d)", params.n_batch), + [](common_params & params, int value) { params.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); - add_opt(llama_arg( + add_opt(common_arg( {"-ub", "--ubatch-size"}, "N", - format("physical maximum batch size (default: %d)", params.n_ubatch), - [](gpt_params & params, int value) { + string_format("physical maximum batch size (default: %d)", params.n_ubatch), + [](common_params & params, int value) { params.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); - add_opt(llama_arg( + add_opt(common_arg( {"--keep"}, "N", - format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](gpt_params & params, int value) { + string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [](common_params & params, int value) { params.n_keep = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--no-context-shift"}, - format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](gpt_params & params) { + string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + [](common_params & params) { params.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); - add_opt(llama_arg( + add_opt(common_arg( {"--chunks"}, "N", - format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](gpt_params & params, int value) { + string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [](common_params & params, int value) { params.n_chunks = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"-fa", "--flash-attn"}, - format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [](common_params & params) { params.flash_attn = true; } ).set_env("LLAMA_ARG_FLASH_ATTN")); - add_opt(llama_arg( + add_opt(common_arg( {"-p", "--prompt"}, "PROMPT", ex == LLAMA_EXAMPLE_MAIN ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" : "prompt to start generation with", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.prompt = value; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--no-perf"}, - format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](gpt_params & params) { + string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](common_params & params) { params.no_perf = true; params.sparams.no_perf = true; } ).set_env("LLAMA_ARG_NO_PERF")); - add_opt(llama_arg( + add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -743,24 +717,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.in_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"-bf", "--binary-file"}, "FNAME", "binary file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -770,287 +744,301 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-e", "--escape"}, - format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](gpt_params & params) { + string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](common_params & params) { params.escape = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--no-escape"}, "do not process escape sequences", - [](gpt_params & params) { + [](common_params & params) { params.escape = false; } )); - add_opt(llama_arg( + add_opt(common_arg( {"-ptc", "--print-token-count"}, "N", - format("print token count every N tokens (default: %d)", params.n_print), - [](gpt_params & params, int value) { + string_format("print token count every N tokens (default: %d)", params.n_print), + [](common_params & params, int value) { params.n_print = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--prompt-cache"}, "FNAME", "file to cache prompt state for faster startup (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.path_prompt_cache = value; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--prompt-cache-all"}, "if specified, saves user input and generations to cache as well\n", - [](gpt_params & params) { + [](common_params & params) { params.prompt_cache_all = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--prompt-cache-ro"}, "if specified, uses the prompt cache but does not update it", - [](gpt_params & params) { + [](common_params & params) { params.prompt_cache_ro = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-r", "--reverse-prompt"}, "PROMPT", "halt generation at PROMPT, return control in interactive mode\n", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.antiprompt.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-sp", "--special"}, - format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](gpt_params & params) { + string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [](common_params & params) { params.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"-cnv", "--conversation"}, - format( + string_format( "run in conversation mode:\n" "- does not print special tokens and suffix/prefix\n" "- interactive mode is also enabled\n" "(default: %s)", params.conversation ? "true" : "false" ), - [](gpt_params & params) { + [](common_params & params) { params.conversation = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-i", "--interactive"}, - format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](gpt_params & params) { + string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [](common_params & params) { params.interactive = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-if", "--interactive-first"}, - format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](gpt_params & params) { + string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [](common_params & params) { params.interactive_first = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"-mli", "--multiline-input"}, "allows you to write or paste multiple lines without ending each in '\\'", - [](gpt_params & params) { + [](common_params & params) { params.multiline_input = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--in-prefix-bos"}, "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](gpt_params & params) { + [](common_params & params) { params.input_prefix_bos = true; params.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--in-prefix"}, "STRING", "string to prefix user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.input_prefix = value; params.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"--in-suffix"}, "STRING", "string to suffix after user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.input_suffix = value; params.enable_chat_template = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"--no-warmup"}, "skip warming up the model with an empty run", - [](gpt_params & params) { + [](common_params & params) { params.warmup = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( + add_opt(common_arg( {"--spm-infill"}, - format( + string_format( "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), - [](gpt_params & params) { + [](common_params & params) { params.spm_infill = true; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"--samplers"}, "SAMPLERS", - format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [](common_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); + params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"-s", "--seed"}, "SEED", - format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), - [](gpt_params & params, const std::string & value) { + string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + [](common_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--sampling-seq"}, "SEQUENCE", - format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](gpt_params & params, const std::string & value) { - params.sparams.samplers = gpt_sampler_types_from_chars(value); + string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [](common_params & params, const std::string & value) { + params.sparams.samplers = common_sampler_types_from_chars(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](gpt_params & params) { + [](common_params & params) { params.sparams.ignore_eos = true; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--penalize-nl"}, - format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), - [](gpt_params & params) { + string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + [](common_params & params) { params.sparams.penalize_nl = true; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--temp"}, "N", - format("temperature (default: %.1f)", (double)params.sparams.temp), - [](gpt_params & params, const std::string & value) { + string_format("temperature (default: %.1f)", (double)params.sparams.temp), + [](common_params & params, const std::string & value) { params.sparams.temp = std::stof(value); params.sparams.temp = std::max(params.sparams.temp, 0.0f); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--top-k"}, "N", - format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), - [](gpt_params & params, int value) { + string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + [](common_params & params, int value) { params.sparams.top_k = value; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--top-p"}, "N", - format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), - [](gpt_params & params, const std::string & value) { + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + [](common_params & params, const std::string & value) { params.sparams.top_p = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--min-p"}, "N", - format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), - [](gpt_params & params, const std::string & value) { + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + [](common_params & params, const std::string & value) { params.sparams.min_p = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--tfs"}, "N", - format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), - [](gpt_params & params, const std::string & value) { + string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + [](common_params & params, const std::string & value) { params.sparams.tfs_z = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( + {"--xtc-probability"}, "N", + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability), + [](common_params & params, const std::string & value) { + params.sparams.xtc_probability = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--xtc-threshold"}, "N", + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold), + [](common_params & params, const std::string & value) { + params.sparams.xtc_threshold = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( {"--typical"}, "N", - format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), - [](gpt_params & params, const std::string & value) { + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + [](common_params & params, const std::string & value) { params.sparams.typ_p = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--repeat-last-n"}, "N", - format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), - [](gpt_params & params, int value) { + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + [](common_params & params, int value) { params.sparams.penalty_last_n = value; params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--repeat-penalty"}, "N", - format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), - [](gpt_params & params, const std::string & value) { + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + [](common_params & params, const std::string & value) { params.sparams.penalty_repeat = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--presence-penalty"}, "N", - format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), - [](gpt_params & params, const std::string & value) { + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + [](common_params & params, const std::string & value) { params.sparams.penalty_present = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--frequency-penalty"}, "N", - format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), - [](gpt_params & params, const std::string & value) { + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + [](common_params & params, const std::string & value) { params.sparams.penalty_freq = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--dynatemp-range"}, "N", - format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), - [](gpt_params & params, const std::string & value) { + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + [](common_params & params, const std::string & value) { params.sparams.dynatemp_range = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--dynatemp-exp"}, "N", - format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), - [](gpt_params & params, const std::string & value) { + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + [](common_params & params, const std::string & value) { params.sparams.dynatemp_exponent = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--mirostat"}, "N", - format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.sparams.mirostat = value; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--mirostat-lr"}, "N", - format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), - [](gpt_params & params, const std::string & value) { + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + [](common_params & params, const std::string & value) { params.sparams.mirostat_eta = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--mirostat-ent"}, "N", - format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), - [](gpt_params & params, const std::string & value) { + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + [](common_params & params, const std::string & value) { params.sparams.mirostat_tau = std::stof(value); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::stringstream ss(value); llama_token key; char sign; @@ -1067,20 +1055,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--grammar"}, "GRAMMAR", - format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + [](common_params & params, const std::string & value) { params.sparams.grammar = value; } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::copy( std::istreambuf_iterator(file), @@ -1089,17 +1077,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.sparams.grammar = json_schema_to_grammar(json::parse(value)); } ).set_sparam()); - add_opt(llama_arg( + add_opt(common_arg( {"--pooling"}, "{none,mean,cls,last,rank}", "pooling type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } @@ -1108,275 +1096,275 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING")); - add_opt(llama_arg( + add_opt(common_arg( {"--attention"}, "{causal,non,causal}", "attention type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-scaling"}, "{none,linear,yarn}", "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE")); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-scale"}, "N", "RoPE context scaling factor, expands context by a factor of N", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_scale = 1.0f / std::stof(value); } ).set_env("LLAMA_ARG_ROPE_SCALE")); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-freq-base"}, "N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_base = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_BASE")); - add_opt(llama_arg( + add_opt(common_arg( {"--rope-freq-scale"}, "N", "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.rope_freq_scale = std::stof(value); } ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-orig-ctx"}, "N", - format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](gpt_params & params, int value) { + string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [](common_params & params, int value) { params.yarn_orig_ctx = value; } ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-ext-factor"}, "N", - format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [](common_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-attn-factor"}, "N", - format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [](common_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-beta-slow"}, "N", - format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [](common_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); - add_opt(llama_arg( + add_opt(common_arg( {"--yarn-beta-fast"}, "N", - format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](gpt_params & params, const std::string & value) { + string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [](common_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); - add_opt(llama_arg( + add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", - format("group-attention factor (default: %d)", params.grp_attn_n), - [](gpt_params & params, int value) { + string_format("group-attention factor (default: %d)", params.grp_attn_n), + [](common_params & params, int value) { params.grp_attn_n = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_N")); - add_opt(llama_arg( + ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); + add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", - format("group-attention width (default: %.1f)", (double)params.grp_attn_w), - [](gpt_params & params, int value) { + string_format("group-attention width (default: %d)", params.grp_attn_w), + [](common_params & params, int value) { params.grp_attn_w = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_W")); - add_opt(llama_arg( + ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", - [](gpt_params & params) { + [](common_params & params) { params.dump_kv_cache = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"-nkvo", "--no-kv-offload"}, "disable KV offload", - [](gpt_params & params) { + [](common_params & params) { params.no_kv_offload = true; } ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); - add_opt(llama_arg( + add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", - format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; } ).set_env("LLAMA_ARG_CACHE_TYPE_K")); - add_opt(llama_arg( + add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", - format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); - add_opt(llama_arg( + add_opt(common_arg( {"--perplexity", "--all-logits"}, - format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](gpt_params & params) { + string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + [](common_params & params) { params.logits_all = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--hellaswag"}, "compute HellaSwag score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.hellaswag = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--hellaswag-tasks"}, "N", - format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](gpt_params & params, int value) { + string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [](common_params & params, int value) { params.hellaswag_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.winogrande = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--winogrande-tasks"}, "N", - format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](gpt_params & params, int value) { + string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [](common_params & params, int value) { params.winogrande_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", - [](gpt_params & params) { + [](common_params & params) { params.multiple_choice = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--multiple-choice-tasks"}, "N", - format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](gpt_params & params, int value) { + string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [](common_params & params, int value) { params.multiple_choice_tasks = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", - [](gpt_params & params) { + [](common_params & params) { params.kl_divergence = true; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--save-all-logits", "--kl-divergence-base"}, "FNAME", "set logits file", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.logits_file = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--ppl-stride"}, "N", - format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](gpt_params & params, int value) { + string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [](common_params & params, int value) { params.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", - format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](gpt_params & params, int value) { + string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [](common_params & params, int value) { params.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( + add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", - format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](gpt_params & params, const std::string & value) { + string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [](common_params & params, const std::string & value) { params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(llama_arg( + add_opt(common_arg( {"-np", "--parallel"}, "N", - format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](gpt_params & params, int value) { + string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](common_params & params, int value) { params.n_parallel = value; } ).set_env("LLAMA_ARG_N_PARALLEL")); - add_opt(llama_arg( + add_opt(common_arg( {"-ns", "--sequences"}, "N", - format("number of sequences to decode (default: %d)", params.n_sequences), - [](gpt_params & params, int value) { + string_format("number of sequences to decode (default: %d)", params.n_sequences), + [](common_params & params, int value) { params.n_sequences = value; } ).set_examples({LLAMA_EXAMPLE_PARALLEL})); - add_opt(llama_arg( + add_opt(common_arg( {"-cb", "--cont-batching"}, - format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](common_params & params) { params.cont_batching = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); - add_opt(llama_arg( + add_opt(common_arg( {"-nocb", "--no-cont-batching"}, "disable continuous batching", - [](gpt_params & params) { + [](common_params & params) { params.cont_batching = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); - add_opt(llama_arg( + add_opt(common_arg( {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.mmproj = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); - add_opt(llama_arg( + add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.image.emplace_back(value); } ).set_examples({LLAMA_EXAMPLE_LLAVA})); -#ifdef GGML_USE_RPC - add_opt(llama_arg( - {"--rpc"}, "SERVERS", - "comma separated list of RPC servers", - [](gpt_params & params, const std::string & value) { - params.rpc_servers = value; - } - ).set_env("LLAMA_ARG_RPC")); -#endif - add_opt(llama_arg( + if (llama_supports_rpc()) { + add_opt(common_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [](common_params & params, const std::string & value) { + params.rpc_servers = value; + } + ).set_env("LLAMA_ARG_RPC")); + } + add_opt(common_arg( {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", - [](gpt_params & params) { + [](common_params & params) { params.use_mlock = true; } ).set_env("LLAMA_ARG_MLOCK")); - add_opt(llama_arg( + add_opt(common_arg( {"--no-mmap"}, "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](gpt_params & params) { + [](common_params & params) { params.use_mmap = false; } ).set_env("LLAMA_ARG_NO_MMAP")); - add_opt(llama_arg( + add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" "- distribute: spread execution evenly over all nodes\n" @@ -1384,17 +1372,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); - add_opt(llama_arg( + add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); @@ -1402,10 +1390,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(llama_arg( + add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.n_gpu_layers_draft = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); @@ -1413,13 +1401,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" "- none: use one GPU only\n" "- layer (default): split layers and KV across GPUs\n" "- row: split rows across GPUs", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::string arg_next = value; if (arg_next == "none") { params.split_mode = LLAMA_SPLIT_MODE_NONE; @@ -1439,10 +1427,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_env("LLAMA_ARG_SPLIT_MODE")); - add_opt(llama_arg( + add_opt(common_arg( {"-ts", "--tensor-split"}, "N0,N1,N2,...", "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::string arg_next = value; // split string by , and / @@ -1451,7 +1439,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, std::vector split_arg{ it, {} }; if (split_arg.size() >= llama_max_devices()) { throw std::invalid_argument( - format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) ); } for (size_t i = 0; i < llama_max_devices(); ++i) { @@ -1466,315 +1454,315 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_env("LLAMA_ARG_TENSOR_SPLIT")); - add_opt(llama_arg( + add_opt(common_arg( {"-mg", "--main-gpu"}, "INDEX", - format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](gpt_params & params, int value) { + string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [](common_params & params, int value) { params.main_gpu = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n"); } } ).set_env("LLAMA_ARG_MAIN_GPU")); - add_opt(llama_arg( + add_opt(common_arg( {"--check-tensors"}, - format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](gpt_params & params) { + string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [](common_params & params) { params.check_tensors = true; } )); - add_opt(llama_arg( + add_opt(common_arg( {"--override-kv"}, "KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.lora_adapters.push_back({ std::string(value), 1.0 }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( + add_opt(common_arg( {"--lora-scaled"}, "FNAME", "SCALE", "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & fname, const std::string & scale) { + [](common_params & params, const std::string & fname, const std::string & scale) { params.lora_adapters.push_back({ fname, std::stof(scale) }); } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( + add_opt(common_arg( {"--control-vector"}, "FNAME", "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.control_vectors.push_back({ 1.0f, value, }); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--control-vector-scaled"}, "FNAME", "SCALE", "add a control vector with user defined scaling SCALE\n" "note: this argument can be repeated to add multiple scaled control vectors", - [](gpt_params & params, const std::string & fname, const std::string & scale) { + [](common_params & params, const std::string & fname, const std::string & scale) { params.control_vectors.push_back({ std::stof(scale), fname }); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--control-vector-layer-range"}, "START", "END", "layer range to apply the control vector(s) to, start and end inclusive", - [](gpt_params & params, const std::string & start, const std::string & end) { + [](common_params & params, const std::string & start, const std::string & end) { params.control_vector_layer_start = std::stoi(start); params.control_vector_layer_end = std::stoi(end); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-a", "--alias"}, "STRING", "set alias for model name (to be used by REST API)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_alias = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); - add_opt(llama_arg( + add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA ? std::string("model path from which to load base model") - : format( + : string_format( "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(llama_arg( + add_opt(common_arg( {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( + add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.model_url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); - add_opt(llama_arg( + add_opt(common_arg( {"-hfr", "--hf-repo"}, "REPO", "Hugging Face model repository (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); - add_opt(llama_arg( + add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file (default: unused)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); - add_opt(llama_arg( + add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.hf_token = value; } ).set_env("HF_TOKEN")); - add_opt(llama_arg( + add_opt(common_arg( {"--context-file"}, "FNAME", "file to load context from (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.context_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"--chunk-size"}, "N", - format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](gpt_params & params, int value) { + string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [](common_params & params, int value) { params.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"--chunk-separator"}, "STRING", - format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [](common_params & params, const std::string & value) { params.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( + add_opt(common_arg( {"--junk"}, "N", - format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](gpt_params & params, int value) { + string_format("number of times to repeat the junk text (default: %d)", params.n_junk), + [](common_params & params, int value) { params.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( + add_opt(common_arg( {"--pos"}, "N", - format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](gpt_params & params, int value) { + string_format("position of the passkey in the junk text (default: %d)", params.i_pos), + [](common_params & params, int value) { params.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( + add_opt(common_arg( {"-o", "--output", "--output-file"}, "FNAME", - format("output file (default: '%s')", + string_format("output file (default: '%s')", ex == LLAMA_EXAMPLE_EXPORT_LORA ? params.lora_outfile.c_str() : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR ? params.cvector_outfile.c_str() : params.out_file.c_str()), - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.out_file = value; params.cvector_outfile = value; params.lora_outfile = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( + add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", - format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](gpt_params & params, int value) { + string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [](common_params & params, int value) { params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--save-frequency"}, "N", - format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](gpt_params & params, int value) { + string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [](common_params & params, int value) { params.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--process-output"}, - format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](gpt_params & params) { + string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [](common_params & params) { params.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--no-ppl"}, - format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](gpt_params & params) { + string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](common_params & params) { params.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"--chunk", "--from-chunk"}, "N", - format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](gpt_params & params, int value) { + string_format("start processing the input from chunk N (default: %d)", params.i_chunk), + [](common_params & params, int value) { params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( + add_opt(common_arg( {"-pps"}, - format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](gpt_params & params) { + string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [](common_params & params) { params.is_pp_shared = true; } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"-npp"}, "n0,n1,...", "number of prompt tokens", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"-ntg"}, "n0,n1,...", "number of text generation tokens", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"-npl"}, "n0,n1,...", "number of parallel prompts", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { auto p = string_split(value, ','); params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"--embd-normalize"}, "N", - format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](gpt_params & params, int value) { + string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [](common_params & params, int value) { params.embd_normalize = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--embd-output-format"}, "FORMAT", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.embd_out = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--embd-separator"}, "STRING", "separator of embendings (default \\n) for example \"<#sep#>\"", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.embd_sep = value; } ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( + add_opt(common_arg( {"--host"}, "HOST", - format("ip address to listen (default: %s)", params.hostname.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("ip address to listen (default: %s)", params.hostname.c_str()), + [](common_params & params, const std::string & value) { params.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); - add_opt(llama_arg( + add_opt(common_arg( {"--port"}, "PORT", - format("port to listen (default: %d)", params.port), - [](gpt_params & params, int value) { + string_format("port to listen (default: %d)", params.port), + [](common_params & params, int value) { params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); - add_opt(llama_arg( + add_opt(common_arg( {"--path"}, "PATH", - format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("path to serve static files from (default: %s)", params.public_path.c_str()), + [](common_params & params, const std::string & value) { params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); - add_opt(llama_arg( + add_opt(common_arg( {"--embedding", "--embeddings"}, - format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [](common_params & params) { params.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); - add_opt(llama_arg( + add_opt(common_arg( {"--reranking", "--rerank"}, - format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), + [](common_params & params) { params.reranking = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING")); - add_opt(llama_arg( + add_opt(common_arg( {"--api-key"}, "KEY", "API key to use for authentication (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.api_keys.push_back(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); - add_opt(llama_arg( + add_opt(common_arg( {"--api-key-file"}, "FNAME", "path to file containing API keys (default: none)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { std::ifstream key_file(value); if (!key_file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::string key; while (std::getline(key_file, key)) { @@ -1785,70 +1773,74 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, key_file.close(); } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--ssl-key-file"}, "FNAME", "path to file a PEM-encoded SSL private key", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.ssl_file_key = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE")); - add_opt(llama_arg( + add_opt(common_arg( {"--ssl-cert-file"}, "FNAME", "path to file a PEM-encoded SSL certificate", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.ssl_file_cert = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); - add_opt(llama_arg( + add_opt(common_arg( {"-to", "--timeout"}, "N", - format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](gpt_params & params, int value) { + string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [](common_params & params, int value) { params.timeout_read = value; params.timeout_write = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); - add_opt(llama_arg( + add_opt(common_arg( {"--threads-http"}, "N", - format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](gpt_params & params, int value) { + string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [](common_params & params, int value) { params.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); - add_opt(llama_arg( - {"-spf", "--system-prompt-file"}, "FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; + add_opt(common_arg( + {"--cache-reuse"}, "N", + string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), + [](common_params & params, int value) { + params.n_cache_reuse = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); + add_opt(common_arg( {"--metrics"}, - format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [](common_params & params) { params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); - add_opt(llama_arg( + add_opt(common_arg( + {"--slots"}, + string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](common_params & params) { + params.endpoint_slots = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); + add_opt(common_arg( + {"--props"}, + string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), + [](common_params & params) { + params.endpoint_props = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); + add_opt(common_arg( {"--no-slots"}, - format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](gpt_params & params) { + "disables slots monitoring endpoint", + [](common_params & params) { params.endpoint_slots = false; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); - add_opt(llama_arg( + add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.slot_save_path = value; // if doesn't end with DIRECTORY_SEPARATOR, add it if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { @@ -1856,14 +1848,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--chat-template"}, "JINJA_TEMPLATE", "set custom jinja chat template (default: template taken from model's metadata)\n" "if suffix/prefix are specified, template will be disabled\n" "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [](gpt_params & params, const std::string & value) { - if (!llama_chat_verify_template(value)) { - throw std::runtime_error(format( + [](common_params & params, const std::string & value) { + if (!common_chat_verify_template(value)) { + throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s\n" "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", value.c_str() @@ -1872,31 +1864,31 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.chat_template = value; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); - add_opt(llama_arg( + add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](gpt_params & params, const std::string & value) { + string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [](common_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--lora-init-without-apply"}, - format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](gpt_params & params) { + string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [](common_params & params) { params.lora_init_without_apply = true; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( + add_opt(common_arg( {"--simple-io"}, "use basic IO for better compatibility in subprocesses and limited consoles", - [](gpt_params & params) { + [](common_params & params) { params.simple_io = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( + add_opt(common_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { params.logdir = value; if (params.logdir.back() != DIRECTORY_SEPARATOR) { @@ -1904,101 +1896,101 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } } )); - add_opt(llama_arg( + add_opt(common_arg( {"--positive-file"}, "FNAME", - format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + [](common_params & params, const std::string & value) { params.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--negative-file"}, "FNAME", - format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](gpt_params & params, const std::string & value) { + string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [](common_params & params, const std::string & value) { params.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--pca-batch"}, "N", - format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](gpt_params & params, int value) { + string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [](common_params & params, int value) { params.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--pca-iter"}, "N", - format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](gpt_params & params, int value) { + string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [](common_params & params, int value) { params.n_pca_iterations = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--method"}, "{pca, mean}", "dimensionality reduction method to be used (default: pca)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } else { throw std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( + add_opt(common_arg( {"--output-format"}, "{md,jsonl}", "output format for batched-bench results (default: md)", - [](gpt_params & params, const std::string & value) { + [](common_params & params, const std::string & value) { /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } else if (value == "md") { params.batched_bench_output_jsonl = false; } else { std::invalid_argument("invalid value"); } } ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( + add_opt(common_arg( {"--log-disable"}, "Log disable", - [](gpt_params &) { - gpt_log_pause(gpt_log_main()); + [](common_params &) { + common_log_pause(common_log_main()); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--log-file"}, "FNAME", "Log to file", - [](gpt_params &, const std::string & value) { - gpt_log_set_file(gpt_log_main(), value.c_str()); + [](common_params &, const std::string & value) { + common_log_set_file(common_log_main(), value.c_str()); } )); - add_opt(llama_arg( + add_opt(common_arg( {"--log-colors"}, "Enable colored logging", - [](gpt_params &) { - gpt_log_set_colors(gpt_log_main(), true); + [](common_params &) { + common_log_set_colors(common_log_main(), true); } ).set_env("LLAMA_LOG_COLORS")); - add_opt(llama_arg( + add_opt(common_arg( {"-v", "--verbose", "--log-verbose"}, "Set verbosity level to infinity (i.e. log all messages, useful for debugging)", - [](gpt_params & params) { + [](common_params & params) { params.verbosity = INT_MAX; - gpt_log_set_verbosity_thold(INT_MAX); + common_log_set_verbosity_thold(INT_MAX); } )); - add_opt(llama_arg( + add_opt(common_arg( {"-lv", "--verbosity", "--log-verbosity"}, "N", "Set the verbosity threshold. Messages with a higher verbosity will be ignored.", - [](gpt_params & params, int value) { + [](common_params & params, int value) { params.verbosity = value; - gpt_log_set_verbosity_thold(value); + common_log_set_verbosity_thold(value); } ).set_env("LLAMA_LOG_VERBOSITY")); - add_opt(llama_arg( + add_opt(common_arg( {"--log-prefix"}, "Enable prefx in log messages", - [](gpt_params &) { - gpt_log_set_prefix(gpt_log_main(), true); + [](common_params &) { + common_log_set_prefix(common_log_main(), true); } ).set_env("LLAMA_LOG_PREFIX")); - add_opt(llama_arg( + add_opt(common_arg( {"--log-timestamps"}, "Enable timestamps in log messages", - [](gpt_params &) { - gpt_log_set_timestamps(gpt_log_main(), true); + [](common_params &) { + common_log_set_timestamps(common_log_main(), true); } ).set_env("LLAMA_LOG_TIMESTAMPS")); diff --git a/common/arg.h b/common/arg.h index 413de2c88..a6700d323 100644 --- a/common/arg.h +++ b/common/arg.h @@ -10,7 +10,7 @@ // CLI argument parsing // -struct llama_arg { +struct common_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; std::vector args; const char * value_hint = nullptr; // help text or example for arg value @@ -18,60 +18,60 @@ struct llama_arg { const char * env = nullptr; std::string help; bool is_sparam = false; // is current arg a sampling param? - void (*handler_void) (gpt_params & params) = nullptr; - void (*handler_string) (gpt_params & params, const std::string &) = nullptr; - void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; - void (*handler_int) (gpt_params & params, int) = nullptr; + void (*handler_void) (common_params & params) = nullptr; + void (*handler_string) (common_params & params, const std::string &) = nullptr; + void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (common_params & params, int) = nullptr; - llama_arg( + common_arg( const std::initializer_list & args, const char * value_hint, const std::string & help, - void (*handler)(gpt_params & params, const std::string &) + void (*handler)(common_params & params, const std::string &) ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} - llama_arg( + common_arg( const std::initializer_list & args, const char * value_hint, const std::string & help, - void (*handler)(gpt_params & params, int) + void (*handler)(common_params & params, int) ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} - llama_arg( + common_arg( const std::initializer_list & args, const std::string & help, - void (*handler)(gpt_params & params) + void (*handler)(common_params & params) ) : args(args), help(help), handler_void(handler) {} // support 2 values for arg - llama_arg( + common_arg( const std::initializer_list & args, const char * value_hint, const char * value_hint_2, const std::string & help, - void (*handler)(gpt_params & params, const std::string &, const std::string &) + void (*handler)(common_params & params, const std::string &, const std::string &) ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - llama_arg & set_examples(std::initializer_list examples); - llama_arg & set_env(const char * env); - llama_arg & set_sparam(); + common_arg & set_examples(std::initializer_list examples); + common_arg & set_env(const char * env); + common_arg & set_sparam(); bool in_example(enum llama_example ex); bool get_value_from_env(std::string & output); bool has_value_from_env(); std::string to_string(); }; -struct gpt_params_context { +struct common_params_context { enum llama_example ex = LLAMA_EXAMPLE_COMMON; - gpt_params & params; - std::vector options; + common_params & params; + std::vector options; void(*print_usage)(int, char **) = nullptr; - gpt_params_context(gpt_params & params) : params(params) {} + common_params_context(common_params & params) : params(params) {} }; // parse input arguments from CLI // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); // function to be used by test-arg-parser -gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); +common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/common.cpp b/common/common.cpp index 29df16c95..c08f01b42 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -23,10 +24,10 @@ #include #include #include +#include #include #include #include -#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -362,10 +363,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -void gpt_init() { +void common_init() { llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) { - if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) { - gpt_log_add(gpt_log_main(), level, "%s", text); + if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) { + common_log_add(common_log_main(), level, "%s", text); } }, NULL); @@ -378,7 +379,7 @@ void gpt_init() { LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); } -std::string gpt_params_get_system_info(const gpt_params & params) { +std::string common_params_get_system_info(const common_params & params) { std::ostringstream os; os << "system_info: n_threads = " << params.cpuparams.n_threads; @@ -400,6 +401,21 @@ std::string gpt_params_get_system_info(const gpt_params & params) { // String utils // +std::string string_format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + std::vector string_split(std::string input, char separator) { std::vector parts; size_t separator_pos = input.find(separator); @@ -493,7 +509,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector & lora_adapters) { +void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { llama_lora_adapter_clear(ctx); for (auto & la : lora_adapters) { if (la.scale != 0.0f) { @@ -970,7 +986,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector curl(curl_easy_init(), &curl_easy_cleanup); @@ -1182,15 +1198,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat } // Send a HEAD request to retrieve the etag and last-modified headers - struct llama_load_model_from_url_headers { + struct common_load_model_from_url_headers { std::string etag; std::string last_modified; }; - llama_load_model_from_url_headers headers; + common_load_model_from_url_headers headers; { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; + common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata; static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex etag_regex("ETag", std::regex_constants::icase); @@ -1326,7 +1342,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat return true; } -struct llama_model * llama_load_model_from_url( +struct llama_model * common_load_model_from_url( const char * model_url, const char * path_model, const char * hf_token, @@ -1337,7 +1353,7 @@ struct llama_model * llama_load_model_from_url( return NULL; } - if (!llama_download_file(model_url, path_model, hf_token)) { + if (!common_download_file(model_url, path_model, hf_token)) { return NULL; } @@ -1390,7 +1406,7 @@ struct llama_model * llama_load_model_from_url( char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); - return llama_download_file(split_url, split_path, hf_token); + return common_download_file(split_url, split_path, hf_token); }, idx)); } @@ -1405,7 +1421,7 @@ struct llama_model * llama_load_model_from_url( return llama_load_model_from_file(path_model, params); } -struct llama_model * llama_load_model_from_hf( +struct llama_model * common_load_model_from_hf( const char * repo, const char * model, const char * path_model, @@ -1425,12 +1441,12 @@ struct llama_model * llama_load_model_from_hf( model_url += "/resolve/main/"; model_url += model; - return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params); + return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params); } #else -struct llama_model * llama_load_model_from_url( +struct llama_model * common_load_model_from_url( const char * /*model_url*/, const char * /*path_model*/, const char * /*hf_token*/, @@ -1439,7 +1455,7 @@ struct llama_model * llama_load_model_from_url( return nullptr; } -struct llama_model * llama_load_model_from_hf( +struct llama_model * common_load_model_from_hf( const char * /*repo*/, const char * /*model*/, const char * /*path_model*/, @@ -1455,11 +1471,11 @@ struct llama_model * llama_load_model_from_hf( // Batch utils // -void llama_batch_clear(struct llama_batch & batch) { +void common_batch_clear(struct llama_batch & batch) { batch.n_tokens = 0; } -void llama_batch_add( +void common_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, @@ -1482,15 +1498,15 @@ void llama_batch_add( // Vocab utils // -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, bool parse_special) { - return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); + return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); } -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_model * model, const std::string & text, bool add_special, @@ -1509,7 +1525,7 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { +std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string piece; piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); @@ -1525,7 +1541,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t return piece; } -std::string llama_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { +std::string common_detokenize(llama_context * ctx, const std::vector & tokens, bool special) { std::string text; text.resize(std::max(text.capacity(), tokens.size())); int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); @@ -1545,15 +1561,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector // Chat template utils // -bool llama_chat_verify_template(const std::string & tmpl) { +bool common_chat_verify_template(const std::string & tmpl) { llama_chat_message chat[] = {{"user", "test"}}; int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); return res >= 0; } -std::string llama_chat_apply_template(const struct llama_model * model, +std::string common_chat_apply_template(const struct llama_model * model, const std::string & tmpl, - const std::vector & msgs, + const std::vector & msgs, bool add_ass) { int alloc_size = 0; bool fallback = false; // indicate if we must fallback to default chatml @@ -1595,42 +1611,42 @@ std::string llama_chat_apply_template(const struct llama_model * model, return formatted_chat; } -std::string llama_chat_format_single(const struct llama_model * model, +std::string common_chat_format_single(const struct llama_model * model, const std::string & tmpl, - const std::vector & past_msg, - const llama_chat_msg & new_msg, + const std::vector & past_msg, + const common_chat_msg & new_msg, bool add_ass) { std::ostringstream ss; - auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); - std::vector chat_new(past_msg); + auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false); + std::vector chat_new(past_msg); // if the past_msg ends with a newline, we must preserve it in the formatted version if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { ss << "\n"; }; // format chat with new_msg chat_new.push_back(new_msg); - auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass); + auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass); // get the diff part ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); return ss.str(); } -std::string llama_chat_format_example(const struct llama_model * model, +std::string common_chat_format_example(const struct llama_model * model, const std::string & tmpl) { - std::vector msgs = { + std::vector msgs = { {"system", "You are a helpful assistant"}, {"user", "Hello"}, {"assistant", "Hi there"}, {"user", "How are you?"}, }; - return llama_chat_apply_template(model, tmpl, msgs, true); + return common_chat_apply_template(model, tmpl, msgs, true); } // // KV cache utils // -void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { +void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", @@ -1653,7 +1669,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } -void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { +void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", @@ -1705,7 +1721,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) { +void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) { double sum = 0.0; switch (embd_norm) { @@ -1739,7 +1755,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) } } -float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ +float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){ double sum = 0.0; double sum1 = 0.0; double sum2 = 0.0; @@ -1765,8 +1781,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // Control vector utils // -static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { - llama_control_vector_data result = { -1, {} }; +static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) { + common_control_vector_data result = { -1, {} }; ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -1850,11 +1866,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr return result; } -llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { - llama_control_vector_data result = { -1, {} }; +common_control_vector_data common_control_vector_load(const std::vector & load_infos) { + common_control_vector_data result = { -1, {} }; for (const auto & info : load_infos) { - auto cur = llama_control_vector_load_one(info); + auto cur = common_control_vector_load_one(info); if (cur.n_embd == -1) { result.n_embd = -1; @@ -1946,7 +1962,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha } } -void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, +void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { const auto & sparams = params.sparams; @@ -2088,6 +2104,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability); + fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold); fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 8b84cf9ad..5ca8fd391 100644 --- a/common/common.h +++ b/common/common.h @@ -24,12 +24,12 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" -struct llama_lora_adapter_info { +struct common_lora_adapter_info { std::string path; float scale; }; -struct llama_lora_adapter_container : llama_lora_adapter_info { +struct common_lora_adapter_container : common_lora_adapter_info { struct llama_lora_adapter * adapter; }; @@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT; extern char const * LLAMA_COMPILER; extern char const * LLAMA_BUILD_TARGET; -struct llama_control_vector_load_info; +struct common_control_vector_load_info; // // CPU utils @@ -82,14 +82,16 @@ enum llama_example { LLAMA_EXAMPLE_COUNT, }; -enum gpt_sampler_type { - GPT_SAMPLER_TYPE_NONE = 0, - GPT_SAMPLER_TYPE_TOP_K = 1, - GPT_SAMPLER_TYPE_TOP_P = 2, - GPT_SAMPLER_TYPE_MIN_P = 3, - GPT_SAMPLER_TYPE_TFS_Z = 4, - GPT_SAMPLER_TYPE_TYPICAL_P = 5, - GPT_SAMPLER_TYPE_TEMPERATURE = 6, +enum common_sampler_type { + COMMON_SAMPLER_TYPE_NONE = 0, + COMMON_SAMPLER_TYPE_TOP_K = 1, + COMMON_SAMPLER_TYPE_TOP_P = 2, + COMMON_SAMPLER_TYPE_MIN_P = 3, + COMMON_SAMPLER_TYPE_TFS_Z = 4, + COMMON_SAMPLER_TYPE_TYPICAL_P = 5, + COMMON_SAMPLER_TYPE_TEMPERATURE = 6, + COMMON_SAMPLER_TYPE_XTC = 7, + COMMON_SAMPLER_TYPE_INFILL = 8, }; // dimensionality reduction methods, used by cvector-generator @@ -99,7 +101,7 @@ enum dimre_method { }; // sampler parameters -struct gpt_sampler_params { +struct common_sampler_params { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler int32_t n_prev = 64; // number of previous tokens to remember @@ -108,6 +110,8 @@ struct gpt_sampler_params { int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC float tfs_z = 1.00f; // 1.0 = disabled float typ_p = 1.00f; // typical_p, 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities @@ -124,13 +128,15 @@ struct gpt_sampler_params { bool ignore_eos = false; bool no_perf = false; // disable performance metrics - std::vector samplers = { - GPT_SAMPLER_TYPE_TOP_K, - GPT_SAMPLER_TYPE_TFS_Z, - GPT_SAMPLER_TYPE_TYPICAL_P, - GPT_SAMPLER_TYPE_TOP_P, - GPT_SAMPLER_TYPE_MIN_P, - GPT_SAMPLER_TYPE_TEMPERATURE + + std::vector samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + COMMON_SAMPLER_TYPE_TFS_Z, + COMMON_SAMPLER_TYPE_TYPICAL_P, + COMMON_SAMPLER_TYPE_TOP_P, + COMMON_SAMPLER_TYPE_MIN_P, + COMMON_SAMPLER_TYPE_XTC, + COMMON_SAMPLER_TYPE_TEMPERATURE, }; std::string grammar; // optional BNF-like grammar to constrain sampling @@ -141,7 +147,7 @@ struct gpt_sampler_params { std::string print() const; }; -struct gpt_params { +struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -183,7 +189,7 @@ struct gpt_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct gpt_sampler_params sparams; + struct common_sampler_params sparams; std::string model = ""; // model path // NOLINT std::string model_draft = ""; // draft model for speculative decoding // NOLINT @@ -208,9 +214,9 @@ struct gpt_params { std::vector kv_overrides; bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) - std::vector lora_adapters; // lora adapter path with user defined scale + std::vector lora_adapters; // lora adapter path with user defined scale - std::vector control_vectors; // control vector with user defined scale + std::vector control_vectors; // control vector with user defined scale int32_t verbosity = 0; int32_t control_vector_layer_start = -1; // layer range for control vector @@ -277,12 +283,12 @@ struct gpt_params { int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT std::string chat_template = ""; // NOLINT - std::string system_prompt = ""; // NOLINT bool enable_chat_template = true; std::vector api_keys; @@ -290,7 +296,10 @@ struct gpt_params { std::string ssl_file_key = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT - bool endpoint_slots = true; + // "advanced" endpoints are disabled by default for better security + bool webui = true; + bool endpoint_slots = false; + bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; bool log_json = false; @@ -345,19 +354,32 @@ struct gpt_params { // call once at the start of a program if it uses libcommon // initializes the logging system and prints info about the build -void gpt_init(); +void common_init(); -std::string gpt_params_get_system_info(const gpt_params & params); +std::string common_params_get_system_info(const common_params & params); -bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); -bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); -void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); +bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); +bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); +void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); bool set_process_priority(enum ggml_sched_priority prio); // // String utils // +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +std::string string_format(const char * fmt, ...); + std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); @@ -401,29 +423,29 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // -struct llama_init_result { +struct common_init_result { struct llama_model * model = nullptr; struct llama_context * context = nullptr; - std::vector lora_adapters; + std::vector lora_adapters; }; -struct llama_init_result llama_init_from_gpt_params(gpt_params & params); +struct common_init_result common_init_from_params(common_params & params); -struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); -struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params); +struct llama_model_params common_model_params_to_llama (const common_params & params); +struct llama_context_params common_context_params_to_llama(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); -struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); -struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); // clear LoRA adapters from context, then apply new list of adapters -void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); +void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); // Batch utils -void llama_batch_clear(struct llama_batch & batch); +void common_batch_clear(struct llama_batch & batch); -void llama_batch_add( +void common_batch_add( struct llama_batch & batch, llama_token id, llama_pos pos, @@ -436,13 +458,13 @@ void llama_batch_add( // tokenizes a string into a vector of tokens // should work similar to Python's `tokenizer.encode` -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_context * ctx, const std::string & text, bool add_special, bool parse_special = false); -std::vector llama_tokenize( +std::vector common_tokenize( const struct llama_model * model, const std::string & text, bool add_special, @@ -450,7 +472,7 @@ std::vector llama_tokenize( // tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` -std::string llama_token_to_piece( +std::string common_token_to_piece( const struct llama_context * ctx, llama_token token, bool special = true); @@ -458,7 +480,7 @@ std::string llama_token_to_piece( // detokenizes a vector of tokens into a string // should work similar to Python's `tokenizer.decode` // optionally renders special/control tokens -std::string llama_detokenize( +std::string common_detokenize( llama_context * ctx, const std::vector & tokens, bool special = true); @@ -468,31 +490,31 @@ std::string llama_detokenize( // // same with llama_chat_message, but uses std::string -struct llama_chat_msg { +struct common_chat_msg { std::string role; std::string content; }; // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid -bool llama_chat_verify_template(const std::string & tmpl); +bool common_chat_verify_template(const std::string & tmpl); // CPP wrapper for llama_chat_apply_template // If the built-in template is not supported, we default to chatml // If the custom "tmpl" is not supported, we throw an error -std::string llama_chat_apply_template(const struct llama_model * model, +std::string common_chat_apply_template(const struct llama_model * model, const std::string & tmpl, - const std::vector & chat, + const std::vector & chat, bool add_ass); // Format single message, while taking into account the position of that message in chat history -std::string llama_chat_format_single(const struct llama_model * model, +std::string common_chat_format_single(const struct llama_model * model, const std::string & tmpl, - const std::vector & past_msg, - const llama_chat_msg & new_msg, + const std::vector & past_msg, + const common_chat_msg & new_msg, bool add_ass); // Returns an example of formatted chat -std::string llama_chat_format_example(const struct llama_model * model, +std::string common_chat_format_example(const struct llama_model * model, const std::string & tmpl); // @@ -500,31 +522,31 @@ std::string llama_chat_format_example(const struct llama_model * model, // // Dump the KV cache view with the number of sequences per cell. -void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); +void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). -void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); +void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40); // // Embedding utils // -void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); +void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); -float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); +float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); // // Control vector utils // -struct llama_control_vector_data { +struct common_control_vector_data { int n_embd; // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd std::vector data; }; -struct llama_control_vector_load_info { +struct common_control_vector_load_info { float strength; std::string fname; @@ -532,7 +554,7 @@ struct llama_control_vector_load_info { // Load control vectors, scale each by strength, and add them together. // On error, returns {-1, empty} -llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); +common_control_vector_data common_control_vector_load(const std::vector & load_infos); // // Split utils @@ -551,5 +573,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data); void yaml_dump_non_result_info( - FILE * stream, const gpt_params & params, const llama_context * lctx, + FILE * stream, const common_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 881eb49e3..dadc18c8b 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -611,7 +611,7 @@ private: } return join_seq(); }; - return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); + return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space"); } /* diff --git a/common/log.cpp b/common/log.cpp index 5a844ed59..04c7c0ed1 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -8,10 +8,10 @@ #include #include -int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA; +int common_log_verbosity_thold = LOG_DEFAULT_LLAMA; -void gpt_log_set_verbosity_thold(int verbosity) { - gpt_log_verbosity_thold = verbosity; +void common_log_set_verbosity_thold(int verbosity) { + common_log_verbosity_thold = verbosity; } #define LOG_COL_DEFAULT "\033[0m" @@ -29,16 +29,16 @@ static int64_t t_us() { } // colors -enum gpt_log_col : int { - GPT_LOG_COL_DEFAULT = 0, - GPT_LOG_COL_BOLD, - GPT_LOG_COL_RED, - GPT_LOG_COL_GREEN, - GPT_LOG_COL_YELLOW, - GPT_LOG_COL_BLUE, - GPT_LOG_COL_MAGENTA, - GPT_LOG_COL_CYAN, - GPT_LOG_COL_WHITE, +enum common_log_col : int { + COMMON_LOG_COL_DEFAULT = 0, + COMMON_LOG_COL_BOLD, + COMMON_LOG_COL_RED, + COMMON_LOG_COL_GREEN, + COMMON_LOG_COL_YELLOW, + COMMON_LOG_COL_BLUE, + COMMON_LOG_COL_MAGENTA, + COMMON_LOG_COL_CYAN, + COMMON_LOG_COL_WHITE, }; // disable colors by default @@ -54,7 +54,7 @@ static std::vector g_col = { "", }; -struct gpt_log_entry { +struct common_log_entry { enum ggml_log_level level; bool prefix; @@ -71,7 +71,7 @@ struct gpt_log_entry { if (!fcur) { // stderr displays DBG messages only when their verbosity level is not higher than the threshold // these messages will still be logged to a file - if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) { + if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) { return; } @@ -86,19 +86,19 @@ struct gpt_log_entry { if (timestamp) { // [M.s.ms.us] fprintf(fcur, "%s%d.%02d.%03d.%03d%s ", - g_col[GPT_LOG_COL_BLUE], + g_col[COMMON_LOG_COL_BLUE], (int) (timestamp / 1000000 / 60), (int) (timestamp / 1000000 % 60), (int) (timestamp / 1000 % 1000), (int) (timestamp % 1000), - g_col[GPT_LOG_COL_DEFAULT]); + g_col[COMMON_LOG_COL_DEFAULT]); } switch (level) { - case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break; - case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break; - case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break; - case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break; + case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break; + case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break; + case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break; + case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break; default: break; } @@ -107,18 +107,18 @@ struct gpt_log_entry { fprintf(fcur, "%s", msg.data()); if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) { - fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]); + fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]); } fflush(fcur); } }; -struct gpt_log { +struct common_log { // default capacity - will be expanded if needed - gpt_log() : gpt_log(256) {} + common_log() : common_log(256) {} - gpt_log(size_t capacity) { + common_log(size_t capacity) { file = nullptr; prefix = false; timestamps = false; @@ -137,7 +137,7 @@ struct gpt_log { resume(); } - ~gpt_log() { + ~common_log() { pause(); if (file) { fclose(file); @@ -158,12 +158,12 @@ private: int64_t t_start; // ring buffer of entries - std::vector entries; + std::vector entries; size_t head; size_t tail; // worker thread copies into this - gpt_log_entry cur; + common_log_entry cur; public: void add(enum ggml_log_level level, const char * fmt, va_list args) { @@ -219,7 +219,7 @@ public: tail = (tail + 1) % entries.size(); if (tail == head) { // expand the buffer - std::vector new_entries(2*entries.size()); + std::vector new_entries(2*entries.size()); size_t new_tail = 0; @@ -320,15 +320,15 @@ public: pause(); if (colors) { - g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; - g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD; - g_col[GPT_LOG_COL_RED] = LOG_COL_RED; - g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN; - g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW; - g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE; - g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; - g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN; - g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE; + g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT; + g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD; + g_col[COMMON_LOG_COL_RED] = LOG_COL_RED; + g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN; + g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW; + g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE; + g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA; + g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN; + g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE; } else { for (size_t i = 0; i < g_col.size(); i++) { g_col[i] = ""; @@ -355,47 +355,47 @@ public: // public API // -struct gpt_log * gpt_log_init() { - return new gpt_log; +struct common_log * common_log_init() { + return new common_log; } -struct gpt_log * gpt_log_main() { - static struct gpt_log log; +struct common_log * common_log_main() { + static struct common_log log; return &log; } -void gpt_log_pause(struct gpt_log * log) { +void common_log_pause(struct common_log * log) { log->pause(); } -void gpt_log_resume(struct gpt_log * log) { +void common_log_resume(struct common_log * log) { log->resume(); } -void gpt_log_free(struct gpt_log * log) { +void common_log_free(struct common_log * log) { delete log; } -void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) { +void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) { va_list args; va_start(args, fmt); log->add(level, fmt, args); va_end(args); } -void gpt_log_set_file(struct gpt_log * log, const char * file) { +void common_log_set_file(struct common_log * log, const char * file) { log->set_file(file); } -void gpt_log_set_colors(struct gpt_log * log, bool colors) { +void common_log_set_colors(struct common_log * log, bool colors) { log->set_colors(colors); } -void gpt_log_set_prefix(struct gpt_log * log, bool prefix) { +void common_log_set_prefix(struct common_log * log, bool prefix) { log->set_prefix(prefix); } -void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) { +void common_log_set_timestamps(struct common_log * log, bool timestamps) { log->set_timestamps(timestamps); } diff --git a/common/log.h b/common/log.h index 84f9b3ed7..66605cc69 100644 --- a/common/log.h +++ b/common/log.h @@ -14,23 +14,23 @@ #define LOG_DEFAULT_LLAMA 0 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower -// set via gpt_log_set_verbosity() -extern int gpt_log_verbosity_thold; +// set via common_log_set_verbosity() +extern int common_log_verbosity_thold; -void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe +void common_log_set_verbosity_thold(int verbosity); // not thread-safe -// the gpt_log uses an internal worker thread to print/write log messages +// the common_log uses an internal worker thread to print/write log messages // when the worker thread is paused, incoming log messages are discarded -struct gpt_log; +struct common_log; -struct gpt_log * gpt_log_init(); -struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit -void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe -void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe -void gpt_log_free (struct gpt_log * log); +struct common_log * common_log_init(); +struct common_log * common_log_main(); // singleton, automatically destroys itself on exit +void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe +void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe +void common_log_free (struct common_log * log); LOG_ATTRIBUTE_FORMAT(3, 4) -void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...); +void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...); // defaults: file = NULL, colors = false, prefix = false, timestamps = false // @@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f // D - debug (stderr, V = LOG_DEFAULT_DEBUG) // -void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe -void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe -void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log -void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix +void common_log_set_file (struct common_log * log, const char * file); // not thread-safe +void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe +void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log +void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix // helper macros for logging // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold @@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w // // LOG_DBG("this is a debug message: %d\n", expensive_function()); // -// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold +// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold // #define LOG_TMPL(level, verbosity, ...) \ do { \ - if ((verbosity) <= gpt_log_verbosity_thold) { \ - gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \ + if ((verbosity) <= common_log_verbosity_thold) { \ + common_log_add(common_log_main(), (level), __VA_ARGS__); \ } \ } while (0) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index 7953c723e..a9dfb6714 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -8,7 +8,7 @@ #include #include -void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, +void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { const int64_t t_start_ms = ggml_time_ms(); const int64_t inp_size = inp.size(); @@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in const int64_t i_start = std::max(inp_size - nnew, ngram_size); for (int64_t i = i_start; i < inp_size; ++i) { const int64_t ngram_start = i - ngram_size; - llama_ngram ngram(&inp[ngram_start], ngram_size); + common_ngram ngram(&inp[ngram_start], ngram_size); const llama_token token = inp[i]; - llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); + common_ngram_cache::iterator part_it = ngram_cache.find(ngram); if (part_it == ngram_cache.end()) { - llama_ngram_cache_part part; + common_ngram_cache_part part; part.emplace(token, 1); ngram_cache.emplace(ngram, part); } else { - llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); + common_ngram_cache_part::iterator token_count_it = part_it->second.find(token); if (token_count_it == part_it->second.end()) { part_it->second.emplace(token, 1); } else { @@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; // Helper function that tries to draft a token from only the static ngram cache: -static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { - llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); +static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) { + common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); if (part_static_it == nc_static.end()) { return -1; } - const llama_ngram_cache_part part_static = part_static_it->second; + const common_ngram_cache_part part_static = part_static_it->second; int max_count_static = 0; int sum_count_static = 0; @@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng // Try to draft a token from primary cache (context/dynamic), validate with static cache: static llama_token try_draft( - llama_ngram_cache & nc_primary, const std::vector & ngrams_primary, llama_ngram_cache_part & part_static, + common_ngram_cache & nc_primary, const std::vector & ngrams_primary, common_ngram_cache_part & part_static, const int * min_sample_size, const int * min_percent) { llama_token drafted_token = -1; for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { - const llama_ngram ngram_primary = ngrams_primary[i]; + const common_ngram ngram_primary = ngrams_primary[i]; - llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); + common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); if (part_primary_it == nc_primary.end()) { continue; } - const llama_ngram_cache_part part_primary = part_primary_it->second; + const common_ngram_cache_part part_primary = part_primary_it->second; int max_count_primary = 0; int max_count_static = 0; @@ -117,7 +117,7 @@ static llama_token try_draft( for (std::pair token_count_primary : part_primary) { const llama_token token = token_count_primary.first; - llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); + common_ngram_cache_part::iterator token_count_static_it = part_static.find(token); const int32_t count_primary = token_count_primary.second; const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; @@ -142,9 +142,9 @@ static llama_token try_draft( return drafted_token; } -void llama_ngram_cache_draft( +void common_ngram_cache_draft( std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, - llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static + common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static ) { GGML_ASSERT(draft.size() == 1); const int inp_size = inp.size(); @@ -157,21 +157,21 @@ void llama_ngram_cache_draft( llama_token drafted_token = -1; const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; - llama_ngram ngram_static; + common_ngram ngram_static; for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); } - llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); - llama_ngram_cache_part part_static; + common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + common_ngram_cache_part part_static; if (part_static_it != nc_static.end()) { part_static = part_static_it->second; } // cd = context + dynamic - std::vector ngrams_cd; + std::vector ngrams_cd; for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; - llama_ngram ngram_cd; + common_ngram ngram_cd; for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); } @@ -196,16 +196,16 @@ void llama_ngram_cache_draft( } } -void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { +void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) { std::ofstream file_out(filename, std::ios::binary); - for (std::pair item : ngram_cache) { - const llama_ngram ngram = item.first; - llama_ngram_cache_part token_counts = item.second; + for (std::pair item : ngram_cache) { + const common_ngram ngram = item.first; + common_ngram_cache_part token_counts = item.second; GGML_ASSERT(!token_counts.empty()); const int32_t ntokens = token_counts.size(); GGML_ASSERT(ntokens > 0); - file_out.write(reinterpret_cast(&ngram), sizeof(llama_ngram)); + file_out.write(reinterpret_cast(&ngram), sizeof(common_ngram)); file_out.write(reinterpret_cast(&ntokens), sizeof(int32_t)); for (std::pair item2 : token_counts) { const llama_token token = item2.first; @@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen } -llama_ngram_cache llama_ngram_cache_load(std::string & filename) { +common_ngram_cache common_ngram_cache_load(std::string & filename) { std::ifstream hashmap_file(filename, std::ios::binary); if (!hashmap_file) { throw std::ifstream::failure("Unable to open file " + filename); } - llama_ngram_cache ngram_cache; + common_ngram_cache ngram_cache; - llama_ngram ngram; + common_ngram ngram; int32_t ntokens; llama_token token; int32_t count; @@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) { char * ntokensc = reinterpret_cast(&ntokens); char * tokenc = reinterpret_cast(&token); char * countc = reinterpret_cast(&count); - while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { + while(hashmap_file.read(ngramc, sizeof(common_ngram))) { GGML_ASSERT(!hashmap_file.eof()); GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); GGML_ASSERT(ntokens > 0); - llama_ngram_cache_part token_counts; + common_ngram_cache_part token_counts; for (int i = 0; i < ntokens; ++i) { GGML_ASSERT(!hashmap_file.eof()); @@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) { return ngram_cache; } -void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { - for (std::pair ngram_part : ngram_cache_add) { - const llama_ngram ngram = ngram_part.first; - llama_ngram_cache_part part = ngram_part.second; +void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) { + for (std::pair ngram_part : ngram_cache_add) { + const common_ngram ngram = ngram_part.first; + common_ngram_cache_part part = ngram_part.second; - llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); + common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); if (part_merged_it == ngram_cache_target.end()) { ngram_cache_target.emplace(ngram, part); continue; @@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram const int32_t count = token_count.second; GGML_ASSERT(count > 0); - llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); + common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); if (token_count_merged_it == part_merged_it->second.end()) { part_merged_it->second.emplace(token, count); continue; diff --git a/common/ngram-cache.h b/common/ngram-cache.h index ab4c9b376..09c2b0319 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -12,22 +12,22 @@ // Data structures to map n-grams to empirical token probabilities: -struct llama_ngram { +struct common_ngram { llama_token tokens[LLAMA_NGRAM_MAX]; - llama_ngram() { + common_ngram() { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { tokens[i] = -1; } } - llama_ngram(const llama_token * input, const int ngram_size) { + common_ngram(const llama_token * input, const int ngram_size) { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { tokens[i] = i < ngram_size ? input[i] : -1; } } - bool operator==(const llama_ngram & other) const { + bool operator==(const common_ngram & other) const { for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { if (tokens[i] != other.tokens[i]) { return false; @@ -37,28 +37,28 @@ struct llama_ngram { } }; -struct llama_token_hash_function { +struct common_token_hash_function { size_t operator()(const llama_token token) const { // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ return token * 11400714819323198485llu; } }; -struct llama_ngram_hash_function { - size_t operator()(const llama_ngram & ngram) const { - size_t hash = llama_token_hash_function{}(ngram.tokens[0]); +struct common_ngram_hash_function { + size_t operator()(const common_ngram & ngram) const { + size_t hash = common_token_hash_function{}(ngram.tokens[0]); for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { - hash ^= llama_token_hash_function{}(ngram.tokens[i]); + hash ^= common_token_hash_function{}(ngram.tokens[i]); } return hash; } }; // token -> number of times token has been seen -typedef std::unordered_map llama_ngram_cache_part; +typedef std::unordered_map common_ngram_cache_part; // n-gram -> empirical distribution of following tokens -typedef std::unordered_map llama_ngram_cache; +typedef std::unordered_map common_ngram_cache; // Update an ngram cache with tokens. @@ -70,8 +70,8 @@ typedef std::unordered_map & inp_data, int nnew, bool print_progress); +void common_ngram_cache_update( + common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp_data, int nnew, bool print_progress); // Try to draft tokens from ngram caches. // inp: the tokens generated so far. @@ -81,21 +81,21 @@ void llama_ngram_cache_update( // nc_context: ngram cache based on current context. // nc_dynamic: ngram cache based on previous user generations. // nc_static: ngram cache generated from a large text corpus, used for validation. -void llama_ngram_cache_draft( +void common_ngram_cache_draft( std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, - llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); + common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static); // Save an ngram cache to a file. // ngram_cache: the ngram cache to save. // filename: the path under which to save the ngram cache. -void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); +void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename); -// Load an ngram cache saved with llama_ngram_cache_save. +// Load an ngram cache saved with common_ngram_cache_save. // filename: the path from which to load the ngram cache. // returns: an ngram cache containing the information saved to filename. -llama_ngram_cache llama_ngram_cache_load(std::string & filename); +common_ngram_cache common_ngram_cache_load(std::string & filename); // Merge two ngram caches. // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. // ngram_cache_add: the ngram cache to add to ngram_cache_target. -void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); +void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add); diff --git a/common/sampling.cpp b/common/sampling.cpp index 3dc7f1120..56cd0df6b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -98,8 +98,8 @@ struct ring_buffer { std::vector data; }; -struct gpt_sampler { - gpt_sampler_params params; +struct common_sampler { + common_sampler_params params; struct llama_sampler * grmr; struct llama_sampler * chain; @@ -125,26 +125,26 @@ struct gpt_sampler { } }; -std::string gpt_sampler_params::print() const { +std::string common_sampler_params::print() const { char result[1024]; snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, - top_k, tfs_z, top_p, min_p, typ_p, temp, + top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, mirostat, mirostat_eta, mirostat_tau); return std::string(result); } -struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) { +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) { llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); lparams.no_perf = params.no_perf; - auto * result = new gpt_sampler { + auto * result = new common_sampler { /* .params = */ params, /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), /* .chain = */ llama_sampler_chain_init(lparams), @@ -175,24 +175,30 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st if (params.mirostat == 0) { for (const auto & cnstr : params.samplers) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: + case COMMON_SAMPLER_TYPE_TOP_K: llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); break; - case GPT_SAMPLER_TYPE_TOP_P: + case COMMON_SAMPLER_TYPE_TOP_P: llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); break; - case GPT_SAMPLER_TYPE_MIN_P: + case COMMON_SAMPLER_TYPE_MIN_P: llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); break; - case GPT_SAMPLER_TYPE_TFS_Z: + case COMMON_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + break; + case COMMON_SAMPLER_TYPE_TFS_Z: llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); break; - case GPT_SAMPLER_TYPE_TYPICAL_P: + case COMMON_SAMPLER_TYPE_TYPICAL_P: llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); break; - case GPT_SAMPLER_TYPE_TEMPERATURE: + case COMMON_SAMPLER_TYPE_TEMPERATURE: llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; + case COMMON_SAMPLER_TYPE_INFILL: + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); + break; default: GGML_ASSERT(false && "unknown sampler type"); } @@ -224,7 +230,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st return result; } -void gpt_sampler_free(struct gpt_sampler * gsmpl) { +void common_sampler_free(struct common_sampler * gsmpl) { if (gsmpl) { llama_sampler_free(gsmpl->grmr); @@ -234,7 +240,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) { } } -void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) { +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { if (accept_grammar) { llama_sampler_accept(gsmpl->grmr, token); } @@ -244,14 +250,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce gsmpl->prev.push_back(token); } -void gpt_sampler_reset(struct gpt_sampler * gsmpl) { +void common_sampler_reset(struct common_sampler * gsmpl) { llama_sampler_reset(gsmpl->grmr); llama_sampler_reset(gsmpl->chain); } -struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { - return new gpt_sampler { +struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { + return new common_sampler { /* .params = */ gsmpl->params, /* .grmr = */ llama_sampler_clone(gsmpl->grmr), /* .chain = */ llama_sampler_clone(gsmpl->chain), @@ -261,7 +267,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) { }; } -void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) { +void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { // TODO: measure grammar performance if (gsmpl) { @@ -272,7 +278,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * } } -llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { gsmpl->set_logits(ctx, idx); auto & grmr = gsmpl->grmr; @@ -318,21 +324,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return cur_p.data[cur_p.selected].id; } -uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { +uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } // helpers -llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) { +llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) { return &gsmpl->cur_p; } -llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) { +llama_token common_sampler_last(const struct common_sampler * gsmpl) { return gsmpl->prev.rat(0); } -std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { +std::string common_sampler_print(const struct common_sampler * gsmpl) { std::string result = "logits "; for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { @@ -343,7 +349,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) { return result; } -std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) { +std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) { n = std::min(n, (int) gsmpl->prev.size()); if (n <= 0) { @@ -358,63 +364,69 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen"); - result += llama_token_to_piece(ctx_main, id); + result += common_token_to_piece(ctx_main, id); } return result; } -char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) { +char common_sampler_type_to_chr(enum common_sampler_type cnstr) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: return 'k'; - case GPT_SAMPLER_TYPE_TFS_Z: return 'f'; - case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y'; - case GPT_SAMPLER_TYPE_TOP_P: return 'p'; - case GPT_SAMPLER_TYPE_MIN_P: return 'm'; - case GPT_SAMPLER_TYPE_TEMPERATURE: return 't'; + case COMMON_SAMPLER_TYPE_TOP_K: return 'k'; + case COMMON_SAMPLER_TYPE_TFS_Z: return 'f'; + case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y'; + case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; + case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; + case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; + case COMMON_SAMPLER_TYPE_XTC: return 'x'; + case COMMON_SAMPLER_TYPE_INFILL: return 'i'; default : return '?'; } } -std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) { +std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { switch (cnstr) { - case GPT_SAMPLER_TYPE_TOP_K: return "top_k"; - case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z"; - case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; - case GPT_SAMPLER_TYPE_TOP_P: return "top_p"; - case GPT_SAMPLER_TYPE_MIN_P: return "min_p"; - case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case COMMON_SAMPLER_TYPE_TOP_K: return "top_k"; + case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z"; + case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p"; + case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; + case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; + case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case COMMON_SAMPLER_TYPE_XTC: return "xtc"; + case COMMON_SAMPLER_TYPE_INFILL: return "infill"; default : return ""; } } -std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { - std::unordered_map sampler_canonical_name_map { - { "top_k", GPT_SAMPLER_TYPE_TOP_K }, - { "top_p", GPT_SAMPLER_TYPE_TOP_P }, - { "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "min_p", GPT_SAMPLER_TYPE_MIN_P }, - { "tfs_z", GPT_SAMPLER_TYPE_TFS_Z }, - { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE }, +std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + { "top_k", COMMON_SAMPLER_TYPE_TOP_K }, + { "top_p", COMMON_SAMPLER_TYPE_TOP_P }, + { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, + { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z }, + { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, + { "xtc", COMMON_SAMPLER_TYPE_XTC }, + { "infill", COMMON_SAMPLER_TYPE_INFILL }, }; // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map sampler_alt_name_map { - { "top-k", GPT_SAMPLER_TYPE_TOP_K }, - { "top-p", GPT_SAMPLER_TYPE_TOP_P }, - { "nucleus", GPT_SAMPLER_TYPE_TOP_P }, - { "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typical", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "typ", GPT_SAMPLER_TYPE_TYPICAL_P }, - { "min-p", GPT_SAMPLER_TYPE_MIN_P }, - { "tfs-z", GPT_SAMPLER_TYPE_TFS_Z }, - { "tfs", GPT_SAMPLER_TYPE_TFS_Z }, - { "temp", GPT_SAMPLER_TYPE_TEMPERATURE }, + std::unordered_map sampler_alt_name_map { + { "top-k", COMMON_SAMPLER_TYPE_TOP_K }, + { "top-p", COMMON_SAMPLER_TYPE_TOP_P }, + { "nucleus", COMMON_SAMPLER_TYPE_TOP_P }, + { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, + { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, + { "tfs-z", COMMON_SAMPLER_TYPE_TFS_Z }, + { "tfs", COMMON_SAMPLER_TYPE_TFS_Z }, + { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, }; - std::vector samplers; + std::vector samplers; samplers.reserve(names.size()); for (const auto & name : names) { @@ -434,17 +446,19 @@ std::vector gpt_sampler_types_from_names(const std::vector gpt_sampler_types_from_chars(const std::string & chars) { - std::unordered_map sampler_name_map = { - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P }, - { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE } +std::vector common_sampler_types_from_chars(const std::string & chars) { + std::unordered_map sampler_name_map = { + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, }; - std::vector samplers; + std::vector samplers; samplers.reserve(chars.size()); for (const auto & c : chars) { diff --git a/common/sampling.h b/common/sampling.h index d0e1a9203..d37f25ad3 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -7,7 +7,7 @@ #include #include -// gpt_sampler extends llama_sampler with additional functionality: +// common_sampler extends llama_sampler with additional functionality: // // - grammar support // - custom sampler logic based on the parameters @@ -23,30 +23,30 @@ // token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the // grammar constraints are applied to the full vocabulary and the token is resampled. // -// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can +// The common_sampler also maintains a container with the last accepted tokens. In the future, this can // be moved into the core llama library. // -// For convenience, the gpt_sampler also maintains a container with the current candidate tokens. +// For convenience, the common_sampler also maintains a container with the current candidate tokens. // This can be used to access the probabilities of the rest of the non-sampled tokens. // // TODO: measure grammar performance // -struct gpt_sampler; +struct common_sampler; // llama_sampler API overloads -struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params); +struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params); -void gpt_sampler_free(struct gpt_sampler * gsmpl); +void common_sampler_free(struct common_sampler * gsmpl); // if accept_grammar is true, the token is accepted both by the sampling chain and the grammar -void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar); -void gpt_sampler_reset (struct gpt_sampler * gsmpl); -struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl); +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); +void common_sampler_reset (struct common_sampler * gsmpl); +struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); // arguments can be nullptr to skip printing -void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl); +void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); // extended sampling implementation: // @@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // if grammar_first is true, the grammar is applied before the samplers (slower) // useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar // -llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); -uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); +uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); // helpers // access the internal list of current candidate tokens -llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl); +llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl); // get the last accepted token -llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl); +llama_token common_sampler_last(const struct common_sampler * gsmpl); // print the sampler chain into a string -std::string gpt_sampler_print(const struct gpt_sampler * gsmpl); +std::string common_sampler_print(const struct common_sampler * gsmpl); // get a string representation of the last accepted tokens -std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n); +std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n); -char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr); -std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr); +char common_sampler_type_to_chr(enum common_sampler_type cnstr); +std::string common_sampler_type_to_str(enum common_sampler_type cnstr); -std::vector gpt_sampler_types_from_names(const std::vector & names, bool allow_alt_names); -std::vector gpt_sampler_types_from_chars(const std::string & chars); +std::vector common_sampler_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector common_sampler_types_from_chars(const std::string & chars); diff --git a/docs/build.md b/docs/build.md index faa0ecfa4..4e362ebc7 100644 --- a/docs/build.md +++ b/docs/build.md @@ -198,6 +198,8 @@ The following compilation options are also available to tweak performance: ### MUSA +This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa). + - Using `make`: ```bash make GGML_MUSA=1 @@ -209,6 +211,12 @@ The following compilation options are also available to tweak performance: cmake --build build --config Release ``` +The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used. + +The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. + +Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet. + ### hipBLAS This provides BLAS acceleration on HIP-supported AMD GPUs. diff --git a/docs/docker.md b/docs/docker.md index e8a084173..8d90e6ded 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -19,8 +19,11 @@ Additionally, there the following images, similar to the above: - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) -The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). +The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). ## Usage @@ -84,3 +87,37 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` + +## Docker With MUSA + +Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container. + +## Building Docker locally + +```bash +docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile . +docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile . +docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile . +``` + +You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture. + +The defaults are: + +- `MUSA_VERSION` set to `rc3.1.0` + +The resulting images, are essentially the same as the non-MUSA images: + +1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `local/llama.cpp:light-musa`: This image only includes the main executable file. +3. `local/llama.cpp:server-musa`: This image only includes the server executable file. + +## Usage + +After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag. + +```bash +docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 +``` diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 4a15941f1..81c3220ad 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { return 1; } - gpt_init(); + common_init(); int is_pp_shared = params.is_pp_shared; @@ -36,7 +36,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_to_llama(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -45,7 +45,7 @@ int main(int argc, char ** argv) { return 1; } - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_to_llama(params); // ensure enough sequences are available ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { // warm up { for (int i = 0; i < 16; ++i) { - llama_batch_add(batch, 0, i, { 0 }, false); + common_batch_add(batch, 0, i, { 0 }, false); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { @@ -122,11 +122,11 @@ int main(int argc, char ** argv) { continue; } - llama_batch_clear(batch); + common_batch_clear(batch); for (int i = 0; i < pp; ++i) { for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { - llama_batch_add(batch, 0, i, { j }, false); + common_batch_add(batch, 0, i, { j }, false); } } batch.logits[batch.n_tokens - 1] = true; @@ -151,10 +151,10 @@ int main(int argc, char ** argv) { const auto t_tg_start = ggml_time_us(); for (int i = 0; i < tg; ++i) { - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < pl; ++j) { - llama_batch_add(batch, 0, pp + i, { j }, true); + common_batch_add(batch, 0, pp + i, { j }, true); } if (!decode_helper(ctx, batch, ctx_params.n_batch)) { diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 7887a43d6..3b554033e 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } - gpt_init(); + common_init(); // number of parallel batches int n_parallel = params.n_parallel; @@ -39,7 +39,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_to_llama(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -51,13 +51,13 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(model, params.prompt, true); + tokens_list = common_tokenize(model, params.prompt, true); const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; // initialize the context - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_to_llama(params); ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_predict, n_parallel); @@ -94,7 +94,7 @@ int main(int argc, char ** argv) { LOG("\n"); for (auto id : tokens_list) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", common_token_to_piece(ctx, id).c_str()); } // create a llama_batch @@ -108,7 +108,7 @@ int main(int argc, char ** argv) { // evaluate the initial prompt for (size_t i = 0; i < tokens_list.size(); ++i) { - llama_batch_add(batch, tokens_list[i], i, seq_ids, false); + common_batch_add(batch, tokens_list[i], i, seq_ids, false); } GGML_ASSERT(batch.n_tokens == (int) tokens_list.size()); @@ -123,8 +123,8 @@ int main(int argc, char ** argv) { decoder_start_token_id = llama_token_bos(model); } - llama_batch_clear(batch); - llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); + common_batch_clear(batch); + common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false); } // llama_decode will output logits only for the last token of the prompt @@ -161,7 +161,7 @@ int main(int argc, char ** argv) { while (n_cur <= n_predict) { // prepare the next batch - llama_batch_clear(batch); + common_batch_clear(batch); // sample the next token for each parallel sequence / stream for (int32_t i = 0; i < n_parallel; ++i) { @@ -185,15 +185,15 @@ int main(int argc, char ** argv) { // if there is only one stream, we print immediately to stdout if (n_parallel == 1) { - LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG("%s", common_token_to_piece(ctx, new_token_id).c_str()); } - streams[i] += llama_token_to_piece(ctx, new_token_id); + streams[i] += common_token_to_piece(ctx, new_token_id); i_batch[i] = batch.n_tokens; // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_cur, { i }, true); + common_batch_add(batch, new_token_id, n_cur, { i }, true); n_decode += 1; } diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index c140daed3..988a584c9 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -872,7 +872,7 @@ static std::string basename(const std::string &path) { } int main(int argc, char ** argv) { - gpt_init(); + common_init(); struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 41bf4eb2a..69e141ecb 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -31,7 +31,7 @@ template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); + ret += common_token_to_piece(ctx, *begin); } return ret; @@ -272,8 +272,8 @@ struct tokenized_prompt { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true); - tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true); + tokens_pos = common_tokenize(ctx, pos, add_bos, true); + tokens_neg = common_tokenize(ctx, neg, add_bos, true); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); padding_seq(ctx, tokens_pos, max_seq_len); padding_seq(ctx, tokens_neg, max_seq_len); @@ -281,7 +281,7 @@ struct tokenized_prompt { void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { // TODO: customize padding token - std::vector pad_tokens = ::llama_tokenize(ctx, " ", false); + std::vector pad_tokens = common_tokenize(ctx, " ", false); llama_token pad_tok = pad_tokens.back(); while (tokens.size() < len) { tokens.push_back(pad_tok); @@ -370,7 +370,7 @@ static void export_gguf(const std::vector & v_ctrl, const * Load prompt files and completion file. * Then format each pair of prompt + completion to make an entry. */ -static int prepare_entries(gpt_params & params, train_context & ctx_train) { +static int prepare_entries(common_params & params, train_context & ctx_train) { // load prompts std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); @@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } @@ -413,7 +413,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model to get hparams - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 734926822..3f18fc6a7 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -28,7 +28,7 @@ static std::vector split_lines(const std::string & s, const std::st static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); + common_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -74,18 +74,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + embd_pos * n_embd; - llama_embd_normalize(embd, out, n_embd, embd_norm); + common_embd_normalize(embd, out, n_embd, embd_norm); } } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) { return 1; } - gpt_init(); + common_init(); params.embedding = true; // For non-causal models, batch size must be equal to ubatch size @@ -95,7 +95,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -122,7 +122,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } // split the prompt into lines @@ -135,7 +135,7 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim std::vector> inputs; for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true, true); + auto inp = common_tokenize(ctx, prompt, true, true); if (inp.size() > n_batch) { LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); @@ -159,7 +159,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); for (int j = 0; j < (int) inputs[i].size(); j++) { - LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str()); } LOG("\n\n"); } @@ -199,7 +199,7 @@ int main(int argc, char ** argv) { batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; s = 0; - llama_batch_clear(batch); + common_batch_clear(batch); } // add to batch @@ -263,7 +263,7 @@ int main(int argc, char ** argv) { LOG("\n"); for (int i = 0; i < n_prompts; i++) { for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); LOG("%6.2f ", sim); } LOG("%1.10s", prompts[i].c_str()); @@ -296,7 +296,7 @@ int main(int argc, char ** argv) { for (int i = 0;;) { // at least two iteration (n_embd_count > 1) LOG(" ["); for (int j = 0;;) { // at least two iteration (n_embd_count > 1) - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); LOG("%6.2f", sim); j++; if (j < n_embd_count) LOG(", "); else break; diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 6d629fe4e..fb52db4e1 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -126,10 +126,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { return true; } -static bool run(llama_context * ctx, const gpt_params & params) { +static bool run(llama_context * ctx, const common_params & params) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = common_tokenize(ctx, params.prompt, add_bos); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { LOG_ERR("%s : failed to eval\n", __func__); @@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) { int main(int argc, char ** argv) { callback_data cb_data; - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); llama_backend_init(); llama_numa_init(params.numa); @@ -160,7 +160,7 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -172,7 +172,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); } diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 0051a5eb6..67662313d 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -128,7 +128,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector & lora_files, + std::vector & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors @@ -314,9 +314,9 @@ struct lora_merge_ctx { // optionally dequantize it printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); auto nels = ggml_nelements(inp_base); - ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); + const auto * qtype = ggml_get_type_traits(base->type); std::vector dequant_buf(nels * sizeof(float)); - qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels); ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); } else { ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); @@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 4b19a9dc2..77c59a836 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -11,7 +11,7 @@ static void write_table_header(std::ofstream & file) { file << "| -------- | ----------- |\n"; } -static void write_table_entry(std::ofstream & file, const llama_arg & opt) { +static void write_table_entry(std::ofstream & file, const common_arg & opt) { file << "| `"; // args for (const auto & arg : opt.args) { @@ -40,7 +40,7 @@ static void write_table_entry(std::ofstream & file, const llama_arg & opt) { file << "` | " << md_help << " |\n"; } -static void write_table(std::ofstream & file, std::vector & opts) { +static void write_table(std::ofstream & file, std::vector & opts) { write_table_header(file); for (const auto & opt : opts) { write_table_entry(file, *opt); @@ -50,12 +50,12 @@ static void write_table(std::ofstream & file, std::vector & opts) { static void export_md(std::string fname, llama_example ex) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); - gpt_params params; - auto ctx_arg = gpt_params_parser_init(params, ex); + common_params params; + auto ctx_arg = common_params_parser_init(params, ex); - std::vector common_options; - std::vector sparam_options; - std::vector specific_options; + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; for (auto & opt : ctx_arg.options) { // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example if (opt.is_sparam) { diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 20b99a4fd..6e42fa073 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -15,11 +15,11 @@ static std::vector> encode(llama_context * ctx, const std::ve llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); for (uint64_t i = 0; i < sentences.size(); i++) { - llama_batch_clear(batch); + common_batch_clear(batch); const std::string input_string = instruction + sentences[i]; - std::vector inputs = llama_tokenize(model, input_string, true, false); + std::vector inputs = common_tokenize(model, input_string, true, false); const int32_t n_toks = inputs.size(); @@ -28,7 +28,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // inputs.push_back(llama_token_eos(model)); // we want to ignore instruction tokens for mean pooling - const int32_t n_inst = llama_tokenize(model, instruction, true, false).size(); + const int32_t n_inst = common_tokenize(model, instruction, true, false).size(); #ifdef GRIT_DEBUG // debug tokens - should be matching as referenced in the GritLM sample @@ -40,7 +40,7 @@ static std::vector> encode(llama_context * ctx, const std::ve // add input to batch (this increments n_tokens) for (int32_t j = 0; j < n_toks; j++) { - llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); + common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); } // clear previous kv_cache values (irrelevant for embeddings) @@ -75,7 +75,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } std::vector emb_norm(emb_unorm.size()); - llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); + common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); result.push_back(emb_norm); #ifdef GRIT_DEBUG @@ -105,16 +105,16 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); - std::vector inputs = llama_tokenize(model, prompt, false, true); + std::vector inputs = common_tokenize(model, prompt, false, true); int32_t i_current_token = 0; while (true) { - llama_batch_clear(bat); + common_batch_clear(bat); { const int32_t n_inputs = inputs.size(); for (int32_t i = 0; i < n_inputs; i++) { - llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); + common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); } } inputs.clear(); @@ -127,7 +127,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std break; } - std::string piece = llama_token_to_piece(ctx, token); + std::string piece = common_token_to_piece(ctx, token); if (stream) { std::printf("%s", piece.c_str()); std::fflush(stdout); @@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) { } int main(int argc, char * argv[]) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); - llama_model_params mparams = llama_model_params_from_gpt_params(params); - llama_context_params cparams = llama_context_params_from_gpt_params(params); + llama_model_params mparams = common_model_params_to_llama(params); + llama_context_params cparams = common_context_params_to_llama(params); llama_backend_init(); @@ -199,10 +199,10 @@ int main(int argc, char * argv[]) { const int n_embd = llama_n_embd(model); - const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); - const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); - const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index c8e273529..d1ff3e8bc 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -37,13 +37,13 @@ struct Stats { class IMatrixCollector { public: IMatrixCollector() = default; - void set_params(gpt_params params) { m_params = std::move(params); } + void set_params(common_params params) { m_params = std::move(params); } bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix(int ncall = -1) const; bool load_imatrix(const char * file_name); private: std::unordered_map m_stats; - gpt_params m_params; + common_params m_params; std::mutex m_mutex; int m_last_call = 0; std::vector m_src1_data; @@ -428,7 +428,7 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { +static bool compute_imatrix(llama_context * ctx, const common_params & params) { const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); const int n_ctx = llama_n_ctx(ctx); @@ -436,7 +436,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = common_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -568,17 +568,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_ctx = 512; params.logits_all = true; params.escape = false; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { return 1; } - gpt_init(); + common_init(); params.n_batch = std::min(params.n_batch, params.n_ctx); @@ -607,7 +607,7 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -625,7 +625,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } if (!compute_imatrix(ctx, params)) { diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index d52425ae6..f82c614f5 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -35,8 +35,8 @@ static llama_context ** g_ctx; static llama_model ** g_model; -static gpt_sampler ** g_smpl; -static gpt_params * g_params; +static common_sampler ** g_smpl; +static common_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; @@ -44,7 +44,7 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const std::vector & input_tokens, const std::string & output, const std::vector & output_tokens ) { @@ -95,12 +95,12 @@ static void sigint_handler(int signo) { } else { console::cleanup(); LOG("\n"); - gpt_perf_print(*g_ctx, *g_smpl); + common_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); - gpt_log_pause(gpt_log_main()); + common_log_pause(common_log_main()); _exit(130); } @@ -109,14 +109,14 @@ static void sigint_handler(int signo) { #endif int main(int argc, char ** argv) { - gpt_params params; + common_params params; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) { return 1; } - gpt_init(); + common_init(); auto & sparams = params.sparams; @@ -166,7 +166,7 @@ int main(int argc, char ** argv) { llama_model * model = nullptr; llama_context * ctx = nullptr; - gpt_sampler * smpl = nullptr; + common_sampler * smpl = nullptr; g_model = &model; g_ctx = &ctx; @@ -174,7 +174,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; @@ -195,21 +195,21 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } const bool add_bos = llama_add_bos_token(model); GGML_ASSERT(!llama_add_eos_token(model)); std::vector embd_inp; std::vector embd_end; - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - GGML_ASSERT(llama_token_prefix(model) >= 0); - GGML_ASSERT(llama_token_suffix(model) >= 0); + GGML_ASSERT(llama_token_fim_pre(model) >= 0); + GGML_ASSERT(llama_token_fim_suf(model) >= 0); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; @@ -218,7 +218,7 @@ int main(int argc, char ** argv) { } embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_middle(model); + const llama_token middle_token = llama_token_fim_mid(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } @@ -257,13 +257,13 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > 0) { LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); } LOG_CNT("'\n"); } @@ -298,11 +298,11 @@ int main(int argc, char ** argv) { LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); } } - smpl = gpt_sampler_init(model, sparams); + smpl = common_sampler_init(model, sparams); - LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); @@ -411,9 +411,9 @@ int main(int argc, char ** argv) { embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = gpt_sampler_sample(smpl, ctx, -1); + const llama_token id = common_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); @@ -434,7 +434,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], false); + common_sampler_accept(smpl, embd_inp[n_consumed], false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -446,7 +446,7 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); LOG("%s", token_str.c_str()); if (embd.size() > 1) { @@ -465,10 +465,10 @@ int main(int argc, char ** argv) { // if not currently processing queued inputs; if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ + if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if (is_interacting && !params.interactive_first) { // print an eot token - LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); } LOG("\n"); console::set_display(console::user_input); @@ -505,11 +505,11 @@ int main(int argc, char ** argv) { } // tokenize new prefix and suffix - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + std::vector inp_pfx = common_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = common_tokenize(ctx, params.input_suffix, false); - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); + inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx; @@ -529,7 +529,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of generation tokens in interactive mode - else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { + else if (llama_token_is_eog(model, common_sampler_last(smpl))) { LOG_DBG("found EOS token\n"); if (params.interactive) { @@ -579,7 +579,7 @@ int main(int argc, char ** argv) { const size_t original_size = embd_inp.size(); - const auto line_inp = ::llama_tokenize(ctx, buffer, false); + const auto line_inp = common_tokenize(ctx, buffer, false); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); @@ -587,7 +587,7 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token); + output_ss << common_token_to_piece(ctx, token); } n_remain -= line_inp.size(); @@ -601,7 +601,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - gpt_sampler_reset(smpl); + common_sampler_reset(smpl); } is_interacting = false; } @@ -620,17 +620,17 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); } LOG("\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); llama_free(ctx); llama_free_model(model); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_backend_free(); return 0; diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index a8779bf3b..fc9f0097f 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -540,7 +540,7 @@ class SchemaConverter: return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space") + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") def _resolve_ref(self, ref): diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index fb1d387b2..c22bdedcf 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -304,9 +304,9 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); -#ifdef GGML_USE_RPC - printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); -#endif + if (llama_supports_rpc()) { + printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); + } printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); @@ -497,14 +497,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); -#ifdef GGML_USE_RPC - } else if (arg == "-rpc" || arg == "--rpc") { + } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { if (++i >= argc) { invalid_param = true; break; } params.rpc_servers.push_back(argv[i]); -#endif } else if (arg == "-sm" || arg == "--split-mode") { if (++i >= argc) { invalid_param = true; diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts index 0a3806172..2d1dfba20 100644 --- a/examples/llama.android/llama/build.gradle.kts +++ b/examples/llama.android/llama/build.gradle.kts @@ -18,6 +18,7 @@ android { } externalNativeBuild { cmake { + arguments += "-DLLAMA_BUILD_COMMON=ON" arguments += "-DCMAKE_BUILD_TYPE=Release" cppFlags += listOf() arguments += listOf() diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index f611809c6..f5ffd063f 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -186,11 +186,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( for (nri = 0; nri < nr; nri++) { LOGi("Benchmark prompt processing (pp)"); - llama_batch_clear(*batch); + common_batch_clear(*batch); const int n_tokens = pp; for (i = 0; i < n_tokens; i++) { - llama_batch_add(*batch, 0, i, { 0 }, false); + common_batch_add(*batch, 0, i, { 0 }, false); } batch->logits[batch->n_tokens - 1] = true; @@ -210,9 +210,9 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( const auto t_tg_start = ggml_time_us(); for (i = 0; i < tg; i++) { - llama_batch_clear(*batch); + common_batch_clear(*batch); for (j = 0; j < pl; j++) { - llama_batch_add(*batch, 0, i, { j }, true); + common_batch_add(*batch, 0, i, { j }, true); } LOGi("llama_decode() text generation: %d", i); @@ -357,7 +357,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( const auto context = reinterpret_cast(context_pointer); const auto batch = reinterpret_cast(batch_pointer); - const auto tokens_list = llama_tokenize(context, text, 1); + const auto tokens_list = common_tokenize(context, text, 1); auto n_ctx = llama_n_ctx(context); auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); @@ -369,14 +369,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( } for (auto id : tokens_list) { - LOGi("%s", llama_token_to_piece(context, id).c_str()); + LOGi("%s", common_token_to_piece(context, id).c_str()); } - llama_batch_clear(*batch); + common_batch_clear(*batch); // evaluate the initial prompt for (auto i = 0; i < tokens_list.size(); i++) { - llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); + common_batch_add(*batch, tokens_list[i], i, { 0 }, false); } // llama_decode will output logits only for the last token of the prompt @@ -419,7 +419,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( return nullptr; } - auto new_token_chars = llama_token_to_piece(context, new_token_id); + auto new_token_chars = common_token_to_piece(context, new_token_id); cached_token_chars += new_token_chars; jstring new_token = nullptr; @@ -431,8 +431,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( new_token = env->NewStringUTF(""); } - llama_batch_clear(*batch); - llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); + common_batch_clear(*batch); + common_batch_add(*batch, new_token_id, n_cur, { 0 }, true); env->CallVoidMethod(intvar_ncur, la_int_var_inc); diff --git a/examples/llama.vim b/examples/llama.vim deleted file mode 100644 index 1b5ad6ba0..000000000 --- a/examples/llama.vim +++ /dev/null @@ -1,135 +0,0 @@ -" Requires an already running llama.cpp server -" To install either copy or symlink to ~/.vim/autoload/llama.vim -" Then start with either :call llama#doLlamaGen(), -" or add a keybind to your vimrc such as -" nnoremap Z :call llama#doLlamaGen() -" Similarly, you could add an insert mode keybind with -" inoremap call llama#doLlamaGen() -" -" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc -" let g:llama_api_url = "192.168.1.10:8080" -" llama_overrides can also be set through buffer/window scopes. For instance -" autocmd filetype python let b:llama_overrides = {"temp": 0.2} -" Could be added to your .vimrc to automatically set a lower temperature when -" editing a python script -" Additionally, an override dict can be stored at the top of a file -" !*{"stop": ["User:"]} -" Could be added to the start of your chatlog.txt to set the stopping token -" These parameter dicts are merged together from lowest to highest priority: -" server default -> g:llama_overrides -> w:llama_overrides -> -" b:llama_overrides -> in file (!*) overrides -" -" Sublists (like logit_bias and stop) are overridden, not merged -" Example override: -" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} -if !exists("g:llama_api_url") - let g:llama_api_url= "127.0.0.1:8080" -endif -if !exists("g:llama_overrides") - let g:llama_overrides = {} -endif -const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } -const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] -let s:linedict = {} - -func s:callbackHandler(bufn, channel, msg) - if len(a:msg) < 3 - return - elseif a:msg[0] == "d" - let l:msg = a:msg[6:-1] - else - let l:msg = a:msg - endif - let l:decoded_msg = json_decode(l:msg) - let l:newtext = split(l:decoded_msg['content'], "\n", 1) - if len(l:newtext) > 0 - call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) - else - echo "nothing genned" - endif - if len(newtext) > 1 - let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) - let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 - endif - if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop - echo "Finished generation" - endif -endfunction - -func llama#doLlamaGen() - if exists("b:job") - if job_status(b:job) == "run" - call job_stop(b:job) - return - endif - endif - - let l:cbuffer = bufnr("%") - let s:linedict[l:cbuffer] = line('$') - let l:buflines = getbufline(l:cbuffer, 1, 1000) - let l:querydata = copy(s:querydata) - call extend(l:querydata, g:llama_overrides) - if exists("w:llama_overrides") - call extend(l:querydata, w:llama_overrides) - endif - if exists("b:llama_overrides") - call extend(l:querydata, b:llama_overrides) - endif - if l:buflines[0][0:1] == '!*' - let l:userdata = json_decode(l:buflines[0][2:-1]) - call extend(l:querydata, l:userdata) - let l:buflines = l:buflines[1:-1] - endif - let l:querydata.prompt = join(l:buflines, "\n") - let l:curlcommand = copy(s:curlcommand) - if exists("g:llama_api_key") - call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key]) - endif - let l:curlcommand[2] = json_encode(l:querydata) - let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) -endfunction - -" Echos the tokkenization of the provided string , or cursor to end of word -" Onus is placed on the user to include the preceding space -func llama#tokenizeWord(...) - if (a:0 > 0) - let l:input = a:1 - else - exe "normal \"*ye" - let l:input = @* - endif - let l:querydata = {"content": l:input} - let l:curlcommand = copy(s:curlcommand) - let l:curlcommand[2] = json_encode(l:querydata) - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" - let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) -endfunction - -func s:tokenizeWordCallback(plaintext, channel, msg) - echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) -endfunction - - -" Echos the token count of the entire buffer (or provided string) -" Example usage :echo llama#tokenCount() -func llama#tokenCount(...) - if (a:0 > 0) - let l:buflines = a:1 - else - let l:buflines = getline(1,1000) - if l:buflines[0][0:1] == '!*' - let l:buflines = l:buflines[1:-1] - endif - let l:buflines = join(l:buflines, "\n") - endif - let l:querydata = {"content": l:buflines} - let l:curlcommand = copy(s:curlcommand) - let l:curlcommand[2] = json_encode(l:querydata) - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" - let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) -endfunction - -func s:tokenCountCallback(channel, msg) - let resp = json_decode(a:msg) - echo len(resp.tokens) -endfunction diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 8f437863f..5f9abe2b6 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -37,21 +37,21 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } -static const char * sample(struct gpt_sampler * smpl, +static const char * sample(struct common_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); - gpt_sampler_accept(smpl, id, true); + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; } else { - ret = llama_token_to_piece(ctx_llama, id); + ret = common_token_to_piece(ctx_llama, id); } eval_id(ctx_llama, id, n_past); return ret.c_str(); @@ -120,7 +120,7 @@ static void print_usage(int, char ** argv) { LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { +static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { // load and preprocess the image llava_image_embed * embed = NULL; @@ -146,7 +146,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para return embed; } -static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) { +static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { int n_past = 0; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; @@ -159,16 +159,16 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ user_prompt = prompt.substr(image_pos + std::string("").length()); LOG_INF("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } LOG_INF("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -176,9 +176,9 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; user_prompt = prompt + "\nASSISTANT:"; if (params->verbose_prompt) { - auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } @@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ LOG("\n"); - struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); @@ -211,15 +211,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); LOG("\n"); } -static struct llama_model * llava_init(gpt_params * params) { +static struct llama_model * llava_init(common_params * params) { llama_backend_init(); llama_numa_init(params->numa); - llama_model_params model_params = llama_model_params_from_gpt_params(*params); + llama_model_params model_params = common_model_params_to_llama(*params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { @@ -229,7 +229,7 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -240,7 +240,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + llama_context_params ctx_params = common_context_params_to_llama(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); @@ -272,13 +272,13 @@ static void llava_free(struct llava_context * ctx_llava) { int main(int argc, char ** argv) { ggml_time_init(); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { return 1; } - gpt_init(); + common_init(); if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 8558c6bdc..2c96973c8 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - LOG_ERR("%s: coulnd't embed the image\n", __func__); + LOG_ERR("%s: couldn't embed the image\n", __func__); return NULL; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index c5156c35b..6b666de1b 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -25,11 +25,11 @@ static void show_additional_info(int /*argc*/, char ** argv) { LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llama_model * llava_init(gpt_params * params) { +static struct llama_model * llava_init(common_params * params) { llama_backend_init(); llama_numa_init(params->numa); - llama_model_params model_params = llama_model_params_from_gpt_params(*params); + llama_model_params model_params = common_model_params_to_llama(*params); llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { @@ -39,13 +39,13 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(common_params * params, llama_model * model) { auto prompt = params->prompt; if (prompt.empty()) { prompt = "describe the image in detail."; } - llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + llama_context_params ctx_params = common_context_params_to_llama(*params); if (params->n_ctx < 2048) { // warn user here, "Image processing requires at least 2048 context, setting context to 2048" LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); @@ -79,7 +79,7 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static struct clip_ctx * clip_init_context(gpt_params * params) { +static struct clip_ctx * clip_init_context(common_params * params) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -114,7 +114,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); } @@ -129,7 +129,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str llava_image_embed_free(slice_embed); } -static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { +static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) { std::string system_prompt; int idx = 0; int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); @@ -162,22 +162,22 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e LOG_INF("%s: image token past: %d\n", __func__, n_past); } -static const char * sample(struct gpt_sampler * smpl, +static const char * sample(struct common_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); - gpt_sampler_accept(smpl, id, true); + const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); + common_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; } else { - ret = llama_token_to_piece(ctx_llama, id); + ret = common_token_to_piece(ctx_llama, id); } eval_id(ctx_llama, id, n_past); return ret.c_str(); } -static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ +static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){ auto * ctx_clip = clip_init_context(params); auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); if (!embeds) { @@ -213,7 +213,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri return ctx_llava; } -static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){ +static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){ std::string user_prompt = prompt; int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); if (!is_first) { @@ -237,11 +237,11 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par LOG_INF("\n"); - struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams); return smpl; } -static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){ +static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){ const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); return tmp; @@ -250,13 +250,13 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampl int main(int argc, char ** argv) { ggml_time_init(); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { return 1; } - gpt_init(); + common_init(); if (params.mmproj.empty() || (params.image.empty())) { show_additional_info(argc, argv); @@ -290,7 +290,7 @@ int main(int argc, char ** argv) { fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); }else { while (true) { LOG(""); @@ -309,7 +309,7 @@ int main(int argc, char ** argv) { if (strstr(response.c_str(), "")) break; // minicpm-v fflush(stdout); } - gpt_sampler_free(smpl); + common_sampler_free(smpl); } } printf("\n"); diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 49870b4a4..f9e4aba81 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -37,13 +37,13 @@ struct ngram_container { }; int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } - gpt_init(); + common_init(); const int W = 15; // lookahead window const int N = 5; // n-gram size @@ -56,7 +56,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -65,7 +65,7 @@ int main(int argc, char ** argv) { std::vector inp; std::vector all; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); all = inp; const int max_context_size = llama_n_ctx(ctx); @@ -79,7 +79,7 @@ int main(int argc, char ** argv) { LOG("\n\n"); for (auto id : inp) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -115,7 +115,7 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); // target model sampling context - struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sparams); // verification n-grams std::vector ngrams_cur(G); @@ -156,12 +156,12 @@ int main(int argc, char ** argv) { // sample first token { - id = gpt_sampler_sample(smpl, ctx, 0); + id = common_sampler_sample(smpl, ctx, 0); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); LOG("%s", token_str.c_str()); fflush(stdout); @@ -172,7 +172,7 @@ int main(int argc, char ** argv) { // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - llama_kv_cache_dump_view_seqs(kvc_view, 40); + common_kv_cache_dump_view_seqs(kvc_view, 40); } // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ @@ -201,10 +201,10 @@ int main(int argc, char ** argv) { // V V V V V V // id { - llama_batch_clear(batch); + common_batch_clear(batch); // current token - first token of the first level - llama_batch_add(batch, id, n_past, seq_id_all, true); + common_batch_add(batch, id, n_past, seq_id_all, true); // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation { @@ -229,7 +229,7 @@ int main(int argc, char ** argv) { ngrams_cur[g].tokens [j + 1] = t; ngrams_cur[g].i_batch[j + 1] = batch.n_tokens; - llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); + common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true); } } } @@ -241,13 +241,13 @@ int main(int argc, char ** argv) { seq_id_look[j] = i + j + 1; } - llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); + common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false); } // fill the rest of the levels for (int j = 1; j < N - 1; j++) { for (int i = 0; i < W; i++) { - llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); + common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2); } } } @@ -281,13 +281,13 @@ int main(int argc, char ** argv) { } // sample the next token - id = gpt_sampler_sample(smpl, ctx, i_batch); + id = common_sampler_sample(smpl, ctx, i_batch); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); // print { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); if (v == 0) { LOG("%s", token_str.c_str()); @@ -327,7 +327,7 @@ int main(int argc, char ** argv) { // print known n-grams starting with token id (debug) if (0 && v == 0) { if (ngrams_observed.cnt[id] > 0) { - LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str()); + LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str()); } for (int i = 0; i < ngrams_observed.cnt[id]; i++) { @@ -336,7 +336,7 @@ int main(int argc, char ** argv) { const int idx = id*(N - 1)*G + i*(N - 1); for (int j = 0; j < N - 1; j++) { - const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); + const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]); LOG("%s", token_str.c_str()); } @@ -358,7 +358,7 @@ int main(int argc, char ** argv) { if (v == 0) { // sample from the last level for (int i = 0; i < W; i++) { - tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); + tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i); } } else { for (int i = 0; i < W; i++) { @@ -466,9 +466,9 @@ int main(int argc, char ** argv) { LOG_INF("n_accept = %d\n", n_accept); LOG_INF("\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_kv_cache_view_free(&kvc_view); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 33287c02c..7ced0aa97 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -12,9 +12,9 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } @@ -23,7 +23,7 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -31,15 +31,15 @@ int main(int argc, char ** argv){ // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); fprintf(stderr, "%s: tokenization done\n", __func__); - llama_ngram_cache ngram_cache; - llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); + common_ngram_cache ngram_cache; + common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); - llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); + common_ngram_cache_save(ngram_cache, params.lookup_cache_static); return 0; } diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp index 81e2b0436..6871c0f5f 100644 --- a/examples/lookup/lookup-merge.cpp +++ b/examples/lookup/lookup-merge.cpp @@ -33,15 +33,15 @@ int main(int argc, char ** argv){ } fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str()); - llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]); + common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]); for (size_t i = 1; i < args.size()-1; ++i) { fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str()); - llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]); + common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]); - llama_ngram_cache_merge(ngram_cache_merged, ngram_cache); + common_ngram_cache_merge(ngram_cache_merged, ngram_cache); } fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str()); - llama_ngram_cache_save(ngram_cache_merged, args.back()); + common_ngram_cache_save(ngram_cache_merged, args.back()); } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 6d1e1ceb9..7faebe7ba 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -13,13 +13,13 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } - gpt_init(); + common_init(); const int n_draft = params.n_draft; @@ -28,18 +28,18 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); - llama_ngram_cache ngram_cache_context; - llama_ngram_cache ngram_cache_dynamic; - llama_ngram_cache ngram_cache_static; + common_ngram_cache ngram_cache_context; + common_ngram_cache ngram_cache_dynamic; + common_ngram_cache ngram_cache_static; int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; @@ -48,7 +48,7 @@ int main(int argc, char ** argv){ if (!params.lookup_cache_static.empty()) { try { - ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); @@ -57,7 +57,7 @@ int main(int argc, char ** argv){ if (!params.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } @@ -86,7 +86,7 @@ int main(int argc, char ** argv){ { const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); t_draft_us += ggml_time_us() - t_start_draft_us; } @@ -105,7 +105,7 @@ int main(int argc, char ** argv){ { const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } } @@ -115,7 +115,7 @@ int main(int argc, char ** argv){ pseudo_output.push_back(inp_slice[pseudo_output.size()]); { const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } } @@ -133,7 +133,7 @@ int main(int argc, char ** argv){ } // After each chunk, update the dynamic ngram cache with the context ngram cache: - llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); ngram_cache_context.clear(); } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 2ccd0e6c1..82fc7d466 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -13,13 +13,13 @@ #include int main(int argc, char ** argv){ - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } - gpt_init(); + common_init(); // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; @@ -31,29 +31,29 @@ int main(int argc, char ** argv){ llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, true, true); + inp = common_tokenize(ctx, params.prompt, true, true); - llama_ngram_cache ngram_cache_context; - llama_ngram_cache ngram_cache_dynamic; - llama_ngram_cache ngram_cache_static; + common_ngram_cache ngram_cache_context; + common_ngram_cache ngram_cache_dynamic; + common_ngram_cache ngram_cache_static; int64_t t_draft_flat_us = 0; int64_t t_draft_us = 0; { // Fill up context ngram cache with tokens from user input: const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); if (!params.lookup_cache_static.empty()) { try { - ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static); } catch (std::ifstream::failure const &) { LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); exit(1); @@ -62,7 +62,7 @@ int main(int argc, char ** argv){ if (!params.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } @@ -80,7 +80,7 @@ int main(int argc, char ** argv){ LOG("\n\n"); for (auto id : inp) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + LOG("%s", common_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -102,7 +102,7 @@ int main(int argc, char ** argv){ bool has_eos = false; - struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams); + struct common_sampler * smpl = common_sampler_init(model, params.sparams); std::vector draft; @@ -117,7 +117,7 @@ int main(int argc, char ** argv){ // debug if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - llama_kv_cache_dump_view_seqs(kvc_view, 40); + common_kv_cache_dump_view_seqs(kvc_view, 40); } // print current draft sequence @@ -126,11 +126,11 @@ int main(int argc, char ** argv){ int i_dft = 0; while (true) { // sample from the target model - llama_token id = gpt_sampler_sample(smpl, ctx, i_dft); + llama_token id = common_sampler_sample(smpl, ctx, i_dft); - gpt_sampler_accept(smpl, id, true); + common_sampler_accept(smpl, id, true); - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); if (!params.use_color) { LOG("%s", token_str.c_str()); @@ -152,7 +152,7 @@ int main(int argc, char ** argv){ { // Update context ngram cache with the newly accepted token: const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } @@ -178,7 +178,7 @@ int main(int argc, char ** argv){ { // Update context ngram cache with the newly accepted token: const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); t_draft_us += ggml_time_us() - t_start_draft_us; } break; @@ -192,18 +192,18 @@ int main(int argc, char ** argv){ // clean the cache of draft tokens that weren't accepted llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - llama_batch_clear(batch_tgt); - llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); + common_batch_clear(batch_tgt); + common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); // Draft already contains a single token sampled from the model: GGML_ASSERT(draft.size() == 1); GGML_ASSERT(draft[0] == inp.back()); const int64_t t_start_draft_us = ggml_time_us(); - llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); for (size_t i = 1; i < draft.size(); ++i) { - llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); } t_draft_us += ggml_time_us() - t_start_draft_us; @@ -218,8 +218,8 @@ int main(int argc, char ** argv){ auto t_dec_end = ggml_time_us(); // Update dynamic ngram cache with context ngram cache and save it to disk: - llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); - llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); + common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); LOG("\n\n"); @@ -237,9 +237,9 @@ int main(int argc, char ** argv){ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_INF("\ntarget:\n\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_batch_free(batch_tgt); diff --git a/examples/main/README.md b/examples/main/README.md index f0c3031ab..620934dad 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -241,6 +241,19 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0` +### XTC Sampling + +- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0). +- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1). + +Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one. + +By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models. + +Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`. + +Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1` + ### Logit Bias - `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 6bbb1e13e..65483c45f 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -33,8 +33,8 @@ static llama_context ** g_ctx; static llama_model ** g_model; -static gpt_sampler ** g_smpl; -static gpt_params * g_params; +static common_sampler ** g_smpl; +static common_params * g_params; static std::vector * g_input_tokens; static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; @@ -63,7 +63,7 @@ static bool file_is_empty(const std::string & path) { } static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const std::vector & input_tokens, const std::string & output, const std::vector & output_tokens ) { @@ -114,12 +114,12 @@ static void sigint_handler(int signo) { } else { console::cleanup(); LOG("\n"); - gpt_perf_print(*g_ctx, *g_smpl); + common_perf_print(*g_ctx, *g_smpl); write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); // make sure all logs are flushed LOG("Interrupted by user\n"); - gpt_log_pause(gpt_log_main()); + common_log_pause(common_log_main()); _exit(130); } @@ -127,22 +127,22 @@ static void sigint_handler(int signo) { } #endif -static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, const std::string & role, const std::string & content) { - llama_chat_msg new_msg{role, content}; - auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); +static std::string chat_add_and_format(struct llama_model * model, std::vector & chat_msgs, const std::string & role, const std::string & content) { + common_chat_msg new_msg{role, content}; + auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); LOG_DBG("formatted: '%s'\n", formatted.c_str()); return formatted; } int main(int argc, char ** argv) { - gpt_params params; + common_params params; g_params = ¶ms; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) { return 1; } - gpt_init(); + common_init(); auto & sparams = params.sparams; @@ -187,9 +187,9 @@ int main(int argc, char ** argv) { llama_model * model = nullptr; llama_context * ctx = nullptr; - gpt_sampler * smpl = nullptr; + common_sampler * smpl = nullptr; - std::vector chat_msgs; + std::vector chat_msgs; g_model = &model; g_ctx = &ctx; @@ -197,7 +197,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; @@ -246,7 +246,7 @@ int main(int argc, char ** argv) { // print chat template example in conversation mode if (params.conversation) { if (params.enable_chat_template) { - LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str()); + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str()); } else { LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } @@ -255,7 +255,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); } @@ -296,7 +296,7 @@ int main(int argc, char ** argv) { : params.prompt; if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { LOG_DBG("tokenize the prompt\n"); - embd_inp = ::llama_tokenize(ctx, prompt, true, true); + embd_inp = common_tokenize(ctx, prompt, true, true); } else { LOG_DBG("use session tokens\n"); embd_inp = session_tokens; @@ -379,13 +379,13 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); } if (params.n_keep > add_bos) { LOG_INF("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); } LOG_CNT("'\n"); } @@ -415,9 +415,9 @@ int main(int argc, char ** argv) { for (const auto & antiprompt : params.antiprompt) { LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); + auto tmp = common_tokenize(ctx, antiprompt, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -430,9 +430,9 @@ int main(int argc, char ** argv) { if (!params.input_prefix.empty()) { LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); + auto tmp = common_tokenize(ctx, params.input_prefix, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } @@ -440,23 +440,23 @@ int main(int argc, char ** argv) { if (!params.input_suffix.empty()) { LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); if (params.verbose_prompt) { - auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); + auto tmp = common_tokenize(ctx, params.input_suffix, false, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); } } } } - smpl = gpt_sampler_init(model, sparams); + smpl = common_sampler_init(model, sparams); if (!smpl) { LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); return 1; } - LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl)); + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); - LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); @@ -521,7 +521,7 @@ int main(int argc, char ** argv) { antiprompt_ids.reserve(params.antiprompt.size()); for (const std::string & antiprompt : params.antiprompt) { - antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true)); + antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true)); } if (llama_model_has_encoder(model)) { @@ -569,30 +569,30 @@ int main(int argc, char ** argv) { if (!params.ctx_shift){ LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); break; - } else { - if (params.n_predict == -2) { - LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); - break; - } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG_DBG("after swap: n_past = %d\n", n_past); - - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - - LOG_DBG("clear session path\n"); - path_session.clear(); } + + if (params.n_predict == -2) { + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); } } else { // context extension via Self-Extend @@ -679,9 +679,9 @@ int main(int argc, char ** argv) { LOG_DBG("saved session to %s\n", path_session.c_str()); } - const llama_token id = gpt_sampler_sample(smpl, ctx, -1); + const llama_token id = common_sampler_sample(smpl, ctx, -1); - gpt_sampler_accept(smpl, id, /* accept_grammar= */ true); + common_sampler_accept(smpl, id, /* accept_grammar= */ true); // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); @@ -702,7 +702,7 @@ int main(int argc, char ** argv) { // push the prompt in the sampling context in order to apply repetition penalties later // for the prompt, we don't apply grammar rules - gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); + common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; if ((int) embd.size() >= params.n_batch) { @@ -714,7 +714,7 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id, params.special); + const std::string token_str = common_token_to_piece(ctx, id, params.special); // Console/Stream Output LOG("%s", token_str.c_str()); @@ -743,7 +743,7 @@ int main(int argc, char ** argv) { // check for reverse prompt in the last n_prev tokens if (!params.antiprompt.empty()) { const int n_prev = 32; - const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev); + const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); is_antiprompt = false; // Check if each of the reverse prompts appears at the end of the output. @@ -765,7 +765,7 @@ int main(int argc, char ** argv) { } // check for reverse prompt using special tokens - llama_token last_token = gpt_sampler_last(smpl); + llama_token last_token = common_sampler_last(smpl); for (std::vector ids : antiprompt_ids) { if (ids.size() == 1 && last_token == ids[0]) { if (params.interactive) { @@ -782,13 +782,13 @@ int main(int argc, char ** argv) { } // deal with end of generation tokens in interactive mode - if (llama_token_is_eog(model, gpt_sampler_last(smpl))) { + if (llama_token_is_eog(model, common_sampler_last(smpl))) { LOG_DBG("found an EOG token\n"); if (params.interactive) { if (!params.antiprompt.empty()) { // tokenize and inject first reverse prompt - const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); + const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true); embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); is_antiprompt = true; } @@ -803,8 +803,8 @@ int main(int argc, char ** argv) { // if current token is not EOG, we add it to current assistant message if (params.conversation) { - const auto id = gpt_sampler_last(smpl); - assistant_ss << llama_token_to_piece(ctx, id, false); + const auto id = common_sampler_last(smpl); + assistant_ss << common_token_to_piece(ctx, id, false); } if (n_past > 0 && is_interacting) { @@ -862,9 +862,9 @@ int main(int argc, char ** argv) { ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) : std::move(buffer); // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) - const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat); - const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat); + const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true); LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); @@ -882,7 +882,7 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token); + output_ss << common_token_to_piece(ctx, token); } // reset assistant message @@ -899,7 +899,7 @@ int main(int argc, char ** argv) { if (n_past > 0) { if (is_interacting) { - gpt_sampler_reset(smpl); + common_sampler_reset(smpl); } is_interacting = false; } @@ -925,10 +925,10 @@ int main(int argc, char ** argv) { } LOG("\n\n"); - gpt_perf_print(ctx, smpl); + common_perf_print(ctx, smpl); write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); - gpt_sampler_free(smpl); + common_sampler_free(smpl); llama_free(ctx); llama_free_model(model); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 81e2f7ed7..20274c147 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -54,7 +54,7 @@ static std::vector k_prompts = { struct client { ~client() { if (smpl) { - gpt_sampler_free(smpl); + common_sampler_free(smpl); } } @@ -75,7 +75,7 @@ struct client { std::string prompt; std::string response; - struct gpt_sampler * smpl = nullptr; + struct common_sampler * smpl = nullptr; }; static void print_date_time() { @@ -103,13 +103,13 @@ static std::vector split_string(const std::string& input, char deli int main(int argc, char ** argv) { srand(1234); - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { return 1; } - gpt_init(); + common_init(); // number of simultaneous "clients" to simulate const int32_t n_clients = params.n_parallel; @@ -130,7 +130,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the target model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -160,11 +160,11 @@ int main(int argc, char ** argv) { for (size_t i = 0; i < clients.size(); ++i) { auto & client = clients[i]; client.id = i; - client.smpl = gpt_sampler_init(model, params.sparams); + client.smpl = common_sampler_init(model, params.sparams); } std::vector tokens_system; - tokens_system = ::llama_tokenize(ctx, k_system, true); + tokens_system = common_tokenize(ctx, k_system, true); const int32_t n_tokens_system = tokens_system.size(); llama_seq_id g_seq_id = 0; @@ -189,7 +189,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: Evaluating the system prompt ...\n", __func__); for (int32_t i = 0; i < n_tokens_system; ++i) { - llama_batch_add(batch, tokens_system[i], i, { 0 }, false); + common_batch_add(batch, tokens_system[i], i, { 0 }, false); } if (llama_decode(ctx, batch) != 0) { @@ -210,10 +210,10 @@ int main(int argc, char ** argv) { while (true) { if (dump_kv_cache) { llama_kv_cache_view_update(ctx, &kvc_view); - llama_kv_cache_dump_view_seqs(kvc_view, 40); + common_kv_cache_dump_view_seqs(kvc_view, 40); } - llama_batch_clear(batch); + common_batch_clear(batch); // decode any currently ongoing sequences for (auto & client : clients) { @@ -223,7 +223,7 @@ int main(int argc, char ** argv) { client.i_batch = batch.n_tokens; - llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); + common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); client.n_decoded += 1; } @@ -252,14 +252,14 @@ int main(int argc, char ** argv) { client.prompt = client.input + "\nAssistant:"; client.response = ""; - gpt_sampler_reset(client.smpl); + common_sampler_reset(client.smpl); // do not prepend BOS because we have a system prompt! std::vector tokens_prompt; - tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); + tokens_prompt = common_tokenize(ctx, client.prompt, false); for (size_t i = 0; i < tokens_prompt.size(); ++i) { - llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); + common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); } // extract the logits only for the last token @@ -340,9 +340,9 @@ int main(int argc, char ** argv) { //printf("client %d, seq %d, token %d, pos %d, batch %d\n", // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i); + const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i); - gpt_sampler_accept(client.smpl, id, true); + common_sampler_accept(client.smpl, id, true); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients @@ -350,7 +350,7 @@ int main(int argc, char ** argv) { client.t_start_gen = ggml_time_us(); } - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = common_token_to_piece(ctx, id); client.response += token_str; client.sampled = id; diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 7ef8d14f3..09bba708f 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -15,17 +15,17 @@ static void print_usage(int, char ** argv) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_junk = 250; params.n_keep = 32; params.i_pos = -1; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { return 1; } - gpt_init(); + common_init(); int n_junk = params.n_junk; int n_keep = params.n_keep; @@ -61,7 +61,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = common_model_params_to_llama(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -72,7 +72,7 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = common_context_params_to_llama(params); ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; @@ -92,10 +92,10 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); + tokens_list = common_tokenize(ctx, params.prompt, true); // tokenize the prefix and use it as a sink - const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size(); + const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size(); const int n_tokens_all = tokens_list.size(); @@ -137,10 +137,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; } - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -171,10 +171,10 @@ int main(int argc, char ** argv) { n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; - llama_batch_clear(batch); + common_batch_clear(batch); for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) { - llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); + common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false); } if (i + n_batch >= n_tokens_all) { @@ -229,15 +229,15 @@ int main(int argc, char ** argv) { break; } - LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + LOG("%s", common_token_to_piece(ctx, new_token_id).c_str()); n_decode += 1; // prepare the next batch - llama_batch_clear(batch); + common_batch_clear(batch); // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_past++, { 0 }, true); + common_batch_add(batch, new_token_id, n_past++, { 0 }, true); } n_cur += 1; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 87347135e..efb41b80a 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -35,7 +35,7 @@ struct results_log_softmax { }; static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, + const llama_context * ctx, const common_params & params, const llama_model * model, const struct results_perplexity & results ) { if (params.logdir.empty()) { @@ -169,7 +169,7 @@ static void process_logits( break; } lock.unlock(); - const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]); const double v = -results.log_softmax; local_nll += v; local_nll2 += v*v; @@ -203,7 +203,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, break; } lock.unlock(); - const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]); + const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]); local_nll += v; local_nll2 += v*v; } @@ -281,7 +281,9 @@ static std::pair log_softmax(int n_vocab, const float * logits, c kld.sum_kld += sum; kld.sum_kld2 += sum*sum; ++kld.count; - if (imax == imax_base) ++kld.n_same_top; + if (imax == imax_base) { + ++kld.n_same_top; + } const float p_base = expf(-nll_base); const float p = expf(-nll); @@ -323,7 +325,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens break; } lock.unlock(); - std::pair v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); + std::pair v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); kld_values[i] = (float)v.first; p_diff_values[i] = v.second; } @@ -337,7 +339,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens } } -static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { +static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) { // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` @@ -348,7 +350,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & LOG_INF("%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = common_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); @@ -383,9 +385,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + int count = 0; double nll = 0.0; @@ -424,8 +427,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } - const auto batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab); if (j == 0) { tokens[batch_start] = token_org; @@ -447,11 +450,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { - // Calculate probability of next token, given the previous ones. const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); + logits.begin() + size_t(j + 0) * n_vocab, + logits.begin() + size_t(j + 1) * n_vocab); const float prob = softmax(tok_logits)[tokens[start + j + 1]]; logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]]; @@ -472,7 +474,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & return {tokens, std::exp(nll / count), logit_history, prob_history}; } -static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) { +static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) { if (params.ppl_stride > 0) { return perplexity_v2(ctx, params); } @@ -500,7 +502,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); + std::vector tokens = common_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -521,9 +523,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + int count = 0; double nll = 0.0; double nll2 = 0.0; @@ -538,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par std::vector logits; if (num_batches > 1) { - logits.reserve((size_t)n_ctx * n_vocab); + logits.reserve(size_t(n_ctx) * n_vocab); } LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); @@ -620,7 +623,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par if (num_batches > 1 && n_outputs > 0) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab); } } @@ -661,7 +664,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } else { double av = nll/count; double av2 = nll2/count - av*av; - if (av2 > 0) av2 = sqrt(av2/(count-1)); + if (av2 > 0) { + av2 = sqrt(av2/(count-1)); + } LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); } } @@ -686,10 +691,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par return {tokens, ppl, logit_history, prob_history}; } -static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int32_t n_batch, int32_t n_vocab) { +static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int n_batch, int n_vocab) { int prev_outputs = 0; - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + for (int i = 0; i < (int) batch.n_tokens; i += n_batch) { + const int n_tokens = std::min(n_batch, batch.n_tokens - i); llama_batch batch_view = { n_tokens, @@ -713,7 +718,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< n_outputs += batch_view.logits[i] != 0; } - memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float)); + memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float)); prev_outputs += n_outputs; } @@ -728,7 +733,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto if (eval_results.size() != eval_pairs.size()) { eval_results.resize(eval_pairs.size()); } - if (eval_pairs.empty()) return; + if (eval_pairs.empty()) { + return; + } size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size()); @@ -736,11 +743,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { float local_logprobs[K_TOKEN_CHUNK]; while (true) { - size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); - if (first >= eval_results.size()) break; - size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); + const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); + if (first >= eval_results.size()) { + break; + } + const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); for (size_t i = first; i < last; ++i) { - auto logits = batch_logits + eval_pairs[i].first * n_vocab; + const auto * logits = batch_logits + eval_pairs[i].first * n_vocab; float max_logit = logits[0]; for (int j = 1; j < n_vocab; ++j) { max_logit = std::max(max_logit, logits[j]); @@ -763,7 +772,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto } } -static void hellaswag_score(llama_context * ctx, const gpt_params & params) { +static void hellaswag_score(llama_context * ctx, const common_params & params) { // Calculates hellaswag score (acc_norm) from prompt // // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl @@ -844,7 +853,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j = 0; j < 4; j++) { hs_cur.ending[j] = prompt_lines[idx*6+2+j]; - hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); + hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); } // determine the common prefix of the endings @@ -877,10 +886,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { double acc = 0.0f; - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -888,7 +898,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { std::vector tok_logits(n_vocab); // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(size_t(n_ctx)*n_vocab); std::vector> eval_pairs; std::vector eval_results; @@ -900,7 +910,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - llama_batch_clear(batch); + common_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -916,7 +926,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < hs_cur.common_prefix; ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); + common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -926,7 +936,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { // TODO: don't evaluate the last token of each sequence for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); + common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } @@ -975,7 +985,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { auto & hs_cur = hs_data[i]; // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float)); + std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -1102,7 +1112,7 @@ static std::vector load_winogrande_from_csv(const std::string * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2 * */ -static void winogrande_score(llama_context * ctx, const gpt_params & params) { +static void winogrande_score(llama_context * ctx, const common_params & params) { constexpr int k_min_trailing_ctx = 3; @@ -1136,8 +1146,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { LOG_INF("%s : tokenizing selected tasks\n", __func__); for (auto & task : data) { - task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); - task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); + task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true); + task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true); task.common_prefix = 0; for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { @@ -1152,16 +1162,17 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[1].size() - task.common_prefix; - task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); - task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); + task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size(); + task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size(); } LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int max_tasks_per_batch = 128; const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); @@ -1169,7 +1180,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { std::vector tok_logits(n_vocab); // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(size_t(n_ctx)*n_vocab); std::vector> eval_pairs; std::vector eval_results; @@ -1184,7 +1195,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { size_t i1 = i0; size_t i_logits = 0; - llama_batch_clear(batch); + common_batch_clear(batch); while (n_cur + (int) data[i1].required_tokens <= n_ctx) { int n_logits = 0; @@ -1194,7 +1205,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < data[i1].common_prefix; ++i) { - llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); + common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); } batch.logits[batch.n_tokens - 1] = true; n_logits += 1; @@ -1202,7 +1213,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { for (int s = 0; s < 2; ++s) { // TODO: end before the last token, no need to predict past the end of the sequences for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { - llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); + common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); n_logits += 1; } } @@ -1359,7 +1370,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic } return false; } - task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); + task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true)); } auto min_len = task.seq_tokens.front().size(); for (auto& seq : task.seq_tokens) { @@ -1403,7 +1414,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic // git@hf.co:datasets/Stevross/mmlu // https://huggingface.co/datasets/truthful_qa // -static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { +static void multiple_choice_score(llama_context * ctx, const common_params & params) { std::istringstream strstream(params.prompt); uint32_t n_task; @@ -1509,17 +1520,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params LOG("\ntask\tacc_norm\n"); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); const int n_batch = params.n_batch; + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); std::vector tok_logits(n_vocab); - std::vector batch_logits(n_vocab*n_ctx); + std::vector batch_logits(size_t(n_ctx)*n_vocab); std::vector> eval_pairs; std::vector eval_results; @@ -1536,7 +1548,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params size_t i1 = i0; size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - llama_batch_clear(batch); + common_batch_clear(batch); // batch as much tasks as possible into the available context // each task has 4 unique sequence ids - one for each ending @@ -1559,7 +1571,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params for (size_t i = 0; i < cur_task.common_prefix; ++i) { //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); - llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); + common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix n_logits += 1; @@ -1569,7 +1581,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params // TODO: don't evaluate the last token of each sequence for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); + common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); n_logits += needs_logits; } } @@ -1627,7 +1639,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params //LOG("\n common_prefix: %zu\n", cur_task.common_prefix); // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); + std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -1683,7 +1695,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params LOG_INF("\n"); } -static void kl_divergence(llama_context * ctx, const gpt_params & params) { +static void kl_divergence(llama_context * ctx, const common_params & params) { if (params.logits_file.empty()) { LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); return; @@ -1709,7 +1721,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); } - int n_vocab, n_chunk; + int n_vocab; + int n_chunk; in.read((char *)&n_vocab, sizeof(n_vocab)); in.read((char *)&n_chunk, sizeof(n_chunk)); if (in.fail()) { @@ -1720,7 +1733,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); } - std::vector tokens(n_ctx * n_chunk); + std::vector tokens(size_t(n_ctx) * n_chunk); if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); return; @@ -1737,7 +1750,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { std::vector p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector logits; if (num_batches > 1) { - logits.reserve(n_ctx * n_vocab); + logits.reserve(size_t(n_ctx) * n_vocab); } std::vector workers(std::thread::hardware_concurrency() - 1); @@ -1801,7 +1814,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (num_batches > 1) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab); } } @@ -1822,7 +1835,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); p_diff_ptr += n_ctx - 1 - first; kld_ptr += n_ctx - 1 - first; @@ -1955,17 +1968,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { } int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.n_ctx = 512; params.logits_all = true; params.escape = false; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { return 1; } - gpt_init(); + common_init(); const int32_t n_ctx = params.n_ctx; @@ -2004,7 +2017,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model and apply lora adapter, if any - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -2023,7 +2036,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } struct results_perplexity results; diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 498cbbe3c..e372856c6 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { } static void test_roundtrip_on_chunk( - const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats ) { if (layer->type == GGML_TYPE_F16) { @@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats static void test_roundtrip_on_layer( - std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, + std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference, const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, std::vector & output_scratch, error_stats & total_error, int max_thread = 0 ) { @@ -371,8 +371,8 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); - if (qfns.from_float && qfns.to_float) { + const auto * qfns = ggml_get_type_traits(type); + if (qfns->from_float && qfns->to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); } @@ -393,7 +393,7 @@ int main(int argc, char ** argv) { test_roundtrip_on_layer( layer_name, params.per_layer_stats, - qfns, + *qfns, params.reference, kv_tensor.second, input_scratch, diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 5971690f1..1768aae51 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -77,7 +77,7 @@ static std::vector chunk_file(const std::string & filename, int chunk_siz static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { size_t n_tokens = tokens.size(); for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); + common_batch_add(batch, tokens[i], i, { seq_id }, true); } } @@ -107,18 +107,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + batch.seq_id[i][0] * n_embd; - llama_embd_normalize(embd, out, n_embd); + common_embd_normalize(embd, out, n_embd); } } int main(int argc, char ** argv) { - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { return 1; } - gpt_init(); + common_init(); // For BERT models, batch size must be equal to ubatch size params.n_ubatch = params.n_batch; @@ -149,7 +149,7 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -176,7 +176,7 @@ int main(int argc, char ** argv) { // print system information { LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } // max batch size @@ -185,7 +185,7 @@ int main(int argc, char ** argv) { // tokenize the prompts and trim for (auto & chunk : chunks) { - auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); + auto inp = common_tokenize(ctx, chunk.textdata, true, false); if (inp.size() > n_batch) { LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", __func__, (long long int) inp.size(), (long long int) n_batch); @@ -204,7 +204,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { - LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); } LOG_INF("\n\n"); } @@ -232,7 +232,7 @@ int main(int argc, char ** argv) { if (batch.n_tokens + n_toks > n_batch) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - llama_batch_clear(batch); + common_batch_clear(batch); p += s; s = 0; } @@ -260,20 +260,20 @@ int main(int argc, char ** argv) { while (true) { LOG("Enter query: "); std::getline(std::cin, query); - std::vector query_tokens = llama_tokenize(ctx, query, true); + std::vector query_tokens = common_tokenize(ctx, query, true); batch_add_seq(query_batch, query_tokens, 0); std::vector query_emb(n_embd, 0); batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); - llama_batch_clear(query_batch); + common_batch_clear(query_batch); // compute cosine similarities { std::vector> similarities; for (int i = 0; i < n_chunks; i++) { - float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); + float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); similarities.push_back(std::make_pair(i, sim)); } diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 355125831..8354e37e5 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -151,7 +151,7 @@ int main(int argc, char * argv[]) { get_backend_memory(&free_mem, &total_mem); } printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024)); - start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); + ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem); ggml_backend_free(backend); return 0; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 0117d9357..3866cfa27 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -6,12 +6,12 @@ #include int main(int argc, char ** argv) { - gpt_params params; + common_params params; params.prompt = "The quick brown fox"; params.sparams.seed = 1234; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -28,7 +28,7 @@ int main(int argc, char ** argv) { std::string result2; // init - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); llama_model * model = llama_init.model; llama_context * ctx = llama_init.context; @@ -46,7 +46,7 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); // tokenize prompt - auto tokens = llama_tokenize(ctx, params.prompt, true); + auto tokens = common_tokenize(ctx, params.prompt, true); // evaluate prompt llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); @@ -72,7 +72,7 @@ int main(int argc, char ** argv) { for (auto i = 0; i < params.n_predict; i++) { auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = llama_token_to_piece(ctx, next_token); + auto next_token_str = common_token_to_piece(ctx, next_token); printf("%s", next_token_str.c_str()); result0 += next_token_str; @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { llama_free(ctx); // make new context - auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params)); llama_sampler * smpl2 = llama_sampler_chain_init(sparams); @@ -128,7 +128,7 @@ int main(int argc, char ** argv) { // second run for (auto i = 0; i < params.n_predict; i++) { auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = llama_token_to_piece(ctx2, next_token); + auto next_token_str = common_token_to_piece(ctx2, next_token); printf("%s", next_token_str.c_str()); result1 += next_token_str; @@ -152,7 +152,7 @@ int main(int argc, char ** argv) { } // make new context - auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params)); llama_sampler * smpl3 = llama_sampler_chain_init(sparams); @@ -216,7 +216,7 @@ int main(int argc, char ** argv) { // third run with seq 1 instead of 0 for (auto i = 0; i < params.n_predict; i++) { auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = llama_token_to_piece(ctx3, next_token); + auto next_token_str = common_token_to_piece(ctx3, next_token); printf("%s", next_token_str.c_str()); result2 += next_token_str; diff --git a/examples/server/README.md b/examples/server/README.md index 6253de43c..fcdb02afd 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -18,6 +18,8 @@ The project is under active development, and we are [looking for feedback and co ## Usage + + **Common params** | Argument | Explanation | @@ -58,8 +60,6 @@ The project is under active development, and we are [looking for feedback and co | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | -| `-gan, --grp-attn-n N` | group-attention factor (default: 1)
(env: LLAMA_ARG_GRP_ATTN_N) | -| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)
(env: LLAMA_ARG_GRP_ATTN_W) | | `-dkvc, --dump-kv-cache` | verbose print of the KV cache | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | @@ -147,9 +147,11 @@ The project is under active development, and we are [looking for feedback and co | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | -| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | +| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | -| `--no-slots` | disables slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | +| `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | +| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| @@ -316,7 +318,6 @@ node index.js - The prompt is a string or an array with the first element given as a string - The model's `tokenizer.ggml.add_bos_token` metadata is `true` - - The system prompt is empty `temperature`: Adjust the randomness of the generated text. Default: `0.8` @@ -374,14 +375,14 @@ node index.js `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0` + `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled. + `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` - `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. **Response format** @@ -519,34 +520,62 @@ Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/B Takes a prefix and a suffix and returns the predicted completion as stream. - *Options:* +*Options:* - `input_prefix`: Set the prefix of the code to infill. +- `input_prefix`: Set the prefix of the code to infill. +- `input_suffix`: Set the suffix of the code to infill. +- `input_extra`: Additional context inserted before the FIM prefix. +- `prompt`: Added after the `FIM_MID` token - `input_suffix`: Set the suffix of the code to infill. +`input_extra` is array of `{"filename": string, "text": string}` objects. - It also accepts all the options of `/completion` except `stream` and `prompt`. +The endpoint also accepts all the options of `/completion`. -- **GET** `/props`: Return current server settings. +If the model has `FIM_REPO` and `FIM_FILE_SEP` tokens, the [repo-level pattern](https://arxiv.org/pdf/2409.12186) is used: + +```txt +myproject +{chunk 0 filename} +{chunk 0 text} +{chunk 1 filename} +{chunk 1 text} +... +filename +[input_prefix][input_suffix][prompt] +``` + +If the tokens are missing, then the extra context is simply prefixed at the start: + +```txt +[input_extra][input_prefix][input_suffix][prompt] +``` + +### **GET** `/props`: Get server global properties. + +This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` **Response format** ```json { - "assistant_name": "", - "user_name": "", "default_generation_settings": { ... }, "total_slots": 1, "chat_template": "" } ``` -- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. -- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `chat_template` - the model's original Jinja2 prompt template +### POST `/props`: Change server global properties. + +To use this endpoint with POST method, you need to start server with `--props` + +*Options:* + +- None yet + ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. @@ -813,28 +842,6 @@ To know the `id` of the adapter, use GET `/lora-adapters` ## More examples -### Change system prompt on runtime - -To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once. - -`prompt`: Specify a context that you want all connecting clients to respect. - -`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint. - -`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint. - -```json -{ - "system_prompt": { - "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:", - "anti_prompt": "User:", - "assistant_name": "Assistant:" - } -} -``` - -**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`. - ### Interactive mode Check the sample in [chat.mjs](chat.mjs). diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index c87dd8f1e..ad4183cd9 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -43,6 +43,8 @@ top_k: 0, // <= 0 to use vocab size top_p: 1.0, // 1.0 = disabled min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4 + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -836,6 +838,8 @@ return html` ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} + ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })} ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })} @@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => { const snapSettings = { temperature: { snapValue: 1.0, snapRangeMultiplier: 6 }, min_p: { snapValue: 0.05, snapRangeMultiplier: 2 }, + xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 }, + xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 }, top_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 }, typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 07fec6a38..88065705f 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -307,6 +307,8 @@ top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -1013,6 +1015,8 @@ ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} + ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
diff --git a/examples/server/public/index.js b/examples/server/public/index.js index fe615ca25..32ec6e9e1 100644 --- a/examples/server/public/index.js +++ b/examples/server/public/index.js @@ -1 +1 @@ -const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function f(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function s(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}s.prototype.brand=t;s.prototype.h=function(){return!0};s.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};s.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};s.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};s.prototype.valueOf=function(){return this.value};s.prototype.toString=function(){return this.value+""};s.prototype.toJSON=function(){return this.value};s.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(s.prototype,"value",{get(){const t=f(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new s(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){s.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new s).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}s.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){s.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=f(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){var n=t.parentNode;n&&n.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,f,s){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,s),i.__i=u,o=null,-1!==u&&(s--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:u>r?s>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function Ft(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function At(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(s.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof s){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][f+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,s as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,st as hydrate,C as isValidElement,S as options,ft as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,Ft as useContext,At as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; +const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 7267f3f9c..e67bb15c1 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -529,7 +529,7 @@ export class SchemaConverter { return joinSeq(); }; - return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") + return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space") } _notStrings(strings) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 13e54e501..b5e63384c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -128,14 +128,14 @@ struct slot_params { bool stream = true; bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; - - json input_prefix; - json input_suffix; }; struct server_slot { @@ -165,8 +165,13 @@ struct server_slot { json prompt; // can be either a string, array of strings or array of token ids + json input_prefix; + json input_suffix; + json input_extra; + // when a task is submitted, we first tokenize the prompt and store it here std::vector prompt_tokens; + std::vector extra_tokens; std::string generated_text; std::vector cache_tokens; @@ -175,6 +180,7 @@ struct server_slot { server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; bool has_next_token = true; + bool has_new_line = false; bool truncated = false; bool stopped_eos = false; bool stopped_word = false; @@ -188,26 +194,20 @@ struct server_slot { // sampling json json_schema; - struct gpt_sampler_params sparams; - struct gpt_sampler * smpl = nullptr; + struct common_sampler_params sparams; + struct common_sampler * smpl = nullptr; llama_token sampled; - int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1; // group-attention factor - int32_t ga_w = 512; // group-attention width - - int32_t n_past_se = 0; // self-extend - // stats - size_t n_sent_text = 0; // number of sent text character + size_t n_sent_text = 0; // number of sent text character size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_generation; double t_prompt_processing; // ms - double t_token_generation; // ms + double t_token_generation; // ms std::function callback_on_release; @@ -216,6 +216,7 @@ struct server_slot { n_prompt_tokens = 0; generated_text = ""; + has_new_line = false; truncated = false; stopped_eos = false; stopped_word = false; @@ -225,13 +226,11 @@ struct server_slot { n_sent_text = 0; n_sent_token_probs = 0; cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; - ga_i = 0; - n_past_se = 0; generated_token_probs.clear(); } - bool has_budget(gpt_params &global_params) { + bool has_budget(common_params &global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } @@ -611,9 +610,9 @@ struct server_response { struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; - std::vector loras; + std::vector loras; - gpt_params params; + common_params params; llama_batch batch = {}; @@ -623,12 +622,6 @@ struct server_context { int32_t n_ctx; // total context for all clients / slots - // system prompt - bool system_need_update = false; - - std::string system_prompt; - std::vector system_tokens; - // slots / clients std::vector slots; json default_generation_settings_for_props; @@ -655,20 +648,20 @@ struct server_context { // Clear any sampling context for (server_slot & slot : slots) { if (slot.smpl != nullptr) { - gpt_sampler_free(slot.smpl); + common_sampler_free(slot.smpl); } } llama_batch_free(batch); } - bool load_model(const gpt_params & params_) { + bool load_model(const common_params & params_) { params = params_; - // dedicate one sequence to the system prompt + // reserve one extra sequence (seq_id == 0) for extra features params.n_parallel += 1; - llama_init_result llama_init = llama_init_from_gpt_params(params); + common_init_result llama_init = common_init_from_params(params); model = llama_init.model; ctx = llama_init.context; @@ -711,22 +704,6 @@ struct server_context { SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - const int ga_n = params.grp_attn_n; - const int ga_w = params.grp_attn_w; - - if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT - //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - - SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w); - } - - slot.ga_i = 0; - slot.ga_n = ga_n; - slot.ga_w = ga_w; - slot.sparams = params.sparams; slot.callback_on_release = [this](int) { @@ -753,12 +730,7 @@ struct server_context { metrics.init(); } - std::vector tokenize(const json & json_prompt, bool add_special) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other chat templates - const bool TMP_FORCE_SPECIAL = true; - + std::vector tokenize(const json & json_prompt, bool add_special, bool parse_special) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector prompt_tokens; @@ -771,10 +743,10 @@ struct server_context { std::vector p; if (first) { - p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + p = common_tokenize(ctx, s, add_special, parse_special); first = false; } else { - p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); + p = common_tokenize(ctx, s, false, parse_special); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); @@ -788,7 +760,7 @@ struct server_context { } } else { auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); } return prompt_tokens; @@ -830,7 +802,7 @@ struct server_context { int slot_prompt_len = slot_prompt.size(); // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = common_part(slot_prompt, prompt); + int lcp_len = longest_common_prefix(slot_prompt, prompt); // fraction of the common substring length compared to the current slot's prompt length similarity = static_cast(lcp_len) / slot_prompt_len; @@ -891,6 +863,8 @@ struct server_context { slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); + slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); @@ -909,6 +883,8 @@ struct server_context { slot.sparams.seed = json_value(data, "seed", default_sparams.seed); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement + slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms); // process "json_schema" and "grammar" if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { @@ -917,19 +893,14 @@ struct server_context { } if (data.contains("json_schema") && !data.contains("grammar")) { try { - auto schema = json_value(data, "json_schema", json::object()); - slot.sparams.grammar = json_schema_to_grammar(schema); + auto schema = json_value(data, "json_schema", json::object()); + slot.sparams.grammar = json_schema_to_grammar(schema); } catch (const std::exception & e) { send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); return false; } } else { - slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); - } - - if (slot.params.cache_prompt && slot.ga_n != 1) { - slot.params.cache_prompt = false; - SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n"); + slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -939,11 +910,29 @@ struct server_context { } // infill - slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); - slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); + slot.input_prefix = json_value(data, "input_prefix", json()); + slot.input_suffix = json_value(data, "input_suffix", json()); + slot.input_extra = json_value(data, "input_extra", json()); + + SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size()); + for (const auto & chunk : slot.input_extra) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk["text"].is_string()) { + send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + // filename is optional + if (chunk.contains("filename") && !chunk["filename"].is_string()) { + send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str()); + } // get prompt - if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) { + { const auto & prompt = data.find("prompt"); if (prompt == data.end()) { send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); @@ -999,7 +988,7 @@ struct server_context { slot.sparams.logit_bias.push_back({tok, bias}); } } else if (el[0].is_string()) { - auto toks = llama_tokenize(model, el[0].get(), false); + auto toks = common_tokenize(model, el[0].get(), false); for (auto tok : toks) { slot.sparams.logit_bias.push_back({tok, bias}); } @@ -1031,7 +1020,7 @@ struct server_context { sampler_names.emplace_back(name); } } - slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false); + slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false); } else { slot.sparams.samplers = default_sparams.samplers; } @@ -1039,10 +1028,10 @@ struct server_context { { if (slot.smpl != nullptr) { - gpt_sampler_free(slot.smpl); + common_sampler_free(slot.smpl); } - slot.smpl = gpt_sampler_init(model, slot.sparams); + slot.smpl = common_sampler_init(model, slot.sparams); if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); @@ -1066,59 +1055,9 @@ struct server_context { clean_kv_cache = false; } - void system_prompt_update() { - SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str()); - - kv_cache_clear(); - system_tokens.clear(); - - if (!system_prompt.empty()) { - system_tokens = ::llama_tokenize(ctx, system_prompt, true); - - const int32_t n_batch = llama_n_batch(ctx); - const int32_t n_tokens_prompt = system_tokens.size(); - - for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i); - - llama_batch_clear(batch); - - for (int32_t j = 0; j < n_tokens; ++j) { - llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); - } - - if (llama_decode(ctx, batch) != 0) { - SRV_ERR("%s", "llama_decode() failed\n"); - return; - } - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i <= params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } - } - - system_need_update = false; - } - - bool system_prompt_set(const std::string & sys_prompt) { - SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); - - system_prompt = sys_prompt; - - // release all slots - for (server_slot & slot : slots) { - slot.release(); - } - - system_need_update = true; - return true; - } - bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special); + const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); slot.sampled = result.tok; // search stop word and delete it @@ -1151,22 +1090,21 @@ struct server_context { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; + bool send_text = true; size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); if (stop_pos != std::string::npos) { - is_stop_full = true; slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } else { - is_stop_full = false; + } else if (slot.has_next_token) { stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); + send_text = stop_pos == std::string::npos; } // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { + if (send_text) { // no send the stop word in the response result.text_to_send = slot.generated_text.substr(pos, std::string::npos); slot.n_sent_text += result.text_to_send.size(); @@ -1191,13 +1129,28 @@ struct server_context { SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); } + // if we have already seen a new line, we stop after a certain time limit + if (slot.has_new_line && slot.params.t_max_predict_ms > 0 && + (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { + slot.stopped_limit = true; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); + } + + // check if there is a new line in the generated text + if (result.text_to_send.find('\n') != std::string::npos) { + slot.has_new_line = true; + } + // if context shift is disabled, we stop when it reaches the context limit - if (slot.n_decoded >= slot.n_ctx) { + if (slot.n_past >= slot.n_ctx) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; - SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx); + SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", + slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); } if (llama_token_is_eog(model, result.tok)) { @@ -1209,18 +1162,18 @@ struct server_context { const auto n_ctx_train = llama_n_ctx_train(model); - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { + if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; // stop prediction SLT_WRN(slot, - "n_predict (%d) is not set and self-context extend is disabled. " + "n_predict (%d) is set for infinite generation. " "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", slot.params.n_predict, n_ctx_train); } - SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); return slot.has_next_token; // continue } @@ -1229,7 +1182,7 @@ struct server_context { std::vector samplers; samplers.reserve(slot.sparams.samplers.size()); for (const auto & sampler : slot.sparams.samplers) { - samplers.emplace_back(gpt_sampler_type_to_str(sampler)); + samplers.emplace_back(common_sampler_type_to_str(sampler)); } return json { @@ -1237,13 +1190,15 @@ struct server_context { {"n_predict", slot.n_predict}, // Server configured n_predict {"model", params.model_alias}, {"seed", slot.sparams.seed}, - {"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0}, + {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, {"temperature", slot.sparams.temp}, {"dynatemp_range", slot.sparams.dynatemp_range}, {"dynatemp_exponent", slot.sparams.dynatemp_exponent}, {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, + {"xtc_probability", slot.sparams.xtc_probability}, + {"xtc_threshold", slot.sparams.xtc_threshold}, {"tfs_z", slot.sparams.tfs_z}, {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, @@ -1302,7 +1257,7 @@ struct server_context { }; if (slot.sparams.n_probs > 0) { - const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); + const std::vector to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1339,6 +1294,7 @@ struct server_context { {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, {"prompt", slot.prompt}, + {"has_new_line", slot.has_new_line}, {"truncated", slot.truncated}, {"stopped_eos", slot.stopped_eos}, {"stopped_word", slot.stopped_word}, @@ -1352,7 +1308,7 @@ struct server_context { if (slot.sparams.n_probs > 0) { std::vector probs; if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false); + const std::vector stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); probs = std::vector( @@ -1406,7 +1362,7 @@ struct server_context { continue; } - llama_embd_normalize(embd, embd_res.data(), n_embd); + common_embd_normalize(embd, embd_res.data(), n_embd); res.data = json { {"embedding", embd_res}, @@ -1488,9 +1444,8 @@ struct server_context { if (prompt.is_string() || json_is_array_of_numbers(prompt)) { data["index"] = 0; create_task(data, false, nullptr); - } - // otherwise, it's a multiple-prompt task, we break it into smaller tasks - else if (prompt.is_array()) { + } else if (prompt.is_array()) { + // otherwise, it's a multiple-prompt task, we break it into smaller tasks std::vector prompts = prompt; if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { // prompts[0] is the question @@ -1515,9 +1470,8 @@ struct server_context { } } } - } - // invalid case - else { + } else { + // invalid case throw std::runtime_error(error_msg); } @@ -1627,16 +1581,6 @@ struct server_context { break; } - if (task.data.contains("system_prompt")) { - std::string sys_prompt = json_value(task.data, "system_prompt", std::string()); - system_prompt_set(sys_prompt); - - for (server_slot & slot : slots) { - slot.n_past = 0; - slot.n_past_se = 0; - } - } - slot->reset(); slot->id_task = task.id; @@ -1677,6 +1621,7 @@ struct server_context { slot_data["prompt"] = slot.prompt; slot_data["next_token"] = { {"has_next_token", slot.has_next_token}, + {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, {"n_decoded", slot.n_decoded}, {"stopped_eos", slot.stopped_eos}, @@ -1800,6 +1745,9 @@ struct server_context { } slot->cache_tokens.resize(token_count); + // TODO: maybe detokenize the slot->cache_tokens instead? + slot->prompt = string_format("[restored %d tokens from file]", (int) token_count); + const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; @@ -1850,7 +1798,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SET_LORA: { - llama_lora_adapters_apply(ctx, loras); + common_lora_adapters_apply(ctx, loras); server_task_result result; result.id = task.id; result.stop = true; @@ -1862,10 +1810,6 @@ struct server_context { } void update_slots() { - if (system_need_update) { - system_prompt_update(); - } - // check if all slots are idle { bool all_idle = true; @@ -1879,7 +1823,7 @@ struct server_context { if (all_idle) { SRV_INF("%s", "all slots are idle\n"); - if (system_prompt.empty() && clean_kv_cache) { + if (clean_kv_cache) { kv_cache_clear(); } @@ -1900,43 +1844,41 @@ struct server_context { // apply context-shift if needed // TODO: simplify and improve for (server_slot & slot : slots) { - if (slot.ga_n == 1) { - if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { - if (!params.ctx_shift) { - // this check is redundant (for good) - // we should never get here, because generation should already stopped in process_token() - slot.release(); - send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); - continue; - } - - // Shift context - const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; - const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - - SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - - llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - - if (slot.params.cache_prompt) { - for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - } - - slot.n_past -= n_discard; - - slot.truncated = true; + if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { + if (!params.ctx_shift) { + // this check is redundant (for good) + // we should never get here, because generation should already stopped in process_token() + slot.release(); + send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); + continue; } + + // Shift context + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = slot.n_past - n_keep; + const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); + + SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); + + llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard); + + if (slot.params.cache_prompt) { + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } + + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + } + + slot.n_past -= n_discard; + + slot.truncated = true; } } // start populating the batch for this iteration - llama_batch_clear(batch); + common_batch_clear(batch); // frist, add sampled tokens from any ongoing sequences for (auto & slot : slots) { @@ -1946,11 +1888,7 @@ struct server_context { slot.i_batch = batch.n_tokens; - const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - // TODO: we always have to take into account the "system_tokens" - // this is not great and needs to be improved somehow - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); + common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true); slot.n_past += 1; @@ -1958,8 +1896,8 @@ struct server_context { slot.cache_tokens.push_back(slot.sampled); } - SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", - slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated); + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated); } // process in chunks of params.n_batch @@ -1986,63 +1924,126 @@ struct server_context { slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) { - const bool add_bos = llama_add_bos_token(model); - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } + switch (slot.cmpl_type) { + case SERVER_TASK_CMPL_TYPE_NORMAL: + case SERVER_TASK_CMPL_TYPE_EMBEDDING: + { + prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true); + } break; + case SERVER_TASK_CMPL_TYPE_RERANK: + { + // require slot.prompt to be array of 2 strings + if (!slot.prompt.is_array() || slot.prompt.size() != 2) { + SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); + slot.release(); + send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); + continue; + } - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); + // prompt: [BOS]query[EOS][SEP]doc[EOS] + prompt_tokens.clear(); + prompt_tokens.push_back(llama_token_bos(model)); + { + const auto part = tokenize(slot.prompt[0], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + prompt_tokens.push_back(llama_token_sep(model)); + { + const auto part = tokenize(slot.prompt[1], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + } break; + case SERVER_TASK_CMPL_TYPE_INFILL: + { + // TODO: optimize this block by reducing memory allocations and movement - const int space_token = 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } + // use FIM repo-level pattern: + // ref: https://arxiv.org/pdf/2409.12186 + // + // [FIM_REP]myproject + // [FIM_SEP]filename0 + // extra chunk 0 + // [FIM_SEP]filename1 + // extra chunk 1 + // ... + // [FIM_SEP]filename + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt + // + auto tokens_prefix = tokenize(slot.input_prefix, false, false); + auto tokens_suffix = tokenize(slot.input_suffix, false, false); + auto tokens_prompt = tokenize(slot.prompt, false, false); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + slot.extra_tokens.clear(); + if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + static const auto k_fim_repo = tokenize("myproject\n", false, false); - auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; - auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; - if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + slot.extra_tokens.push_back(llama_token_fim_rep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); + } - const llama_token middle_token = llama_token_middle(model); - if (middle_token >= 0) { - embd_inp.push_back(middle_token); - } + for (const auto & chunk : slot.input_extra) { + // { "text": string, "filename": string } + const std::string text = chunk.value("text", ""); + const std::string filename = chunk.value("filename", "tmp"); - prompt_tokens = embd_inp; - } else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // require slot.prompt to be array of 2 strings - if (!slot.prompt.is_array() || slot.prompt.size() != 2) { - SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); - slot.release(); - send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); - continue; - } + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = tokenize(filename + "\n", false, false); - // prompt: [BOS]query[EOS][SEP]doc[EOS] - prompt_tokens.clear(); - prompt_tokens.push_back(llama_token_bos(model)); - { - const auto part = tokenize(slot.prompt[0], false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - prompt_tokens.push_back(llama_token_sep(model)); - { - const auto part = tokenize(slot.prompt[1], false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - } else { - prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } else { + // chunk separator in binary form to avoid confusing the AI + static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; + static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); + } + + const auto chunk_tokens = tokenize(text, false, false); + slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); + } + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + // TODO: current filename + static const auto k_fim_file = tokenize("filename\n", false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } + + // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) + const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); + const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); + + // fill the rest of the context with extra chunks + const int n_extra_take = std::min(std::max(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size()); + + tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); + tokens_suffix.resize(n_suffix_take); + + tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); + tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); + tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); + + auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix; + auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix; + + if (llama_add_bos_token(model)) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + + SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size()); + + // put the extra context before the FIM prefix + embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end()); + + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + embd_inp.push_back(llama_token_fim_mid(model)); + + prompt_tokens = std::move(embd_inp); + } break; } slot.n_past = 0; @@ -2050,6 +2051,19 @@ struct server_context { SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); + // print prompt tokens (for debugging) + if (1) { + // first 16 tokens (avoid flooding logs) + for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + } else { + // all + for (int i = 0; i < (int) prompt_tokens.size(); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + } + // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); @@ -2070,7 +2084,9 @@ struct server_context { } else { if (!params.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size - if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) { + // TODO: there should be a separate parameter that control prompt truncation + // context shift should be applied only during the generation phase + if (slot.n_prompt_tokens >= slot.n_ctx) { slot.release(); send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); continue; @@ -2081,8 +2097,8 @@ struct server_context { } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it (if group attention self-extend is disabled) - if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { + // if input prompt is too big, truncate it + if (slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; @@ -2107,20 +2123,62 @@ struct server_context { GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - gpt_sampler_reset(slot.smpl); - - if (!slot.params.cache_prompt) { - slot.n_past_se = 0; - slot.ga_i = 0; - } else { - GGML_ASSERT(slot.ga_n == 1); + common_sampler_reset(slot.smpl); + if (slot.params.cache_prompt) { // reuse any previously computed tokens that are common with the new prompt - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { - gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false); + common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); + } + + // reuse chunks from the cached prompt by shifting their KV cache in the new position + if (params.n_cache_reuse > 0) { + size_t head_c = slot.n_past; // cache + size_t head_p = slot.n_past; // current prompt + + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + + while (head_c < slot.cache_tokens.size() && + head_p < prompt_tokens.size()) { + + size_t n_match = 0; + while (head_c + n_match < slot.cache_tokens.size() && + head_p + n_match < prompt_tokens.size() && + slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { + + n_match++; + } + + if (n_match >= (size_t) params.n_cache_reuse) { + SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + //for (size_t i = head_p; i < head_p + n_match; i++) { + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + //} + + const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; + + llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + + for (size_t i = 0; i < n_match; i++) { + slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; + + common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false); + + slot.n_past++; + } + + head_c += n_match; + head_p += n_match; + } else { + head_c += 1; + } + } + + SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); } } } @@ -2130,9 +2188,6 @@ struct server_context { SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); slot.n_past--; - if (slot.ga_i > 0) { - slot.n_past_se--; - } } slot.n_prompt_tokens_processed = 0; @@ -2158,55 +2213,31 @@ struct server_context { } // keep only the common part - int p0 = (int) system_tokens.size() + slot.n_past; - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { + if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); - p0 = (int) system_tokens.size(); - if (p0 != 0) { - // copy over the system prompt when there is one - llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); - } - - // there is no common part left (except for the system prompt) + // there is no common part left slot.n_past = 0; - slot.n_past_se = 0; - slot.ga_i = 0; - // TODO: is the system prompt ever in the sampling context? - gpt_sampler_reset(slot.smpl); + + common_sampler_reset(slot.smpl); } + SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); + // remove the non-common part from the cache slot.cache_tokens.resize(slot.n_past); - SLT_INF(slot, "kv cache rm [%d, end)\n", p0); - - int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - int32_t ga_i = slot.ga_i; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; - // add prompt tokens for processing in the current batch - // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow - for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) { - if (slot.ga_n != 1) { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - } - - llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); + while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); } slot.n_prompt_tokens_processed++; - slot_npast++; + slot.n_past++; } SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); @@ -2247,34 +2278,6 @@ struct server_context { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); - for (auto & slot : slots) { - if (slot.ga_n != 1) { - // context extension via Self-Extend - // TODO: simplify and/or abstract this - while (slot.n_past_se >= slot.ga_i + slot.ga_w) { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - - SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); - - slot.n_past_se -= bd; - - slot.ga_i += slot.ga_w / slot.ga_n; - - SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } - - slot.n_past_se += n_tokens; - } - } - llama_batch batch_view = { n_tokens, batch.token + i, @@ -2337,9 +2340,9 @@ struct server_context { } completion_token_output result; - const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); - gpt_sampler_accept(slot.smpl, id, true); + common_sampler_accept(slot.smpl, id, true); slot.n_decoded += 1; if (slot.n_decoded == 1) { @@ -2350,7 +2353,7 @@ struct server_context { result.tok = id; - const auto * cur_p = gpt_sampler_get_candidates(slot.smpl); + const auto * cur_p = common_sampler_get_candidates(slot.smpl); for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { result.probs.push_back({ @@ -2414,13 +2417,13 @@ inline void signal_handler(int signal) { int main(int argc, char ** argv) { // own arguments required by this example - gpt_params params; + common_params params; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } - gpt_init(); + common_init(); // enabling this will output extra debug information in the HTTP responses from the server // see format_final_response_oaicompat() @@ -2429,10 +2432,6 @@ int main(int argc, char ** argv) { // struct that contains llama context and inference server_context ctx_server; - if (!params.system_prompt.empty()) { - ctx_server.system_prompt_set(params.system_prompt); - } - if (params.model_alias == "unknown") { params.model_alias = params.model; } @@ -2442,7 +2441,7 @@ int main(int argc, char ** argv) { LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); LOG_INF("\n"); - LOG_INF("%s\n", gpt_params_get_system_info(params).c_str()); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("\n"); std::unique_ptr svr; @@ -2536,20 +2535,10 @@ int main(int argc, char ** argv) { // auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) { - // TODO: should we apply API key to all endpoints, including "/health" and "/models"? - static const std::unordered_set protected_endpoints = { - "/props", - "/completion", - "/completions", - "/v1/completions", - "/chat/completions", - "/v1/chat/completions", - "/infill", - "/tokenize", - "/detokenize", - "/embedding", - "/embeddings", - "/v1/embeddings", + static const std::unordered_set public_endpoints = { + "/health", + "/models", + "/v1/models", }; // If API key is not set, skip validation @@ -2557,8 +2546,8 @@ int main(int argc, char ** argv) { return true; } - // If path is not in protected_endpoints list, skip validation - if (protected_endpoints.find(req.path) == protected_endpoints.end()) { + // If path is public, skip validation + if (public_endpoints.find(req.path) != public_endpoints.end()) { return true; } @@ -2620,7 +2609,7 @@ int main(int argc, char ** argv) { const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) { if (!params.endpoint_slots) { - res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED)); + res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2869,24 +2858,28 @@ int main(int argc, char ** argv) { }; const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { - std::string template_key = "tokenizer.chat_template", curr_tmpl; - int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0); - if (tlen > 0) { - std::vector curr_tmpl_buf(tlen + 1, 0); - if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) { - curr_tmpl = std::string(curr_tmpl_buf.data(), tlen); - } - } json data = { - { "system_prompt", ctx_server.system_prompt.c_str() }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params.n_parallel }, - { "chat_template", curr_tmpl.c_str() }, + { "chat_template", llama_get_chat_template(ctx_server.model) }, }; res_ok(res, data); }; + const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + if (!ctx_server.params.endpoint_props) { + res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED)); + return; + } + + json data = json::parse(req.body); + + // update any props here + + res_ok(res, {{ "success", true }}); + }; + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) { if (ctx_server.params.embedding || ctx_server.params.reranking) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED)); @@ -2942,7 +2935,23 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res); }; - const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + std::string err; + if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "prefix token is missing. "; + } + if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "suffix token is missing. "; + } + if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "middle token is missing. "; + } + + if (!err.empty()) { + res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); + return; + } + json data = json::parse(req.body); return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); }; @@ -3028,11 +3037,12 @@ int main(int argc, char ** argv) { if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { - std::string piece = llama_token_to_piece(ctx_server.ctx, token); + std::string piece = common_token_to_piece(ctx_server.ctx, token); json piece_json; // Check if the piece is valid UTF-8 @@ -3265,30 +3275,39 @@ int main(int argc, char ** argv) { svr->set_base_dir(params.public_path); } - // using embedded static files - svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); - svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); - svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); - svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + if (!params.api_keys.empty()) { + // for now, if API key is set, web UI is unusable + svr->Get("/", [&](const httplib::Request &, httplib::Response & res) { + return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8"); + }); + } else { + // using embedded static files + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); - // add new-ui files - svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); - svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); - svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); - svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); - svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); - svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + // add new-ui files + svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8")); + svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8")); + svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8")); + svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8")); + svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8")); + svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8")); + } // register API routes - svr->Get ("/health", handle_health); + svr->Get ("/health", handle_health); // public endpoint (no API key check) svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); - svr->Get ("/v1/models", handle_models); + svr->Post("/props", handle_props_change); + svr->Get ("/models", handle_models); // public endpoint (no API key check) + svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions); @@ -3366,10 +3385,11 @@ int main(int argc, char ** argv) { } // print sample chat example to make it clear which template is used - LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str()); + LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str()); ctx_server.queue_tasks.on_new_task(std::bind( &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature index ba3afcf06..ae6c6b01b 100644 --- a/examples/server/tests/features/ctx_shift.feature +++ b/examples/server/tests/features/ctx_shift.feature @@ -13,6 +13,10 @@ Feature: llama.cpp server And 32 as batch size And 2 slots + # the prompt is 301 tokens + # the slot context is 256/2 = 128 tokens + # the prompt is truncated to keep the last 109 tokens + # 64 tokens are generated thanks to shifting the context when it gets full Scenario: Inference with context shift And 64 server max tokens to predict Then the server is starting diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index eb82e7aca..0a3c5cc77 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -5,7 +5,7 @@ Feature: Security Background: Server startup with an api key defined Given a server listening on localhost:8080 And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And a server api key llama.cpp + And a server api key THIS_IS_THE_KEY Then the server is starting Then the server is healthy @@ -16,11 +16,11 @@ Feature: Security And a completion request with api error Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackeme | raised | - | | raised | + | api_key | api_error | + | THIS_IS_THE_KEY | no | + | THIS_IS_THE_KEY | no | + | hackeme | raised | + | | raised | Scenario Outline: OAI Compatibility Given a system prompt test @@ -32,10 +32,10 @@ Feature: Security Given an OAI compatible chat completions request with api error Examples: Prompts - | api_key | api_error | - | llama.cpp | no | - | llama.cpp | no | - | hackme | raised | + | api_key | api_error | + | THIS_IS_THE_KEY | no | + | THIS_IS_THE_KEY | no | + | hackme | raised | Scenario Outline: OAI Compatibility (invalid response formats) Given a system prompt test @@ -55,7 +55,7 @@ Feature: Security Scenario Outline: CORS Options - Given a user api key llama.cpp + Given a user api key THIS_IS_THE_KEY When an OPTIONS request is sent from Then CORS header is set to diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 2611614ba..540a2ecd5 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1299,7 +1299,8 @@ async def wait_for_slots_status(context, async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session: while True: - async with await session.get(f'{base_url}/slots', params=params) as slots_response: + headers = {'Authorization': f'Bearer {context.server_api_key}'} + async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response: status_code = slots_response.status slots = await slots_response.json() if context.debug: @@ -1387,6 +1388,7 @@ def start_server_background(context): context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] server_listen_addr = context.server_fqdn server_args = [ + '--slots', # requires to get slot status via /slots endpoint '--host', server_listen_addr, '--port', context.server_port, ] diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 47dfdfde5..69519ef95 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -57,7 +57,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul // Format given chat. If tmpl is empty, we take the template from model metadata inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { - std::vector chat; + std::vector chat; for (size_t i = 0; i < messages.size(); ++i) { const auto & curr_msg = messages[i]; @@ -84,12 +84,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri chat.push_back({role, content}); } - const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true); + const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true); LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); return formatted_chat; } +static std::string llama_get_chat_template(const struct llama_model * model) { + std::string template_key = "tokenizer.chat_template"; + // call with NULL buffer to get the total size of the string + int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); + if (res < 0) { + return ""; + } else { + std::vector model_template(res, 0); + llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size()); + } +} + // // base64 utils (TODO: move to common in the future) // @@ -182,14 +195,14 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t common_part(const std::vector & a, const std::vector & b) { +static size_t longest_common_prefix(const std::vector & a, const std::vector & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } -static size_t common_part(const std::string & a, const std::string & b) { +static size_t longest_common_prefix(const std::string & a, const std::string & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} @@ -233,7 +246,7 @@ template static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); + ret += common_token_to_piece(ctx, *begin); } return ret; @@ -241,7 +254,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + std::string out = token == -1 ? "" : common_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) @@ -347,9 +360,9 @@ static json oaicompat_completion_params_parse( // Handle "logprobs" field // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future - if (body.contains("logprobs")) { + if (json_value(body, "logprobs", false)) { llama_params["n_probs"] = json_value(body, "top_logprobs", 20); - } else if (body.contains("top_logprobs")) { + } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) { throw std::runtime_error("top_logprobs requires logprobs to be set to true"); } diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 070cfbe7a..b63afbb8b 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,5 @@ set(TARGET llama-simple) add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index c2b7267c8..be91b2891 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,50 +1,112 @@ -#include "arg.h" -#include "common.h" -#include "log.h" #include "llama.h" - +#include +#include +#include #include static void print_usage(int, char ** argv) { - LOG("\nexample usage:\n"); - LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); - LOG("\n"); + printf("\nexample usage:\n"); + printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]); + printf("\n"); } int main(int argc, char ** argv) { - gpt_params params; + // path to the model gguf file + std::string model_path; + // prompt to generate text from + std::string prompt = "Hello my name is"; + // number of layers to offload to the GPU + int ngl = 99; + // number of tokens to predict + int n_predict = 32; - params.prompt = "Hello my name is"; - params.n_predict = 32; + // parse command line arguments - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { - return 1; + { + int i = 1; + for (; i < argc; i++) { + if (strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + model_path = argv[++i]; + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-n") == 0) { + if (i + 1 < argc) { + try { + n_predict = std::stoi(argv[++i]); + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else if (strcmp(argv[i], "-ngl") == 0) { + if (i + 1 < argc) { + try { + ngl = std::stoi(argv[++i]); + } catch (...) { + print_usage(argc, argv); + return 1; + } + } else { + print_usage(argc, argv); + return 1; + } + } else { + // prompt starts here + break; + } + } + if (model_path.empty()) { + print_usage(argc, argv); + return 1; + } + if (i < argc) { + prompt = argv[i++]; + for (; i < argc; i++) { + prompt += " "; + prompt += argv[i]; + } + } } - gpt_init(); - - // total length of the sequence including the prompt - const int n_predict = params.n_predict; - - // init LLM - - llama_backend_init(); - llama_numa_init(params.numa); - // initialize the model - llama_model_params model_params = llama_model_params_from_gpt_params(params); + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; - llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; } + // tokenize the prompt + + // find the number of tokens in the prompt + const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); + + // allocate space for the tokens and tokenize the prompt + std::vector prompt_tokens(n_prompt); + if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { + fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__); + return 1; + } + // initialize the context - llama_context_params ctx_params = llama_context_params_from_gpt_params(params); + llama_context_params ctx_params = llama_context_default_params(); + // n_ctx is the context size + ctx_params.n_ctx = n_prompt + n_predict - 1; + // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode + ctx_params.n_batch = n_prompt; + // enable performance counters + ctx_params.no_perf = false; llama_context * ctx = llama_new_context_with_model(model, ctx_params); @@ -53,117 +115,87 @@ int main(int argc, char ** argv) { return 1; } + // initialize the sampler + auto sparams = llama_sampler_chain_default_params(); - sparams.no_perf = false; - llama_sampler * smpl = llama_sampler_chain_init(sparams); llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); - // tokenize the prompt - - std::vector tokens_list; - tokens_list = ::llama_tokenize(ctx, params.prompt, true); - - const int n_ctx = llama_n_ctx(ctx); - const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - - LOG("\n"); - LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); - - // make sure the KV cache is big enough to hold all the prompt and generated tokens - if (n_kv_req > n_ctx) { - LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__); - return 1; - } - // print the prompt token-by-token - LOG("\n"); - - for (auto id : tokens_list) { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); + for (auto id : prompt_tokens) { + char buf[128]; + int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; + } + std::string s(buf, n); + printf("%s", s.c_str()); } - // create a llama_batch with size 512 - // we use this object to submit token data for decoding + // prepare a batch for the prompt - llama_batch batch = llama_batch_init(512, 0, 1); - - // evaluate the initial prompt - for (size_t i = 0; i < tokens_list.size(); i++) { - llama_batch_add(batch, tokens_list[i], i, { 0 }, false); - } - - // llama_decode will output logits only for the last token of the prompt - batch.logits[batch.n_tokens - 1] = true; - - if (llama_decode(ctx, batch) != 0) { - LOG("%s: llama_decode() failed\n", __func__); - return 1; - } + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size(), 0, 0); // main loop - int n_cur = batch.n_tokens; - int n_decode = 0; - const auto t_main_start = ggml_time_us(); + int n_decode = 0; + llama_token new_token_id; + + for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) { + // evaluate the current batch with the transformer model + if (llama_decode(ctx, batch)) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + + n_pos += batch.n_tokens; - while (n_cur <= n_predict) { // sample the next token { - const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1); + new_token_id = llama_sampler_sample(smpl, ctx, -1); // is it an end of generation? - if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { - LOG("\n"); - + if (llama_token_is_eog(model, new_token_id)) { break; } - LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str()); + char buf[128]; + int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; + } + std::string s(buf, n); + printf("%s", s.c_str()); fflush(stdout); - // prepare the next batch - llama_batch_clear(batch); - - // push this new token for next evaluation - llama_batch_add(batch, new_token_id, n_cur, { 0 }, true); + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1, n_pos, 0); n_decode += 1; } - - n_cur += 1; - - // evaluate the current batch with the transformer model - if (llama_decode(ctx, batch)) { - LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); - return 1; - } } - LOG("\n"); + printf("\n"); const auto t_main_end = ggml_time_us(); - LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); - LOG("\n"); + fprintf(stderr, "\n"); llama_perf_sampler_print(smpl); llama_perf_context_print(ctx); + fprintf(stderr, "\n"); - LOG("\n"); - - llama_batch_free(batch); llama_sampler_free(smpl); llama_free(ctx); llama_free_model(model); - llama_backend_free(); - return 0; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index adf6255e1..5a7b3084f 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -26,20 +26,20 @@ struct seq_draft { std::vector tokens; std::vector> dists; - struct gpt_sampler * smpl = nullptr; + struct common_sampler * smpl = nullptr; }; int main(int argc, char ** argv) { - gpt_params params; + common_params params; // needed to get candidate probs even for temp <= 0.0 params.sparams.n_probs = 128; - if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) { return 1; } - gpt_init(); + common_init(); if (params.model_draft.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); @@ -66,7 +66,7 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - llama_init_result llama_init_tgt = llama_init_from_gpt_params(params); + common_init_result llama_init_tgt = common_init_from_params(params); model_tgt = llama_init_tgt.model; ctx_tgt = llama_init_tgt.context; @@ -78,7 +78,7 @@ int main(int argc, char ** argv) { } params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; - llama_init_result llama_init_dft = llama_init_from_gpt_params(params); + common_init_result llama_init_dft = common_init_from_params(params); model_dft = llama_init_dft.model; ctx_dft = llama_init_dft.context; @@ -124,8 +124,8 @@ int main(int argc, char ** argv) { if (std::strcmp(token_text_tgt, token_text_dft) != 0) { LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__); LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i, - llama_token_to_piece(ctx_tgt, i).c_str(), - llama_token_to_piece(ctx_dft, i).c_str()); + common_token_to_piece(ctx_tgt, i).c_str(), + common_token_to_piece(ctx_dft, i).c_str()); return 1; } } @@ -134,7 +134,7 @@ int main(int argc, char ** argv) { // Tokenize the prompt std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); + inp = common_tokenize(ctx_tgt, params.prompt, true, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { LOG("\n\n"); for (auto id : inp) { - LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str()); + LOG("%s", common_token_to_piece(ctx_tgt, id).c_str()); } const int n_input = inp.size(); @@ -178,7 +178,7 @@ int main(int argc, char ** argv) { bool has_eos = false; // target model sampling context (reuse the llama_context's sampling instance) - struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams); + struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams); struct llama_sampler * softmax = llama_sampler_init_softmax(); @@ -186,8 +186,8 @@ int main(int argc, char ** argv) { std::vector drafts(n_seq_dft); for (int s = 0; s < n_seq_dft; ++s) { - // allocate gpt_sampler for each draft sequence - drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams); + // allocate llama_sampler for each draft sequence + drafts[s].smpl = common_sampler_init(model_dft, params.sparams); } llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); @@ -229,9 +229,9 @@ int main(int argc, char ** argv) { bool accept = false; if (params.sparams.temp > 0) { // stochastic verification - gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); + common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); - auto & dist_tgt = *gpt_sampler_get_candidates(smpl); + auto & dist_tgt = *common_sampler_get_candidates(smpl); float p_tgt = 0.0f; float p_dft = 0.0f; @@ -277,13 +277,13 @@ int main(int argc, char ** argv) { s_keep = s; accept = true; token_id = drafts[s].tokens[i_dft]; - token_str = llama_token_to_piece(ctx_tgt, token_id); - gpt_sampler_accept(smpl, token_id, true); + token_str = common_token_to_piece(ctx_tgt, token_id); + common_sampler_accept(smpl, token_id, true); LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); break; } else { - LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); + LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); drafts[s].active = false; // calculate residual probability @@ -349,19 +349,19 @@ int main(int argc, char ** argv) { const int idx = dist(rng); token_id = dist_tgt.data[idx].id; - gpt_sampler_accept(smpl, token_id, true); - token_str = llama_token_to_piece(ctx_tgt, token_id); + common_sampler_accept(smpl, token_id, true); + token_str = common_token_to_piece(ctx_tgt, token_id); } } else { // greedy verification // sample from the target model LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); + token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); - gpt_sampler_accept(smpl, token_id, true); + common_sampler_accept(smpl, token_id, true); - token_str = llama_token_to_piece(ctx_tgt, token_id); + token_str = common_token_to_piece(ctx_tgt, token_id); for (int s = 0; s < n_seq_dft; ++s) { if (!drafts[s].active) { @@ -431,8 +431,8 @@ int main(int argc, char ** argv) { drafts[0].dists.push_back(std::vector()); drafts[0].i_batch_tgt.push_back(0); - llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); + common_batch_clear(batch_dft); + common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); @@ -446,9 +446,9 @@ int main(int argc, char ** argv) { } if (drafts[0].smpl) { - gpt_sampler_free(drafts[0].smpl); + common_sampler_free(drafts[0].smpl); } - drafts[0].smpl = gpt_sampler_clone(smpl); + drafts[0].smpl = common_sampler_clone(smpl); int n_seq_cur = 1; int n_past_cur = n_past_dft; @@ -461,8 +461,8 @@ int main(int argc, char ** argv) { drafts[0].drafting = true; drafts[0].i_batch_dft = 0; - llama_batch_clear(batch_tgt); - llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); + common_batch_clear(batch_tgt); + common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); // sample n_draft tokens from the draft model using tree-based sampling for (int i = 0; i < n_draft; ++i) { @@ -477,13 +477,13 @@ int main(int argc, char ** argv) { continue; } - gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); + common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); - const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); + const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl); for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); + k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); } std::vector sa(1, s); @@ -518,9 +518,9 @@ int main(int argc, char ** argv) { drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; if (drafts[n_seq_cur].smpl) { - gpt_sampler_free(drafts[n_seq_cur].smpl); + common_sampler_free(drafts[n_seq_cur].smpl); } - drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl); + drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl); sa.push_back(n_seq_cur); @@ -536,7 +536,7 @@ int main(int argc, char ** argv) { const int s = sa[is]; - gpt_sampler_accept(drafts[s].smpl, id, true); + common_sampler_accept(drafts[s].smpl, id, true); drafts[s].tokens.push_back(id); // save cur_p.data into drafts[s].dists @@ -545,12 +545,12 @@ int main(int argc, char ** argv) { // add unique drafted tokens to the target batch drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); - llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); + common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); // add the token to the batch for batched decoding with the draft model drafts[s].i_batch_dft = batch_dft.n_tokens; - llama_batch_add(batch_dft, id, n_past_cur, { s }, true); + common_batch_add(batch_dft, id, n_past_cur, { s }, true); if (batch_tgt.n_tokens > n_draft) { drafts[s].drafting = false; @@ -617,11 +617,11 @@ int main(int argc, char ** argv) { LOG_INF("\n"); LOG_INF("target:\n\n"); - gpt_perf_print(ctx_tgt, smpl); + common_perf_print(ctx_tgt, smpl); - gpt_sampler_free(smpl); + common_sampler_free(smpl); for (int s = 0; s < n_seq_dft; ++s) { - gpt_sampler_free(drafts[s].smpl); + common_sampler_free(drafts[s].smpl); } llama_sampler_free(softmax); diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index a9af6471f..12ad54256 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) { const bool parse_special = !no_parse_special; std::vector tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, parse_special); + tokens = common_tokenize(model, prompt, add_bos, parse_special); if (printing_ids) { printf("["); @@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) { } else { bool invalid_utf8 = false; printf("%6d -> '", tokens[i]); - write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); + write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); if (invalid_utf8) { printf("' (utf-8 decode failure)\n"); } else { diff --git a/flake.lock b/flake.lock index 3fb6ced51..702527028 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1728018373, - "narHash": "sha256-NOiTvBbRLIOe5F6RbHaAh6++BNjsb149fGZd1T4+KBg=", + "lastModified": 1728492678, + "narHash": "sha256-9UTxR8eukdg+XZeHgxW5hQA9fIKHsKCdOIUycTryeVw=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "bc947f541ae55e999ffdb4013441347d83b00feb", + "rev": "5633bcff0c6162b9e4b5f1264264611e950c8ec7", "type": "github" }, "original": { diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index 64cde7f13..d57967368 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -17,7 +17,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); -GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); +GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); + +GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void); + +GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); #ifdef __cplusplus } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e7678d071..4508da4fb 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2535,7 +2535,7 @@ extern "C" { typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y, int nr, int nc); - typedef struct { + struct ggml_type_traits { const char * type_name; int64_t blck_size; int64_t blck_size_interleave; // interleave elements in blocks @@ -2551,9 +2551,9 @@ extern "C" { int64_t ncols; // number of columns to process simultaneously ggml_gemv_t gemv; ggml_gemm_t gemm; - } ggml_type_traits_t; + }; - GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index f126ebf7e..676f85a36 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -163,8 +163,8 @@ if (GGML_OPENMP) list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) if (GGML_MUSA) - list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp") - list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so") + list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include") + list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so") endif() else() message(WARNING "OpenMP not found") diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 70187b9b6..041de9e3e 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -14,7 +14,7 @@ //#define GGML_ALLOCATOR_DEBUG -//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) +//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__) #define AT_PRINTF(...) @@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso size = GGML_PAD(size, talloc->alignment); if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { - fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); GGML_ABORT("not enough space in the buffer"); } @@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz best_fit_block = alloc->n_free_blocks - 1; } else { // this should never happen - fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); GGML_ABORT("not enough space in the buffer"); } @@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz } } } - fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); for (int i = 0; i < 1024; i++) { if (alloc->allocated_tensors[i].tensor) { - fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, + GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, alloc->allocated_tensors[i].offset, alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); } } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); } #endif @@ -348,7 +348,6 @@ struct tensor_alloc { }; struct leaf_alloc { - int buffer_id; struct tensor_alloc leaf; }; @@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - galloc->leaf_allocs[i].buffer_id = hn->buffer_id; if (leaf->view_src || leaf->data) { galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; @@ -768,13 +766,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views if (new_size > cur_size || galloc->buffers[i] == NULL) { #ifndef NDEBUG - fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif ggml_backend_buffer_free(galloc->buffers[i]); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); if (galloc->buffers[i] == NULL) { - fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); return false; } ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); @@ -825,14 +823,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { if (galloc->n_nodes != graph->n_nodes) { #ifndef NDEBUG - fprintf(stderr, "%s: graph has different number of nodes\n", __func__); + GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__); #endif return true; } if (galloc->n_leafs != graph->n_leafs) { #ifndef NDEBUG - fprintf(stderr, "%s: graph has different number of leafs\n", __func__); + GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__); #endif return true; } @@ -843,7 +841,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { #ifndef NDEBUG - fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); + GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name); #endif return true; } @@ -855,7 +853,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph } if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { #ifndef NDEBUG - fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); + GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); #endif return true; } @@ -869,14 +867,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (ggml_gallocr_needs_realloc(galloc, graph)) { if (galloc->n_buffers == 1) { #ifndef NDEBUG - fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); + GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__); #endif if (!ggml_gallocr_reserve(galloc, graph)) { return false; } } else { #ifndef NDEBUG - fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); + GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); #endif return false; } @@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx, ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); if (buffer == NULL) { #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); + GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); #endif for (size_t i = 0; i < *n_buffers; i++) { ggml_backend_buffer_free((*buffers)[i]); @@ -990,7 +988,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (this_size > max_size) { - fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", + GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", __func__, t->name, ggml_backend_buft_name(buft), this_size, max_size); @@ -1022,7 +1020,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (n_buffers == 0) { #ifndef NDEBUG - fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); + GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); #endif return NULL; } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 16ccd2595..a3bc79a46 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { #ifndef NDEBUG - fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); + GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); #endif size_t nbytes = ggml_nbytes(src); void * data = malloc(nbytes); @@ -546,6 +546,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na #include "ggml-blas.h" #endif +#ifdef GGML_USE_RPC +#include "ggml-rpc.h" +#endif + struct ggml_backend_registry { std::vector backends; std::vector devices; @@ -563,6 +567,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_BLAS register_backend(ggml_backend_blas_reg()); #endif +#ifdef GGML_USE_RPC + register_backend(ggml_backend_rpc_reg()); +#endif // TODO: sycl, kompute, cann @@ -571,7 +578,7 @@ struct ggml_backend_registry { void register_backend(ggml_backend_reg_t reg) { #ifndef NDEBUG - fprintf(stderr, "%s: registered backend %s (%zu devices)\n", + GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif backends.push_back(reg); @@ -582,7 +589,7 @@ struct ggml_backend_registry { void register_device(ggml_backend_dev_t device) { #ifndef NDEBUG - fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); + GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); #endif devices.push_back(device); } @@ -682,8 +689,6 @@ ggml_backend_t ggml_backend_init_best(void) { // backend CPU -static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment - static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { return "CPU"; @@ -702,7 +707,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { } static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - free(buffer->context); + ggml_aligned_free(buffer->context, buffer->size); } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -770,14 +775,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty } static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) + auto alloc_size = size; + if (alloc_size == 0) { + alloc_size = 1; + } + + void * data = ggml_aligned_malloc(alloc_size); + if (data == NULL) { - fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size); return NULL; } - return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); + return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size); } static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -836,7 +846,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); if (result != 0) { - fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); + GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); return NULL; } @@ -1184,7 +1194,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st op->type != GGML_TYPE_IQ1_S && op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: - return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type; case GGML_OP_ROPE_BACK: return op->src[2] == NULL && (op->op_params[2] & 4) == 0; case GGML_OP_IM2COL_BACK: @@ -1459,7 +1469,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co } #ifndef NDEBUG - fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", + GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); #endif @@ -1548,13 +1558,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; - fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), + GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { - fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, + GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); cur_split++; } struct ggml_tensor * node = graph->nodes[i]; @@ -1562,7 +1572,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str continue; } ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, + GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1570,10 +1580,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str continue; } ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, + GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); } } @@ -2087,11 +2097,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); + GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { - fprintf(stderr, "%s: failed to allocate graph\n", __func__); + GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__); return false; } } @@ -2485,7 +2495,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s struct ggml_context * ctx_unallocated = ggml_init(params); if (ctx_allocated == NULL || ctx_unallocated == NULL) { - fprintf(stderr, "failed to allocate context for graph copy\n"); + GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); @@ -2508,7 +2518,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // allocate nodes ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); if (buffer == NULL) { - fprintf(stderr, "failed to allocate buffer for graph copy\n"); + GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 0c6574de5..7875ec86d 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg // convert src0 to float if (type != GGML_TYPE_F32) { - ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); - ggml_to_float_t const to_float = type_traits.to_float; + const auto * type_traits = ggml_get_type_traits(type); + ggml_to_float_t const to_float = type_traits->to_float; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) { /* .context = */ ctx, }; -#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) +#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) if (openblas_get_parallel() != OPENBLAS_OPENMP) { - fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); } #endif -#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) - fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); #endif return backend; @@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s // TODO: find the optimal value const int64_t min_batch = 32; - return (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1->type == GGML_TYPE_F32 && - (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch)); + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); } case GGML_OP_OUT_PROD: - return (op->src[0]->type == GGML_TYPE_F32 && - op->src[1]->type == GGML_TYPE_F32 && - ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); + return op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1)) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); default: return false; diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index db5f8f186..ec3c0a688 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) { for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { ggml_backend_cann_buffer_types[i] = { /* .iface = */ ggml_backend_cann_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i)}, @@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new( ACL_CHECK(aclrtCreateEvent(&event)); return new ggml_backend_event{ - /* .backend = */ backend, + /* .device = */ nullptr, /* .context = */ event, }; } @@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) { * * @param event Pointer to the event structure to be recorded. */ -static void ggml_backend_cann_event_record(ggml_backend_event_t event) { +static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)event->backend->context; - + (ggml_backend_cann_context*)backend->context; ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); } @@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; - - if (ggml_backend_is_cann(event->backend)) { + if (ggml_backend_is_cann(backend)) { ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent)event->context)); } else { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index edb61abdf..1338bd458 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { return; } } - GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); + GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); ggml_cuda_set_device(device); CUDA_CHECK(cudaFree(ptr)); pool_size -= size; @@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return nullptr; } @@ -2406,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { #ifndef NDEBUG - GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); + GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__); #endif return false; } @@ -2524,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif } } @@ -2575,14 +2575,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); #endif } if (node->op == GGML_OP_MUL_MAT_ID) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); #endif } @@ -2591,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, // Changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } @@ -2603,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (!ptr) { use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); #endif } else { if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { @@ -2627,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif } } @@ -2685,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, use_cuda_graph = false; cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__); #endif } else { graph_evaluated_or_captured = true; // CUDA graph has been captured @@ -2854,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { // clear the error cudaGetLastError(); - GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return false; } diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 96a5adef5..00e21b5d7 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ const half * x = (const half *) vx; - + // load 2 halfs into register in a single instruction + const half2 x_reg = *((half2 *) &(x[ib + iqs])); // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; + v.x = __low2float(x_reg); + v.y = __high2float(x_reg); } static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) { @@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons // matrix multiplication // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 #ifdef GGML_CUDA_F16 - tmp += __hmul2(v, { - y[iybs + iqs + j/qr + 0], - y[iybs + iqs + j/qr + y_offset] - }); + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += __hmul2(v, y_reg); + } + else { + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); + } #else - tmp += v.x * y[iybs + iqs + j/qr + 0]; - tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += v.x * y_reg.x; + tmp += v.y * y_reg.y; + } + else { + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + } #endif // GGML_CUDA_F16 } } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index d3f4bad8c..65c4f8119 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -19,6 +19,9 @@ extern "C" { #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// required for mmap as gguf only guarantees 32-byte alignment +#define TENSOR_ALIGNMENT 32 + // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop @@ -196,6 +199,11 @@ struct ggml_cgraph { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); +// Memory allocation + +void * ggml_aligned_malloc(size_t size); +void ggml_aligned_free(void * ptr, size_t size); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index ab7298cba..13c7dd436 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -25,7 +25,7 @@ # include # include #endif -#include +#include #define UNUSED GGML_UNUSED @@ -630,22 +630,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g return (enum ggml_status)output[0]; } -static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - UNUSED(backend); - UNUSED(op); - //TODO: call the remote backend and cache the results - return true; -} - -static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) { - return false; - } - ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; - ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; - return buft_ctx->endpoint == rpc_ctx->endpoint; -} - static ggml_backend_i ggml_backend_rpc_interface = { /* .get_name = */ ggml_backend_rpc_name, /* .free = */ ggml_backend_rpc_free, @@ -659,8 +643,8 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_rpc_graph_compute, - /* .supports_op = */ ggml_backend_rpc_supports_op, - /* .supports_buft = */ ggml_backend_rpc_supports_buft, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, /* .offload_op = */ NULL, /* .event_record = */ NULL, /* .event_wait = */ NULL, @@ -691,7 +675,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type { /* .iface = */ ggml_backend_rpc_buffer_type_interface, - /* .device = */ nullptr, + /* .device = */ ggml_backend_rpc_add_device(endpoint), /* .context = */ buft_ctx }; buft_map[endpoint] = buft; @@ -707,7 +691,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_rpc_guid(), /* .interface = */ ggml_backend_rpc_interface, - /* .device = */ nullptr, + /* .device = */ ggml_backend_rpc_add_device(endpoint), /* .context = */ ctx }; return backend; @@ -1189,7 +1173,7 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre } } -void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { +void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) { std::string host; int port; if (!parse_endpoint(endpoint, host, port)) { @@ -1226,3 +1210,179 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free WSACleanup(); #endif } + +// device interface + +struct ggml_backend_rpc_device_context { + std::string endpoint; + std::string name; +}; + +static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ctx->name.c_str(); +} + +static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ctx->name.c_str(); +} + +static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total); + + UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) { + // TODO: obtain value from the server + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; + + UNUSED(dev); +} + +static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_rpc_device_get_name(dev); + props->description = ggml_backend_rpc_device_get_description(dev); + props->type = ggml_backend_rpc_device_get_type(dev); + ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ggml_backend_rpc_init(ctx->endpoint.c_str()); + + UNUSED(params); +} + +static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; + + return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str()); + + UNUSED(dev); +} + +static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + + UNUSED(dev); + UNUSED(max_tensor_size); +} + +static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { + UNUSED(dev); + UNUSED(op); + //TODO: call the remote backend and cache the results + return true; +} + +static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) { + return false; + } + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context; + return buft_ctx->endpoint == dev_ctx->endpoint; +} + +static const struct ggml_backend_device_i ggml_backend_rpc_device_i = { + /* .get_name = */ ggml_backend_rpc_device_get_name, + /* .get_description = */ ggml_backend_rpc_device_get_description, + /* .get_memory = */ ggml_backend_rpc_device_get_memory, + /* .get_type = */ ggml_backend_rpc_device_get_type, + /* .get_props = */ ggml_backend_rpc_device_get_props, + /* .init_backend = */ ggml_backend_rpc_device_init, + /* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_rpc_device_supports_op, + /* .supports_buft = */ ggml_backend_rpc_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// backend reg interface + +static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) { + return "RPC"; + + UNUSED(reg); +} + +static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) { + return 0; + + UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead"); + + UNUSED(reg); + UNUSED(index); +} + +static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) { + return (void *)ggml_backend_rpc_add_device; + } + return NULL; + + UNUSED(reg); +} + +static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = { + /* .get_name = */ ggml_backend_rpc_reg_get_name, + /* .get_device_count = */ ggml_backend_rpc_reg_get_device_count, + /* .get_device = */ ggml_backend_rpc_reg_get_device, + /* .get_proc_address = */ ggml_backend_rpc_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_rpc_reg(void) { + static struct ggml_backend_reg ggml_backend_rpc_reg = { + /* .iface = */ ggml_backend_rpc_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_rpc_reg; +} + +ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { + static std::unordered_map dev_map; + + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (dev_map.find(endpoint) != dev_map.end()) { + return dev_map[endpoint]; + } + + ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context { + /* .endpoint = */ endpoint, + /* .name = */ "RPC[" + std::string(endpoint) + "]", + }; + + ggml_backend_dev_t dev = new ggml_backend_device { + /* .iface = */ ggml_backend_rpc_device_i, + /* .reg = */ ggml_backend_rpc_reg(), + /* .context = */ ctx, + }; + + dev_map[endpoint] = dev; + + return dev; +} diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 4e5e86b29..e749bbe70 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg return; } - ggml_type_traits_t tt = ggml_internal_get_type_traits(quant); + const auto * tt = ggml_get_type_traits(quant); - ggml_to_float_t dequant_fn = tt.to_float; + ggml_to_float_t dequant_fn = tt->to_float; dequant_fn(from, to, ne); } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 03b832d0f..779b38d12 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -35,10 +35,6 @@ #include #endif -#ifdef GGML_USE_METAL -#include -#endif - #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t; #endif #if defined(__APPLE__) +#include +#include #include #endif @@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi //#define GGML_SOFT_MAX_ACCELERATE #endif + +void * ggml_aligned_malloc(size_t size) { #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) -#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) + return _aligned_malloc(size, TENSOR_ALIGNMENT); #else -inline static void * ggml_aligned_malloc(size_t size) { if (size == 0) { GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); return NULL; } void * aligned_memory = NULL; #ifdef GGML_USE_CPU_HBM - int result = hbw_posix_memalign(&aligned_memory, 16, size); + int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); +#elif TARGET_OS_OSX + kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); + int result = EFAULT; + switch (alloc_status) { + case KERN_SUCCESS: + result = 0; + break; + case KERN_INVALID_ADDRESS: + result = EINVAL; + break; + case KERN_NO_SPACE: + result = ENOMEM; + break; + default: + result = EFAULT; + break; + } #elif GGML_USE_METAL - int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); + const long page_size = sysconf(_SC_PAGESIZE); + int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size); #else - int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); + int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); #endif if (result != 0) { // Handle allocation failure @@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) { return NULL; } return aligned_memory; +#endif } -#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) -#ifdef GGML_USE_CPU_HBM -#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) + +void ggml_aligned_free(void * ptr, size_t size) { + GGML_UNUSED(size); +#if defined(_MSC_VER) || defined(__MINGW32__) + _aligned_free(ptr); +#elif GGML_USE_CPU_HBM + if (ptr != NULL) { + hbw_free(ptr); + } +#elif TARGET_OS_OSX + if (ptr != NULL) { + vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size); + } #else -#define GGML_ALIGNED_FREE(ptr) free(ptr) -#endif + free(ptr); #endif +} + inline static void * ggml_malloc(size_t size) { if (size == 0) { @@ -729,7 +757,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); -static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { +static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", .blck_size = 1, @@ -1151,9 +1179,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { }; // For internal test use -ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { +const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { GGML_ASSERT(type < GGML_TYPE_COUNT); - return type_traits[type]; + return &type_traits[type]; } // @@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { *ctx = (struct ggml_context) { /*.mem_size =*/ mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.no_alloc_save =*/ params.no_alloc, @@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) { __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { - GGML_ALIGNED_FREE(ctx->mem_buffer); + ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); } found = true; @@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask void ggml_threadpool_free(struct ggml_threadpool* threadpool) { if (!threadpool) return; + const int n_threads = threadpool->n_threads_max; + #ifndef GGML_USE_OPENMP struct ggml_compute_state* workers = threadpool->workers; - const int n_threads = threadpool->n_threads_max; ggml_mutex_lock(&threadpool->mutex); @@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) { ggml_cond_destroy(&threadpool->cond); #endif // GGML_USE_OPENMP - GGML_ALIGNED_FREE(threadpool->workers); - GGML_ALIGNED_FREE(threadpool); + const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads; + ggml_aligned_free(threadpool->workers, workers_size); + ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool)); } #ifndef GGML_USE_OPENMP @@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_cplan * cplan) { struct ggml_threadpool * threadpool = - GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); + ggml_aligned_malloc(sizeof(struct ggml_threadpool)); { threadpool->cgraph = cgraph; threadpool->cplan = cplan; @@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( // Allocate and init workers state const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; - struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size); + struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); memset(workers, 0, workers_size); for (int j = 0; j < tpp->n_threads; j++) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e08617ba2..7ab08b036 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -152,6 +152,8 @@ class Keys: MERGES = "tokenizer.ggml.merges" BOS_ID = "tokenizer.ggml.bos_token_id" EOS_ID = "tokenizer.ggml.eos_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" @@ -168,11 +170,16 @@ class Keys: CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" # FIM/Infill special tokens constants + FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" + FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" + FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" + FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" + FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" + FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" + # deprecated: PREFIX_ID = "tokenizer.ggml.prefix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" - EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: TYPE = "adapter.type" @@ -1579,6 +1586,8 @@ KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID @@ -1586,8 +1595,15 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV -KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID + +KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID +KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID +KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID +KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID +KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID +KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID + +# deprecated +KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID -KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 5c460ef1b..0d8d8a0b0 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -843,15 +843,6 @@ class GGUFWriter: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) - def add_prefix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) - - def add_suffix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) - - def add_middle_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) - def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) diff --git a/include/llama.h b/include/llama.h index 7cae1bbe2..02bc7f087 100644 --- a/include/llama.h +++ b/include/llama.h @@ -433,6 +433,7 @@ extern "C" { LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mlock (void); LLAMA_API bool llama_supports_gpu_offload(void); + LLAMA_API bool llama_supports_rpc (void); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @@ -896,6 +897,7 @@ extern "C" { // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence + LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line @@ -904,11 +906,17 @@ extern "C" { LLAMA_API bool llama_add_bos_token(const struct llama_model * model); LLAMA_API bool llama_add_eos_token(const struct llama_model * model); - // Codellama infill tokens - LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix - LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle - LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix - LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle + // infill tokens + DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead"); + DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead"); + DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead"); + + LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model); // // Tokenization @@ -945,6 +953,12 @@ extern "C" { int32_t lstrip, bool special); + // check if token0 is contained as a prefix in token1 + LLAMA_API bool llama_token_is_prefix( + const struct llama_model * model, + llama_token token0, + llama_token token1); + /// @details Convert the provided tokens into text (inverse of llama_tokenize()). /// @param text The char pointer must be large enough to hold the resulting text. /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -1093,6 +1107,9 @@ extern "C" { /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); + /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @@ -1137,6 +1154,28 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); + // this sampler is meant to be used for fill-in-the-middle infilling + // it's supposed to be used after top_k + top_p sampling + // + // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG + // 2. combine probs of tokens that have the same prefix + // + // example: + // + // - before: + // "hel": 0.5 + // "hell": 0.2 + // "hello": 0.1 + // "dummy": 0.1 + // + // - after: + // "hel": 0.8 + // "dummy": 0.1 + // + // 3. discard non-EOG tokens with low prob + // 4. if no tokens are left -> pick EOT + // + LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model); // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp index 1a52ff5e9..131d7c177 100644 --- a/pocs/vdot/q8dot.cpp +++ b/pocs/vdot/q8dot.cpp @@ -136,7 +136,7 @@ int main(int argc, char** argv) { auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1; - auto funcs = ggml_internal_get_type_traits(ggml_type); + const auto * funcs = ggml_get_type_traits(ggml_type); Stat simple, ggml; @@ -156,8 +156,8 @@ int main(int argc, char** argv) { t1 = std::chrono::high_resolution_clock::now(); float fs; - if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); - else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); + if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1); + else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1); t2 = std::chrono::high_resolution_clock::now(); t = 1e-3*std::chrono::duration_cast(t2-t1).count(); if (iloop > 3) ggml.addResult(fs, t); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index 17e9e4482..88e66ea13 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -236,7 +236,7 @@ int main(int argc, char** argv) { int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64); int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64); - auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0); + const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0); std::vector q40; std::vector q41; @@ -261,9 +261,9 @@ int main(int argc, char** argv) { // Note, we do not include this in the timing as in practical application // we already have the quantized model weights. if (useQ4_1) { - funcs.from_float(x1.data(), q41.data(), kVecSize); + funcs->from_float(x1.data(), q41.data(), kVecSize); } else { - funcs.from_float(x1.data(), q40.data(), kVecSize); + funcs->from_float(x1.data(), q40.data(), kVecSize); } // Now measure time the dot product needs using the "scalar" version above @@ -282,10 +282,10 @@ int main(int argc, char** argv) { dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); } else { - auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type); - vdot.from_float(y1.data(), q8.data(), kVecSize); - if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); - else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); + const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type); + vdot->from_float(y1.data(), q8.data(), kVecSize); + if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); + else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); } sumq += result; t2 = std::chrono::high_resolution_clock::now(); diff --git a/scripts/debug-test.sh b/scripts/debug-test.sh index 91946c514..c6c1e988a 100755 --- a/scripts/debug-test.sh +++ b/scripts/debug-test.sh @@ -110,7 +110,7 @@ rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir" ########################################################### # Note: test-eval-callback requires -DLLAMA_CURL -cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment" +cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DGGML_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build environment" pushd "$build_dir" make -j || abort "Failed to compile" popd > /dev/null || exit 1 @@ -127,7 +127,7 @@ printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n" pushd "$build_dir" tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1')) if [ ${#tests[@]} -eq 0 ]; then - abort "No tests avaliable... check your compliation process..." + abort "No tests available... check your compilation process..." fi popd > /dev/null || exit 1 @@ -137,7 +137,7 @@ popd > /dev/null || exit 1 # Select test number if [ -z $test_number ]; then - # List out avaliable tests + # List out available tests printf "Which test would you like to debug?\n" id=0 for s in "${tests[@]}" diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 3cca9cc2f..6d31b21b9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -564f42082f858f9674b2a2e06e9e779d9ed2c754 +2327bda7a55ac6b72614ac5ebd5c5a5e02553b9b diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e255a8fc4..2e6550682 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa }; } +// xtc + +struct llama_sampler_xtc { + const float probability; + const float threshold; + const size_t min_keep; + + const uint32_t seed; + uint32_t seed_cur; + + std::mt19937 rng; +}; + +static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { + return "xtc"; +} + +static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + + if (ctx->probability <= 0.0f + || ctx->threshold > 0.5f + || cur_p->size < 2) { + return; + } + + std::uniform_real_distribution distribution(0.0f, 1.0f); + float chance = distribution(ctx->rng); + if (chance > ctx->probability) return; + + // in case it's not sorted/recalculated yet + llama_sampler_softmax_impl(cur_p); + + int pos_last = 0; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].p >= ctx->threshold) { + pos_last = i; + } else break; + } + + if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { + cur_p->data += pos_last; + cur_p->size -= pos_last; + } +} + +static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_xtc *) smpl->ctx; + auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_xtc *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_xtc_free(struct llama_sampler * smpl) { + delete (llama_sampler_xtc *) smpl->ctx; +} + +static void llama_sampler_xtc_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler_i llama_sampler_xtc_i = { + /* .name = */ llama_sampler_xtc_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sample_xtc_apply, + /* .reset = */ llama_sampler_xtc_reset, + /* .clone = */ llama_sampler_xtc_clone, + /* .free = */ llama_sampler_xtc_free, +}; + +struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { + auto seed_cur = get_rng_seed(seed); + return new llama_sampler { + /* .iface = */ &llama_sampler_xtc_i, + /* .ctx = */ new llama_sampler_xtc { + /* .probability = */ p, + /* .threshold = */ t, + /* .min_keep = */ min_keep, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + }, + }; +} + // mirostat struct llama_sampler_mirostat { @@ -1644,6 +1739,207 @@ struct llama_sampler * llama_sampler_init_logit_bias( }; } +// infill + +//#define GGML_DEBUG_SAMPLER_INFILL + +struct llama_sampler_infill { + const struct llama_vocab * vocab; +}; + +static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) { + return "infill"; +} + +static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_infill *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + +#if defined(GGML_DEBUG_SAMPLER_INFILL) +#define LOG_DBG_CUR LLAMA_LOG_DEBUG +#else +#define LOG_DBG_CUR(...) +#endif + + for (size_t i = 0; i < cur_p->size; ++i) { + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + float p_txt_sum = 0.0f; + float p_eog_sum = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_eog_sum += cur_p->data[i].p; + } else { + p_txt_sum += cur_p->data[i].p; + } + } + + const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); + + LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); + + if (3*p_eog_sum*cur_p->size > p_txt_sum) { + LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); + + // keep just the EOG tokens + const auto size_org = cur_p->size; + + cur_p->size = 0; + + float p_sum = 0.0f; + + for (size_t i = 0; i < size_org; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + } + + return; + } + + size_t n_combined = 0; GGML_UNUSED(n_combined); + + // combine tokens with common prefix + for (size_t i = 0; i < cur_p->size; ++i) { + for (size_t j = 0; j < cur_p->size; ++j) { + if (cur_p->data[i].logit == -INFINITY) { + break; + } + + if (i == j || cur_p->data[j].logit == -INFINITY) { + continue; + } + + if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) { + if (cur_p->data[i].p > cur_p->data[j].p) { + cur_p->data[i].p += cur_p->data[j].p; + cur_p->data[j].logit = -INFINITY; + cur_p->data[j].p = 0.0f; + } else { + cur_p->data[j].p += cur_p->data[i].p; + cur_p->data[i].logit = -INFINITY; + cur_p->data[i].p = 0.0f; + } + + n_combined++; + } + } + } + + size_t n_non_eog = 0; + + size_t size_org = cur_p->size; + + float p_sum = 0.0f; + float thold = 0.2f; + + cur_p->size = 0; + + LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + if (!is_eog) { + ++n_non_eog; + } + + p_sum += cur_p->data[i].p; + + // keep this token + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog); + + // if no non-EOG tokens are left -> reduce cur_p to single EOT token + if (n_non_eog == 0) { + cur_p->size = 1; + cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab); + cur_p->data[0].logit = 1.0f; + + return; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + size_org = cur_p->size; + p_sum = 0.0f; + thold = 1.0/(n_non_eog + 1); + + cur_p->size = 0; + + LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + +#undef LOG_DBG_CUR +} + +static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_infill *) smpl->ctx; + return llama_sampler_init_infill_impl(*ctx->vocab); +} + +static void llama_sampler_infill_free(struct llama_sampler * smpl) { + delete (llama_sampler_infill *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_infill_i = { + /* .name = */ llama_sampler_infill_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_infill_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_infill_clone, + /* .free = */ llama_sampler_infill_free, +}; + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab) { + return new llama_sampler { + /* .iface = */ &llama_sampler_infill_i, + /* .ctx = */ new llama_sampler_infill { + /* .vocab = */ &vocab, + }, + }; +} + // utils uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { diff --git a/src/llama-sampling.h b/src/llama-sampling.h index d90b14713..2683f1b92 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -4,8 +4,6 @@ #include "llama-grammar.h" -#include - struct llama_vocab; struct llama_grammar; @@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl( const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root); + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d2f34ddd6..57d56a3d3 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session { } // seed the work queue with all possible 2-character tokens. - for (size_t i = 1; i < symbols.size(); ++i) { + for (int i = 1; i < (int) symbols.size(); ++i) { try_add_bigram(i - 1, i); } @@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session { index++; symbols.emplace_back(sym); } - for (size_t i = 1; i < symbols.size(); ++i) { + for (int i = 1; i < (int) symbols.size(); ++i) { add_new_bigram(i - 1, i); } @@ -1663,6 +1663,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { return vocab.special_eos_id; } +llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { + return vocab.special_eot_id; +} + +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + llama_token llama_token_cls_impl(const struct llama_vocab & vocab) { return vocab.special_cls_id; } @@ -1688,23 +1696,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) { } llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) { - return vocab.special_prefix_id; + return vocab.special_fim_pre_id; } llama_token llama_token_middle_impl(const struct llama_vocab & vocab) { - return vocab.special_middle_id; + return vocab.special_fim_mid_id; } llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) { - return vocab.special_suffix_id; + return vocab.special_fim_suf_id; } -llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { - return vocab.special_eot_id; +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pre_id; } -llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { - return vocab.special_eom_id; +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_suf_id; +} + +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_mid_id; +} + +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pad_id; +} + +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_rep_id; +} + +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_sep_id; } int32_t llama_tokenize_impl( @@ -1834,6 +1858,23 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token return 0; } +bool llama_token_is_prefix_impl( + const struct llama_vocab & vocab, + llama_token token0, + llama_token token1) { + char text_buf_0[128]; + char text_buf_1[128]; + + const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false); + const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false); + + if (len0 <= 0 || len1 <= 0) { + return false; + } + + return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0; +} + int32_t llama_detokenize_impl( const struct llama_vocab & vocab, const llama_token * tokens, diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 28bad9135..d958d0073 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -37,20 +37,26 @@ struct llama_vocab { std::map, int> bpe_ranks; // default LLaMA special tokens + // TODO: should we set all of these to LLAMA_TOKEN_NULL? id special_bos_id = 1; id special_eos_id = 2; + id special_eot_id = LLAMA_TOKEN_NULL; + id special_eom_id = LLAMA_TOKEN_NULL; id special_unk_id = 0; id special_sep_id = LLAMA_TOKEN_NULL; id special_pad_id = LLAMA_TOKEN_NULL; id special_cls_id = LLAMA_TOKEN_NULL; id special_mask_id = LLAMA_TOKEN_NULL; - id linefeed_id = 13; - id special_prefix_id = LLAMA_TOKEN_NULL; - id special_suffix_id = LLAMA_TOKEN_NULL; - id special_middle_id = LLAMA_TOKEN_NULL; - id special_eot_id = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token - id special_eom_id = LLAMA_TOKEN_NULL; + id linefeed_id = 13; + + // fim tokens + id special_fim_pre_id = LLAMA_TOKEN_NULL; + id special_fim_suf_id = LLAMA_TOKEN_NULL; + id special_fim_mid_id = LLAMA_TOKEN_NULL; + id special_fim_pad_id = LLAMA_TOKEN_NULL; + id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo + id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator // set of all tokens that cause "end of generation" std::set special_eog_ids; @@ -104,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t llama_token llama_token_bos_impl(const struct llama_vocab & vocab); llama_token llama_token_eos_impl(const struct llama_vocab & vocab); +llama_token llama_token_eot_impl(const struct llama_vocab & vocab); +llama_token llama_token_eom_impl(const struct llama_vocab & vocab); llama_token llama_token_cls_impl(const struct llama_vocab & vocab); llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token llama_token_nl_impl (const struct llama_vocab & vocab); llama_token llama_token_pad_impl(const struct llama_vocab & vocab); -bool llama_add_bos_token_impl(const struct llama_vocab & vocab); -bool llama_add_eos_token_impl(const struct llama_vocab & vocab); - llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); -llama_token llama_token_eot_impl (const struct llama_vocab & vocab); -llama_token llama_token_eom_impl (const struct llama_vocab & vocab); + +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab); + +bool llama_add_bos_token_impl(const struct llama_vocab & vocab); +bool llama_add_eos_token_impl(const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, @@ -136,6 +149,12 @@ int32_t llama_token_to_piece_impl( int32_t lstrip, bool special); +// check if token0 is contained as a prefix in token1 +bool llama_token_is_prefix_impl( + const struct llama_vocab & vocab, + llama_token token0, + llama_token token1); + int32_t llama_detokenize_impl( const struct llama_vocab & vocab, const llama_token * tokens, diff --git a/src/llama.cpp b/src/llama.cpp index 9e83c7913..68479c6db 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8,10 +8,6 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_RPC -# include "ggml-rpc.h" -#endif - #if defined(GGML_USE_SYCL) # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) @@ -347,6 +343,8 @@ enum llm_kv { LLM_KV_TOKENIZER_MERGES, LLM_KV_TOKENIZER_BOS_ID, LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_TOKENIZER_UNK_ID, LLM_KV_TOKENIZER_SEP_ID, LLM_KV_TOKENIZER_PAD_ID, @@ -359,14 +357,20 @@ enum llm_kv { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, - LLM_KV_TOKENIZER_PREFIX_ID, - LLM_KV_TOKENIZER_SUFFIX_ID, - LLM_KV_TOKENIZER_MIDDLE_ID, - LLM_KV_TOKENIZER_EOT_ID, - LLM_KV_TOKENIZER_EOM_ID, + LLM_KV_TOKENIZER_FIM_PRE_ID, + LLM_KV_TOKENIZER_FIM_SUF_ID, + LLM_KV_TOKENIZER_FIM_MID_ID, + LLM_KV_TOKENIZER_FIM_PAD_ID, + LLM_KV_TOKENIZER_FIM_REP_ID, + LLM_KV_TOKENIZER_FIM_SEP_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + + // deprecated: + LLM_KV_TOKENIZER_PREFIX_ID, + LLM_KV_TOKENIZER_SUFFIX_ID, + LLM_KV_TOKENIZER_MIDDLE_ID, }; static const std::map LLM_KV_NAMES = { @@ -424,57 +428,65 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, - { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, - { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, - { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, - { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, - { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, - { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, - { LLM_KV_SPLIT_NO, "split.no" }, - { LLM_KV_SPLIT_COUNT, "split.count" }, - { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, + { LLM_KV_SPLIT_NO, "split.no" }, + { LLM_KV_SPLIT_COUNT, "split.count" }, + { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, - { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, - { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, - { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, - { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, + { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, + { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, + { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, - { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, - { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, - { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, - { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, - { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, - { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, - { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, - { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, - { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, - { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, - { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, - { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, - { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, - { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, - { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, - { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, - { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, - { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, - { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, - { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, - { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, - { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, - { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, + { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, + { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, + { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, + { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, + { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, + { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, + { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + + // deprecated + { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, + { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, + { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, }; struct LLM_KV { @@ -3402,10 +3414,6 @@ struct llama_lora_adapter { static int llama_get_device_count(const llama_model & model) { int count = (int) model.devices.size(); -#if defined(GGML_USE_RPC) - count += (int) model.rpc_servers.size(); -#endif - #if defined(GGML_USE_SYCL) count += ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_CANN) @@ -3452,15 +3460,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) { ggml_backend_buffer_type_t buft = nullptr; -#if defined(GGML_USE_RPC) - int rpc_count = (int)model.rpc_servers.size(); - if (device < rpc_count) { - const char * endpoint = model.rpc_servers[device].c_str(); - return ggml_backend_rpc_buffer_type(endpoint); - } - device -= rpc_count; -#endif - if (device < (int)model.devices.size()) { return ggml_backend_dev_buffer_type(model.devices[device]); } @@ -3513,18 +3512,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo } static size_t llama_get_device_memory(const llama_model & model, int device) { -#if defined(GGML_USE_RPC) - int rpc_count = (int)model.rpc_servers.size(); - if (device < rpc_count) { - size_t total; - size_t free; - const char * endpoint = model.rpc_servers[device].c_str(); - ggml_backend_rpc_get_device_memory(endpoint, &free, &total); - return free; - } - device = device - rpc_count; -#endif - if (device < (int)model.devices.size()) { ggml_backend_dev_t dev = model.devices[device]; size_t total; @@ -6178,14 +6165,14 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; - vocab.linefeed_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; + vocab.linefeed_id = LLAMA_TOKEN_NULL; // read vocab size from metadata if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { @@ -6202,16 +6189,16 @@ static void llm_load_vocab( vocab.special_bos_id = 1; vocab.special_eos_id = 2; vocab.special_unk_id = 0; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; vocab.special_unk_id = 100; vocab.special_sep_id = 102; vocab.special_pad_id = 0; @@ -6247,22 +6234,22 @@ static void llm_load_vocab( // default special tokens vocab.special_bos_id = 11; vocab.special_eos_id = 11; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "t5") { vocab.type = LLAMA_VOCAB_TYPE_UGM; // default special tokens - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; vocab.special_eos_id = 1; vocab.special_unk_id = 2; - vocab.special_sep_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; vocab.special_pad_id = 0; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); if (precompiled_charsmap_keyidx != -1) { @@ -6285,11 +6272,11 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_RWKV; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -6373,7 +6360,7 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "chatglm-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; } else if ( tokenizer_pre == "viking") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; @@ -6499,44 +6486,6 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - // For Fill-In-the-Middle (FIM)/infill models which where converted - // prior to support of FIM special tokens in GGUF, the following - // will allow those models to continue to work. The general names - // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and - // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once - // new versions of these models have been published. - std::string gen_name; - ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false); - - std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(), - [](unsigned char c){ return std::tolower(c); }); - - if (gen_name.find("code") != std::string::npos) { - if (model.arch == LLM_ARCH_LLAMA - && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text.find("
") != std::string::npos
-              && vocab.id_to_token[32008].text.find("") != std::string::npos
-              && vocab.id_to_token[32009].text.find("") != std::string::npos
-              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
-                vocab.special_prefix_id = 32007;
-                vocab.special_suffix_id = 32008;
-                vocab.special_middle_id = 32009;
-                vocab.special_eot_id    = 32010;
-            } else if (model.arch == LLM_ARCH_GEMMA
-              && 107 < vocab.id_to_token.size()
-              && vocab.id_to_token[67].text == "<|fim_prefix|>"
-              && vocab.id_to_token[69].text == "<|fim_suffix|>"
-              && vocab.id_to_token[68].text == "<|fim_middle|>"
-              && vocab.id_to_token[107].text == "") {
-                vocab.special_prefix_id = 67;
-                vocab.special_suffix_id = 69;
-                vocab.special_middle_id = 68;
-                // TODO: this is not EOT, it is "file separator" token, needs fix
-                //       https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
-                //vocab.special_eot_id    = 70;
-                vocab.special_eot_id    = 107;
-            }
-        }
         try {
             vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
         } catch (const std::exception & e) {
@@ -6564,18 +6513,26 @@ static void llm_load_vocab(
     // special tokens
     {
         const std::vector> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID,    vocab.special_bos_id    },
-            { LLM_KV_TOKENIZER_EOS_ID,    vocab.special_eos_id    },
-            { LLM_KV_TOKENIZER_UNK_ID,    vocab.special_unk_id    },
-            { LLM_KV_TOKENIZER_SEP_ID,    vocab.special_sep_id    },
-            { LLM_KV_TOKENIZER_PAD_ID,    vocab.special_pad_id    },
-            { LLM_KV_TOKENIZER_CLS_ID,    vocab.special_cls_id    },
-            { LLM_KV_TOKENIZER_MASK_ID,   vocab.special_mask_id   },
-            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
-            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
-            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
-            { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
-            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
+            { LLM_KV_TOKENIZER_BOS_ID,     vocab.special_bos_id     },
+            { LLM_KV_TOKENIZER_EOS_ID,     vocab.special_eos_id     },
+            { LLM_KV_TOKENIZER_EOT_ID,     vocab.special_eot_id     },
+            { LLM_KV_TOKENIZER_EOM_ID,     vocab.special_eom_id     },
+            { LLM_KV_TOKENIZER_UNK_ID,     vocab.special_unk_id     },
+            { LLM_KV_TOKENIZER_SEP_ID,     vocab.special_sep_id     },
+            { LLM_KV_TOKENIZER_PAD_ID,     vocab.special_pad_id     },
+            { LLM_KV_TOKENIZER_CLS_ID,     vocab.special_cls_id     },
+            { LLM_KV_TOKENIZER_MASK_ID,    vocab.special_mask_id    },
+            { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
+            { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
+            { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
+            { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
+
+            // deprecated
+            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
         };
 
         for (const auto & it : special_token_types) {
@@ -6606,46 +6563,140 @@ static void llm_load_vocab(
             }
         }
 
-        // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
-        //       for now, we apply this workaround to find the EOT token based on its text
-        if (vocab.special_eot_id == -1) {
-            for (const auto & t : vocab.token_to_id) {
+        // auto-detect special tokens by text
+        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+        //       for now, we apply this workaround to find the tokens based on their text
+
+        for (const auto & t : vocab.token_to_id) {
+            // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
+            if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
                 if (false
-                        // TODO: gemma "" is exported as a normal token, so the following check does not work
-                        //       need to fix convert script
-                        //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
                         || t.first == "<|eot_id|>"
                         || t.first == "<|im_end|>"
                         || t.first == "<|end|>"
                         || t.first == ""
                         || t.first == "<|endoftext|>"
                         || t.first == ""
+                        || t.first == "<|end▁of▁sentence|>" // DeepSeek
                    ) {
                     vocab.special_eot_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
-                    break;
                 }
             }
-        }
 
-        // find EOM token: "<|eom_id|>"
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
-        //       for now, we apply this workaround to find the EOM token based on its text
-        if (vocab.special_eom_id == -1) {
-            const auto & t = vocab.token_to_id.find("<|eom_id|>");
-            if (t != vocab.token_to_id.end()) {
-                vocab.special_eom_id = t->second;
-                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                        __func__, t->first.c_str());
-                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+            // find EOM token: "<|eom_id|>"
+            if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eom_id|>"
+                        ) {
+                    vocab.special_eom_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PRE token: "<|fim_prefix|>", "", "
", etc.
+            if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_prefix|>"  // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁begin|>" // DeepSeek
+                        || t.first == "
"
+                        ) {
+                    vocab.special_fim_pre_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SUF token: "<|fim_suffix|>", "", "", etc.
+            if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_suffix|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁hole|>" // DeepSeek
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_suf_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_MID token: "<|fim_middle|>", "", "", etc.
+            if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_middle|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁end|>"  // DeepSeek
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_mid_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PAD token: "<|fim_pad|>", "", "", etc.
+            if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_pad|>" // Qwen
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_pad_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_REP token: "<|fim_repo|>", "", "", etc.
+            if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_repo|>"  // Qwen
+                        || t.first == "<|repo_name|>"
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_rep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SEP token: "<|file_sep|>"
+            if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|file_sep|>" // Qwen
+                        ) {
+                    vocab.special_fim_sep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
                 }
             }
         }
@@ -6654,6 +6705,19 @@ static void llm_load_vocab(
         // this is currently determined based on the token text, which is obviously not ideal
         // ref: https://github.com/ggerganov/llama.cpp/issues/9606
         vocab.special_eog_ids.clear();
+
+        if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
+        }
+
+        if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
+        }
+
+        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
+        }
+
         for (const auto & t : vocab.token_to_id) {
             if (false
                     || t.first == "<|eot_id|>"
@@ -6666,24 +6730,31 @@ static void llm_load_vocab(
                ) {
                 vocab.special_eog_ids.insert(t.second);
                 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                            __func__, t.first.c_str());
+                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.second, t.first.c_str());
                     vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                 }
+            } else {
+                // token is control, but not marked as EOG -> print a warning
+                if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
+                }
             }
         }
 
-        if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+        // sanity checks
+        if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eos_id);
             LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+        if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eot_id);
             LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+        if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
@@ -6877,20 +6948,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id    != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,  vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
-    if (vocab.special_eos_id    != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,  vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
-    if (vocab.special_unk_id    != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,  vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
-    if (vocab.special_sep_id    != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,  vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
-    if (vocab.special_pad_id    != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,  vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
-    if (vocab.special_cls_id    != -1) { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,  vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
-    if (vocab.special_mask_id   != -1) { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+    if (vocab.special_bos_id  != -1)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
+    if (vocab.special_eos_id  != -1)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
+    if (vocab.special_eot_id  != -1)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
+    if (vocab.special_eom_id  != -1)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
+    if (vocab.special_unk_id  != -1)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
+    if (vocab.special_sep_id  != -1)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
+    if (vocab.special_pad_id  != -1)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
+    if (vocab.special_cls_id  != -1)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
+    if (vocab.special_mask_id != -1)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
 
-    if (vocab.linefeed_id       != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,       vocab.id_to_token[vocab.linefeed_id].text.c_str() );       }
-    if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token        = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
-    if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token        = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
-    if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token        = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
-    if (vocab.special_eot_id    != -1) { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,    vocab.id_to_token[vocab.special_eot_id].text.c_str() );    }
-    if (vocab.special_eom_id    != -1) { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,    vocab.id_to_token[vocab.special_eom_id].text.c_str() );    }
+    if (vocab.linefeed_id != -1)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+    if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+    if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+    if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+    if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+    if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+    if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
 
     for (const auto & id : vocab.special_eog_ids) {
         LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@@ -16005,9 +16080,11 @@ struct llm_build_context {
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_norm", -1);
 
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
+
         ggml_build_forward_expand(gf, cur);
 
         return gf;
@@ -17857,10 +17934,9 @@ static void llama_tensor_dequantize_internal(
     }
     float * f32_output = (float *) output.data();
 
-    ggml_type_traits_t qtype;
+    const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
     if (ggml_is_quantized(tensor->type)) {
-        qtype = ggml_internal_get_type_traits(tensor->type);
-        if (qtype.to_float == NULL) {
+        if (qtype->to_float == NULL) {
             throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
         }
     } else if (tensor->type != GGML_TYPE_F16 &&
@@ -17874,7 +17950,7 @@ static void llama_tensor_dequantize_internal(
         } else if (tensor->type == GGML_TYPE_BF16) {
             ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
         } else if (ggml_is_quantized(tensor->type)) {
-            qtype.to_float(tensor->data, f32_output, nelements);
+            qtype->to_float(tensor->data, f32_output, nelements);
         } else {
             GGML_ABORT("fatal error"); // unreachable
         }
@@ -17910,7 +17986,7 @@ static void llama_tensor_dequantize_internal(
             } else if (typ == GGML_TYPE_BF16) {
                 ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
             } else {
-                qtype.to_float(inbuf, outbuf, nels);
+                qtype->to_float(inbuf, outbuf, nels);
             }
         };
         workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
@@ -19004,15 +19080,20 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
+#if defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
 #else
     return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
-        ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
+           ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
+           llama_supports_rpc();
 #endif
 }
 
+bool llama_supports_rpc(void) {
+    return ggml_backend_reg_by_name("RPC") != nullptr;
+}
+
 void llama_backend_init(void) {
     ggml_time_init();
 
@@ -19087,6 +19168,36 @@ struct llama_model * llama_load_model_from_file(
         model->rpc_servers.push_back(servers);
     }
 
+    // add RPC devices
+    if (!model->rpc_servers.empty()) {
+        ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+        if (!rpc_reg) {
+            LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
+            llama_free_model(model);
+            return nullptr;
+        }
+
+        // ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+        using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
+        ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+        if (!ggml_backend_rpc_add_device_fn) {
+            LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
+            llama_free_model(model);
+            return nullptr;
+        }
+
+        for (const std::string & server : model->rpc_servers) {
+            ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+            if (dev) {
+                model->devices.push_back(dev);
+            } else {
+                LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+                llama_free_model(model);
+                return nullptr;
+            }
+        }
+    }
+
     // create list of devices to use with this model
     // currently, we use all available devices
     // TODO: rework API to give user more control over device selection
@@ -19118,7 +19229,7 @@ struct llama_model * llama_load_model_from_file(
         } else if (status == -2) {
             LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
         }
-        delete model;
+        llama_free_model(model);
         return nullptr;
     }
 
@@ -19301,23 +19412,6 @@ struct llama_context * llama_new_context_with_model(
             main_gpu -= (int)model->devices.size();
         }
 
-#if defined(GGML_USE_RPC)
-        if (model->n_gpu_layers > 0) {
-            for (const auto & endpoint : model->rpc_servers) {
-                ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-        if (main_gpu >= (int)model->rpc_servers.size()) {
-            main_gpu -= (int)model->rpc_servers.size();
-        }
-#endif
-
 #if defined(GGML_USE_SYCL)
         // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
         if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
@@ -19429,7 +19523,7 @@ struct llama_context * llama_new_context_with_model(
             }
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
@@ -21283,6 +21377,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
     return llama_token_eos_impl(model->vocab);
 }
 
+llama_token llama_token_eot(const struct llama_model * model) {
+    return llama_token_eot_impl(model->vocab);
+}
+
 llama_token llama_token_cls(const struct llama_model * model) {
     return llama_token_cls_impl(model->vocab);
 }
@@ -21319,8 +21417,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
     return llama_token_suffix_impl(model->vocab);
 }
 
-llama_token llama_token_eot(const struct llama_model * model) {
-    return llama_token_eot_impl(model->vocab);
+llama_token llama_token_fim_pre(const struct llama_model * model) {
+    return llama_token_fim_pre_impl(model->vocab);
+}
+
+llama_token llama_token_fim_suf(const struct llama_model * model) {
+    return llama_token_fim_suf_impl(model->vocab);
+}
+
+llama_token llama_token_fim_mid(const struct llama_model * model) {
+    return llama_token_fim_mid_impl(model->vocab);
+}
+
+llama_token llama_token_fim_pad(const struct llama_model * model) {
+    return llama_token_fim_pad_impl(model->vocab);
+}
+
+llama_token llama_token_fim_rep(const struct llama_model * model) {
+    return llama_token_fim_rep_impl(model->vocab);
+}
+
+llama_token llama_token_fim_sep(const struct llama_model * model) {
+    return llama_token_fim_sep_impl(model->vocab);
 }
 
 //
@@ -21348,6 +21466,13 @@ int32_t llama_token_to_piece(
     return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
 }
 
+bool llama_token_is_prefix(
+    const struct llama_model * model,
+                 llama_token   token0,
+                 llama_token   token1) {
+    return llama_token_is_prefix_impl(model->vocab, token0, token1);
+}
+
 int32_t llama_detokenize(
     const struct llama_model * model,
            const llama_token * tokens,
@@ -21678,6 +21803,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
     return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }
 
+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
+    return llama_sampler_init_infill_impl(model->vocab);
+}
+
 //
 // model split
 //
diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
index 07424bbab..04dcd7fcf 100644
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@@ -2311,7 +2311,7 @@ const std::unordered_set unicode_set_whitespace = {
 0x003000,
 };
 
-// list is always in ascending order, to enable binary searh
+// list is always in ascending order, to enable binary search
 const std::initializer_list> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
@@ -3748,7 +3748,7 @@ const std::initializer_list> unicode_map_lowercase
 {0x01E921, 0x01E943},
 };
 
-// list is always in ascending order, to enable binary searh
+// list is always in ascending order, to enable binary search
 const std::initializer_list> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index e07d09733..3665238b5 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -10,12 +10,12 @@
 #include 
 
 int main(void) {
-    gpt_params params;
+    common_params params;
 
     printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
     for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
         try {
-            auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
             std::unordered_set seen_args;
             std::unordered_set seen_env_vars;
             for (const auto & opt : ctx_arg.options) {
@@ -58,44 +58,44 @@ int main(void) {
 
     // missing value
     argv = {"binary_name", "-m"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // wrong value (int)
     argv = {"binary_name", "-ngl", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // wrong value (enum)
     argv = {"binary_name", "-sm", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
     argv = {"binary_name", "--draft", "123"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
 
 
     printf("test-arg-parser: test valid usage\n\n");
 
     argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "model_file.gguf");
 
     argv = {"binary_name", "-t", "1234"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.cpuparams.n_threads == 1234);
 
     argv = {"binary_name", "--verbose"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.verbosity > 1);
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "abc.gguf");
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
     // --draft cannot be used outside llama-speculative
     argv = {"binary_name", "--draft", "123"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
     assert(params.n_draft == 123);
 
 // skip this part on windows, because setenv is not supported
@@ -106,12 +106,12 @@ int main(void) {
 
     setenv("LLAMA_ARG_THREADS", "blah", true);
     argv = {"binary_name"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
@@ -121,7 +121,7 @@ int main(void) {
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index fa26cc653..ee1a8877e 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -133,7 +133,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) {
     std::vector buf(ggml_nbytes(t));
     ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
 
-    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
+    const auto * tt = ggml_get_type_traits(t->type);
     size_t bs = ggml_blck_size(t->type);
     std::vector vq(ggml_blck_size(t->type));
     bool quantized = ggml_is_quantized(t->type);
@@ -159,7 +159,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) {
                     } else if (t->type == GGML_TYPE_I8) {
                         tv.push_back((float)*(int8_t *) &buf[i]);
                     } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), bs);
+                        tt->to_float(&buf[i], vq.data(), bs);
                         tv.insert(tv.end(), vq.begin(), vq.end());
                     } else {
                         GGML_ABORT("fatal error");
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index a8222caee..6f046249f 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -140,11 +140,11 @@ int main(void) {
 
     // test llama_chat_format_single for system message
     printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
-    std::vector chat2;
-    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
+    std::vector chat2;
+    common_chat_msg sys_msg{"system", "You are a helpful assistant"};
 
     auto fmt_sys = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
+        auto output = common_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
         printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
         printf("-------------------------\n");
         return output;
@@ -160,10 +160,10 @@ int main(void) {
     chat2.push_back({"system", "You are a helpful assistant"});
     chat2.push_back({"user", "Hello"});
     chat2.push_back({"assistant", "I am assistant"});
-    llama_chat_msg new_msg{"user", "How are you"};
+    common_chat_msg new_msg{"user", "How are you"};
 
     auto fmt_single = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
+        auto output = common_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
         printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
         printf("-------------------------\n");
         return output;
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 3a89598a8..9d2db91f5 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -696,7 +696,7 @@ static void test_all(const std::string & lang, std::function tmp_q(2*test_size);
     std::vector tmp_out(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
     return array_rmse(test_data, tmp_out.data(), test_size);
 }
 
 // Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
+static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
     std::vector tmp_q(2*test_size);
     std::vector tmp_out(test_size);
     std::vector tmp_out_ref(test_size);
 
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
+    qfns->from_float(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
 
-    qfns.from_float_ref(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns->from_float_ref(test_data, tmp_q.data(), test_size);
+    qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
 
     return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 
 // Total dot product error
 static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
+    const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
 ) {
     std::vector tmp_q1(2*test_size);
     std::vector tmp_q2(2*test_size);
 
-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+    const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
 
-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
+    qfns->from_float(test_data1, tmp_q1.data(), test_size);
+    vdot->from_float(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
-    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
+    qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
 
     const float dot_ref = dot_product(test_data1, test_data2, test_size);
 
@@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
 
         // deprecated - skip
-        if (qfns.blck_size == 0) {
+        if (qfns->blck_size == 0) {
             continue;
         }
 
@@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
         printf("Testing %s\n", ggml_type_name((ggml_type) i));
         ggml_quantize_init(ei);
 
-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
             const float total_error = total_quantization_error(qfns, test_size, test_data.data());
             const float max_quantization_error =
                 type == GGML_TYPE_TQ1_0   ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 24e066053..bdbdd90a8 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -122,9 +122,9 @@ static void usage(char * argv[]) {
     printf("  --type TYPE           set test type as");
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
         if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
+            if (qfns->from_float && qfns->to_float) {
                 printf(" %s", ggml_type_name(type));
             }
         }
@@ -270,12 +270,12 @@ int main(int argc, char * argv[]) {
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
         ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
+        const auto * qfns = ggml_get_type_traits(type);
         if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
             continue;
         }
 
-        if (qfns.from_float && qfns.to_float) {
+        if (qfns->from_float && qfns->to_float) {
             printf("%s\n", ggml_type_name(type));
 
             ggml_quantize_init(type);
@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_ref(test_data1, test_q1, size);
+                        qfns->from_float_ref(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -299,7 +299,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
+                        qfns->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -310,11 +310,11 @@ int main(int argc, char * argv[]) {
 
             if (params.op_dequantize_row_q) {
                 printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data1, test_q1, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
+                        qfns->to_float(test_q1, test_out, size);
                         return test_out[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -328,8 +328,8 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
+                        const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);
+                        vdot->from_float(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);
@@ -340,13 +340,13 @@ int main(int argc, char * argv[]) {
 
             if (params.op_vec_dot_q) {
                 printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
+                qfns->from_float(test_data1, test_q1, largest);
+                qfns->from_float(test_data2, test_q2, largest);
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
                         float result;
-                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
+                        qfns->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
                         return result;
                     };
                     size_t quantized_size = ggml_row_size(type, size);
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 6e021c4c7..1372bdf13 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -111,6 +111,28 @@ static void test_min_p(const std::vector & probs, const std::vector & probs, const std::vector & expected_probs, float p, float t) {
+    const size_t n_vocab = probs.size();
+
+    std::vector cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        const float logit = logf(probs[token_id]);
+        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    APPLY(llama_sampler_init_softmax(), &cur_p);
+    DUMP(&cur_p);
+    APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
+    DUMP(&cur_p);
+
+    GGML_ASSERT(cur_p.size == expected_probs.size());
+    for (size_t i = 0; i < cur_p.size; i++) {
+        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
+    }
+}
+
 static void test_typical(const std::vector & probs, const std::vector & expected_probs, float p) {
     const size_t n_vocab = probs.size();
 
@@ -263,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
     }
     const int64_t t_end = ggml_time_us();
     llama_sampler_free(cnstr);
-    printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
 }
 
 #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
@@ -279,12 +301,13 @@ static void test_perf() {
         data.emplace_back(llama_token_data{i, logit, 0.0f});
     }
 
-    BENCH(llama_sampler_init_top_k    (40),      data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1), data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1), data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_softmax  (),        data, 32);
+    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
+    BENCH(llama_sampler_init_softmax  (),                       data, 32);
 }
 
 int main(void) {
@@ -309,6 +332,14 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
+    printf("XTC should:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
+
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
+
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 4d49850c9..0af85f002 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -202,7 +202,7 @@ int main(int argc, char **argv) {
     for (int i = 0; i < nthread; i++) {
         threads[i] = std::thread([&, i]() {
             for (const auto & test_kv : k_tests) {
-                const std::vector res = llama_tokenize(ctx, test_kv.first, add_special, false);
+                const std::vector res = common_tokenize(ctx, test_kv.first, add_special, false);
 
                 // here only print the result of the first thread
                 // because the other threads are running the same tests
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
 
                 printf("\n");
                 printf("src: '%s'\n", test_kv.first.c_str());
-                printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
+                printf("res: '%s'\n", common_detokenize(ctx, res).c_str());
                 printf("tok: ");
                 for (const auto & tok : res) {
                     printf("%d ", tok);
@@ -229,16 +229,16 @@ int main(int argc, char **argv) {
                 if (!correct) {
                     fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
                     fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                        llama_detokenize(ctx, res).c_str(),
-                        llama_detokenize(ctx, test_kv.second).c_str());
+                        common_detokenize(ctx, res).c_str(),
+                        common_detokenize(ctx, test_kv.second).c_str());
                     fprintf(stderr, "%s : expected tokens: ", __func__);
                     for (const auto & t : test_kv.second) {
-                        fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
                     }
                     fprintf(stderr, "\n");
                     fprintf(stderr, "%s : got tokens:      ", __func__);
                     for (const auto & t : res) {
-                        fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+                        fprintf(stderr, "%6d '%s', ", t, common_token_to_piece(ctx, t).c_str());
                     }
                     fprintf(stderr, "\n");
 
@@ -273,7 +273,7 @@ int main(int argc, char **argv) {
         {
             const auto t_start = ggml_time_us();
 
-            res = llama_tokenize(ctx, text, add_special, false);
+            res = common_tokenize(ctx, text, add_special, false);
 
             const auto t_end = ggml_time_us();
 
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 9498387e0..0ff7fc833 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -78,10 +78,10 @@ int main(int argc, char **argv) {
     const int n_vocab = llama_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector(1, i));
+        std::string str = common_detokenize(ctx, std::vector(1, i));
         try {
             auto cps = unicode_cpts_from_utf8(str);
-            std::vector tokens = llama_tokenize(ctx, str, false, true);
+            std::vector tokens = common_tokenize(ctx, str, false, true);
             if (ignore_merges && tokens.size() > 1) {
                 fprintf(stderr,
                         "%s : error: token %d detokenizes to '%s'(%zu) but "
@@ -94,7 +94,7 @@ int main(int argc, char **argv) {
                 fprintf(stderr, "]\n");
                 return 2;
             }
-            std::string check = llama_detokenize(ctx, tokens);
+            std::string check = common_detokenize(ctx, tokens);
             if (check != str) {
                 fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                     __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -123,8 +123,8 @@ int main(int argc, char **argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize(ctx, tokens);
+                    std::vector tokens = common_tokenize(ctx, str, false);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp
index 7ca9e2ca6..9b0716a43 100644
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -66,9 +66,9 @@ int main(int argc, char ** argv) {
     const int n_vocab = llama_n_vocab(model);
 
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector(1, i), true);
-        std::vector tokens = llama_tokenize(ctx, str, false, true);
-        std::string check = llama_detokenize(ctx, tokens);
+        std::string str = common_detokenize(ctx, std::vector(1, i), true);
+        std::vector tokens = common_tokenize(ctx, str, false, true);
+        std::string check = common_detokenize(ctx, tokens);
         if (check != str) {
             fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                 __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
                     }
 
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector tokens = llama_tokenize(ctx, str, false, true);
-                    std::string check = llama_detokenize(ctx, tokens);
+                    std::vector tokens = common_tokenize(ctx, str, false, true);
+                    std::string check = common_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
                         fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());